PyTorch
ssl-aasist
custom_code
ash56 commited on
Commit
fb0facd
·
verified ·
1 Parent(s): 9742bb8

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. fairseq/examples/mms/lid_rerank/mms/merge_by_lang.py +33 -0
  2. fairseq/examples/mms/lid_rerank/mms/prep_wav_list.py +23 -0
  3. fairseq/examples/mms/lid_rerank/mms/split_by_lang.py +90 -0
  4. fairseq/examples/mms/lid_rerank/nllb/infer.py +46 -0
  5. fairseq/examples/mms/lid_rerank/rerank/rerank.py +132 -0
  6. fairseq/examples/mms/lid_rerank/rerank/tune_coefficients.py +138 -0
  7. fairseq/examples/mms/lid_rerank/whisper/infer_lid.py +65 -0
  8. fairseq/examples/moe_lm/data_card.md +221 -0
  9. fairseq/examples/moe_lm/model_card.md +170 -0
  10. fairseq/examples/mr_hubert/README.md +187 -0
  11. fairseq/examples/mr_hubert/config/decode/infer.yaml +30 -0
  12. fairseq/examples/mr_hubert/config/decode/infer_lm.yaml +37 -0
  13. fairseq/examples/mr_hubert/config/decode/run/submitit_slurm.yaml +17 -0
  14. fairseq/examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml +17 -0
  15. fairseq/examples/mr_hubert/config/finetune/base_100h.yaml +97 -0
  16. fairseq/examples/mr_hubert/config/finetune/base_100h_large.yaml +97 -0
  17. fairseq/examples/mr_hubert/config/finetune/base_10h.yaml +101 -0
  18. fairseq/examples/mr_hubert/config/finetune/base_10h_large.yaml +101 -0
  19. fairseq/examples/mr_hubert/config/finetune/base_1h.yaml +100 -0
  20. fairseq/examples/mr_hubert/config/finetune/base_1h_large.yaml +99 -0
  21. fairseq/examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml +103 -0
  22. fairseq/examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml +107 -0
  23. fairseq/examples/mr_hubert/config/pretrain/run/submitit_reg.yaml +20 -0
  24. fairseq/examples/mr_hubert/train.sh +45 -0
  25. fairseq/examples/multilingual/ML50_langs.txt +52 -0
  26. fairseq/examples/multilingual/README.md +158 -0
  27. fairseq/examples/multilingual/data_scripts/README.md +24 -0
  28. fairseq/examples/multilingual/data_scripts/binarize.py +200 -0
  29. fairseq/examples/multilingual/data_scripts/check_iswlt_test_data.py +67 -0
  30. fairseq/examples/multilingual/data_scripts/check_self_overlaps.py +103 -0
  31. fairseq/examples/multilingual/data_scripts/check_valid_test_overlaps.py +124 -0
  32. fairseq/examples/multilingual/data_scripts/dedup_all.py +52 -0
  33. fairseq/examples/multilingual/data_scripts/download_ML50_v1.sh +30 -0
  34. fairseq/examples/multilingual/data_scripts/download_af_xh.sh +164 -0
  35. fairseq/examples/multilingual/data_scripts/download_flores_data.sh +246 -0
  36. fairseq/examples/multilingual/data_scripts/download_iitb.sh +35 -0
  37. fairseq/examples/multilingual/data_scripts/download_iwslt_and_extract.sh +225 -0
  38. fairseq/examples/multilingual/data_scripts/download_lotus.sh +46 -0
  39. fairseq/examples/multilingual/data_scripts/download_ted_and_extract.py +338 -0
  40. fairseq/examples/multilingual/data_scripts/download_wat19_my.sh +36 -0
  41. fairseq/examples/multilingual/data_scripts/download_wmt19_and_before.py +899 -0
  42. fairseq/examples/multilingual/data_scripts/download_wmt20.sh +547 -0
  43. fairseq/examples/multilingual/data_scripts/preprocess_ML50_v1.sh +27 -0
  44. fairseq/examples/multilingual/data_scripts/remove_valid_test_in_train.py +290 -0
  45. fairseq/examples/multilingual/data_scripts/requirement.txt +2 -0
  46. fairseq/examples/multilingual/data_scripts/utils/dedup.py +41 -0
  47. fairseq/examples/multilingual/data_scripts/utils/fasttext_multi_filter.py +63 -0
  48. fairseq/examples/multilingual/data_scripts/utils/strip_sgm.sh +1 -0
  49. fairseq/examples/multilingual/finetune_multilingual_model.sh +32 -0
  50. fairseq/examples/multilingual/multilingual_fairseq_gen.sh +26 -0
fairseq/examples/mms/lid_rerank/mms/merge_by_lang.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from collections import defaultdict
4
+ import os
5
+ import soundfile as sf
6
+ from tqdm import tqdm
7
+
8
+ if __name__ == "__main__":
9
+ parser = argparse.ArgumentParser(description='Example argument parser')
10
+ parser.add_argument('--exp', type=str)
11
+ parser.add_argument('--dump', type=str)
12
+ args = parser.parse_args()
13
+
14
+ langs = [d for d in os.listdir(args.dump) if os.path.isdir(os.path.join(args.dump, d))]
15
+
16
+ data = {}
17
+
18
+ for lang in langs:
19
+ ids = [int(x.strip()) for x in open(args.dump + "/" + lang + "/ids.txt", "r").readlines()]
20
+ word_hyps = [x.strip() for x in open(args.exp + "/" + lang + "/hypo.word.reord", "r").readlines()]
21
+ scores = [x.strip() for x in open(args.exp + "/" + lang + "/asr_score.reord", "r").readlines()]
22
+ assert len(ids) == len(word_hyps)
23
+ assert len(ids) == len(scores)
24
+ for id, word_hyp, s in zip(ids, word_hyps, scores):
25
+ if id in data:
26
+ print("Duplicate ID found")
27
+ import pdb;pdb.set_trace()
28
+ data[id] = (word_hyp, s)
29
+
30
+ with open(args.exp + "/nbest_asr_hyp", "w") as f1, open(args.exp + "/asr_score", "w") as f2:
31
+ for i in range(len(data.keys())):
32
+ f1.write(data[i][0] + "\n")
33
+ f2.write(data[i][1] + "\n")
fairseq/examples/mms/lid_rerank/mms/prep_wav_list.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import soundfile as sf
2
+ import argparse
3
+
4
+ if __name__ == "__main__":
5
+ parser = argparse.ArgumentParser(description='Example argument parser')
6
+ parser.add_argument('--src', type=str)
7
+ parser.add_argument('--dst', type=str)
8
+ args = parser.parse_args()
9
+
10
+ wavs = [x.strip() for x in open(args.src, "r").readlines()]
11
+
12
+ new_lines = ["/"]
13
+ for wav in wavs:
14
+ # Read the wav file
15
+ data, sample_rate = sf.read(wav)
16
+
17
+ # Number of samples is the length of the data array
18
+ num_samples = len(data)
19
+
20
+ new_lines.append(wav+"\t"+str(num_samples))
21
+
22
+ with open(args.dst, "w") as f:
23
+ f.writelines([x+"\n" for x in new_lines])
fairseq/examples/mms/lid_rerank/mms/split_by_lang.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from collections import defaultdict
4
+ import os
5
+ import soundfile as sf
6
+ from tqdm import tqdm
7
+
8
+ if __name__ == "__main__":
9
+ parser = argparse.ArgumentParser(description='Example argument parser')
10
+ parser.add_argument('--wavs_tsv', type=str)
11
+ parser.add_argument('--lid_preds', type=str)
12
+ parser.add_argument('--dst', type=str)
13
+ parser.add_argument('--refs', type=str, default=None)
14
+ parser.add_argument('--langs', type=str, default=None)
15
+ parser.add_argument('--confs', type=str, default=None)
16
+ args = parser.parse_args()
17
+
18
+ # split wavs into dst/lang/wav.txt and dst/lang/ids.txt
19
+ # uses lid_preds to create topk asr; 1 wav has k different lid
20
+
21
+ wavs_tsv = [x for x in open(args.wavs_tsv, "r").readlines()]
22
+ root = wavs_tsv[0]
23
+ wavs = wavs_tsv[1:]
24
+ lid_preds = [eval(x) for x in open(args.lid_preds, "r").readlines()]
25
+ if args.refs is not None:
26
+ refs = [x.strip() for x in open(args.refs, "r").readlines()]
27
+ assert len(wavs) == len(refs)
28
+ refs_filt = []
29
+ if args.langs is not None:
30
+ langs = [x.strip() for x in open(args.langs, "r").readlines()]
31
+ assert len(wavs) == len(langs)
32
+ langs_filt = []
33
+ if args.confs is not None:
34
+ confs = [x.strip() for x in open(args.confs, "r").readlines()]
35
+ assert len(wavs) == len(confs)
36
+ confs_filt = []
37
+
38
+ assert len(wavs) == len(lid_preds)
39
+
40
+ topk_wavs = []
41
+ topk_langs = []
42
+
43
+ for i, (w, p) in enumerate(zip(wavs, lid_preds)):
44
+ if p == "n/a":
45
+ continue
46
+
47
+ assert len(p) == len(lid_preds[0])
48
+
49
+ for l, _ in p:
50
+ topk_wavs.append(w)
51
+ topk_langs.append(l)
52
+
53
+ if args.refs is not None:
54
+ refs_filt.append(refs[i])
55
+ if args.langs is not None:
56
+ langs_filt.append(langs[i])
57
+ if args.confs is not None:
58
+ confs_filt.append(confs[i])
59
+
60
+ lang_split = defaultdict(list)
61
+ for id, (wav,lid) in enumerate(zip(topk_wavs, topk_langs)):
62
+ lang_split[lid].append((id, wav))
63
+
64
+ for lang in tqdm(lang_split.keys()):
65
+ if not os.path.exists(args.dst + "/" + lang):
66
+ os.makedirs(args.dst + "/" + lang)
67
+
68
+ with open(args.dst + "/" + lang + "/test.tsv", "w") as f1, \
69
+ open(args.dst + "/" + lang + "/ids.txt", "w") as f2:
70
+ f1.write(root)
71
+ f1.writelines([x[1] for x in lang_split[lang]])
72
+ f2.writelines([str(x[0]) + "\n" for x in lang_split[lang]])
73
+
74
+ with open(args.dst + "/" + lang + "/test.ltr", "w") as fw:
75
+ fw.write("d u m m y | d u m m y |\n"*len(lang_split[lang]))
76
+ with open(args.dst + "/" + lang + "/test.wrd", "w") as fw:
77
+ fw.write("dummy dummy\n"*len(lang_split[lang]))
78
+
79
+ with open(args.dst + "/lid.txt", "w") as f:
80
+ f.writelines([x+"\n" for x in topk_langs])
81
+
82
+ if args.refs is not None:
83
+ with open(args.dst + "/refs.txt", "w") as f:
84
+ f.writelines([x+"\n" for x in refs_filt])
85
+ if args.langs is not None:
86
+ with open(args.dst + "/langs.txt", "w") as f:
87
+ f.writelines([x+"\n" for x in langs_filt])
88
+ if args.confs is not None:
89
+ with open(args.dst + "/confs.txt", "w") as f:
90
+ f.writelines([x+"\n" for x in confs_filt])
fairseq/examples/mms/lid_rerank/nllb/infer.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf8 -*-
3
+ import fasttext
4
+ from tqdm import tqdm
5
+ import argparse
6
+ import os
7
+ import math
8
+
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument("--txt", type=str)
11
+ parser.add_argument("--dst", type=str)
12
+ parser.add_argument("--model", type=str)
13
+ parser.add_argument('--lid', type=str)
14
+ args = parser.parse_args()
15
+
16
+ mapping = {"arb":"ara", "azj":"aze", "pes":"fas", "fuv":"ful", "lvs":"lav", "khk":"mon", "zsm":"zlm", "gaz":"orm", "pbt":"pus", "uzn":"uzb", "zho":"cmn"}
17
+
18
+ def fix_code(x):
19
+ code = x.split("_")[-2]
20
+ if code in mapping:
21
+ code = mapping[code]
22
+ return code
23
+
24
+ if __name__ == "__main__":
25
+ if not os.path.exists(args.dst):
26
+ os.makedirs(args.dst)
27
+
28
+ pretrained_lang_model = args.model
29
+ model = fasttext.load_model(pretrained_lang_model)
30
+
31
+ txts = [x.strip() for x in open(args.txt, "r").readlines()]
32
+ lids = [x.strip() for x in open(args.lid, "r").readlines()]
33
+ assert len(txts) == len(lids)
34
+
35
+ with open(args.dst + "/wlid_score", "w") as f:
36
+ for t,l in tqdm(zip(txts, lids)):
37
+ predictions = model.predict(t, k=218) # max 218
38
+ predictions = [(fix_code(x), y) for x, y in zip(predictions[0], predictions[1])]
39
+
40
+ try:
41
+ pred_langs = [x[0] for x in predictions]
42
+ idx = pred_langs.index(l)
43
+ score = math.log(predictions[idx][-1])
44
+ except:
45
+ score = -1000
46
+ f.write(str(score) + "\n")
fairseq/examples/mms/lid_rerank/rerank/rerank.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ from collections import defaultdict
4
+ import os
5
+ from tqdm import tqdm
6
+ import sys
7
+ import subprocess
8
+ import re
9
+ import math
10
+ import numpy as np
11
+ import editdistance
12
+ from sklearn.preprocessing import StandardScaler
13
+ from multiprocessing import Pool
14
+ from functools import partial
15
+ import random
16
+
17
+ cer_langs = [x.strip() for x in open("cer_langs.txt", "r").readlines()]
18
+
19
+ def select(w, feats, ref_lid, nbest_lid, ref_asr, nbest_asr, n=10, exclude=None):
20
+ assert len(w) == len(feats[0])
21
+ scores = []
22
+ for f in feats:
23
+ s = 0
24
+ for i in range(len(w)):
25
+ s += w[i]*f[i]
26
+ scores.append(s)
27
+
28
+ lid_correct = 0
29
+ lid_total = 0
30
+ asr_err = 0
31
+ asr_total = 0
32
+ text = []
33
+ lang = []
34
+
35
+ for i in range(len(ref_lid)):
36
+ if exclude is not None:
37
+ if ref_lid[i] in exclude:
38
+ continue
39
+
40
+ start_idx = i * n
41
+ end_idx = start_idx + n
42
+ cand_scores = scores[start_idx:end_idx]
43
+ max_idx, max_val = max(enumerate(cand_scores), key=lambda x: x[1])
44
+
45
+ cand_feats = feats[start_idx:end_idx]
46
+
47
+ lang.append(nbest_lid[start_idx:end_idx][max_idx])
48
+ if ref_lid[i] == nbest_lid[start_idx:end_idx][max_idx]:
49
+ lid_correct += 1
50
+ lid_total += 1
51
+
52
+ hyp = nbest_asr[start_idx:end_idx][max_idx]
53
+ text.append(hyp)
54
+ ref = ref_asr[i]
55
+ hyp = hyp.lower()
56
+ ref = ref.lower()
57
+ hyp = hyp.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
58
+ ref = ref.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
59
+ if ref_lid[i] in cer_langs:
60
+ hyp = " ".join(hyp)
61
+ ref = " ".join(ref)
62
+
63
+ hyp_words = hyp.split()
64
+ tgt_words = ref.split()
65
+ errs = editdistance.eval(hyp_words, tgt_words)
66
+ asr_err += errs
67
+ asr_total += len(tgt_words)
68
+
69
+ results = {"lid_acc": lid_correct / lid_total, "asr_wer": asr_err / asr_total, "weights": w}
70
+
71
+ return results, text, lang
72
+
73
+ if __name__ == "__main__":
74
+ parser = argparse.ArgumentParser(description='Example argument parser')
75
+ parser.add_argument('--slid', type=str)
76
+ parser.add_argument('--wlid', type=str)
77
+ parser.add_argument('--asr', type=str)
78
+ parser.add_argument('--lm', type=str)
79
+ parser.add_argument('--uasr', type=str)
80
+ parser.add_argument('--n', type=int, default=10)
81
+ parser.add_argument('--dst', type=str)
82
+ parser.add_argument('--ref_lid', type=str)
83
+ parser.add_argument('--nbest_lid', type=str)
84
+ parser.add_argument('--ref_asr', type=str)
85
+ parser.add_argument('--nbest_asr', type=str)
86
+ parser.add_argument('--w', type=str)
87
+ parser.add_argument('--tag', type=str, default = None)
88
+ parser.add_argument('--exclude', nargs="*", default=None) # exclude langs
89
+ args = parser.parse_args()
90
+
91
+ slid = [float(x.strip()) for x in open(args.slid, "r").readlines()]
92
+ wlid = [float(x.strip()) for x in open(args.wlid, "r").readlines()]
93
+ asr = [float(x.strip()) for x in open(args.asr, "r").readlines()]
94
+ lm = [float(x.strip()) for x in open(args.lm, "r").readlines()]
95
+ uasr = [float(x.strip()) for x in open(args.uasr, "r").readlines()]
96
+
97
+ assert len(slid) == len(wlid)
98
+ assert len(wlid) == len(asr)
99
+ assert len(asr) == len(lm)
100
+ assert len(lm) == len(uasr)
101
+
102
+ ref_lid = [x.strip() for x in open(args.ref_lid, "r").readlines()]
103
+ nbest_lid= [x.strip() for x in open(args.nbest_lid, "r").readlines()]
104
+ ref_asr = [x.strip() for x in open(args.ref_asr, "r").readlines()]
105
+ nbest_asr = [x.strip() for x in open(args.nbest_asr, "r").readlines()]
106
+
107
+ assert len(ref_lid) * args.n == len(nbest_lid)
108
+ assert len(ref_asr) * args.n == len(nbest_asr)
109
+ assert len(ref_lid) == len(ref_asr)
110
+
111
+ lengths = [len(x) for x in nbest_asr]
112
+
113
+ feats = [[s, w, a, l, u, le] for s,w,a,l,u,le in zip(slid, wlid, asr, lm, uasr, lengths)]
114
+
115
+ weight = eval(open(args.w, "r").read())['weights']
116
+
117
+ results, text, lang = select(weight, feats, ref_lid, nbest_lid, ref_asr, nbest_asr, n=args.n, exclude=args.exclude)
118
+
119
+ if args.tag is not None:
120
+ tag_text = "." + args.tag
121
+ else:
122
+ tag_text = ""
123
+
124
+ with open(args.dst + "/reranked_1best_asr_hyp" + tag_text, "w") as f_out:
125
+ f_out.writelines([x+"\n" for x in text])
126
+
127
+ with open(args.dst + "/reranked_1best_lid" + tag_text, "w") as f_out:
128
+ f_out.writelines([x+"\n" for x in lang])
129
+
130
+ with open(args.dst + "/text.result" + tag_text, "w") as f_out:
131
+ for k in results.keys():
132
+ f_out.write(k + "\t" + str(results[k]) + "\n")
fairseq/examples/mms/lid_rerank/rerank/tune_coefficients.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ from tqdm import tqdm
4
+ import numpy as np
5
+ import editdistance
6
+ from multiprocessing import Pool
7
+ from functools import partial
8
+
9
+ cer_langs = [x.strip() for x in open("cer_langs.txt", "r").readlines()]
10
+
11
+ def compute(w, feats, ref_lid, nbest_lid, ref_asr, nbest_asr, n=10, exclude=None):
12
+ assert len(w) == len(feats[0])
13
+ scores = []
14
+ for f in feats:
15
+ s = 0
16
+ for i in range(len(w)):
17
+ s += w[i]*f[i]
18
+ scores.append(s)
19
+
20
+ lid_correct = 0
21
+ lid_total = 0
22
+ asr_err = 0
23
+ asr_total = 0
24
+
25
+ for i in range(len(ref_lid)):
26
+ if exclude is not None:
27
+ if ref_lid[i] in exclude:
28
+ continue
29
+
30
+ start_idx = i * n
31
+ end_idx = start_idx + n
32
+ cand_scores = scores[start_idx:end_idx]
33
+ max_idx, max_val = max(enumerate(cand_scores), key=lambda x: x[1])
34
+
35
+ if ref_lid[i] == nbest_lid[start_idx:end_idx][max_idx]:
36
+ lid_correct += 1
37
+ lid_total += 1
38
+
39
+ hyp = nbest_asr[start_idx:end_idx][max_idx]
40
+ ref = ref_asr[i]
41
+ hyp = hyp.lower()
42
+ ref = ref.lower()
43
+ hyp = hyp.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
44
+ ref = ref.replace(".", "").replace(",", "").replace("?", "").replace("!", "").replace(":", "").replace(")", "").replace("(", "").replace("-", "")
45
+ if ref_lid[i] in cer_langs:
46
+ hyp = " ".join(hyp)
47
+ ref = " ".join(ref)
48
+
49
+ hyp_words = hyp.split()
50
+ tgt_words = ref.split()
51
+ errs = editdistance.eval(hyp_words, tgt_words)
52
+ asr_err += errs
53
+ asr_total += len(tgt_words)
54
+
55
+ return {"lid_acc": lid_correct / lid_total, "asr_wer": asr_err / asr_total, "weights": w}
56
+
57
+ if __name__ == "__main__":
58
+ parser = argparse.ArgumentParser(description='Example argument parser')
59
+ parser.add_argument('--slid', type=str)
60
+ parser.add_argument('--wlid', type=str)
61
+ parser.add_argument('--asr', type=str)
62
+ parser.add_argument('--lm', type=str)
63
+ parser.add_argument('--uasr', type=str)
64
+ parser.add_argument('--n', type=int, default=10)
65
+ parser.add_argument('--dst', type=str)
66
+ parser.add_argument('--ref_lid', type=str)
67
+ parser.add_argument('--nbest_lid', type=str)
68
+ parser.add_argument('--ref_asr', type=str)
69
+ parser.add_argument('--nbest_asr', type=str)
70
+ parser.add_argument('--iters', type=int, default=10000)
71
+ parser.add_argument('--slid_scale', type=int, default = 100)
72
+ parser.add_argument('--wlid_scale', type=int, default = 100)
73
+ parser.add_argument('--asr_scale', type=int, default = 10)
74
+ parser.add_argument('--lm_scale', type=int, default = 10)
75
+ parser.add_argument('--uasr_scale', type=int, default = 10)
76
+ parser.add_argument('--len_scale', type=int, default = 1)
77
+ parser.add_argument('--num_jobs', type=int, default = 64)
78
+ parser.add_argument('--exclude', nargs="*", default=None) # exclude langs
79
+ args = parser.parse_args()
80
+
81
+ slid = [float(x.strip()) for x in open(args.slid, "r").readlines()]
82
+ wlid = [float(x.strip()) for x in open(args.wlid, "r").readlines()]
83
+ asr = [float(x.strip()) for x in open(args.asr, "r").readlines()]
84
+ lm = [float(x.strip()) for x in open(args.lm, "r").readlines()]
85
+ uasr = [float(x.strip()) for x in open(args.uasr, "r").readlines()]
86
+
87
+ assert len(slid) == len(wlid)
88
+ assert len(wlid) == len(asr)
89
+ assert len(asr) == len(lm)
90
+ assert len(lm) == len(uasr)
91
+
92
+ ref_lid = [x.strip() for x in open(args.ref_lid, "r").readlines()]
93
+ nbest_lid= [x.strip() for x in open(args.nbest_lid, "r").readlines()]
94
+ ref_asr = [x.strip() for x in open(args.ref_asr, "r").readlines()]
95
+ nbest_asr = [x.strip() for x in open(args.nbest_asr, "r").readlines()]
96
+
97
+ assert len(ref_lid) * args.n == len(nbest_lid)
98
+ assert len(ref_asr) * args.n == len(nbest_asr)
99
+ assert len(ref_lid) == len(ref_asr)
100
+
101
+ lengths = [len(x) for x in nbest_asr]
102
+
103
+ feats = [[s, w, a, l, u, le] for s,w,a,l,u,le in zip(slid, wlid, asr, lm, uasr, lengths)]
104
+
105
+ weights = []
106
+ for i in range(args.iters):
107
+ s_w = np.random.rand() * args.slid_scale
108
+ w_w = np.random.rand() * args.wlid_scale
109
+ a_w = np.random.rand() * args.asr_scale
110
+ l_w = np.random.rand() * args.lm_scale
111
+ u_w = np.random.rand() * args.uasr_scale
112
+ le_w = (np.random.rand() -0.5) * args.len_scale
113
+ weights.append([s_w, w_w, a_w, l_w, u_w, le_w])
114
+
115
+ num_tries = len(weights)
116
+ print("Total number of search points", num_tries)
117
+ threads = args.num_jobs
118
+ pool = Pool(threads)
119
+ compute_fxn = partial(compute, feats=feats, ref_lid=ref_asr, nbest_lid=nbest_lid, ref_asr=ref_asr, nbest_asr=nbest_asr, n=args.n, exclude=args.exclude)
120
+ results = pool.map(compute_fxn, weights)
121
+ pool.close()
122
+ pool.join()
123
+
124
+ assert len(results) == len(weights)
125
+
126
+ wer_best = 100
127
+ best = ""
128
+ if not os.path.exists(args.dst):
129
+ os.makedirs(args.dst)
130
+ with open(args.dst + "/results.all", "w") as f_out:
131
+ for result in results:
132
+ f_out.write(str(result)+"\n")
133
+ if result["asr_wer"] < wer_best:
134
+ wer_best = result["asr_wer"]
135
+ best = result
136
+
137
+ with open(args.dst + "/best_coefficients", "w") as f_out:
138
+ f_out.write(str(best)+"\n")
fairseq/examples/mms/lid_rerank/whisper/infer_lid.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- encoding: utf8 -*-
3
+ import argparse
4
+ import itertools
5
+ import os
6
+ import re
7
+ import sys
8
+ from pathlib import Path
9
+ import math
10
+
11
+ import whisper
12
+ from tqdm import tqdm
13
+
14
+
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--wavs", type=str)
17
+ parser.add_argument("--dst", type=str)
18
+ parser.add_argument("--model", type=str)
19
+ parser.add_argument("--n", type=int, default=10)
20
+ parser.add_argument("--mapping", type=str, default="whisper/lid_mapping.txt")
21
+ args = parser.parse_args()
22
+
23
+ if __name__ == "__main__":
24
+ model = whisper.load_model(args.model)
25
+
26
+ print(args)
27
+
28
+ wavs = [x.strip() for x in open(args.wavs, "r").readlines()]
29
+ if not os.path.exists(args.dst):
30
+ os.makedirs(args.dst)
31
+
32
+ if args.mapping is not None:
33
+ #whisper_lid_code:mms_lid_code
34
+ mapping = {x[0]:x[1] for x in [l.strip().split(";", 1) for l in open(args.mapping, "r").readlines()]}
35
+ else:
36
+ mapping = None
37
+
38
+ with open(args.dst + "/predictions", "w") as f:
39
+ for wav in tqdm(wavs):
40
+ # load audio and pad/trim it to fit 30 seconds
41
+ audio = whisper.load_audio(wav)
42
+ audio = whisper.pad_or_trim(audio)
43
+
44
+ # make log-Mel spectrogram and move to the same device as the model
45
+ mel = whisper.log_mel_spectrogram(audio).to(model.device)
46
+
47
+ _, probs = model.detect_language(mel)
48
+ result = sorted(probs.items(), key=lambda x:x[1], reverse=True)[:args.n]
49
+ f.write(str(result) + "\n")
50
+
51
+ lid_preds = [eval(x) for x in open(args.dst + "/predictions", "r").readlines()]
52
+ lids = []
53
+ scores = []
54
+ for p in lid_preds:
55
+ assert len(p) == len(lid_preds[0])
56
+ for l, s in p:
57
+ if args.mapping is not None:
58
+ lids.append(mapping[l])
59
+ else:
60
+ lids.append(l)
61
+ scores.append(math.log(s))
62
+ with open(args.dst + "/nbest_lid", "w") as f:
63
+ f.writelines([x+"\n" for x in lids])
64
+ with open(args.dst + "/slid_score", "w") as f:
65
+ f.writelines([str(x)+"\n" for x in scores])
fairseq/examples/moe_lm/data_card.md ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data card for the paper "Efficient Large Scale Language Modeling with Mixtures of Experts"
2
+ ## Version 1.0.0
3
+
4
+ We follow the recommendations of Gebru et al. (2018) and provide a datacard for the dataset used to train the 1.1T parameter model.
5
+
6
+ ## Motivation
7
+ * **For what purpose was the dataset created? Was there a specific task in mind? Was there a specific gap that needed to be filled? Please provide a description.**
8
+ The pre-training data for training the 1.1 T model was created by a union of six English language datasets, including five datasets used by RoBERTa (Liu et al 2019) and the English subset of CC 100. These purpose of creating this dataset was to pre-train the language model.
9
+
10
+ * **Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g., company, institution, organization)?**
11
+ FAIR (Fundamental Artificial Intelligence Research)
12
+
13
+ * **Who funded the creation of the dataset? If there is an associated grant, please provide the name of the grantor and the grant name and number.**
14
+ FAIR (Fundamental Artificial Intelligence Research)
15
+
16
+ * **Any other comments?**
17
+ No.
18
+
19
+ ## Composition
20
+
21
+ * **What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)? Are there multiple types of instances (e.g., movies, users, and ratings; people and interactions between them; nodes and edges)? Please provide a description.**
22
+ The instances are textual documents. The overall dataset is composed from a union of the following datasets -
23
+ * BookCorpus (Zhu et al., 2019) consists of more than 10K unpublished books (4GB);
24
+ * English Wikipedia, excluding lists, tables and headers (12GB);
25
+ * CC-News (Nagel,2016) contains 63 million English news articles crawled between September 2016 and February 2019 (76GB);
26
+ * OpenWebText (Gokaslan and Cohen, 2019), an open source recreation of the WebText dataset used to train GPT-2 (38GB);
27
+ * CC-Stories (Trinh and Le, 2018) contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas (31GB);
28
+ * English CC100 (Wenzek et al., 2020), a dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia (292GB).
29
+
30
+ * **How many instances are there in total (of each type, if appropriate)?**
31
+ The training data contains 112B tokens corresponding to 453 GB of data.
32
+
33
+ * **Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set? If the dataset is a sample, then what is the larger set? Is the sample representative of the larger set (e.g., geographic coverage)? If so, please describe how this representativeness was validated/verified. If it is not representative of the larger set, please describe why not (e.g., to cover a more diverse range of instances, because instances were withheld or unavailable).**
34
+ The English CC100 section of the dataset is a subset of CommonCrawl snapshots extracted between January 2018 to December 2018, filtered to match the style of Wikipedia. The CC-stories dataset contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas.
35
+
36
+ * **What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) or features? In either case, please provide a description.**
37
+ Each instance consists of raw text data.
38
+
39
+ * **Is there a label or target associated with each instance? If so, please provide a description.**
40
+ No.
41
+
42
+ * **Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g., because it was unavailable). This does not include intentionally removed information, but might include, e.g., redacted text.**
43
+ No.
44
+
45
+ * **Are relationships between individual instances made explicit (e.g., users' movie ratings, social network links)? If so, please describe how these relationships are made explicit.**
46
+ There are no explicit relationships between individual instances.
47
+
48
+ * **Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.**
49
+ We hold out a random validation set of approximately 150MB from the pretraining data, sampled proportionally to each dataset's size in the pretraining corpus.
50
+
51
+ * **Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.**
52
+ N/A
53
+
54
+ * **Is the dataset self-contained, or does it link to or otherwise rely on external resources (e.g., websites, tweets, other datasets)?**
55
+ It's self-contained.
56
+
57
+ * **Does the dataset contain data that might be considered confidential (e.g., data that is protected by legal privilege or by doctor-patient confidentiality, data that includes the content of individuals' non-public communications)? If so, please provide a description.**
58
+ The datasets used are publicly available, and the information in them is not considered confidential.
59
+
60
+ * **Does the dataset contain data that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety? If so, please describe why.**
61
+ Parts of the dataset are a subset of public Common Crawl data, which could contain sentences that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety.
62
+
63
+ * **Does the dataset relate to people? If not, you may skip the remaining questions in this section.**
64
+ Some documents of this data relate to people, such as news articles, Wikipedia descriptions, etc.
65
+
66
+ * **Does the dataset identify any subpopulations (e.g., by age, gender)? If so, please describe how these subpopulations are identified and provide a description of their respective distributions within the dataset.**
67
+ No.
68
+
69
+ * **Is it possible to identify individuals (i.e., one or more natural persons), either directly or indirectly (i.e., in combination with other data) from the dataset? If so, please describe how**
70
+ In addition to individuals who have Wikipedia pages (celebrities, politicians, etc.), it may be possible to identify other individuals by their names, Twitter account names, etc. if that information is present in Common Crawl.
71
+
72
+ * **Does the dataset contain data that might be considered sensitive in any way (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history)? If so, please provide a description.**
73
+ The training dataset is partially derived from Common Crawl, which may contain some sensitive information.
74
+
75
+ * **Any other comments?**
76
+ No
77
+
78
+
79
+ ## Collection Process
80
+
81
+ * **How was the data associated with each instance acquired? Was the data directly observable (e.g., raw text, movie ratings), reported by subjects (e.g., survey responses), or indirectly inferred/ derived from other data (e.g., part-of-speech tags, model-based guesses for age or language)? If data was reported by subjects or indirectly inferred/derived from other data, was the data validated/verified? If so, please describe how.**
82
+ N/A. The dataset is a union of six publicly available datasets.
83
+
84
+ * **What mechanisms or procedures were used to collect the data (e.g., hardware apparatus or sensor, manual human curation, software program, software API)? How were these mechanisms or procedures validated?**
85
+ N/A
86
+
87
+ * **If the dataset is a sample from a larger set, what was the sampling strategy (e.g., deterministic, probabilistic with specific sampling probabilities)?**
88
+ Please refer to the main document for details.
89
+
90
+ * **Who was involved in the data collection process (e.g., students, crowdworkers, contractors) and how were they compensated (e.g., how much were crowdworkers paid)?**
91
+ This data is mined, filtered and sampled by machines.
92
+
93
+ * **Over what timeframe was the data collected? Does this timeframe match the creation timeframe of the data associated with the instances (e.g., recent crawl of old news articles)? If not, please describe the timeframe in which the data associated with the instances was created.**
94
+ Different parts of the dataset were mined over different time periods.
95
+ 1. The CC-News dataset contains English news articles crawled between September 2016 and February 2019.
96
+ 2. The English CC-100 dataset was extracted from CommonCrawl snapshots between January 2018 and December 2018.
97
+
98
+ * **Were any ethical review processes conducted (e.g., by an institutional review board)? If so, please provide a description of these review processes, including the outcomes, as well as a link or other access point to any supporting documentation.**
99
+ No.
100
+
101
+ * **Does the dataset relate to people? If not, you may skip the remainder of the questions in this section.**
102
+ No.
103
+
104
+ * **Did you collect the data from the individuals in question directly, or obtain it via third parties or other sources (e.g., websites)?**
105
+ N/A
106
+
107
+ * **Were the individuals in question notified about the data collection? If so, please describe (or show with screenshots or other information) how notice was provided, and provide a link or other access point to, or otherwise reproduce, the exact language of the notification itself.**
108
+ N/A
109
+
110
+ * **Did the individuals in question consent to the collection and use of their data? If so, please describe (or show with screenshots or other information) how consent was requested and provided, and provide a link or other access point to, or otherwise reproduce, the exact language to which the individuals consented.**
111
+ N/A
112
+
113
+ * **If consent was obtained, were the consenting individuals provided with a mechanism to revoke their consent in the future or for certain uses? If so, please provide a description, as well as a link or other access point to the mechanism (if appropriate).**
114
+ N/A
115
+
116
+ * **Has an analysis of the potential impact of the dataset and its use on data subjects (e.g., a data protection impact analysis) been conducted? If so, please provide a description of this analysis, including the outcomes, as well as a link or other access point to any supporting documentation.**
117
+ Some responsible AI related evaluations were performed. Please refer to the main document and the model card for the paper.
118
+
119
+ * **Any other comments?**
120
+ No
121
+
122
+
123
+ ## Preprocessing/cleaning/labeling
124
+
125
+
126
+ * **Was any preprocessing/cleaning/labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)? If so, please provide a description. If not, you may skip the remainder of the questions in this section.**
127
+ The component datasets went through standard cleaning and re-formatting practices, including removing repetitive/non informative text like "Chapter One", or "This ebook by Project Gutenberg".
128
+
129
+ * **Was the “raw” data saved in addition to the preprocessed/cleaned/labeled data (e.g., to support unanticipated future uses)? If so, please provide a link or other access point to the “raw” data.**
130
+ The "raw" component datasets is publicly available in their respective locations (more details can be seen in the respective papers linked in references).
131
+
132
+ * **Is the software used to preprocess/clean/label the instances available? If so, please provide a link or other access point.**
133
+ The software is proprietary to Meta Platforms and currently unavailable publicly.
134
+
135
+ * **Any other comments?**
136
+ No
137
+
138
+
139
+ ## Uses
140
+
141
+ * **Has the dataset been used for any tasks already? If so, please provide a description.**
142
+ Yes, this dataset was used to pre-train the models described in the paper.
143
+
144
+ * **Is there a repository that links to any or all papers or systems that use the dataset? If so, please provide a link or other access point.**
145
+ No.
146
+
147
+ * **What (other) tasks could the dataset be used for?**
148
+ This data can be used to pretrain English language models, which are foundation to many current and future language tasks.
149
+
150
+ * **Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses? For example, is there anything that a future user might need to know to avoid uses that could result in unfair treatment of individuals or groups (e.g., stereotyping, quality of service issues) or other undesirable harms (e.g., financial harms, legal risks) If so, please provide a description. Is there anything a future user could do to mitigate these undesirable harms?**
151
+ The pipeline for creating this dataset paves a way for building a scalable infrastructure for mining datasets to be be used for training large-scale models.
152
+
153
+ * **Are there tasks for which the dataset should not be used? If so, please provide a description.**
154
+ No.
155
+
156
+ * **Any other comments?**
157
+ No.
158
+
159
+ ## Distribution
160
+
161
+
162
+ * **Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created? If so, please provide a description.**
163
+ No.
164
+
165
+ * **How will the dataset will be distributed (e.g., tarball on website, API, GitHub)? Does the dataset have a digital object identifier (DOI)?**
166
+ N/A
167
+
168
+ * **When will the dataset be distributed?**
169
+ No.
170
+
171
+ * **Will the dataset be distributed under a copyright or other intellectual property (IP) license, and/or under applicable terms of use (ToU)? If so, please describe this license and/or ToU, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms or ToU, as well as any fees associated with these restrictions.**
172
+ No.
173
+
174
+ * **Have any third parties imposed IP-based or other restrictions on the data associated with the instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms, as well as any fees associated with these restrictions.**
175
+ No.
176
+
177
+ * **Do any export controls or other regulatory restrictions apply to the dataset or to individual instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any supporting documentation.**
178
+ N/A
179
+
180
+ * **Any other comments?**
181
+ No.
182
+
183
+ ## Maintenance
184
+
185
+ * **Who is supporting/hosting/maintaining the dataset?**
186
+ FAIR (Fundamental Artificial Intelligence Research)
187
+
188
+ * **How can the owner/curator/manager of the dataset be contacted (e.g., email address)?**
189
+ Refer to the main document.
190
+
191
+ * **Is there an erratum? If so, please provide a link or other access point.**
192
+ N/A
193
+
194
+ * **Will the dataset be updated (e.g., to correct labeling errors, add new instances, delete instances)? If so, please describe how often, by whom, and how updates will be communicated to users (e.g., mailing list, GitHub)?**
195
+ No plan for updating.
196
+
197
+ * **If the dataset relates to people, are there applicable limits on the retention of the data associated with the instances (e.g., were individuals in question told that their data would be retained for a fixed period of time and then deleted)? If so, please describe these limits and explain how they will be enforced.**
198
+ N/A
199
+
200
+ * **Will older versions of the dataset continue to be supported/hosted/maintained? If so, please describe how. If not, please describe how its obsolescence will be communicated to users.**
201
+ N/A
202
+
203
+ * **If others want to extend/augment/build on/contribute to the dataset, is there a mechanism for them to do so? If so, please provide a description. Will these contributions be validated/ verified? If so, please describe how. If not, why not? Is there a process for communicating/ distributing these contributions to other users? If so, please provide a description.**
204
+ No.
205
+
206
+ * **Any other comments?**
207
+ No.
208
+
209
+ ## References
210
+ Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692.
211
+
212
+ Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724.
213
+
214
+ Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available.
215
+
216
+ Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus
217
+
218
+ Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847.
219
+
220
+ Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association.
221
+
fairseq/examples/moe_lm/model_card.md ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model card for the paper ``Efficient Large Scale Language Modeling with Mixtures of Experts"
2
+ ## Version 1.0.0
3
+
4
+ ### Model developer
5
+ FAIR (Fundamental Artificial Intelligence Research)
6
+
7
+ ### Model type
8
+ An autoregressive English language model trained on a union of six English language models. We explore dense and sparse (MoE based) architectures in the paper.
9
+ * Dense models - Our dense models range from 125M parameters to 13B parameters.
10
+ * Sparse (MoE) models - Our MoE based models range from 15B parameters to 1.1 Trillion parameters.
11
+ This model card focuses on the 1.1 Trillion parameter model, but the discussion
12
+ applies to all of the models explored in this work.
13
+
14
+ ### Citation details
15
+ Artetxe et al. (2021): Efficient Large Scale Language Modeling with Mixtures of Experts
16
+
17
+ ### Model Feedback Channel
18
+ fairseq
19
+
20
+ ## Intended use
21
+ ### Primary intended use
22
+ For research purposes only, e.g. reproducing model evaluation results. Generation is only used in a limited capacity for explanation/justification or for prompting/probing/priming for class labels.
23
+
24
+ ### Out of scope uses
25
+ The primary purpose of the model is not to generate language, although the model is capable of doing that.
26
+
27
+ ## Factors influencing model performance
28
+ This section discusses potential risks associated with using the model.
29
+
30
+ ### Relevant factors
31
+ Based on known problems with NLP technology, potential relevant factors include bias (gender, profession, race and religion).
32
+
33
+ ### Evaluation factors
34
+ The 1.1T model was evaluated on StereoSet and CrowS-Pairs datasets to quantify encoded bias in the model.
35
+
36
+ ## Metrics
37
+ ### Model performance measures
38
+ The 1.1T parameter model was primarily evaluated on
39
+ 1. In-domain and out-of-domain language modeling perplexity.
40
+ 2. Zero-shot and few-shot priming.
41
+ 3. Fully supervised finetuning.
42
+
43
+ ### Approaches to handle uncertainty
44
+ For few-shot learning, we report the average results across 25 runs, randomly sampling a different set of few-shot examples from the training set each time.
45
+
46
+ ## Evaluation data
47
+ ## Zero Shot evaluation
48
+
49
+ ### HellaSwag
50
+ #### Description
51
+ HellaSwag is a dataset for evaluating commonsense reasoning.
52
+
53
+ ### PIQA
54
+ #### Description
55
+ PIQA is a dataset designed to evaluate reasoning about Physical Commonsense in Natural Language
56
+
57
+ ### ReCoRd
58
+ #### Description
59
+ Reading Comprehension with Commonsense Reasoning Dataset (ReCoRD) is a large-scale reading comprehension dataset which requires commonsense reasoning. ReCoRD consists of queries automatically generated from CNN/Daily Mail news articles; the answer to each query is a text span from a summarizing passage of the corresponding news. The goal of ReCoRD is to evaluate a machine's ability of commonsense reasoning in reading comprehension.
60
+
61
+ ## Few Shot evaluation
62
+ ### Winogrande
63
+ #### Description
64
+ Winogrande is a benchmark for commonsense reasoning. The dataset contains pronoun resolution problems originally designed to be unsolvable for statistical models that rely on selectional preferences or word associations.
65
+
66
+ ### StoryCloze
67
+ #### Description
68
+ StoryCloze is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story.
69
+
70
+ ### OpenBookQA
71
+ #### Description
72
+ OpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding of a subject. It consists of 5,957 multiple-choice elementary-level science questions (4,957 train, 500 dev, 500 test), which probe the understanding of a small “book” of 1,326 core science facts and the application of these facts to novel situations.
73
+
74
+ ## Fully supervised evaluation
75
+
76
+ ### BoolQ
77
+ #### Description
78
+ BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally occurring – they are generated in unprompted and unconstrained settings. Each example is a triplet of (question, passage, answer), with the title of the page as optional additional context.
79
+
80
+ ### SST-2
81
+ #### Description
82
+ SST-2 (or SST-binary) is a binary classification dataset where the goal is to differentiate between negative or somewhat negative vs somewhat positive or positive.
83
+
84
+ ### MNLI
85
+ #### Description
86
+ The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information. The corpus is modeled on the SNLI corpus, but differs in that covers a range of genres of spoken and written text, and supports a distinctive cross-genre generalization evaluation.
87
+
88
+ ## Responsible AI (RAI) evaluation
89
+ ### StereoSet
90
+ #### Description
91
+ A large-scale natural dataset in English to measure stereotypical biases in four domains: gender, profession, race, and religion
92
+
93
+ #### Motivation for dataset use
94
+ The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model's stereotype bias in gender, profession, race, and religion
95
+
96
+ ### CrowS
97
+ #### Description
98
+ Challenge Dataset for Measuring Social Biases in Masked Language Models
99
+
100
+ #### Motivation for dataset use
101
+ The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model’s bias in the domains of race, religion and age
102
+
103
+ ----
104
+
105
+ ## Training data
106
+ ### BookCorpus
107
+ #### Description
108
+ A dataset consisting of more than 10K unpublished books. 4GB in size. (Zhu et al., 2019)
109
+
110
+ ### English Wikipedia
111
+ #### Description
112
+ Data from English wikipedia, excluding lists, tables and headers. 12GB in size.
113
+
114
+ ### CC-News
115
+ #### Description
116
+ A dataset containing 63 millions English news articles crawled between September 2016 and February 2019. 76GB in size. (Nagel,2016)
117
+
118
+ ### OpenWebText
119
+ #### Description
120
+ An open source recreation of the WebText dataset used to train GPT-2. 38GB in size. (Gokaslan and Cohen, 2019)
121
+
122
+ ### CC-Stories
123
+ #### Description
124
+ A dataset containing a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas. 31GB in size. (Trinh and Le, 2018)
125
+
126
+ ### English CC100
127
+ #### Description
128
+ A dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia following the methodology introduced in CCNet (https://arxiv.org/abs/1911.00359). 292GB in size. (Wenzek et al., 2020)
129
+
130
+ ## Responsible AI (RAI) Dimensions
131
+ ### Fairness (Bias and inclusion)
132
+ The 1.1T parameter model was evaluated on the StereoSet and CrowS pairs dataset for inherent bias in the model, and bias as a result of the data. Similar to StereoSet, we observe that both the dense and MoE models get worse in terms of the Stereotype Score (SS) with scale.
133
+
134
+ ### Privacy and security
135
+ The 1.1T model did not have any special Privacy and Security considerations. The training data and evaluation data were both public and went through standard Meta privacy and licensing procedures.
136
+
137
+ ### Transparency and control
138
+ In the spirit of transparency and accountability we have created this model card for the 1.1T parameter model and a data card for the training data (referenced in Artetxe et al. (2021)).
139
+
140
+ ### Efficiency (Green AI)
141
+ The 1.1T parameter model is trained as a Mixture of Experts (MoE) model. Mixture of expert (MoE) models are efficient because they leverage sparse computation, i.e., only a small fraction of parameters are active for any given input. For instance, our 1.1T parameter MoE model requires only 30% more FLOPS compared to a 6.7B parameter dense model, i.e., a 160x increase in parameters with only a 30% increase in FLOPS. Notably, MoE models achieve much better validation perplexity for a given compute budget compared to dense models.
142
+
143
+ ## References
144
+ Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. 2019. HellaSwag: Can a machine really finish your sentence? In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 4791– 4800, Florence, Italy. Association for Computational Linguistics.
145
+
146
+ Yonatan Bisk, Rowan Zellers, Ronan Le bras, Jianfeng Gao, and Yejin Choi. 2020. Piqa: Reasoning about physical commonsense in natural language. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):7432–7439.
147
+
148
+ Sheng Zhang, Xiaodong Liu, Jingjing Liu, Jianfeng Gao, Kevin Duh, and Benjamin Van Durme. 2018. ReCoRD: Bridging the gap between human and machine commonsense reading comprehension. arXiv preprint 1810.12885.
149
+
150
+ Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. 2020. Winogrande: An adversarial winograd schema challenge at scale. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):8732–8740.
151
+
152
+ Nasrin Mostafazadeh, Nathanael Chambers, Xiaodong He, Devi Parikh, Dhruv Batra, Lucy Vanderwende, Pushmeet Kohli, and James Allen. 2016. A corpus and cloze evaluation for deeper understanding of commonsense stories. In Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 839–849, San Diego, California. Association for Computational Linguistics.
153
+
154
+ Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. 2018. Can a suit of armor conduct electricity? a new dataset for open book question answering. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 2381–2391, Brussels, Belgium. Association for Computational Linguistics.
155
+
156
+ Christopher Clark and Kenton Lee and Ming-Wei Chang and Tom Kwiatkowski and Michael Collins and Kristina Toutanova. 2019. BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions
157
+
158
+ Moin Nadeem, Anna Bethke, and Siva Reddy. 2021. StereoSet: Measuring stereotypical bias in pretrained language models. In Association for Computational Linguistics (ACL).
159
+
160
+ Nikita Nangia, Clara Vania, Rasika Bhalerao, and Samuel R. Bowman. 2020. CrowS-pairs: A challenge dataset for measuring social biases in masked language models. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1953–1967, Online. Association for Computational Linguistics.
161
+
162
+ Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724.
163
+
164
+ Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available.
165
+
166
+ Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus
167
+
168
+ Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847.
169
+
170
+ Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association.
fairseq/examples/mr_hubert/README.md ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MR-HuBERT
2
+
3
+ ## Pre-trained models
4
+
5
+ ### Main models
6
+ Model | Pretraining Data | Model | Paper Reference
7
+ |---|---|---|---
8
+ MR-HuBERT Base (~97M) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_base/mrhubert_mono_base.pt) | mono\_base
9
+ MR-HuBERT Base (~321M) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_large/mrhubert_mono_large.pt) | mono\_large
10
+ Multilingual MR-HuBERT Base (~97M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_base/multi_base.pt) | multi\_base
11
+ Multilingual MR-HuBERT Large (~321M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download 400k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_400k.pt) or [download 600k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_600k.pt) | Not in the paper
12
+
13
+
14
+ ### Abalation models
15
+ Model | Pretraining Data | Model | Paper Reference
16
+ |---|---|---|---
17
+ MR-HuBERT Base (2-4-6 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-a/b1-a.pt) | (B.1)-a
18
+ MR-HuBERT Base (5-2-5 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-b/b1-b.pt) | (B.1)-b
19
+ MR-HuBERT Base (6-4-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-c/b1-c.pt) | (B.1)-c
20
+ MR-HuBERT Base (3res 3-2-2-2-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-a/b2-a.pt) | (B.2)-a
21
+ MR-HuBERT Base (3res 2-2-4-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-b/b2-b.pt) | (B.2)-b
22
+ MR-HuBERT Base (3res 2-2-2-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-c/b2-c.pt) | (B.2)-c
23
+ MR-HuBERT Base (Simple sampling) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b3-a/b3-a.pt) | (B.3)-a
24
+ MR-HuBERT Base (Single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-a/b4-a.pt) | (B.4)-a
25
+ MR-HuBERT Base (Simple Sampling + single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-b/b4-b.pt) | (B.4)-b
26
+ MR-HuBERT Base (Mono-resolution 20ms) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b5-a/b5-a.pt) | (B.5)-a
27
+ MR-HuBERT Base (3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-a/b6-a.pt) | (B.6)-a
28
+ MR-HuBERT Base (Mono-resolution 20ms, 3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-b/b6-b.pt) | (B.6)-b
29
+ MR-HuBERT Base (HuBERT 20ms&40ms units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-a/b7-a.pt) | (B.7)-a
30
+ MR-HuBERT Base (Encodec 50Hz unit) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-b/b7-b.pt) | (B.7)-b
31
+ MR-HuBERT Base (Encodec 50Hz units and 25Hz units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-c/b7-c.pt) | (B.7)-c
32
+ MR-HuBERT Base (Encodec 50Hz units stream 0&1 ) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-d/b7-d.pt) | (B.7)-d
33
+ MR-HuBERT Large (no audio norm) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-a/b8-a.pt) | (B.8)-a
34
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-b/b8-b.pt) | (B.8)-b
35
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-c/b8-c.pt) | (B.8)-c
36
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-d/b8-d.pt) | (B.8)-d
37
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-e/b8-e.pt) | (B.8)-e
38
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-f/b8-f.pt) | (B.8)-f
39
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-g/b8-g.pt) | (B.8)-g
40
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-h/b8-h.pt) | (B.8)-h
41
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-i/b8-i.pt) | (B.8)-i
42
+ MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-j/b8-j.pt) | (B.8)-j
43
+ Multilingual MR-HuBERT Large (Simple sampling) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_large_simple/multi_large_simple.pt) | Not in paper
44
+ MR-HuBERT xLarge (from HuBERT-base label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v1.pt) | Not in paper
45
+ MR-HuBERT xLarge (from HuBERT-large label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v2.pt) | Not in paper
46
+
47
+ ## Load a model
48
+ ```
49
+ ckpt_path = "/path/to/the/checkpoint.pt"
50
+ models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path])
51
+ model = models[0]
52
+ ```
53
+
54
+ ## Train a new model
55
+
56
+ ### Data preparation
57
+
58
+ Follow the steps in `./simple_kmeans` to create:
59
+ - `{train,valid}.tsv` waveform list files with length information
60
+ ```
61
+ /path/to/your/audio/files
62
+ file1.wav\t160000
63
+ file2.wav\t154600
64
+ ...
65
+ filen.wav\t54362
66
+ ```
67
+ - `{train,valid}.km` frame-aligned pseudo label files (the order is the same as wavefiles in the tsv file).
68
+ ```
69
+ 44 44 44 48 48 962 962 962 962 962 962 962 962 967 967 967 967 967 967 967 967 370 852 370 ... 18 18 745 745
70
+ 44 44 44 48 48 962 962 962 147 147 147 147 147 147 147 147 147 147 147 147 176 176 271 271 ... 27 27 745 745
71
+ ...
72
+ 44 44 44 48 962 962 962 962 962 962 377 377 377 77 77 852 696 694 433 578 578 82 740 622 ... 27 27 745 745
73
+ ```
74
+ - `dict.km.txt` a dummy dictionary (first column is id, the second is dummy one)
75
+ ```
76
+ 0 1
77
+ 1 1
78
+ 2 1
79
+ ...
80
+ 999 1
81
+ ```
82
+
83
+ The `label_rate` is the same as the feature frame rate used for clustering,
84
+ which is 100Hz for MFCC features and 50Hz for HuBERT features by default.
85
+
86
+ ### Pre-train a MR-HuBERT model
87
+
88
+ Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km`
89
+ are saved at `/path/to/labels`, and the label rate is 100Hz.
90
+
91
+ To train a base model (12 layer transformer), run:
92
+ ```sh
93
+ $ python fairseq_cli/hydra_train.py \
94
+ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/pretrain \
95
+ --config-name mrhubert_base_librispeech \
96
+ task.data=/path/to/data task.label_dir=/path/to/labels \
97
+ task.labels='["km"]' model.label_rate=100 \
98
+ task.label_rate_ratios='[1, 2]' \
99
+ ```
100
+
101
+ Please see sample pre-training scripts `train.sh` for an example script.
102
+
103
+ ### Fine-tune a MR-HuBERT model with a CTC loss
104
+
105
+ Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their
106
+ corresponding character transcripts `{train,valid}.ltr` are saved at
107
+ `/path/to/trans`. A typical ltr file is with the same order of tsv waveform files as
108
+ ```
109
+ HOW | ARE | YOU
110
+ ...
111
+ THANK | YOU
112
+ ```
113
+
114
+ To fine-tune a pre-trained MR-HuBERT model at `/path/to/checkpoint`, run
115
+ ```sh
116
+ $ python fairseq_cli/hydra_train.py \
117
+ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/finetune \
118
+ --config-name base_10h \
119
+ task.data=/path/to/data task.label_dir=/path/to/trans \
120
+ model.w2v_path=/path/to/checkpoint
121
+ ```
122
+
123
+ Please see sample fine-tuning scripts `finetune.sh` for an example script.
124
+
125
+ ### Decode a MR-HuBERT model
126
+
127
+ Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of
128
+ the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is
129
+ saved at `/path/to/checkpoint`.
130
+
131
+
132
+ We support three decoding modes:
133
+ - Viterbi decoding: greedy decoding without a language model
134
+ - KenLM decoding: decoding with an arpa-format KenLM n-gram language model
135
+ - Fairseq-LM deocding: decoding with a Fairseq neural language model (not fully tested)
136
+
137
+
138
+ #### Viterbi decoding
139
+
140
+ `task.normalize` needs to be consistent with the value used during fine-tuning.
141
+ Decoding results will be saved at
142
+ `/path/to/experiment/directory/decode/viterbi/test`.
143
+
144
+ ```sh
145
+ $ python examples/speech_recognition/new/infer.py \
146
+ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \
147
+ --config-name infer \
148
+ task.data=/path/to/data \
149
+ task.normalize=[true|false] \
150
+ decoding.exp_dir=/path/to/experiment/directory \
151
+ common_eval.path=/path/to/checkpoint
152
+ dataset.gen_subset=test \
153
+ ```
154
+
155
+ #### KenLM / Fairseq-LM decoding
156
+
157
+ Suppose the pronunciation lexicon and the n-gram LM are saved at
158
+ `/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be
159
+ saved at `/path/to/experiment/directory/decode/kenlm/test`.
160
+
161
+ ```sh
162
+ $ python examples/speech_recognition/new/infer.py \
163
+ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \
164
+ --config-name infer_lm \
165
+ task.data=/path/to/data \
166
+ task.normalize=[true|false] \
167
+ decoding.exp_dir=/path/to/experiment/directory \
168
+ common_eval.path=/path/to/checkpoint
169
+ dataset.gen_subset=test \
170
+ decoding.decoder.lexicon=/path/to/lexicon \
171
+ decoding.decoder.lmpath=/path/to/arpa
172
+ ```
173
+
174
+ The command above uses the default decoding hyperparameter, which can be found
175
+ in `examples/speech_recognition/hydra/decoder.py`. These parameters can be
176
+ configured from the command line. For example, to search with a beam size of
177
+ 500, we can append the command above with `decoding.decoder.beam=500`.
178
+ Important parameters include:
179
+ - decoding.decoder.beam
180
+ - decoding.decoder.beamthreshold
181
+ - decoding.decoder.lmweight
182
+ - decoding.decoder.wordscore
183
+ - decoding.decoder.silweight
184
+
185
+ To decode with a Fairseq LM, you may check the usage examples in wav2vec2 or hubert examples.
186
+
187
+ Please see sample decoding scripts `decode.sh` for an example script.
fairseq/examples/mr_hubert/config/decode/infer.yaml ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ defaults:
4
+ - model: null
5
+
6
+ hydra:
7
+ run:
8
+ dir: ${common_eval.results_path}/viterbi
9
+ sweep:
10
+ dir: ${common_eval.results_path}
11
+ subdir: viterbi
12
+
13
+ task:
14
+ _name: multires_hubert_pretraining
15
+ single_target: true
16
+ fine_tuning: true
17
+ label_rate_ratios: ???
18
+ data: ???
19
+ normalize: false
20
+
21
+ decoding:
22
+ type: viterbi
23
+ unique_wer_file: true
24
+ common_eval:
25
+ results_path: ???
26
+ path: ???
27
+ post_process: letter
28
+ dataset:
29
+ max_tokens: 1100000
30
+ gen_subset: ???
fairseq/examples/mr_hubert/config/decode/infer_lm.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ defaults:
4
+ - model: null
5
+
6
+ hydra:
7
+ run:
8
+ dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
9
+ sweep:
10
+ dir: ${common_eval.results_path}
11
+ subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight}
12
+
13
+ task:
14
+ _name: multires_hubert_pretraining
15
+ single_target: true
16
+ fine_tuning: true
17
+ data: ???
18
+ label_rate_ratios: ???
19
+ normalize: ???
20
+
21
+ decoding:
22
+ type: kenlm
23
+ lexicon: ???
24
+ lmpath: ???
25
+ beamthreshold: 100
26
+ beam: 500
27
+ lmweight: 1.5
28
+ wordscore: -1
29
+ silweight: 0
30
+ unique_wer_file: true
31
+ common_eval:
32
+ results_path: ???
33
+ path: ???
34
+ post_process: letter
35
+ dataset:
36
+ max_tokens: 1100000
37
+ gen_subset: ???
fairseq/examples/mr_hubert/config/decode/run/submitit_slurm.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+ hydra:
3
+ launcher:
4
+ cpus_per_task: ${distributed_training.distributed_world_size}
5
+ gpus_per_node: ${distributed_training.distributed_world_size}
6
+ tasks_per_node: ${hydra.launcher.gpus_per_node}
7
+ nodes: 1
8
+ mem_gb: 200
9
+ timeout_min: 4320
10
+ max_num_timeout: 50
11
+ name: ${hydra.job.config_name}
12
+ submitit_folder: ${hydra.sweep.dir}/submitit
13
+
14
+ distributed_training:
15
+ distributed_world_size: 1
16
+ distributed_no_spawn: true
17
+ distributed_port: 29761
fairseq/examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+ hydra:
3
+ launcher:
4
+ cpus_per_task: ${distributed_training.distributed_world_size}
5
+ gpus_per_node: ${distributed_training.distributed_world_size}
6
+ tasks_per_node: ${hydra.launcher.gpus_per_node}
7
+ nodes: 1
8
+ mem_gb: 200
9
+ timeout_min: 4320
10
+ max_num_timeout: 50
11
+ name: ${hydra.job.config_name}
12
+ submitit_folder: ${hydra.sweep.dir}/submitit
13
+
14
+ distributed_training:
15
+ distributed_world_size: 8
16
+ distributed_no_spawn: true
17
+ distributed_port: 29761
fairseq/examples/mr_hubert/config/finetune/base_100h.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tblog
8
+ seed: 1337
9
+
10
+ checkpoint:
11
+ no_epoch_checkpoints: true
12
+ best_checkpoint_metric: wer
13
+
14
+ distributed_training:
15
+ ddp_backend: c10d
16
+ find_unused_parameters: true
17
+ distributed_world_size: 8
18
+ distributed_port: 29671
19
+ nprocs_per_node: 8
20
+
21
+ task:
22
+ _name: multires_hubert_pretraining
23
+ data: ???
24
+ fine_tuning: true
25
+ label_dir: ???
26
+ label_rate_ratios: ???
27
+ normalize: false # must be consistent with pre-training
28
+ labels: ["ltr"]
29
+ single_target: true
30
+
31
+ dataset:
32
+ num_workers: 0
33
+ max_tokens: 3200000
34
+ validate_after_updates: ${model.freeze_finetune_updates}
35
+ validate_interval: 5
36
+ train_subset: train_100h
37
+ valid_subset: dev_other
38
+
39
+ criterion:
40
+ _name: ctc
41
+ zero_infinity: true
42
+
43
+ optimization:
44
+ max_update: 80000
45
+ lr: [3e-5]
46
+ sentence_avg: true
47
+ update_freq: [1]
48
+
49
+ optimizer:
50
+ _name: adam
51
+ adam_betas: (0.9,0.98)
52
+ adam_eps: 1e-08
53
+
54
+ lr_scheduler:
55
+ _name: tri_stage
56
+ phase_ratio: [0.1, 0.4, 0.5]
57
+ final_lr_scale: 0.05
58
+
59
+ model:
60
+ _name: multires_hubert_ctc
61
+ multires_hubert_path: ???
62
+ apply_mask: true
63
+ mask_selection: static
64
+ mask_length: 10
65
+ mask_other: 0
66
+ mask_prob: 0.75
67
+ mask_channel_selection: static
68
+ mask_channel_length: 64
69
+ mask_channel_other: 0
70
+ mask_channel_prob: 0.5
71
+ layerdrop: 0.1
72
+ dropout: 0.0
73
+ activation_dropout: 0.1
74
+ attention_dropout: 0.0
75
+ feature_grad_mult: 0.0
76
+ freeze_finetune_updates: 10000
77
+
78
+ hydra:
79
+ job:
80
+ config:
81
+ override_dirname:
82
+ kv_sep: '-'
83
+ item_sep: '__'
84
+ exclude_keys:
85
+ - run
86
+ - task.data
87
+ - task.label_dir
88
+ - model.multires_hubert_path
89
+ - dataset.train_subset
90
+ - dataset.valid_subset
91
+ - criterion.wer_kenlm_model
92
+ - criterion.wer_lexicon
93
+ run:
94
+ dir: ???
95
+ sweep:
96
+ dir: ???
97
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
fairseq/examples/mr_hubert/config/finetune/base_100h_large.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tblog
8
+ seed: 1337
9
+
10
+ checkpoint:
11
+ no_epoch_checkpoints: true
12
+ best_checkpoint_metric: wer
13
+
14
+ distributed_training:
15
+ ddp_backend: c10d
16
+ find_unused_parameters: true
17
+ distributed_world_size: 8
18
+ distributed_port: 29671
19
+ nprocs_per_node: 8
20
+
21
+ task:
22
+ _name: multires_hubert_pretraining
23
+ data: ???
24
+ fine_tuning: true
25
+ label_dir: ???
26
+ label_rate_ratios: ???
27
+ normalize: true # must be consistent with pre-training
28
+ labels: ["ltr"]
29
+ single_target: true
30
+
31
+ dataset:
32
+ num_workers: 0
33
+ max_tokens: 1600000
34
+ validate_after_updates: ${model.freeze_finetune_updates}
35
+ validate_interval: 5
36
+ train_subset: train_100h
37
+ valid_subset: dev_other
38
+
39
+ criterion:
40
+ _name: ctc
41
+ zero_infinity: true
42
+
43
+ optimization:
44
+ max_update: 80000
45
+ lr: [3e-5]
46
+ sentence_avg: true
47
+ update_freq: [2]
48
+
49
+ optimizer:
50
+ _name: adam
51
+ adam_betas: (0.9,0.98)
52
+ adam_eps: 1e-08
53
+
54
+ lr_scheduler:
55
+ _name: tri_stage
56
+ phase_ratio: [0.1, 0.4, 0.5]
57
+ final_lr_scale: 0.05
58
+
59
+ model:
60
+ _name: multires_hubert_ctc
61
+ multires_hubert_path: ???
62
+ apply_mask: true
63
+ mask_selection: static
64
+ mask_length: 10
65
+ mask_other: 0
66
+ mask_prob: 0.75
67
+ mask_channel_selection: static
68
+ mask_channel_length: 64
69
+ mask_channel_other: 0
70
+ mask_channel_prob: 0.5
71
+ layerdrop: 0.1
72
+ dropout: 0.0
73
+ activation_dropout: 0.1
74
+ attention_dropout: 0.0
75
+ feature_grad_mult: 0.0
76
+ freeze_finetune_updates: 10000
77
+
78
+ hydra:
79
+ job:
80
+ config:
81
+ override_dirname:
82
+ kv_sep: '-'
83
+ item_sep: '__'
84
+ exclude_keys:
85
+ - run
86
+ - task.data
87
+ - task.label_dir
88
+ - model.multires_hubert_path
89
+ - dataset.train_subset
90
+ - dataset.valid_subset
91
+ - criterion.wer_kenlm_model
92
+ - criterion.wer_lexicon
93
+ run:
94
+ dir: ???
95
+ sweep:
96
+ dir: ???
97
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
fairseq/examples/mr_hubert/config/finetune/base_10h.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tblog
8
+ seed: 1337
9
+
10
+ checkpoint:
11
+ save_interval: 5
12
+ keep_interval_updates: 1
13
+ no_epoch_checkpoints: true
14
+ best_checkpoint_metric: wer
15
+
16
+ distributed_training:
17
+ ddp_backend: c10d
18
+ find_unused_parameters: true
19
+ distributed_world_size: 8
20
+ distributed_port: 29671
21
+ nprocs_per_node: 8
22
+
23
+ task:
24
+ _name: multires_hubert_pretraining
25
+ data: ???
26
+ fine_tuning: true
27
+ label_dir: ???
28
+ label_rate_ratios: ???
29
+ normalize: false # must be consistent with pre-training
30
+ labels: ["ltr"]
31
+ single_target: true
32
+
33
+ dataset:
34
+ num_workers: 0
35
+ max_tokens: 3200000
36
+ validate_after_updates: ${model.freeze_finetune_updates}
37
+ validate_interval: 5
38
+ train_subset: train_10h
39
+ valid_subset: dev
40
+
41
+ criterion:
42
+ _name: ctc
43
+ zero_infinity: true
44
+
45
+ optimization:
46
+ max_update: 25000
47
+ lr: [2e-5]
48
+ sentence_avg: true
49
+ update_freq: [1]
50
+
51
+ optimizer:
52
+ _name: adam
53
+ adam_betas: (0.9,0.98)
54
+ adam_eps: 1e-08
55
+
56
+ lr_scheduler:
57
+ _name: tri_stage
58
+ warmup_steps: 8000
59
+ hold_steps: 0
60
+ decay_steps: 72000
61
+ final_lr_scale: 0.05
62
+
63
+ model:
64
+ _name: multires_hubert_ctc
65
+ multires_hubert_path: ???
66
+ apply_mask: true
67
+ mask_selection: static
68
+ mask_length: 10
69
+ mask_other: 0
70
+ mask_prob: 0.75
71
+ mask_channel_selection: static
72
+ mask_channel_length: 64
73
+ mask_channel_other: 0
74
+ mask_channel_prob: 0.5
75
+ layerdrop: 0.1
76
+ dropout: 0.0
77
+ activation_dropout: 0.1
78
+ attention_dropout: 0.0
79
+ feature_grad_mult: 0.0
80
+ freeze_finetune_updates: 10000
81
+
82
+ hydra:
83
+ job:
84
+ config:
85
+ override_dirname:
86
+ kv_sep: '-'
87
+ item_sep: '__'
88
+ exclude_keys:
89
+ - run
90
+ - task.data
91
+ - task.label_dir
92
+ - model.multires_hubert_path
93
+ - dataset.train_subset
94
+ - dataset.valid_subset
95
+ - criterion.wer_kenlm_model
96
+ - criterion.wer_lexicon
97
+ run:
98
+ dir: ???
99
+ sweep:
100
+ dir: ???
101
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
fairseq/examples/mr_hubert/config/finetune/base_10h_large.yaml ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tblog
8
+ seed: 1337
9
+
10
+ checkpoint:
11
+ save_interval: 5
12
+ keep_interval_updates: 1
13
+ no_epoch_checkpoints: true
14
+ best_checkpoint_metric: wer
15
+
16
+ distributed_training:
17
+ ddp_backend: c10d
18
+ find_unused_parameters: true
19
+ distributed_world_size: 8
20
+ distributed_port: 29671
21
+ nprocs_per_node: 8
22
+
23
+ task:
24
+ _name: multires_hubert_pretraining
25
+ data: ???
26
+ fine_tuning: true
27
+ label_dir: ???
28
+ label_rate_ratios: ???
29
+ normalize: true # must be consistent with pre-training
30
+ labels: ["ltr"]
31
+ single_target: true
32
+
33
+ dataset:
34
+ num_workers: 0
35
+ max_tokens: 3200000
36
+ validate_after_updates: ${model.freeze_finetune_updates}
37
+ validate_interval: 5
38
+ train_subset: train_10h
39
+ valid_subset: dev
40
+
41
+ criterion:
42
+ _name: ctc
43
+ zero_infinity: true
44
+
45
+ optimization:
46
+ max_update: 25000
47
+ lr: [2e-5]
48
+ sentence_avg: true
49
+ update_freq: [1]
50
+
51
+ optimizer:
52
+ _name: adam
53
+ adam_betas: (0.9,0.98)
54
+ adam_eps: 1e-08
55
+
56
+ lr_scheduler:
57
+ _name: tri_stage
58
+ warmup_steps: 8000
59
+ hold_steps: 0
60
+ decay_steps: 72000
61
+ final_lr_scale: 0.05
62
+
63
+ model:
64
+ _name: multires_hubert_ctc
65
+ multires_hubert_path: ???
66
+ apply_mask: true
67
+ mask_selection: static
68
+ mask_length: 10
69
+ mask_other: 0
70
+ mask_prob: 0.75
71
+ mask_channel_selection: static
72
+ mask_channel_length: 64
73
+ mask_channel_other: 0
74
+ mask_channel_prob: 0.5
75
+ layerdrop: 0.1
76
+ dropout: 0.0
77
+ activation_dropout: 0.1
78
+ attention_dropout: 0.0
79
+ feature_grad_mult: 0.0
80
+ freeze_finetune_updates: 10000
81
+
82
+ hydra:
83
+ job:
84
+ config:
85
+ override_dirname:
86
+ kv_sep: '-'
87
+ item_sep: '__'
88
+ exclude_keys:
89
+ - run
90
+ - task.data
91
+ - task.label_dir
92
+ - model.multires_hubert_path
93
+ - dataset.train_subset
94
+ - dataset.valid_subset
95
+ - criterion.wer_kenlm_model
96
+ - criterion.wer_lexicon
97
+ run:
98
+ dir: ???
99
+ sweep:
100
+ dir: ???
101
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
fairseq/examples/mr_hubert/config/finetune/base_1h.yaml ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tblog
8
+ seed: 1337
9
+
10
+ checkpoint:
11
+ save_interval: 50
12
+ keep_interval_updates: 1
13
+ save_interval_updates: 1000
14
+ no_epoch_checkpoints: true
15
+ best_checkpoint_metric: wer
16
+
17
+ distributed_training:
18
+ ddp_backend: c10d
19
+ find_unused_parameters: true
20
+ distributed_world_size: 8
21
+ distributed_port: 29671
22
+ nprocs_per_node: 8
23
+
24
+ task:
25
+ _name: multires_hubert_pretraining
26
+ data: ???
27
+ fine_tuning: true
28
+ label_dir: ???
29
+ label_rate_ratios: ???
30
+ normalize: false # must be consistent with pre-training
31
+ labels: ["ltr"]
32
+ single_target: true
33
+
34
+ dataset:
35
+ num_workers: 0
36
+ max_tokens: 3200000
37
+ validate_after_updates: ${model.freeze_finetune_updates}
38
+ validate_interval: 1000
39
+ train_subset: train_1h
40
+ valid_subset: dev_other
41
+
42
+ criterion:
43
+ _name: ctc
44
+ zero_infinity: true
45
+
46
+ optimization:
47
+ max_update: 13000
48
+ lr: [5e-5]
49
+ sentence_avg: true
50
+ update_freq: [4]
51
+
52
+ optimizer:
53
+ _name: adam
54
+ adam_betas: (0.9,0.98)
55
+ adam_eps: 1e-08
56
+
57
+ lr_scheduler:
58
+ _name: tri_stage
59
+ phase_ratio: [0.1, 0.4, 0.5]
60
+ final_lr_scale: 0.05
61
+
62
+ model:
63
+ _name: multires_hubert_ctc
64
+ multires_hubert_path: ???
65
+ apply_mask: true
66
+ mask_selection: static
67
+ mask_length: 10
68
+ mask_other: 0
69
+ mask_prob: 0.75
70
+ mask_channel_selection: static
71
+ mask_channel_length: 64
72
+ mask_channel_other: 0
73
+ mask_channel_prob: 0.5
74
+ layerdrop: 0.1
75
+ dropout: 0.0
76
+ activation_dropout: 0.1
77
+ attention_dropout: 0.0
78
+ feature_grad_mult: 0.0
79
+ freeze_finetune_updates: 10000
80
+
81
+ hydra:
82
+ job:
83
+ config:
84
+ override_dirname:
85
+ kv_sep: '-'
86
+ item_sep: '__'
87
+ exclude_keys:
88
+ - run
89
+ - task.data
90
+ - task.label_dir
91
+ - model.multires_hubert_path
92
+ - dataset.train_subset
93
+ - dataset.valid_subset
94
+ - criterion.wer_kenlm_model
95
+ - criterion.wer_lexicon
96
+ run:
97
+ dir: ???
98
+ sweep:
99
+ dir: ???
100
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
fairseq/examples/mr_hubert/config/finetune/base_1h_large.yaml ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ tensorboard_logdir: tblog
8
+ seed: 1337
9
+
10
+ checkpoint:
11
+ save_interval: 1000
12
+ keep_interval_updates: 1
13
+ no_epoch_checkpoints: true
14
+ best_checkpoint_metric: wer
15
+
16
+ distributed_training:
17
+ ddp_backend: c10d
18
+ find_unused_parameters: true
19
+ distributed_world_size: 8
20
+ distributed_port: 29671
21
+ nprocs_per_node: 8
22
+
23
+ task:
24
+ _name: multires_hubert_pretraining
25
+ data: ???
26
+ fine_tuning: true
27
+ label_dir: ???
28
+ label_rate_ratios: ???
29
+ normalize: true # must be consistent with pre-training
30
+ labels: ["ltr"]
31
+ single_target: true
32
+
33
+ dataset:
34
+ num_workers: 0
35
+ max_tokens: 1280000
36
+ validate_after_updates: ${model.freeze_finetune_updates}
37
+ validate_interval: 5
38
+ train_subset: train_10h
39
+ valid_subset: dev
40
+
41
+ criterion:
42
+ _name: ctc
43
+ zero_infinity: true
44
+
45
+ optimization:
46
+ max_update: 25000
47
+ lr: [3e-4]
48
+ sentence_avg: true
49
+ update_freq: [5]
50
+
51
+ optimizer:
52
+ _name: adam
53
+ adam_betas: (0.9,0.98)
54
+ adam_eps: 1e-08
55
+
56
+ lr_scheduler:
57
+ _name: tri_stage
58
+ phase_ratio: [0.1, 0.4, 0.5]
59
+ final_lr_scale: 0.05
60
+
61
+ model:
62
+ _name: multires_hubert_ctc
63
+ multires_hubert_path: ???
64
+ apply_mask: true
65
+ mask_selection: static
66
+ mask_length: 10
67
+ mask_other: 0
68
+ mask_prob: 0.75
69
+ mask_channel_selection: static
70
+ mask_channel_length: 64
71
+ mask_channel_other: 0
72
+ mask_channel_prob: 0.5
73
+ layerdrop: 0.1
74
+ dropout: 0.0
75
+ activation_dropout: 0.1
76
+ attention_dropout: 0.0
77
+ feature_grad_mult: 0.0
78
+ freeze_finetune_updates: 10000
79
+
80
+ hydra:
81
+ job:
82
+ config:
83
+ override_dirname:
84
+ kv_sep: '-'
85
+ item_sep: '__'
86
+ exclude_keys:
87
+ - run
88
+ - task.data
89
+ - task.label_dir
90
+ - model.multires_hubert_path
91
+ - dataset.train_subset
92
+ - dataset.valid_subset
93
+ - criterion.wer_kenlm_model
94
+ - criterion.wer_lexicon
95
+ run:
96
+ dir: ???
97
+ sweep:
98
+ dir: ???
99
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
fairseq/examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ seed: 1337
8
+ tensorboard_logdir: tblog
9
+ min_loss_scale: 1e-8
10
+
11
+ checkpoint:
12
+ save_interval_updates: 25000
13
+ keep_interval_updates: 1
14
+ no_epoch_checkpoints: true
15
+
16
+ distributed_training:
17
+ ddp_backend: no_c10d
18
+ distributed_backend: 'nccl'
19
+ distributed_world_size: 32
20
+ distributed_port: 29671
21
+ nprocs_per_node: 8
22
+ find_unused_parameters: true
23
+
24
+ task:
25
+ _name: multires_hubert_pretraining
26
+ data: ???
27
+ label_dir: ???
28
+ labels: ???
29
+ label_rate: ${model.label_rate}
30
+ label_rate_ratios: ???
31
+ sample_rate: 16000
32
+ max_sample_size: 250000
33
+ min_sample_size: 32000
34
+ pad_audio: false
35
+ random_crop: true
36
+ normalize: false # must be consistent with extractor
37
+ # max_keep_size: 300000
38
+ # max_keep_size: 50000
39
+
40
+
41
+ dataset:
42
+ num_workers: 0
43
+ max_tokens: 1000000
44
+ skip_invalid_size_inputs_valid_test: true
45
+ validate_interval: 5
46
+ validate_interval_updates: 10000
47
+
48
+ criterion:
49
+ _name: hubert
50
+ pred_masked_weight: 1.0
51
+ pred_nomask_weight: 0.0
52
+ loss_weights: [10,]
53
+
54
+ optimization:
55
+ max_update: 400000
56
+ lr: [0.0005]
57
+ clip_norm: 10.0
58
+
59
+ optimizer:
60
+ _name: adam
61
+ adam_betas: (0.9,0.98)
62
+ adam_eps: 1e-06
63
+ weight_decay: 0.01
64
+
65
+ lr_scheduler:
66
+ _name: polynomial_decay
67
+ warmup_updates: 32000
68
+
69
+ model:
70
+ _name: multires_hubert
71
+ label_rate: ???
72
+ label_rate_ratios: ${task.label_rate_ratios}
73
+ skip_masked: false
74
+ skip_nomask: false
75
+ mask_prob: 0.80
76
+ extractor_mode: default
77
+ conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
78
+ final_dim: 256
79
+ encoder_layers: 4
80
+ encoder_layerdrop: 0.05
81
+ dropout_input: 0.1
82
+ dropout_features: 0.1
83
+ dropout: 0.1
84
+ attention_dropout: 0.1
85
+ feature_grad_mult: 0.1
86
+ untie_final_proj: true
87
+ activation_dropout: 0.0
88
+ conv_adapator_kernal: 1
89
+ use_single_target: true
90
+
91
+ hydra:
92
+ job:
93
+ config:
94
+ override_dirname:
95
+ kv_sep: '-'
96
+ item_sep: '/'
97
+ exclude_keys:
98
+ - run
99
+ - task.data
100
+ - task.label_dir
101
+ - common.min_loss_scale
102
+ - common.log_interval
103
+ - optimization.clip_norm
fairseq/examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _group_
2
+
3
+ common:
4
+ memory_efficient_fp16: true
5
+ log_format: json
6
+ log_interval: 200
7
+ seed: 1337
8
+ tensorboard_logdir: tblog
9
+
10
+ checkpoint:
11
+ save_interval_updates: 25000
12
+ keep_interval_updates: 1
13
+ no_epoch_checkpoints: true
14
+
15
+
16
+ distributed_training:
17
+ ddp_backend: no_c10d
18
+ distributed_backend: 'nccl'
19
+ distributed_world_size: 128
20
+ distributed_port: 29671
21
+ nprocs_per_node: 8
22
+ find_unused_parameters: true
23
+
24
+ task:
25
+ _name: multires_hubert_pretraining
26
+ data: ???
27
+ label_dir: ???
28
+ labels: ???
29
+ label_rate: ${model.label_rate}
30
+ label_rate_ratios: ???
31
+ sample_rate: 16000
32
+ max_sample_size: 250000
33
+ min_sample_size: 32000
34
+ pad_audio: false
35
+ random_crop: true
36
+ normalize: true # must be consistent with extractor
37
+ # max_keep_size: 50000
38
+
39
+ dataset:
40
+ num_workers: 0
41
+ max_tokens: 300000
42
+ skip_invalid_size_inputs_valid_test: true
43
+ validate_interval: 5
44
+ validate_interval_updates: 10000
45
+
46
+ criterion:
47
+ _name: hubert
48
+ pred_masked_weight: 1.0
49
+ pred_nomask_weight: 0.0
50
+ loss_weights: [10,]
51
+
52
+ optimization:
53
+ max_update: 400000
54
+ lr: [0.0015]
55
+ clip_norm: 1.0
56
+ update_freq: [3]
57
+
58
+ optimizer:
59
+ _name: adam
60
+ adam_betas: (0.9,0.98)
61
+ adam_eps: 1e-06
62
+ weight_decay: 0.01
63
+
64
+ lr_scheduler:
65
+ _name: polynomial_decay
66
+ warmup_updates: 32000
67
+
68
+ model:
69
+ _name: multires_hubert
70
+ label_rate: ???
71
+ label_rate_ratios: ${task.label_rate_ratios}
72
+ encoder_layers: 8
73
+ encoder_embed_dim: 1024
74
+ encoder_ffn_embed_dim: 4096
75
+ encoder_attention_heads: 16
76
+ final_dim: 768
77
+ skip_masked: false
78
+ skip_nomask: false
79
+ mask_prob: 0.80
80
+ extractor_mode: layer_norm
81
+ conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2'
82
+ encoder_layerdrop: 0.0
83
+ dropout_input: 0.0
84
+ dropout_features: 0.0
85
+ dropout: 0.0
86
+ attention_dropout: 0.0
87
+ layer_norm_first: true
88
+ feature_grad_mult: 1.0
89
+ untie_final_proj: true
90
+ activation_dropout: 0.0
91
+ conv_adapator_kernal: 1
92
+ use_single_target: true
93
+
94
+ hydra:
95
+ job:
96
+ config:
97
+ override_dirname:
98
+ kv_sep: '-'
99
+ item_sep: '__'
100
+ exclude_keys:
101
+ - run
102
+ - task.data
103
+ run:
104
+ dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
105
+ sweep:
106
+ dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt
107
+ subdir: ${hydra.job.config_name}__${hydra.job.override_dirname}
fairseq/examples/mr_hubert/config/pretrain/run/submitit_reg.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # @package _global_
2
+
3
+ hydra:
4
+ launcher:
5
+ cpus_per_task: 8
6
+ gpus_per_node: 8
7
+ tasks_per_node: ${hydra.launcher.gpus_per_node}
8
+ nodes: 4
9
+ comment: null
10
+ mem_gb: 384
11
+ timeout_min: 4320
12
+ max_num_timeout: 100
13
+ constraint: volta32gb
14
+ name: ${hydra.job.config_name}/${hydra.job.override_dirname}
15
+ submitit_folder: ${hydra.sweep.dir}/submitit/%j
16
+
17
+ distributed_training:
18
+ distributed_world_size: 32
19
+ distributed_port: 29671
20
+ nprocs_per_node: 8
fairseq/examples/mr_hubert/train.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ FAIRSEQ= # Setup your fairseq directory
4
+
5
+ config_dir=${FAIRSEQ}/examples/mr_hubert/config
6
+ config_name=mr_hubert_base_librispeech
7
+
8
+ # Prepared Data Directory
9
+ data_dir=librispeech
10
+ # -- data_dir
11
+ # -- train.tsv
12
+ # -- valid.tsv
13
+
14
+ label_dir=labels
15
+ # -- label_dir
16
+ # -- train.km
17
+ # -- valid.km
18
+ # -- dict.km.txt
19
+
20
+
21
+ exp_dir=exp # Target experiments directory
22
+ ratios="[1, 2]" # Default label rate ratios
23
+ label_rate=50 # Base label rate
24
+
25
+
26
+ _opts=
27
+
28
+ # If use slurm, uncomment this line and modify the job submission at
29
+ # _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg"
30
+
31
+ # If want to set additional experiment tag, uncomment this line
32
+ # _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}"
33
+
34
+
35
+ python ${FAIRSEQ}/fairseq_cli/hydra_train.py \
36
+ -m --config-dir ${config_dir} --config-name ${config_name} ${_opts} \
37
+ task.data=${data_dir} \
38
+ task.label_dir=${label_dir} \
39
+ task.labels='["km"]' \
40
+ model.label_rate=${label_rate} \
41
+ task.label_rate_ratios='${ratios}' \
42
+ hydra.sweep.dir=${exp_dir} &
43
+
44
+
45
+
fairseq/examples/multilingual/ML50_langs.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ar_AR
2
+ cs_CZ
3
+ de_DE
4
+ en_XX
5
+ es_XX
6
+ et_EE
7
+ fi_FI
8
+ fr_XX
9
+ gu_IN
10
+ hi_IN
11
+ it_IT
12
+ ja_XX
13
+ kk_KZ
14
+ ko_KR
15
+ lt_LT
16
+ lv_LV
17
+ my_MM
18
+ ne_NP
19
+ nl_XX
20
+ ro_RO
21
+ ru_RU
22
+ si_LK
23
+ tr_TR
24
+ vi_VN
25
+ zh_CN
26
+ af_ZA
27
+ az_AZ
28
+ bn_IN
29
+ fa_IR
30
+ he_IL
31
+ hr_HR
32
+ id_ID
33
+ ka_GE
34
+ km_KH
35
+ mk_MK
36
+ ml_IN
37
+ mn_MN
38
+ mr_IN
39
+ pl_PL
40
+ ps_AF
41
+ pt_XX
42
+ sv_SE
43
+ sw_KE
44
+ ta_IN
45
+ te_IN
46
+ th_TH
47
+ tl_XX
48
+ uk_UA
49
+ ur_PK
50
+ xh_ZA
51
+ gl_ES
52
+ sl_SI
fairseq/examples/multilingual/README.md ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multilingual Translation
2
+
3
+ [[Multilingual Translation with Extensible Multilingual Pretraining and Finetuning, https://arxiv.org/abs/2008.00401]](https://arxiv.org/abs/2008.00401)
4
+
5
+ ## Introduction
6
+
7
+ This work is for training multilingual translation models with multiple bitext datasets. This multilingual translation framework supports (see [[training section]](#Training) and [[finetuning section]](#Finetuning) for examples)
8
+
9
+ * temperature based sampling over unbalancing datasets of different translation directions
10
+ - --sampling-method' with
11
+ choices=['uniform', 'temperature', 'concat']
12
+ - --sampling-temperature
13
+ * configurable to automatically add source and/or target language tokens to source/target sentences using data which are prepared in the same way as bilignual training
14
+ - --encoder-langtok with choices=['src', 'tgt', None] to specify whether to add source or target language tokens to the source sentences
15
+ - --decoder-langtok (binary option) to specify whether to add target language tokens to the target sentences or not
16
+ * finetuning mBART pretrained models for multilingual translation
17
+ - --finetune-from-model to specify the path from which to load the pretrained model
18
+
19
+ ## Preprocessing data
20
+ Multilingual training requires a joint BPE vocab. Please follow [mBART's preprocessing steps](https://github.com/pytorch/fairseq/tree/main/examples/mbart#bpe-data) to reuse our pretrained sentence-piece model.
21
+
22
+ You can also train a joint BPE model on your own dataset and then follow the steps in [[link]](https://github.com/pytorch/fairseq/tree/main/examples/translation#multilingual-translation).
23
+
24
+ ## Training
25
+
26
+
27
+ ```bash
28
+ lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
29
+ path_2_data=<set to data path>
30
+ lang_list=<a file which contains a list of languages separated by new lines>
31
+
32
+ fairseq-train $path_2_data \
33
+ --encoder-normalize-before --decoder-normalize-before \
34
+ --arch transformer --layernorm-embedding \
35
+ --task translation_multi_simple_epoch \
36
+ --sampling-method "temperature" \
37
+ --sampling-temperature 1.5 \
38
+ --encoder-langtok "src" \
39
+ --decoder-langtok \
40
+ --lang-dict "$lang_list" \
41
+ --lang-pairs "$lang_pairs" \
42
+ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
43
+ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
44
+ --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
45
+ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
46
+ --max-tokens 1024 --update-freq 2 \
47
+ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
48
+ --seed 222 --log-format simple --log-interval 2
49
+ ```
50
+
51
+ ## Finetuning
52
+ We can also finetune multilingual models from a monolingual pretrained models, e.g. [mMBART](https://github.com/pytorch/fairseq/tree/main/examples/mbart).
53
+ ```bash
54
+ lang_pairs=<language pairs to be trained, e.g. "en-cs,cs-en">
55
+ path_2_data=<set to data path>
56
+ lang_list=<a file which contains a list of languages separated by new lines>
57
+ pretrained_model=<path to the pretrained model, e.g. mbart or another trained multilingual model>
58
+
59
+ fairseq-train $path_2_data \
60
+ --finetune-from-model $pretrained_model \
61
+ --encoder-normalize-before --decoder-normalize-before \
62
+ --arch transformer --layernorm-embedding \
63
+ --task translation_multi_simple_epoch \
64
+ --sampling-method "temperature" \
65
+ --sampling-temperature 1.5 \
66
+ --encoder-langtok "src" \
67
+ --decoder-langtok \
68
+ --lang-dict "$lang_list" \
69
+ --lang-pairs "$lang_pairs" \
70
+ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
71
+ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
72
+ --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
73
+ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
74
+ --max-tokens 1024 --update-freq 2 \
75
+ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
76
+ --seed 222 --log-format simple --log-interval 2
77
+ ```
78
+ ## Generate
79
+ The following command uses the multilingual task (translation_multi_simple_epoch) to generate translation from $source_lang to $target_lang on the test dataset. During generaton, the source language tokens are added to source sentences and the target language tokens are added as the starting token to decode target sentences. Options --lang-dict and --lang-pairs are needed to tell the generation process the ordered list of languages and translation directions that the trained model are awared of; they will need to be consistent with the training.
80
+
81
+ ```bash
82
+ model=<multilingual model>
83
+ source_lang=<source language>
84
+ target_lang=<target language>
85
+
86
+ fairseq-generate $path_2_data \
87
+ --path $model \
88
+ --task translation_multi_simple_epoch \
89
+ --gen-subset test \
90
+ --source-lang $source_lang \
91
+ --target-lang $target_lang
92
+ --sacrebleu --remove-bpe 'sentencepiece'\
93
+ --batch-size 32 \
94
+ --encoder-langtok "src" \
95
+ --decoder-langtok \
96
+ --lang-dict "$lang_list" \
97
+ --lang-pairs "$lang_pairs" > ${source_lang}_${target_lang}.txt
98
+ ```
99
+ Fairseq will generate translation into a file {source_lang}_${target_lang}.txt with sacreblue at the end.
100
+
101
+ You can also use costomized tokenizer to compare the performance with the literature. For example, you get a tokenizer [here](https://github.com/rsennrich/wmt16-scripts) and do the following:
102
+ ```bash
103
+ TOKENIZER=<path to a customized tokenizer for decoding evaluation>
104
+ TOK_CMD=<"$TOKENIZER $target_lang" or cat for sacrebleu>
105
+
106
+ cat {source_lang}_${target_lang}.txt | grep -P "^H" |sort -V |cut -f 3- |$TOK_CMD > ${source_lang}_${target_lang}.hyp
107
+ cat {source_lang}_${target_lang}.txt | grep -P "^T" |sort -V |cut -f 2- |$TOK_CMD > ${source_lang}_${target_lang}.ref
108
+ sacrebleu -tok 'none' -s 'none' ${source_lang}_${target_lang}.ref < ${source_lang}_${target_lang}.hyp
109
+ ```
110
+
111
+ # mBART50 models
112
+
113
+ * [mMBART 50 pretrained model](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.pretrained.tar.gz).
114
+ * [mMBART 50 finetuned many-to-one](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.n1.tar.gz).
115
+ * [mMBART 50 finetuned one-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.1n.tar.gz).
116
+ * [mMBART 50 finetuned many-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.nn.tar.gz).
117
+
118
+ Please download and extract from the above tarballs. Each tarball contains
119
+ * The fairseq model checkpoint: model.pt
120
+ * The list of supported languages: ML50_langs.txt
121
+ * Sentence piece model: sentence.bpe.model
122
+ * Fairseq dictionary of each language: dict.{lang}.txt (please replace lang with a language specified in ML50_langs.txt)
123
+
124
+ To use the trained models,
125
+ * use the tool [binarize.py](./data_scripts/binarize.py) to binarize your data using sentence.bpe.model and dict.{lang}.txt, and copy the dictionaries to your data path
126
+ * then run the generation command:
127
+ ```bash
128
+ path_2_data=<path to your binarized data with fairseq dictionaries>
129
+ model=<path_to_extracted_folder>/model.pt
130
+ lang_list=<path_to_extracted_folder>/ML50_langs.txt
131
+ source_lang=<source language>
132
+ target_lang=<target language>
133
+
134
+ fairseq-generate $path_2_data \
135
+ --path $model \
136
+ --task translation_multi_simple_epoch \
137
+ --gen-subset test \
138
+ --source-lang $source_lang \
139
+ --target-lang $target_lang
140
+ --sacrebleu --remove-bpe 'sentencepiece'\
141
+ --batch-size 32 \
142
+ --encoder-langtok "src" \
143
+ --decoder-langtok \
144
+ --lang-dict "$lang_list"
145
+ ```
146
+
147
+ ## Citation
148
+
149
+ ```bibtex
150
+ @article{tang2020multilingual,
151
+ title={Multilingual Translation with Extensible Multilingual Pretraining and Finetuning},
152
+ author={Yuqing Tang and Chau Tran and Xian Li and Peng-Jen Chen and Naman Goyal and Vishrav Chaudhary and Jiatao Gu and Angela Fan},
153
+ year={2020},
154
+ eprint={2008.00401},
155
+ archivePrefix={arXiv},
156
+ primaryClass={cs.CL}
157
+ }
158
+ ```
fairseq/examples/multilingual/data_scripts/README.md ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Install dependency
3
+ ```bash
4
+ pip install -r requirement.txt
5
+ ```
6
+
7
+ # Download the data set
8
+ ```bash
9
+ export WORKDIR_ROOT=<a directory which will hold all working files>
10
+
11
+ ```
12
+ The downloaded data will be at $WORKDIR_ROOT/ML50
13
+
14
+ # preprocess the data
15
+ Install SPM [here](https://github.com/google/sentencepiece)
16
+ ```bash
17
+ export WORKDIR_ROOT=<a directory which will hold all working files>
18
+ export SPM_PATH=<a path pointing to sentencepice spm_encode.py>
19
+ ```
20
+ * $WORKDIR_ROOT/ML50/raw: extracted raw data
21
+ * $WORKDIR_ROOT/ML50/dedup: dedup data
22
+ * $WORKDIR_ROOT/ML50/clean: data with valid and test sentences removed from the dedup data
23
+
24
+
fairseq/examples/multilingual/data_scripts/binarize.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import os, sys
3
+ from subprocess import check_call, check_output
4
+ import glob
5
+ import argparse
6
+ import shutil
7
+ import pathlib
8
+ import itertools
9
+
10
+ def call_output(cmd):
11
+ print(f"Executing: {cmd}")
12
+ ret = check_output(cmd, shell=True)
13
+ print(ret)
14
+ return ret
15
+
16
+ def call(cmd):
17
+ print(cmd)
18
+ check_call(cmd, shell=True)
19
+
20
+
21
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
22
+
23
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
24
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
25
+ sys.exit(-1)
26
+
27
+ SPM_PATH = os.environ.get('SPM_PATH', None)
28
+
29
+ if SPM_PATH is None or not SPM_PATH.strip():
30
+ print("Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting...")
31
+ sys.exit(-1)
32
+
33
+
34
+ SPM_MODEL = f'{WORKDIR_ROOT}/sentence.bpe.model'
35
+ SPM_VOCAB = f'{WORKDIR_ROOT}/dict_250k.txt'
36
+
37
+ SPM_ENCODE = f'{SPM_PATH}'
38
+
39
+ if not os.path.exists(SPM_MODEL):
40
+ call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/sentence.bpe.model -O {SPM_MODEL}")
41
+
42
+
43
+ if not os.path.exists(SPM_VOCAB):
44
+ call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/dict_250k.txt -O {SPM_VOCAB}")
45
+
46
+
47
+
48
+ def get_data_size(raw):
49
+ cmd = f'wc -l {raw}'
50
+ ret = call_output(cmd)
51
+ return int(ret.split()[0])
52
+
53
+ def encode_spm(model, direction, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None):
54
+ src, tgt = direction.split('-')
55
+
56
+ for split in splits:
57
+ src_raw, tgt_raw = f'{RAW_DIR}/{split}{prefix}.{direction}.{src}', f'{RAW_DIR}/{split}{prefix}.{direction}.{tgt}'
58
+ if os.path.exists(src_raw) and os.path.exists(tgt_raw):
59
+ cmd = f"""python {SPM_ENCODE} \
60
+ --model {model}\
61
+ --output_format=piece \
62
+ --inputs {src_raw} {tgt_raw} \
63
+ --outputs {BPE_DIR}/{direction}{prefix}/{split}.bpe.{src} {BPE_DIR}/{direction}{prefix}/{split}.bpe.{tgt} """
64
+ print(cmd)
65
+ call(cmd)
66
+
67
+
68
+ def binarize_(
69
+ bpe_dir,
70
+ databin_dir,
71
+ direction, spm_vocab=SPM_VOCAB,
72
+ splits=['train', 'test', 'valid'],
73
+ ):
74
+ src, tgt = direction.split('-')
75
+
76
+ try:
77
+ shutil.rmtree(f'{databin_dir}', ignore_errors=True)
78
+ os.mkdir(f'{databin_dir}')
79
+ except OSError as error:
80
+ print(error)
81
+ cmds = [
82
+ "fairseq-preprocess",
83
+ f"--source-lang {src} --target-lang {tgt}",
84
+ f"--destdir {databin_dir}/",
85
+ f"--workers 8",
86
+ ]
87
+ if isinstance(spm_vocab, tuple):
88
+ src_vocab, tgt_vocab = spm_vocab
89
+ cmds.extend(
90
+ [
91
+ f"--srcdict {src_vocab}",
92
+ f"--tgtdict {tgt_vocab}",
93
+ ]
94
+ )
95
+ else:
96
+ cmds.extend(
97
+ [
98
+ f"--joined-dictionary",
99
+ f"--srcdict {spm_vocab}",
100
+ ]
101
+ )
102
+ input_options = []
103
+ if 'train' in splits and glob.glob(f"{bpe_dir}/train.bpe*"):
104
+ input_options.append(
105
+ f"--trainpref {bpe_dir}/train.bpe",
106
+ )
107
+ if 'valid' in splits and glob.glob(f"{bpe_dir}/valid.bpe*"):
108
+ input_options.append(f"--validpref {bpe_dir}/valid.bpe")
109
+ if 'test' in splits and glob.glob(f"{bpe_dir}/test.bpe*"):
110
+ input_options.append(f"--testpref {bpe_dir}/test.bpe")
111
+ if len(input_options) > 0:
112
+ cmd = " ".join(cmds + input_options)
113
+ print(cmd)
114
+ call(cmd)
115
+
116
+
117
+ def binarize(
118
+ databin_dir,
119
+ direction, spm_vocab=SPM_VOCAB, prefix='',
120
+ splits=['train', 'test', 'valid'],
121
+ pairs_per_shard=None,
122
+ ):
123
+ def move_databin_files(from_folder, to_folder):
124
+ for bin_file in glob.glob(f"{from_folder}/*.bin") \
125
+ + glob.glob(f"{from_folder}/*.idx") \
126
+ + glob.glob(f"{from_folder}/dict*"):
127
+ try:
128
+ shutil.move(bin_file, to_folder)
129
+ except OSError as error:
130
+ print(error)
131
+ bpe_databin_dir = f"{BPE_DIR}/{direction}{prefix}_databin"
132
+ bpe_dir = f"{BPE_DIR}/{direction}{prefix}"
133
+ if pairs_per_shard is None:
134
+ binarize_(bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=splits)
135
+ move_databin_files(bpe_databin_dir, databin_dir)
136
+ else:
137
+ # binarize valid and test which will not be sharded
138
+ binarize_(
139
+ bpe_dir, bpe_databin_dir, direction,
140
+ spm_vocab=spm_vocab, splits=[s for s in splits if s != "train"])
141
+ for shard_bpe_dir in glob.glob(f"{bpe_dir}/shard*"):
142
+ path_strs = os.path.split(shard_bpe_dir)
143
+ shard_str = path_strs[-1]
144
+ shard_folder = f"{bpe_databin_dir}/{shard_str}"
145
+ databin_shard_folder = f"{databin_dir}/{shard_str}"
146
+ print(f'working from {shard_folder} to {databin_shard_folder}')
147
+ os.makedirs(databin_shard_folder, exist_ok=True)
148
+ binarize_(
149
+ shard_bpe_dir, shard_folder, direction,
150
+ spm_vocab=spm_vocab, splits=["train"])
151
+
152
+ for test_data in glob.glob(f"{bpe_databin_dir}/valid.*") + glob.glob(f"{bpe_databin_dir}/test.*"):
153
+ filename = os.path.split(test_data)[-1]
154
+ try:
155
+ os.symlink(test_data, f"{databin_shard_folder}/{filename}")
156
+ except OSError as error:
157
+ print(error)
158
+ move_databin_files(shard_folder, databin_shard_folder)
159
+
160
+
161
+ def load_langs(path):
162
+ with open(path) as fr:
163
+ langs = [l.strip() for l in fr]
164
+ return langs
165
+
166
+ if __name__ == '__main__':
167
+ parser = argparse.ArgumentParser()
168
+ parser.add_argument("--data_root", default=f"{WORKDIR_ROOT}/ML50")
169
+ parser.add_argument("--raw-folder", default='raw')
170
+ parser.add_argument("--bpe-folder", default='bpe')
171
+ parser.add_argument("--databin-folder", default='databin')
172
+
173
+ args = parser.parse_args()
174
+
175
+ DATA_PATH = args.data_root #'/private/home/yuqtang/public_data/ML50'
176
+ RAW_DIR = f'{DATA_PATH}/{args.raw_folder}'
177
+ BPE_DIR = f'{DATA_PATH}/{args.bpe_folder}'
178
+ DATABIN_DIR = f'{DATA_PATH}/{args.databin_folder}'
179
+ os.makedirs(BPE_DIR, exist_ok=True)
180
+
181
+ raw_files = itertools.chain(
182
+ glob.glob(f'{RAW_DIR}/train*'),
183
+ glob.glob(f'{RAW_DIR}/valid*'),
184
+ glob.glob(f'{RAW_DIR}/test*'),
185
+ )
186
+
187
+ directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
188
+
189
+ for direction in directions:
190
+ prefix = ""
191
+ splits = ['train', 'valid', 'test']
192
+ try:
193
+ shutil.rmtree(f'{BPE_DIR}/{direction}{prefix}', ignore_errors=True)
194
+ os.mkdir(f'{BPE_DIR}/{direction}{prefix}')
195
+ os.makedirs(DATABIN_DIR, exist_ok=True)
196
+ except OSError as error:
197
+ print(error)
198
+ spm_model, spm_vocab = SPM_MODEL, SPM_VOCAB
199
+ encode_spm(spm_model, direction=direction, splits=splits)
200
+ binarize(DATABIN_DIR, direction, spm_vocab=spm_vocab, splits=splits)
fairseq/examples/multilingual/data_scripts/check_iswlt_test_data.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import os, sys
8
+ import subprocess
9
+ import re
10
+ from subprocess import check_call, check_output
11
+
12
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
13
+
14
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
15
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
16
+ sys.exit(-1)
17
+
18
+
19
+ BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ")
20
+ def run_eval_bleu(cmd):
21
+ output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip()
22
+ print(output)
23
+ bleu = -1.0
24
+ for line in output.strip().split('\n'):
25
+ m = BLEU_REGEX.search(line)
26
+ if m is not None:
27
+ bleu = m.groups()[0]
28
+ bleu = float(bleu)
29
+ break
30
+ return bleu
31
+
32
+ def check_data_test_bleu(raw_folder, data_lang_pairs):
33
+ not_matchings = []
34
+ for sacrebleu_set, src_tgts in data_lang_pairs:
35
+ for src_tgt in src_tgts:
36
+ print(f'checking test bleus for: {src_tgt} at {sacrebleu_set}')
37
+ src, tgt = src_tgt.split('-')
38
+ ssrc, stgt = src[:2], tgt[:2]
39
+ if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'):
40
+ # reversed direction may have different test set
41
+ test_src = f'{raw_folder}/test.{tgt}-{src}.{src}'
42
+ else:
43
+ test_src = f'{raw_folder}/test.{src}-{tgt}.{src}'
44
+ cmd1 = f'cat {test_src} | sacrebleu -t "{sacrebleu_set}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""'
45
+ test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}'
46
+ cmd2 = f'cat {test_tgt} | sacrebleu -t "{sacrebleu_set}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""'
47
+ bleu1 = run_eval_bleu(cmd1)
48
+ if bleu1 != 100.0:
49
+ not_matchings.append(f'{sacrebleu_set}:{src_tgt} source side not matching: {test_src}')
50
+ bleu2 = run_eval_bleu(cmd2)
51
+ if bleu2 != 100.0:
52
+ not_matchings.append(f'{sacrebleu_set}:{src_tgt} target side not matching: {test_tgt}')
53
+ return not_matchings
54
+
55
+ if __name__ == "__main__":
56
+ to_data_path = f'{WORKDIR_ROOT}/iwsltv2'
57
+ not_matching = check_data_test_bleu(
58
+ f'{to_data_path}/raw',
59
+ [
60
+ ('iwslt17', ['en_XX-ar_AR', 'en_XX-ko_KR', 'ar_AR-en_XX', 'ko_KR-en_XX']),
61
+ ('iwslt17', ['en_XX-it_IT', 'en_XX-nl_XX', 'it_IT-en_XX', 'nl_XX-en_XX']),
62
+ ('iwslt17/tst2015', ['en_XX-vi_VN', "vi_VN-en_XX"]),
63
+ ]
64
+ )
65
+ if len(not_matching) > 0:
66
+ print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching))
67
+
fairseq/examples/multilingual/data_scripts/check_self_overlaps.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import os
8
+ import glob
9
+ import argparse
10
+ from utils.dedup import deup
11
+ import sys
12
+
13
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
14
+
15
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
16
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
17
+ sys.exit(-1)
18
+
19
+ def get_directions(folder):
20
+ raw_files = glob.glob(f'{folder}/train*')
21
+ directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
22
+ return directions
23
+
24
+ def diff_list(lhs, rhs):
25
+ return set(lhs).difference(set(rhs))
26
+
27
+ def check_diff(
28
+ from_src_file, from_tgt_file,
29
+ to_src_file, to_tgt_file,
30
+ ):
31
+ seen_in_from = set()
32
+ seen_src_in_from = set()
33
+ seen_tgt_in_from = set()
34
+ from_count = 0
35
+ with open(from_src_file, encoding='utf-8') as fsrc, \
36
+ open(from_tgt_file, encoding='utf-8') as ftgt:
37
+ for s, t in zip(fsrc, ftgt):
38
+ seen_in_from.add((s, t))
39
+ seen_src_in_from.add(s)
40
+ seen_tgt_in_from.add(t)
41
+ from_count += 1
42
+ common = 0
43
+ common_src = 0
44
+ common_tgt = 0
45
+ to_count = 0
46
+ seen = set()
47
+
48
+ with open(to_src_file, encoding='utf-8') as fsrc, \
49
+ open(to_tgt_file, encoding='utf-8') as ftgt:
50
+ for s, t in zip(fsrc, ftgt):
51
+ to_count += 1
52
+ if (s, t) not in seen:
53
+ if (s, t) in seen_in_from:
54
+ common += 1
55
+ if s in seen_src_in_from:
56
+ common_src += 1
57
+ seen_src_in_from.remove(s)
58
+ if t in seen_tgt_in_from:
59
+ common_tgt += 1
60
+ seen_tgt_in_from.remove(t)
61
+ seen.add((s, t))
62
+ return common, common_src, common_tgt, from_count, to_count
63
+
64
+ def main():
65
+ parser = argparse.ArgumentParser()
66
+ parser.add_argument("--folder", type=str, required=True,
67
+ help="the data folder ")
68
+ parser.add_argument("--split", type=str, default='test',
69
+ help="split (valid, test) to check against training data")
70
+ parser.add_argument('--directions', type=str, default=None, required=False)
71
+
72
+ args = parser.parse_args()
73
+
74
+ if args.directions is None:
75
+ directions = set(get_directions(args.folder))
76
+ directions = sorted(directions)
77
+ else:
78
+ directions = args.directions.split(',')
79
+ directions = sorted(set(directions))
80
+
81
+ results = []
82
+ print(f'checking where {args.split} split data are in training')
83
+ print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size')
84
+
85
+ for direction in directions:
86
+ src, tgt = direction.split('-')
87
+ from_src_file = f'{args.folder}/{args.split}.{src}-{tgt}.{src}'
88
+ from_tgt_file = f'{args.folder}/{args.split}.{src}-{tgt}.{tgt}'
89
+ if not os.path.exists(from_src_file):
90
+ # some test/valid data might in reverse directinos:
91
+ from_src_file = f'{args.folder}/{args.split}.{tgt}-{src}.{src}'
92
+ from_tgt_file = f'{args.folder}/{args.split}.{tgt}-{src}.{tgt}'
93
+ to_src_file = f'{args.folder}/train.{src}-{tgt}.{src}'
94
+ to_tgt_file = f'{args.folder}/train.{src}-{tgt}.{tgt}'
95
+ if not os.path.exists(to_src_file) or not os.path.exists(from_src_file):
96
+ continue
97
+ r = check_diff(from_src_file, from_tgt_file, to_src_file, to_tgt_file)
98
+ results.append(r)
99
+ print(f'{direction}\t', '\t'.join(map(str, r)))
100
+
101
+
102
+ if __name__ == "__main__":
103
+ main()
fairseq/examples/multilingual/data_scripts/check_valid_test_overlaps.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import os
8
+ import argparse
9
+ import pandas as pd
10
+ import sys
11
+
12
+
13
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
14
+
15
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
16
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
17
+ sys.exit(-1)
18
+
19
+ def load_langs(path):
20
+ with open(path) as fr:
21
+ langs = [l.strip() for l in fr]
22
+ return langs
23
+
24
+
25
+
26
+ def load_sentences(raw_data, split, direction):
27
+ src, tgt = direction.split('-')
28
+ src_path = f"{raw_data}/{split}.{direction}.{src}"
29
+ tgt_path = f"{raw_data}/{split}.{direction}.{tgt}"
30
+ if os.path.exists(src_path) and os.path.exists(tgt_path):
31
+ return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())]
32
+ else:
33
+ return []
34
+
35
+ def swap_direction(d):
36
+ src, tgt = d.split('-')
37
+ return f'{tgt}-{src}'
38
+
39
+ def get_all_test_data(raw_data, directions, split='test'):
40
+ test_data = [
41
+ x
42
+ for dd in directions
43
+ for d in [dd, swap_direction(dd)]
44
+ for x in load_sentences(raw_data, split, d)
45
+ ]
46
+ # all_test_data = {s for _, d in test_data for s in d}
47
+ all_test_data = {}
48
+ for lang, d in test_data:
49
+ for s in d:
50
+ s = s.strip()
51
+ lgs = all_test_data.get(s, set())
52
+ lgs.add(lang)
53
+ all_test_data[s] = lgs
54
+ return all_test_data, test_data
55
+
56
+
57
+ def check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train={}):
58
+ # src, tgt = direction.split('-')
59
+ print(f'check training data for {direction} in {src_path} and {tgt_path}')
60
+ size = 0
61
+ overlapped_size_counted_dup = 0
62
+ if not os.path.exists(tgt_path) or not os.path.exists(src_path):
63
+ return mess_up_train, size, overlapped_size_counted_dup
64
+
65
+ with open(src_path) as f, open(tgt_path) as g:
66
+ for src_line, tgt_line in zip(f, g):
67
+ s = src_line.strip()
68
+ t = tgt_line.strip()
69
+ size += 1
70
+ if s in all_test_data:
71
+ langs = mess_up_train.get(s, set())
72
+ langs.add(direction)
73
+ mess_up_train[s] = langs
74
+ overlapped_size_counted_dup += 1
75
+ if t in all_test_data:
76
+ langs = mess_up_train.get(t, set())
77
+ langs.add(direction)
78
+ mess_up_train[t] = langs
79
+ overlapped_size_counted_dup += 1
80
+ print(f'{direction}: size={size}, overlapped={overlapped_size_counted_dup}')
81
+ return mess_up_train, size, overlapped_size_counted_dup
82
+
83
+ def check_train_all(raw_data, directions, all_test_data):
84
+ mess_up_train = {}
85
+ data_sizes = {}
86
+ # raw_data = '~chau/data-bin/MineBART/multilingual_mined_100M/en_XX/et_EE-en_XX/all.{en_XX, et_EE}'
87
+ print(f'checking training data againsts # {len(all_test_data)} sentences')
88
+ print(f'example test data: ', [s for i, s in enumerate(all_test_data.keys()) if i < 10])
89
+ for direction in directions:
90
+ src, tgt = direction.split('-')
91
+ path = f'{raw_data}/en_XX/{direction}/all'
92
+ src_path = f'{path}.{src}'
93
+ tgt_path = f'{path}.{tgt}'
94
+ print(f'checking {src_path} {tgt_path}')
95
+ _, size, overlapped_size_counted_dup = check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train)
96
+ data_sizes[direction] = (size, overlapped_size_counted_dup)
97
+ return mess_up_train, data_sizes
98
+
99
+
100
+
101
+
102
+ def main():
103
+ parser = argparse.ArgumentParser()
104
+ parser.add_argument("--folder", type=str, required=True,
105
+ help="the data folder ")
106
+ parser.add_argument("--test-data", type=str, required=True,
107
+ help="the test data folder ")
108
+ parser.add_argument('--directions', type=str, default=None, required=False)
109
+
110
+ args = parser.parse_args()
111
+ directions = args.directions.split(',')
112
+ directions = sorted(set(directions))
113
+
114
+ results = []
115
+ # print(f'checking where {args.split} split data are in training')
116
+ # print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size')
117
+ raw_data = args.folder
118
+ all_test_data, test_data = get_all_test_data(args.test_data, directions, split='test')
119
+ mess_up_train, data_sizes = check_train_all(raw_data, directions, all_test_data)
120
+ print(data_sizes)
121
+
122
+
123
+ if __name__ == "__main__":
124
+ main()
fairseq/examples/multilingual/data_scripts/dedup_all.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+
8
+ import os
9
+ import glob
10
+ import argparse
11
+ from utils.dedup import deup
12
+
13
+ import sys
14
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
15
+
16
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
17
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
18
+ sys.exit(-1)
19
+
20
+
21
+ def main():
22
+ parser = argparse.ArgumentParser()
23
+ parser.add_argument("--from-folder", type=str, required=True,
24
+ help="the data folder to be dedup")
25
+ parser.add_argument("--to-folder", type=str, required=True,
26
+ help="the data folder to save deduped data")
27
+ parser.add_argument('--directions', type=str, default=None, required=False)
28
+
29
+ args = parser.parse_args()
30
+
31
+ if args.directions is None:
32
+ raw_files = glob.glob(f'{args.from_folder}/train*')
33
+
34
+ directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
35
+ else:
36
+ directions = args.directions.split(',')
37
+ directions = sorted(set(directions))
38
+
39
+ for direction in directions:
40
+ src, tgt = direction.split('-')
41
+ src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}'
42
+ tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}'
43
+ src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}'
44
+ tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}'
45
+ assert src_file != src_file_out
46
+ assert tgt_file != tgt_file_out
47
+ print(f'deduping {src_file}, {tgt_file}')
48
+ deup(src_file, tgt_file, src_file_out, tgt_file_out)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ main()
fairseq/examples/multilingual/data_scripts/download_ML50_v1.sh ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ if [ -z $WORKDIR_ROOT ] ;
9
+ then
10
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
11
+ exit
12
+ fi
13
+
14
+ # first run download_wmt20.sh; it will install a few useful tools for other scripts
15
+ # TODO: need to print out instructions on downloading a few files which requires manually authentication from the websites
16
+ bash ./download_wmt20.sh
17
+
18
+ python ./download_wmt19_and_before.py
19
+ bash ./download_wat19_my.sh
20
+ python ./download_ted_and_extract.py
21
+ bash ./download_lotus.sh
22
+ bash ./download_iitb.sh
23
+ bash ./download_af_xh.sh
24
+
25
+
26
+ # IWSLT downloading URLs have changed in between; TODO: fix them:
27
+ bash ./download_iwslt_and_extract.sh
28
+
29
+ # TODO: globalvoices URLs changed; need to be fixed
30
+ bash ./download_flores_data.sh
fairseq/examples/multilingual/data_scripts/download_af_xh.sh ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # set -x -e
9
+
10
+ if [ -z $WORKDIR_ROOT ] ;
11
+ then
12
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
13
+ exit
14
+ fi
15
+
16
+
17
+ # put intermediate files
18
+ TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2
19
+ # output {train,valid,test} files to dest
20
+ DEST=${WORKDIR_ROOT}/ML50/raw
21
+
22
+
23
+
24
+ ROOT=${WORKDIR_ROOT}
25
+ UTILS=$PWD/utils
26
+ TMX2CORPUS="${UTILS}/tmx2corpus"
27
+ TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py"
28
+
29
+ mkdir -p $TMP_DIR
30
+ mkdir -p $DEST
31
+ mkdir -p $UTILS
32
+
33
+ function download_opus(){
34
+ src=$1
35
+ tgt=$2
36
+ subset=$3
37
+ ulr=$4
38
+
39
+ mkdir extract_$subset.$src-$tgt
40
+ pushd extract_$subset.$src-$tgt
41
+ if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then
42
+ wget $url -O "$subset.$src-$tgt.tmx.gz"
43
+ gzip -d "$subset.$src-$tgt.tmx.gz"
44
+ f=$subset.$src-$tgt.tmx
45
+ $TMX_TOOL $f
46
+ mv bitext.$src ../$subset.$src-$tgt.$src
47
+ mv bitext.$tgt ../$subset.$src-$tgt.$tgt
48
+ fi
49
+ popd
50
+ }
51
+
52
+ function concat_subsets(){
53
+ src=$1
54
+ tgt=$2
55
+ subsets=$3
56
+ src_train=raw_train.$src-$tgt.$src
57
+ tgt_train=raw_train.$src-$tgt.$tgt
58
+ > $src_train
59
+ > $tgt_train
60
+ for subset in $subsets; do
61
+ cat $subset.$src-$tgt.$src >> $src_train
62
+ cat $subset.$src-$tgt.$tgt >> $tgt_train
63
+ done
64
+ }
65
+
66
+
67
+
68
+ function get_seeded_random()
69
+ {
70
+ seed="$1"
71
+ openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \
72
+ </dev/zero 2>/dev/null
73
+ }
74
+
75
+ function split_train_valid(){
76
+ src=$1
77
+ tgt=$2
78
+ raw_src_train=raw_train.$src-$tgt.$src
79
+ raw_tgt_train=raw_train.$src-$tgt.$tgt
80
+
81
+ shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src
82
+ shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt
83
+
84
+ head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src
85
+ head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt
86
+
87
+ tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src
88
+ tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt
89
+ }
90
+
91
+ function copy2dst(){
92
+ lsrc=$1
93
+ ltgt=$2
94
+ src=${lsrc:0:2}
95
+ tgt=${ltgt:0:2}
96
+
97
+
98
+ cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc
99
+ cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt
100
+
101
+ cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc
102
+ cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt
103
+ }
104
+
105
+
106
+
107
+
108
+ #for xh-en
109
+ declare -A xh_en_urls
110
+ xh_en_urls=(
111
+ [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz
112
+ [wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz
113
+ [memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz
114
+ [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz
115
+ [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz
116
+ [XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz
117
+ [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz
118
+ [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz
119
+ )
120
+
121
+ mkdir $TMP_DIR/xh-en
122
+ pushd $TMP_DIR/xh-en
123
+ for k in "${!xh_en_urls[@]}"
124
+ do
125
+ name=$k
126
+ url=${xh_en_urls[$k]}
127
+ echo "$name: $url"
128
+ download_opus xh en $name $ulr
129
+ done
130
+ concat_subsets xh en "${!xh_en_urls[@]}"
131
+ split_train_valid xh en
132
+ copy2dst xh_ZA en_XX
133
+ popd
134
+
135
+
136
+ ##
137
+ #for af-en
138
+ declare -A af_en_urls
139
+ af_en_urls=(
140
+ [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz
141
+ [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz
142
+ [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz
143
+ [QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz
144
+ [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz
145
+ [OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz
146
+ [SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz
147
+ [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz
148
+ )
149
+
150
+ mkdir $TMP_DIR/af-en
151
+ pushd $TMP_DIR/af-en
152
+ for k in "${!af_en_urls[@]}"
153
+ do
154
+ name=$k
155
+ url=${af_en_urls[$k]}
156
+ echo "$name: $url"
157
+ download_opus af en $name $ulr
158
+ done
159
+ concat_subsets af en "${!af_en_urls[@]}"
160
+ split_train_valid af en
161
+ copy2dst af_ZA en_XX
162
+ popd
163
+
164
+
fairseq/examples/multilingual/data_scripts/download_flores_data.sh ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ # All rights reserved.
5
+ #
6
+ # This source code is licensed under the license found in the
7
+ # LICENSE file in the root directory of this source tree.
8
+ #
9
+
10
+ if [ -z $WORKDIR_ROOT ] ;
11
+ then
12
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
13
+ exit
14
+ fi
15
+
16
+
17
+ set -e
18
+ set -o pipefail
19
+
20
+ SRC=en
21
+ SI_TGT=si
22
+ NE_TGT=ne
23
+
24
+ DESTDIR=${WORKDIR_ROOT}/ML50/raw/
25
+
26
+ ROOT=${WORKDIR_ROOT}/tmp
27
+ mkdir -p $ROOT
28
+ DATA=$ROOT/data
29
+ NE_ROOT=$DATA/all-clean-ne
30
+ SI_ROOT=$DATA/all-clean-si
31
+
32
+ mkdir -p $DATA $NE_ROOT $SI_ROOT
33
+
34
+ SI_OPUS_DATASETS=(
35
+ "$SI_ROOT/GNOME.en-si"
36
+ "$SI_ROOT/Ubuntu.en-si"
37
+ "$SI_ROOT/KDE4.en-si"
38
+ "$SI_ROOT/OpenSubtitles.en-si"
39
+ )
40
+
41
+ SI_OPUS_URLS=(
42
+ "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-si.txt.zip"
43
+ "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-si.txt.zip"
44
+ "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-si.txt.zip"
45
+ "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-si.txt.zip"
46
+ )
47
+
48
+ NE_OPUS_DATASETS=(
49
+ "$NE_ROOT/GNOME.en-ne"
50
+ "$NE_ROOT/Ubuntu.en-ne"
51
+ "$NE_ROOT/KDE4.en-ne"
52
+ )
53
+
54
+ NE_OPUS_URLS=(
55
+ "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ne.txt.zip"
56
+ "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ne.txt.zip"
57
+ "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip"
58
+ )
59
+
60
+ REMOVE_FILE_PATHS=()
61
+
62
+ # Download data
63
+ download_data() {
64
+ CORPORA=$1
65
+ URL=$2
66
+
67
+ if [ -f $CORPORA ]; then
68
+ echo "$CORPORA already exists, skipping download"
69
+ else
70
+ echo "Downloading $URL"
71
+ wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA
72
+ if [ -f $CORPORA ]; then
73
+ echo "$URL successfully downloaded."
74
+ else
75
+ echo "$URL not successfully downloaded."
76
+ rm -f $CORPORA
77
+ exit -1
78
+ fi
79
+ fi
80
+ }
81
+
82
+ # Example: download_opus_data $LANG_ROOT $TGT
83
+ download_opus_data() {
84
+ LANG_ROOT=$1
85
+ TGT=$2
86
+
87
+ if [ "$TGT" = "si" ]; then
88
+ URLS=("${SI_OPUS_URLS[@]}")
89
+ DATASETS=("${SI_OPUS_DATASETS[@]}")
90
+ else
91
+ URLS=("${NE_OPUS_URLS[@]}")
92
+ DATASETS=("${NE_OPUS_DATASETS[@]}")
93
+ fi
94
+
95
+ # Download and extract data
96
+ for ((i=0;i<${#URLS[@]};++i)); do
97
+ URL=${URLS[i]}
98
+ CORPORA=${DATASETS[i]}
99
+
100
+ download_data $CORPORA $URL
101
+ unzip -o $CORPORA -d $LANG_ROOT
102
+ REMOVE_FILE_PATHS+=( $CORPORA $CORPORA.xml $CORPORA.ids $LANG_ROOT/README $LANG_ROOT/LICENSE )
103
+ done
104
+
105
+ cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC
106
+ cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT
107
+
108
+ REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC )
109
+ REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT )
110
+ }
111
+
112
+ download_opus_data $SI_ROOT $SI_TGT
113
+ cp ${SI_OPUS_DATASETS[3]}.$SRC $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SRC
114
+ cp ${SI_OPUS_DATASETS[3]}.$SI_TGT $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SI_TGT
115
+ REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT )
116
+
117
+ download_opus_data $NE_ROOT $NE_TGT
118
+
119
+
120
+ # Download and extract Global Voices data
121
+ GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en"
122
+ GLOBAL_VOICES_URL="http://www.casmacat.eu/corpus/global-voices/globalvoices.ne-en.xliff.gz"
123
+
124
+ download_data $GLOBAL_VOICES.gz $GLOBAL_VOICES_URL
125
+ gunzip -Nf $GLOBAL_VOICES.gz
126
+
127
+ sed -ne 's?.*<source>\(.*\)</source>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$NE_TGT
128
+ sed -ne 's?.*<target[^>]*>\(.*\)</target>.*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$SRC
129
+
130
+ REMOVE_FILE_PATHS+=( $GLOBAL_VOICES )
131
+
132
+ # Download and extract the bible dataset
133
+ BIBLE_TOOLS=bible-corpus-tools
134
+ XML_BIBLES=XML_Bibles
135
+ XML_BIBLES_DUP=XML_Bibles_dup
136
+
137
+ if [ ! -e $BIBLE_TOOLS ]; then
138
+ echo "Cloning bible-corpus-tools repository..."
139
+ git clone https://github.com/christos-c/bible-corpus-tools.git
140
+ fi
141
+
142
+ mkdir -p $BIBLE_TOOLS/bin $XML_BIBLES $XML_BIBLES_DUP
143
+ javac -cp "$BIBLE_TOOLS/lib/*" -d $BIBLE_TOOLS/bin $BIBLE_TOOLS/src/bible/readers/*.java $BIBLE_TOOLS/src/bible/*.java
144
+
145
+ download_data bible.tar.gz "https://github.com/christos-c/bible-corpus/archive/v1.2.1.tar.gz"
146
+ tar xvzf bible.tar.gz
147
+
148
+ cp bible-corpus-1.2.1/bibles/{Greek.xml,English.xml,Nepali.xml} $XML_BIBLES/
149
+ cp bible-corpus-1.2.1/bibles/{Greek.xml,English-WEB.xml,Nepali.xml} $XML_BIBLES_DUP/
150
+
151
+ java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES
152
+ java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES_DUP
153
+ java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES
154
+ java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES_DUP
155
+
156
+ cat $XML_BIBLES/aligned/*/English.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$SRC
157
+ cat $XML_BIBLES/aligned/*/Nepali.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$NE_TGT
158
+ cat $XML_BIBLES_DUP/aligned/*/English-WEB.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$SRC
159
+ cat $XML_BIBLES_DUP/aligned/*/Nepali.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$NE_TGT
160
+ REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $XML_BIBLES_DUP )
161
+
162
+ # Download and extract the Penn Treebank dataset
163
+ NE_TAGGED=$ROOT/new_submissions_parallel_corpus_project_Nepal
164
+ NE_TAGGED_URL="http://www.cle.org.pk/Downloads/ling_resources/parallelcorpus/NepaliTaggedCorpus.zip"
165
+ EN_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.en.patch"
166
+ NE_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.ne.patch"
167
+ MOSES=mosesdecoder
168
+ MOSES_TOK=$MOSES/scripts/tokenizer
169
+ EN_PATCH_REGEX="{s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
170
+ NE_PATCH_REGEX="{s:\p{Cf}::g;s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}"
171
+
172
+ download_data $DATA/nepali-penn-treebank.$SRC.patch $EN_TAGGED_PATCH_URL
173
+ download_data $DATA/nepali-penn-treebank.$NE_TGT.patch $NE_TAGGED_PATCH_URL
174
+ download_data original.zip $NE_TAGGED_URL
175
+ unzip -o original.zip -d $ROOT
176
+
177
+ cat $NE_TAGGED/00.txt $NE_TAGGED/01.txt $NE_TAGGED/02.txt > $NE_TAGGED/nepali-penn-treebank.$SRC
178
+ cat $NE_TAGGED/00ne_revised.txt $NE_TAGGED/01ne_revised.txt $NE_TAGGED/02ne_revised.txt > $NE_TAGGED/nepali-penn-treebank.$NE_TGT
179
+
180
+ patch $NE_TAGGED/nepali-penn-treebank.$SRC -i $DATA/nepali-penn-treebank.$SRC.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$SRC
181
+ patch $NE_TAGGED/nepali-penn-treebank.$NE_TGT -i $DATA/nepali-penn-treebank.$NE_TGT.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT
182
+
183
+ if [ ! -e $MOSES ]; then
184
+ echo "Cloning moses repository..."
185
+ git clone https://github.com/moses-smt/mosesdecoder.git
186
+ fi
187
+
188
+ cat $NE_TAGGED/nepali-penn-treebank-patched.$SRC | \
189
+ perl -anpe "$EN_PATCH_REGEX" | \
190
+ $MOSES_TOK/tokenizer.perl -l $SRC | \
191
+ $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$SRC
192
+
193
+ cat $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT | \
194
+ perl -CIO -anpe "$NE_PATCH_REGEX" | \
195
+ $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$NE_TGT
196
+
197
+
198
+ # Download nepali dictionary data
199
+ NE_DICT=$NE_ROOT/dictionaries
200
+ download_data $NE_DICT "http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz"
201
+ tar xvzf $NE_DICT
202
+ cp dictionaries/dict.ne $NE_ROOT/dictionary.$NE_TGT-$SRC
203
+ REMOVE_FILE_PATHS+=( $NE_DICT dictionaries )
204
+
205
+ REMOVE_FILE_PATHS+=( $MOSES $NE_TAGGED original.zip $DATA/nepali-penn-treebank.$SRC.patch $DATA/nepali-penn-treebank.$NE_TGT.patch )
206
+
207
+
208
+ # Remove the temporary files
209
+ for ((i=0;i<${#REMOVE_FILE_PATHS[@]};++i)); do
210
+ rm -rf ${REMOVE_FILE_PATHS[i]}
211
+ done
212
+
213
+ # Copy the training data
214
+ si=si_LK
215
+ ne=ne_NP
216
+ en=en_XX
217
+ cat $SI_ROOT/GNOMEKDEUbuntu.en-si.si $SI_ROOT/OpenSubtitles2018.en-si.si > $DESTDIR/train.$si-$en.$si
218
+ cat $SI_ROOT/GNOMEKDEUbuntu.en-si.en $SI_ROOT/OpenSubtitles2018.en-si.en > $DESTDIR/train.$si-$en.$en
219
+
220
+ cat $NE_ROOT/bible_dup.en-ne.ne $NE_ROOT/bible.en-ne.ne $NE_ROOT/globalvoices.2018q4.ne-en.ne $NE_ROOT/GNOMEKDEUbuntu.en-ne.ne $NE_ROOT/nepali-penn-treebank.ne > $DESTDIR/train.$ne-$en.$ne
221
+ cat $NE_ROOT/bible_dup.en-ne.en $NE_ROOT/bible.en-ne.en $NE_ROOT/globalvoices.2018q4.ne-en.en $NE_ROOT/GNOMEKDEUbuntu.en-ne.en $NE_ROOT/nepali-penn-treebank.en > $DESTDIR/train.$ne-$en.$en
222
+
223
+
224
+ #Download the test sets
225
+ wget https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz
226
+ tar -xvzf wikipedia_en_ne_si_test_sets.tgz
227
+
228
+ cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.ne $DESTDIR/valid.$ne-$en.$ne
229
+ cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.en $DESTDIR/valid.$ne-$en.$en
230
+
231
+ cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.si $DESTDIR/valid.$si-$en.$si
232
+ cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.en $DESTDIR/valid.$si-$en.$en
233
+
234
+ cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.ne $DESTDIR/devtest.$ne-$en.$ne
235
+ cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.en $DESTDIR/devtest.$ne-$en.$en
236
+
237
+ cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.si $DESTDIR/devtest.$si-$en.$si
238
+ cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.en $DESTDIR/devtest.$si-$en.$en
239
+
240
+ cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.ne $DESTDIR/test.$ne-$en.$ne
241
+ cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.en $DESTDIR/test.$ne-$en.$en
242
+
243
+ cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.si $DESTDIR/test.$si-$en.$si
244
+ cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.en $DESTDIR/test.$si-$en.$en
245
+
246
+ rm -rf wikipedia_en_ne_si_test_sets.tgz wikipedia_en_ne_si_test_sets
fairseq/examples/multilingual/data_scripts/download_iitb.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+
9
+ if [ -z $WORKDIR_ROOT ] ;
10
+ then
11
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
12
+ exit
13
+ fi
14
+
15
+ IITB=$WORKDIR_ROOT/IITB
16
+ mkdir -p $IITB
17
+ pushd $IITB
18
+
19
+ wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz
20
+ tar -xvzf parallel.tgz
21
+
22
+ wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/dev_test.tgz
23
+ tar -xvzf dev_test.tgz
24
+
25
+ DESTDIR=${WORKDIR_ROOT}/ML50/raw/
26
+
27
+ cp parallel/IITB.en-hi.en $DESTDIR/train.hi_IN-en_XX.en_XX
28
+ cp parallel/IITB.en-hi.hi $DESTDIR/train.hi_IN-en_XX.hi_IN
29
+
30
+ cp dev_test/dev.en $DESTDIR/valid.hi_IN-en_XX.en_XX
31
+ cp dev_test/dev.hi $DESTDIR/valid.hi_IN-en_XX.hi_IN
32
+
33
+ cp dev_test/test.en $DESTDIR/test.hi_IN-en_XX.en_XX
34
+ cp dev_test/test.hi $DESTDIR/test.hi_IN-en_XX.hi_IN
35
+ popd
fairseq/examples/multilingual/data_scripts/download_iwslt_and_extract.sh ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ #echo 'Cloning Moses github repository (for tokenization scripts)...'
9
+ #git clone https://github.com/moses-smt/mosesdecoder.git
10
+
11
+ if [ -z $WORKDIR_ROOT ] ;
12
+ then
13
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
14
+ exit
15
+ fi
16
+
17
+
18
+
19
+ data_root=${WORKDIR_ROOT}/iwsltv2
20
+ DESTDIR=${WORKDIR_ROOT}/ML50/raw
21
+
22
+
23
+ langs="ar_AR it_IT nl_XX ko_KR vi_VN"
24
+ echo "data_root: $data_root"
25
+
26
+ download_path=${data_root}/downloads
27
+ raw=${DESTDIR}
28
+ tmp=${data_root}/tmp
29
+ orig=${data_root}/orig
30
+
31
+ mkdir -p $download_path $orig $raw $tmp
32
+ #######################
33
+ download_iwslt(){
34
+ iwslt_key=$1
35
+ src=$2
36
+ tgt=$3
37
+ save_prefix=$4
38
+ pushd ${download_path}
39
+ if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then
40
+ wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz
41
+ [ $? -eq 0 ] && return 0
42
+ fi
43
+ popd
44
+ }
45
+
46
+ extract_iwslt(){
47
+ src=$1
48
+ tgt=$2
49
+ prefix=$3
50
+ pushd $orig
51
+ tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz
52
+ popd
53
+ }
54
+
55
+ generate_train(){
56
+ lsrc=$1
57
+ ltgt=$2
58
+ src=${lsrc:0:2}
59
+ tgt=${ltgt:0:2}
60
+ for ll in $lsrc $ltgt; do
61
+ l=${ll:0:2}
62
+ f="$orig/*/train.tags.$src-$tgt.$l"
63
+ f_raw=$raw/train.$lsrc-$ltgt.$ll
64
+ cat $f \
65
+ | grep -v '<url>' \
66
+ | grep -v '<talkid>' \
67
+ | grep -v '<keywords>' \
68
+ | grep -v '<speaker>' \
69
+ | grep -v '<reviewer' \
70
+ | grep -v '<translator' \
71
+ | grep -v '<doc' \
72
+ | grep -v '</doc>' \
73
+ | sed -e 's/<title>//g' \
74
+ | sed -e 's/<\/title>//g' \
75
+ | sed -e 's/<description>//g' \
76
+ | sed -e 's/<\/description>//g' \
77
+ | sed 's/^\s*//g' \
78
+ | sed 's/\s*$//g' \
79
+ > $f_raw
80
+ [ $? -eq 0 ] && echo "extracted $f to $f_raw"
81
+ done
82
+ return 0
83
+ }
84
+
85
+ convert_valid_test(){
86
+ src=$1
87
+ tgt=$2
88
+ for l in $src $tgt; do
89
+ echo "lang: ${l}"
90
+ for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do
91
+ fname=${o##*/}
92
+ f=$tmp/${fname%.*}
93
+ echo "$o => $f"
94
+ grep '<seg id' $o \
95
+ | sed -e 's/<seg id="[0-9]*">\s*//g' \
96
+ | sed -e 's/\s*<\/seg>\s*//g' \
97
+ | sed -e "s/\’/\'/g" \
98
+ > $f
99
+ echo ""
100
+ done
101
+ done
102
+ }
103
+
104
+ generate_subset(){
105
+ lsrc=$1
106
+ ltgt=$2
107
+ src=${lsrc:0:2}
108
+ tgt=${ltgt:0:2}
109
+ subset=$3
110
+ prefix=$4
111
+ for ll in $lsrc $ltgt; do
112
+ l=${ll:0:2}
113
+ f=$tmp/$prefix.${src}-${tgt}.$l
114
+ if [[ -f $f ]]; then
115
+ cp $f $raw/$subset.${lsrc}-$ltgt.${ll}
116
+ fi
117
+ done
118
+ }
119
+ #################
120
+
121
+ echo "downloading iwslt training and dev data"
122
+ # using multilingual for it, nl
123
+ download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo
124
+ download_iwslt "2017-01-trnted" ar en
125
+ download_iwslt "2017-01-trnted" en ar
126
+ download_iwslt "2017-01-trnted" ko en
127
+ download_iwslt "2017-01-trnted" en ko
128
+ download_iwslt "2015-01" vi en
129
+ download_iwslt "2015-01" en vi
130
+
131
+ echo "donwloading iwslt test data"
132
+ download_iwslt "2017-01-mted-test" it en "test."
133
+ download_iwslt "2017-01-mted-test" en it "test."
134
+ download_iwslt "2017-01-mted-test" nl en "test."
135
+ download_iwslt "2017-01-mted-test" en nl "test."
136
+
137
+ download_iwslt "2017-01-ted-test" ar en "test."
138
+ download_iwslt "2017-01-ted-test" en ar "test."
139
+ download_iwslt "2017-01-ted-test" ko en "test."
140
+ download_iwslt "2017-01-ted-test" en ko "test."
141
+ download_iwslt "2015-01-test" vi en "test."
142
+ download_iwslt "2015-01-test" en vi "test."
143
+
144
+ echo "extract training data tar balls"
145
+ extract_iwslt DeEnItNlRo DeEnItNlRo
146
+ extract_iwslt ar en
147
+ extract_iwslt en ar
148
+ extract_iwslt ko en
149
+ extract_iwslt en ko
150
+ extract_iwslt vi en
151
+ extract_iwslt en vi
152
+
153
+
154
+ echo "extracting iwslt test data"
155
+ for lang in $langs; do
156
+ l=${lang:0:2}
157
+ extract_iwslt $l en "test."
158
+ extract_iwslt en $l "test."
159
+ done
160
+
161
+ echo "convert dev and test data"
162
+ for lang in $langs; do
163
+ s_lang=${lang:0:2}
164
+ convert_valid_test $s_lang en
165
+ convert_valid_test en $s_lang
166
+ done
167
+
168
+
169
+
170
+ echo "creating training data into $raw"
171
+ for lang in $langs; do
172
+ generate_train $lang en_XX
173
+ generate_train en_XX $lang
174
+ done
175
+
176
+ echo "creating iwslt dev data into raw"
177
+ generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013"
178
+ generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013"
179
+
180
+ generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016"
181
+ generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016"
182
+ generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016"
183
+ generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016"
184
+
185
+
186
+ generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010"
187
+ generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010"
188
+ generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010"
189
+ generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010"
190
+
191
+ echo "creating iswslt test data into raw"
192
+ generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015"
193
+ generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015"
194
+
195
+ generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017"
196
+ generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017"
197
+ generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017"
198
+ generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017"
199
+
200
+ generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng"
201
+ generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng"
202
+ generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng"
203
+ generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng"
204
+
205
+ # normalze iwslt directions into x-en
206
+ pushd $raw
207
+ for lang in $langs; do
208
+ for split in test valid; do
209
+ x_en_f1=$split.$lang-en_XX.en_XX
210
+ x_en_f2=$split.$lang-en_XX.${lang}
211
+
212
+ en_x_f1=$split.en_XX-$lang.en_XX
213
+ en_x_f2=$split.en_XX-$lang.${lang}
214
+
215
+ if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then
216
+ echo "cp $en_x_f1 $x_en_f1"
217
+ cp $en_x_f1 $x_en_f1
218
+ fi
219
+ if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then
220
+ echo "cp $en_x_f2 $x_en_f2"
221
+ cp $en_x_f2 $x_en_f2
222
+ fi
223
+ done
224
+ done
225
+ popd
fairseq/examples/multilingual/data_scripts/download_lotus.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+
9
+ if [ -z $WORKDIR_ROOT ] ;
10
+ then
11
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
12
+ exit
13
+ fi
14
+
15
+
16
+ SRCDIR=$WORKDIR_ROOT/indic_languages_corpus
17
+ DESTDIR=${WORKDIR_ROOT}/ML50/raw/
18
+ mkdir -p $SRCDIR
19
+ mkdir -p $DESTDIR
20
+
21
+ cd $SRCDIR
22
+ wget http://lotus.kuee.kyoto-u.ac.jp/WAT/indic-multilingual/indic_languages_corpus.tar.gz
23
+ tar -xvzf indic_languages_corpus.tar.gz
24
+
25
+ SRC_EXTRACT_DIR=$SRCDIR/indic_languages_corpus/bilingual
26
+
27
+ cp $SRC_EXTRACT_DIR/ml-en/train.ml $DESTDIR/train.ml_IN-en_XX.ml_IN
28
+ cp $SRC_EXTRACT_DIR/ml-en/train.en $DESTDIR/train.ml_IN-en_XX.en_XX
29
+ cp $SRC_EXTRACT_DIR/ml-en/dev.ml $DESTDIR/valid.ml_IN-en_XX.ml_IN
30
+ cp $SRC_EXTRACT_DIR/ml-en/dev.en $DESTDIR/valid.ml_IN-en_XX.en_XX
31
+ cp $SRC_EXTRACT_DIR/ml-en/test.ml $DESTDIR/test.ml_IN-en_XX.ml_IN
32
+ cp $SRC_EXTRACT_DIR/ml-en/test.en $DESTDIR/test.ml_IN-en_XX.en_XX
33
+
34
+ cp $SRC_EXTRACT_DIR/ur-en/train.ur $DESTDIR/train.ur_PK-en_XX.ur_PK
35
+ cp $SRC_EXTRACT_DIR/ur-en/train.en $DESTDIR/train.ur_PK-en_XX.en_XX
36
+ cp $SRC_EXTRACT_DIR/ur-en/dev.ur $DESTDIR/valid.ur_PK-en_XX.ur_PK
37
+ cp $SRC_EXTRACT_DIR/ur-en/dev.en $DESTDIR/valid.ur_PK-en_XX.en_XX
38
+ cp $SRC_EXTRACT_DIR/ur-en/test.ur $DESTDIR/test.ur_PK-en_XX.ur_PK
39
+ cp $SRC_EXTRACT_DIR/ur-en/test.en $DESTDIR/test.ur_PK-en_XX.en_XX
40
+
41
+ cp $SRC_EXTRACT_DIR/te-en/train.te $DESTDIR/train.te_IN-en_XX.te_IN
42
+ cp $SRC_EXTRACT_DIR/te-en/train.en $DESTDIR/train.te_IN-en_XX.en_XX
43
+ cp $SRC_EXTRACT_DIR/te-en/dev.te $DESTDIR/valid.te_IN-en_XX.te_IN
44
+ cp $SRC_EXTRACT_DIR/te-en/dev.en $DESTDIR/valid.te_IN-en_XX.en_XX
45
+ cp $SRC_EXTRACT_DIR/te-en/test.te $DESTDIR/test.te_IN-en_XX.te_IN
46
+ cp $SRC_EXTRACT_DIR/te-en/test.en $DESTDIR/test.te_IN-en_XX.en_XX
fairseq/examples/multilingual/data_scripts/download_ted_and_extract.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import itertools
8
+ import os
9
+ import csv
10
+ from collections import defaultdict
11
+ from six.moves import zip
12
+ import io
13
+ import wget
14
+ import sys
15
+
16
+ from subprocess import check_call, check_output
17
+
18
+ # scripts and data locations
19
+ CWD = os.getcwd()
20
+ UTILS = f"{CWD}/utils"
21
+
22
+ MOSES = f"{UTILS}/mosesdecoder"
23
+
24
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
25
+
26
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
27
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
28
+ sys.exit(-1)
29
+
30
+
31
+ # please donwload mosesdecoder here:
32
+ detok_cmd = f'{MOSES}/scripts/tokenizer/detokenizer.perl'
33
+
34
+
35
+ def call(cmd):
36
+ print(f"Executing: {cmd}")
37
+ check_call(cmd, shell=True)
38
+
39
+ class MultiLingualAlignedCorpusReader(object):
40
+ """A class to read TED talk dataset
41
+ """
42
+
43
+ def __init__(self, corpus_path, delimiter='\t',
44
+ target_token=True, bilingual=True, corpus_type='file',
45
+ lang_dict={'source': ['fr'], 'target': ['en']},
46
+ eval_lang_dict=None, zero_shot=False,
47
+ detok=True,
48
+ ):
49
+
50
+ self.empty_line_flag = 'NULL'
51
+ self.corpus_path = corpus_path
52
+ self.delimiter = delimiter
53
+ self.bilingual = bilingual
54
+ self.lang_dict = lang_dict
55
+ self.lang_set = set()
56
+ self.target_token = target_token
57
+ self.zero_shot = zero_shot
58
+ self.eval_lang_dict = eval_lang_dict
59
+ self.corpus_type = corpus_type
60
+ self.detok = detok
61
+
62
+ for list_ in self.lang_dict.values():
63
+ for lang in list_:
64
+ self.lang_set.add(lang)
65
+
66
+ self.data = dict()
67
+ self.data['train'] = self.read_aligned_corpus(split_type='train')
68
+ self.data['test'] = self.read_aligned_corpus(split_type='test')
69
+ self.data['dev'] = self.read_aligned_corpus(split_type='dev')
70
+
71
+ def read_data(self, file_loc_):
72
+ data_list = list()
73
+ with io.open(file_loc_, 'r', encoding='utf8') as fp:
74
+ for line in fp:
75
+ try:
76
+ text = line.strip()
77
+ except IndexError:
78
+ text = self.empty_line_flag
79
+ data_list.append(text)
80
+ return data_list
81
+
82
+ def filter_text(self, dict_):
83
+ if self.target_token:
84
+ field_index = 1
85
+ else:
86
+ field_index = 0
87
+ data_dict = defaultdict(list)
88
+ list1 = dict_['source']
89
+ list2 = dict_['target']
90
+ for sent1, sent2 in zip(list1, list2):
91
+ try:
92
+ src_sent = ' '.join(sent1.split()[field_index: ])
93
+ except IndexError:
94
+ src_sent = 'NULL'
95
+
96
+ if src_sent.find(self.empty_line_flag) != -1 or len(src_sent) == 0:
97
+ continue
98
+
99
+ elif sent2.find(self.empty_line_flag) != -1 or len(sent2) == 0:
100
+ continue
101
+
102
+ else:
103
+ data_dict['source'].append(sent1)
104
+ data_dict['target'].append(sent2)
105
+ return data_dict
106
+
107
+ def read_file(self, split_type, data_type):
108
+ return self.data[split_type][data_type]
109
+
110
+ def save_file(self, path_, split_type, data_type, lang):
111
+ tok_file = tok_file_name(path_, lang)
112
+ with io.open(tok_file, 'w', encoding='utf8') as fp:
113
+ for line in self.data[split_type][data_type]:
114
+ fp.write(line + '\n')
115
+ if self.detok:
116
+ de_tok(tok_file, lang)
117
+
118
+ def add_target_token(self, list_, lang_id):
119
+ new_list = list()
120
+ token = '__' + lang_id + '__'
121
+ for sent in list_:
122
+ new_list.append(token + ' ' + sent)
123
+ return new_list
124
+
125
+ def read_from_single_file(self, path_, s_lang, t_lang):
126
+ data_dict = defaultdict(list)
127
+ with io.open(path_, 'r', encoding='utf8') as fp:
128
+ reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
129
+ for row in reader:
130
+ data_dict['source'].append(row[s_lang])
131
+ data_dict['target'].append(row[t_lang])
132
+
133
+ if self.target_token:
134
+ text = self.add_target_token(data_dict['source'], t_lang)
135
+ data_dict['source'] = text
136
+
137
+ return data_dict['source'], data_dict['target']
138
+
139
+ def read_aligned_corpus(self, split_type='train'):
140
+ data_dict = defaultdict(list)
141
+ iterable = []
142
+ s_list = []
143
+ t_list = []
144
+
145
+ if self.zero_shot:
146
+ if split_type == "train":
147
+ iterable = zip(self.lang_dict['source'], self.lang_dict['target'])
148
+ else:
149
+ iterable = zip(self.eval_lang_dict['source'], self.eval_lang_dict['target'])
150
+
151
+ elif self.bilingual:
152
+ iterable = itertools.product(self.lang_dict['source'], self.lang_dict['target'])
153
+
154
+ for s_lang, t_lang in iterable:
155
+ if s_lang == t_lang:
156
+ continue
157
+ if self.corpus_type == 'file':
158
+ split_type_file_path = os.path.join(self.corpus_path,
159
+ "all_talks_{}.tsv".format(split_type))
160
+ s_list, t_list = self.read_from_single_file(split_type_file_path,
161
+ s_lang=s_lang,
162
+ t_lang=t_lang)
163
+ data_dict['source'] += s_list
164
+ data_dict['target'] += t_list
165
+ new_data_dict = self.filter_text(data_dict)
166
+ return new_data_dict
167
+
168
+
169
+ def read_langs(corpus_path):
170
+ split_type_file_path = os.path.join(corpus_path, 'extracted',
171
+ "all_talks_dev.tsv")
172
+ with io.open(split_type_file_path, 'r', encoding='utf8') as fp:
173
+ reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
174
+ header = next(reader)
175
+ return [k for k in header.keys() if k != 'talk_name']
176
+
177
+ def extra_english(corpus_path, split):
178
+ split_type_file_path = os.path.join(corpus_path,
179
+ f"all_talks_{split}.tsv")
180
+ output_split_type_file_path = os.path.join(corpus_path,
181
+ f"all_talks_{split}.en")
182
+ with io.open(split_type_file_path, 'r', encoding='utf8') as fp, io.open(output_split_type_file_path, 'w', encoding='utf8') as fw:
183
+ reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE)
184
+ for row in reader:
185
+ line = row['en']
186
+ fw.write(line + '\n')
187
+ de_tok(output_split_type_file_path, 'en')
188
+
189
+
190
+
191
+ def tok_file_name(filename, lang):
192
+ seps = filename.split('.')
193
+ seps.insert(-1, 'tok')
194
+ tok_file = '.'.join(seps)
195
+ return tok_file
196
+
197
+ def de_tok(tok_file, lang):
198
+ # seps = tok_file.split('.')
199
+ # seps.insert(-1, 'detok')
200
+ # de_tok_file = '.'.join(seps)
201
+ de_tok_file = tok_file.replace('.tok.', '.')
202
+ cmd = 'perl {detok_cmd} -l {lang} < {tok_file} > {de_tok_file}'.format(
203
+ detok_cmd=detok_cmd, tok_file=tok_file,
204
+ de_tok_file=de_tok_file, lang=lang[:2])
205
+ call(cmd)
206
+
207
+ def extra_bitex(
208
+ ted_data_path,
209
+ lsrc_lang,
210
+ ltrg_lang,
211
+ target_token,
212
+ output_data_path,
213
+ ):
214
+ def get_ted_lang(lang):
215
+ long_langs = ['pt-br', 'zh-cn', 'zh-tw', 'fr-ca']
216
+ if lang[:5] in long_langs:
217
+ return lang[:5]
218
+ elif lang[:4] =='calv':
219
+ return lang[:5]
220
+ elif lang in ['pt_BR', 'zh_CN', 'zh_TW', 'fr_CA']:
221
+ return lang.lower().replace('_', '-')
222
+ return lang[:2]
223
+ src_lang = get_ted_lang(lsrc_lang)
224
+ trg_lang = get_ted_lang(ltrg_lang)
225
+ train_lang_dict={'source': [src_lang], 'target': [trg_lang]}
226
+ eval_lang_dict = {'source': [src_lang], 'target': [trg_lang]}
227
+
228
+ obj = MultiLingualAlignedCorpusReader(corpus_path=ted_data_path,
229
+ lang_dict=train_lang_dict,
230
+ target_token=target_token,
231
+ corpus_type='file',
232
+ eval_lang_dict=eval_lang_dict,
233
+ zero_shot=False,
234
+ bilingual=True)
235
+
236
+ os.makedirs(output_data_path, exist_ok=True)
237
+ lsrc_lang = lsrc_lang.replace('-', '_')
238
+ ltrg_lang = ltrg_lang.replace('-', '_')
239
+ obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
240
+ split_type='train', data_type='source', lang=src_lang)
241
+ obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
242
+ split_type='train', data_type='target', lang=trg_lang)
243
+
244
+ obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
245
+ split_type='test', data_type='source', lang=src_lang)
246
+ obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
247
+ split_type='test', data_type='target', lang=trg_lang)
248
+
249
+ obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}",
250
+ split_type='dev', data_type='source', lang=src_lang)
251
+ obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}",
252
+ split_type='dev', data_type='target', lang=trg_lang)
253
+
254
+
255
+ def bar_custom(current, total, width=80):
256
+ print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r')
257
+
258
+
259
+ def download_and_extract(download_to, extract_to):
260
+ url = 'http://phontron.com/data/ted_talks.tar.gz'
261
+ filename = f"{download_to}/ted_talks.tar.gz"
262
+ if os.path.exists(filename):
263
+ print(f'{filename} has already been downloaded so skip')
264
+ else:
265
+ filename = wget.download(url, filename, bar=bar_custom)
266
+ if os.path.exists(f'{extract_to}/all_talks_train.tsv'):
267
+ print(f'Already extracted so skip')
268
+ else:
269
+ extract_cmd = f'tar xzfv "{filename}" -C "{extract_to}"'
270
+ call(extract_cmd)
271
+
272
+
273
+ if __name__ == "__main__":
274
+ import argparse
275
+ parser = argparse.ArgumentParser()
276
+ parser.add_argument('--ted_data_path', type=str, default=WORKDIR_ROOT, required=False)
277
+ parser.add_argument(
278
+ '--direction-list',
279
+ type=str,
280
+ # default=None,
281
+ #for ML50
282
+ default=(
283
+ "bn_IN-en_XX,he_IL-en_XX,fa_IR-en_XX,id_ID-en_XX,sv_SE-en_XX,pt_XX-en_XX,ka_GE-en_XX,ka_GE-en_XX,th_TH-en_XX,"
284
+ "mr_IN-en_XX,hr_HR-en_XX,uk_UA-en_XX,az_AZ-en_XX,mk_MK-en_XX,gl_ES-en_XX,sl_SI-en_XX,mn_MN-en_XX,"
285
+ #non-english directions
286
+ # "fr_XX-de_DE," # replaced with wmt20
287
+ # "ja_XX-ko_KR,es_XX-pt_XX,ru_RU-sv_SE,hi_IN-bn_IN,id_ID-ar_AR,cs_CZ-pl_PL,ar_AR-tr_TR"
288
+ ),
289
+ required=False)
290
+ parser.add_argument('--target-token', action='store_true', default=False)
291
+ parser.add_argument('--extract-all-english', action='store_true', default=False)
292
+
293
+ args = parser.parse_args()
294
+
295
+ import sys
296
+ import json
297
+
298
+ # TED Talks data directory
299
+ ted_data_path = args.ted_data_path
300
+
301
+ download_to = f'{ted_data_path}/downloads'
302
+ extract_to = f'{ted_data_path}/extracted'
303
+
304
+ #DESTDIR=${WORKDIR_ROOT}/ML50/raw/
305
+ output_path = f'{ted_data_path}/ML50/raw'
306
+ os.makedirs(download_to, exist_ok=True)
307
+ os.makedirs(extract_to, exist_ok=True)
308
+ os.makedirs(output_path, exist_ok=True)
309
+ download_and_extract(download_to, extract_to)
310
+
311
+
312
+ if args.extract_all_english:
313
+ for split in ['train', 'dev', 'test']:
314
+ extra_english(ted_data_path, split)
315
+ exit(0)
316
+ if args.direction_list is not None:
317
+ directions = args.direction_list.strip().split(',')
318
+ directions = [tuple(d.strip().split('-', 1)) for d in directions if d]
319
+ else:
320
+ langs = read_langs(ted_data_path)
321
+ # directions = [
322
+ # '{}.{}'.format(src, tgt)
323
+ # for src in langs
324
+ # for tgt in langs
325
+ # if src < tgt
326
+ # ]
327
+ directions = [('en', tgt) for tgt in langs if tgt != 'en']
328
+ print(f'num directions={len(directions)}: {directions}')
329
+
330
+ for src_lang, trg_lang in directions:
331
+ print('--working on {}-{}'.format(src_lang, trg_lang))
332
+ extra_bitex(
333
+ extract_to,
334
+ src_lang,
335
+ trg_lang,
336
+ target_token=args.target_token,
337
+ output_data_path=output_path
338
+ )
fairseq/examples/multilingual/data_scripts/download_wat19_my.sh ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+
9
+ if [ -z $WORKDIR_ROOT ] ;
10
+ then
11
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
12
+ exit
13
+ fi
14
+
15
+
16
+ SRCDIR=$WORKDIR_ROOT/indic_languages_corpus
17
+ DESTDIR=$WORKDIR_ROOT/ML50/raw
18
+ mkdir -p $SRCDIR
19
+ mkdir -p $DESTDIR
20
+
21
+ WAT_MY_EN=wat2020.my-en.zip
22
+ cd $SRCDIR
23
+ # please refer to http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/ for latest URL if the following url expired
24
+ #- The data used for WAT2020 are identical to those used in WAT2019.
25
+ wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/$WAT_MY_EN
26
+ unzip $WAT_MY_EN
27
+
28
+
29
+ SRC_EXTRACT_DIR=$SRCDIR/wat2020.my-en/alt
30
+
31
+ cp $SRC_EXTRACT_DIR/train.alt.en $DESTDIR/train.my_MM-en_XX.en_XX
32
+ cp $SRC_EXTRACT_DIR/train.alt.my $DESTDIR/train.my_MM-en_XX.my_MM
33
+ cp $SRC_EXTRACT_DIR/dev.alt.en $DESTDIR/valid.my_MM-en_XX.en_XX
34
+ cp $SRC_EXTRACT_DIR/dev.alt.my $DESTDIR/valid.my_MM-en_XX.my_MM
35
+ cp $SRC_EXTRACT_DIR/test.alt.en $DESTDIR/test.my_MM-en_XX.en_XX
36
+ cp $SRC_EXTRACT_DIR/test.alt.my $DESTDIR/test.my_MM-en_XX.my_MM
fairseq/examples/multilingual/data_scripts/download_wmt19_and_before.py ADDED
@@ -0,0 +1,899 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import NamedTuple, List
2
+ from urllib.parse import urlparse
3
+ import os, sys
4
+ import subprocess
5
+ from subprocess import check_call, check_output
6
+ import glob
7
+ import wget
8
+ import re
9
+ import multiprocessing as mp
10
+ from functools import partial
11
+ import pathlib
12
+ from collections import OrderedDict
13
+
14
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
15
+
16
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
17
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
18
+ sys.exit(-1)
19
+
20
+ # scripts and data locations
21
+ CWD = os.getcwd()
22
+ UTILS = f"{CWD}/utils"
23
+
24
+ MOSES = f"{UTILS}/mosesdecoder"
25
+ SGM_TOOL = f'{MOSES}/scripts/ems/support/input-from-sgm.perl'
26
+
27
+ TMX2CORPUS = f"{UTILS}/tmx2corpus"
28
+ TMX_TOOL = f'python {TMX2CORPUS}/tmx2corpus.py'
29
+
30
+ to_data_path = f'{WORKDIR_ROOT}/wmt'
31
+ download_to = f'{to_data_path}/downloads'
32
+ manually_downloads = f'{to_data_path}/downloads'
33
+ extract_to = f'{to_data_path}/extracted'
34
+ #DESTDIR=${WORKDIR_ROOT}/ML50/raw/
35
+ raw_data = f'{WORKDIR_ROOT}/ML50/raw'
36
+ ####
37
+
38
+ class DLDataset(NamedTuple):
39
+ name: str
40
+ train_urls: List[str]
41
+ valid_urls: List[str]
42
+ test_urls: List[str]
43
+ train_files_patterns: List[str] = []
44
+ valid_files_patterns: List[str] = []
45
+ test_files_patterns: List[str] = []
46
+
47
+
48
+
49
+ def bar_custom(current, total, width=80):
50
+ print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r')
51
+
52
+ def get_downloaded_file(dl_folder, url):
53
+ if isinstance(url, tuple):
54
+ url, f = url
55
+ else:
56
+ url_f = urlparse(url)
57
+ # f = os.path.split(url_f.path)[-1]
58
+ f = '_'.join(url_f.path.split('/')[1:])
59
+ return url, f"{dl_folder}/{f}"
60
+
61
+ def download_parts_and_combine(dl_folder, urls, filename):
62
+ parts = []
63
+ for url_record in urls:
64
+ url, part_file = get_downloaded_file(dl_folder, url_record)
65
+ if os.path.exists(part_file):
66
+ print(f'{part_file} has already been downloaded so skip')
67
+ else:
68
+ part_file = wget.download(url, part_file, bar=bar_custom)
69
+ parts.append(part_file)
70
+
71
+ def get_combine_cmd(parts):
72
+ #default as tar.gz.??
73
+ return f'cat {" ".join(parts)} > {filename}'
74
+
75
+ combine_cmd = get_combine_cmd(parts)
76
+ call(combine_cmd, debug=True)
77
+ return filename
78
+
79
+ def download_a_url(dl_folder, url):
80
+ url, filename = get_downloaded_file(dl_folder, url)
81
+ if os.path.exists(filename):
82
+ print(f'{filename} has already been downloaded so skip')
83
+ return filename
84
+
85
+ print(f'downloading {url} to {filename}')
86
+ if isinstance(url, list) or isinstance(url, tuple):
87
+ download_parts_and_combine(dl_folder, url, filename)
88
+ else:
89
+ wget.download(url, filename, bar=bar_custom)
90
+ print(f'dowloaded: {filename}')
91
+ return filename
92
+
93
+ def download_files(dl_folder, urls, completed_urls={}):
94
+ for url_record in urls:
95
+ url, _ = get_downloaded_file(dl_folder, url_record)
96
+ filename = download_a_url(dl_folder, url_record)
97
+ completed_urls[str(url)] = filename
98
+ return completed_urls
99
+
100
+ def check_need_manual_downalod(dl_folder, to_manually_download_urls):
101
+ to_be_manually_dowloaded = []
102
+ manually_completed_urls = {}
103
+ for url_record, instruction in to_manually_download_urls:
104
+ url, filename = get_downloaded_file(dl_folder, url_record)
105
+ if not os.path.exists(filename):
106
+ print(f'{url} need to be download manually, please download it manually following {instruction}; and copy it to {filename}')
107
+ to_be_manually_dowloaded.append((url, filename))
108
+ else:
109
+ manually_completed_urls[url] = filename
110
+ # if len(to_be_manually_dowloaded) > 0:
111
+ # raise ValueError('Missing files that need to be downloaded manually; stop the process now.')
112
+ return to_be_manually_dowloaded
113
+
114
+ def download_dataset(to_folder, dl_dataset, completed_urls={}):
115
+ download_files(to_folder, dl_dataset.train_urls, completed_urls)
116
+ download_files(to_folder, dl_dataset.valid_urls, completed_urls)
117
+ download_files(to_folder, dl_dataset.test_urls, completed_urls)
118
+ print('completed downloading')
119
+ return completed_urls
120
+
121
+ def call(cmd, debug=False):
122
+ if debug:
123
+ print(cmd)
124
+ check_call(cmd, shell=True)
125
+
126
+
127
+ def get_extract_name(file_path):
128
+ path = os.path.split(file_path)
129
+ return path[-1] + '_extract' #.split('.')[0]
130
+
131
+ def extract_file(downloaded_file, extract_folder, get_extract_name=get_extract_name, debug=False):
132
+ extract_name = get_extract_name(downloaded_file)
133
+ extract_to = f'{extract_folder}/{extract_name}'
134
+ os.makedirs(extract_to, exist_ok=True)
135
+ if os.path.exists(f'{extract_to}/DONE'):
136
+ print(f'{downloaded_file} has already been extracted to {extract_to} so skip')
137
+ return extract_to
138
+ def get_extract_cmd(filename):
139
+ if filename.endswith('.tgz') or filename.endswith('tar.gz'):
140
+ return f'tar xzfv {filename} -C {extract_to}'
141
+ elif filename.endswith('.gz.tar'):
142
+ return f'tar xfv {filename} -C {extract_to}; (cd {extract_to}; gzip -d *.gz; [ $? -eq 0 ] || gzip -d */*.gz)'
143
+ elif filename.endswith('.tar'):
144
+ return f'tar xfv {filename} -C {extract_to}'
145
+ elif filename.endswith('.gz'):
146
+ return f'cp {filename} {extract_to}; (cd {extract_to}; gzip -d *.gz)'
147
+ elif filename.endswith('.zip'):
148
+ return f'unzip {filename} -d {extract_to}'
149
+ extract_cmd = get_extract_cmd(downloaded_file)
150
+ print(f'extracting {downloaded_file}')
151
+ if isinstance(extract_cmd, list):
152
+ for c in extract_cmd:
153
+ call(c, debug=debug)
154
+ else:
155
+ call(extract_cmd, debug=debug)
156
+ call(f'echo DONE > {extract_to}/DONE')
157
+ return extract_to
158
+
159
+
160
+ def extract_all_files(
161
+ completed_urls, extract_folder,
162
+ get_extract_name=get_extract_name,
163
+ completed_extraction={},
164
+ debug=False):
165
+ extracted_folders = OrderedDict()
166
+ for url, downloaded_file in set(completed_urls.items()):
167
+ if downloaded_file in completed_extraction:
168
+ print(f'{downloaded_file} is already extracted; so skip')
169
+ continue
170
+ folder = extract_file(downloaded_file, extract_folder, get_extract_name, debug)
171
+ extracted_folders[url] = folder
172
+ return extracted_folders
173
+
174
+
175
+ def my_glob(folder):
176
+ for p in [f'{folder}/*', f'{folder}/*/*', f'{folder}/*/*/*']:
177
+ for f in glob.glob(p):
178
+ yield f
179
+
180
+
181
+ def sgm2raw(sgm, debug):
182
+ to_file = sgm[0:len(sgm) - len('.sgm')]
183
+ if os.path.exists(to_file):
184
+ debug and print(f'{sgm} already converted to {to_file}; so skip')
185
+ return to_file
186
+ cmd = f'{SGM_TOOL} < {sgm} > {to_file}'
187
+ call(cmd, debug)
188
+ return to_file
189
+
190
+ def tmx2raw(tmx, debug):
191
+ to_file = tmx[0:len(tmx) - len('.tmx')]
192
+ to_folder = os.path.join(*os.path.split(tmx)[:-1])
193
+ if os.path.exists(f'{to_folder}/bitext.en'):
194
+ debug and print(f'{tmx} already extracted to {to_file}; so skip')
195
+ return to_file
196
+ cmd = f'(cd {to_folder}; {TMX_TOOL} {tmx})'
197
+ call(cmd, debug)
198
+ return to_file
199
+
200
+ CZENG16_REGEX = re.compile(r'.*?data.plaintext-format/0[0-9]train$')
201
+ WMT19_WIKITITLES_REGEX = re.compile(r'.*?wikititles-v1.(\w\w)-en.tsv.gz')
202
+ TSV_REGEX = re.compile(r'.*?(\w\w)-(\w\w).tsv$')
203
+
204
+
205
+
206
+ def cut_wikitles(wiki_file, debug):
207
+ # different languages have different file names:
208
+ if wiki_file.endswith('wiki/fi-en/titles.fi-en'):
209
+ to_file1 = f'{wiki_file}.fi'
210
+ to_file2 = f'{wiki_file}.en'
211
+ BACKSLASH = '\\'
212
+ cmd1 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
213
+ cmd2 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
214
+ # elif WMT19_WIKITITLES_REGEX.match(wiki_file):
215
+ # src = WMT19_WIKITITLES_REGEX.match(wiki_file).groups()[0]
216
+ # to_file1 = f'{wiki_file}.{src}'
217
+ # to_file2 = f'{wiki_file}.en'
218
+ # cmd1 = f"cat {wiki_file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
219
+ # cmd2 = f"cat {wiki_file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
220
+ else:
221
+ return None
222
+ if os.path.exists(to_file1) and os.path.exists(to_file2):
223
+ debug and print(f'{wiki_file} already processed to {to_file1} and {to_file2}; so skip')
224
+ return wiki_file
225
+
226
+ call(cmd1, debug=debug)
227
+ call(cmd2, debug=debug)
228
+ return wiki_file
229
+
230
+ def cut_tsv(file, debug):
231
+ m = TSV_REGEX.match(file)
232
+ if m is None:
233
+ raise ValueError(f'{file} is not matching tsv pattern')
234
+ src = m.groups()[0]
235
+ tgt = m.groups()[1]
236
+
237
+ to_file1 = f'{file}.{src}'
238
+ to_file2 = f'{file}.{tgt}'
239
+ cmd1 = f"cat {file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}"
240
+ cmd2 = f"cat {file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}"
241
+ if os.path.exists(to_file1) and os.path.exists(to_file2):
242
+ debug and print(f'{file} already processed to {to_file1} and {to_file2}; so skip')
243
+ return file
244
+
245
+ call(cmd1, debug=debug)
246
+ call(cmd2, debug=debug)
247
+ return file
248
+
249
+
250
+ def convert_file_if_needed(file, debug):
251
+ if file.endswith('.sgm'):
252
+ return sgm2raw(file, debug)
253
+ elif file.endswith('.tmx'):
254
+ return tmx2raw(file, debug)
255
+ elif file.endswith('wiki/fi-en/titles.fi-en'):
256
+ return cut_wikitles(file, debug)
257
+ # elif WMT19_WIKITITLES_REGEX.match(file):
258
+ # return cut_wikitles(file, debug)
259
+ elif file.endswith('.tsv'):
260
+ return cut_tsv(file, debug)
261
+ elif CZENG16_REGEX.match(file):
262
+ return convert2czeng17(file, debug)
263
+ else:
264
+ return file
265
+
266
+
267
+ def convert_files_if_needed(extracted_foldrs, my_glob=my_glob, debug=False):
268
+ return {
269
+ url: list(sorted(set(convert_file_if_needed(f, debug)) for f in sorted(set(my_glob(folder)))))
270
+ for url, folder in extracted_foldrs.items()
271
+ }
272
+
273
+ def match_patt(file_path, file_pattern, src, tgt, lang):
274
+ return file_pattern.format(src=src, tgt=tgt, lang=lang) in file_path
275
+
276
+ def match_patts(file_path, file_patterns, src, tgt, lang):
277
+ for file_pattern in file_patterns:
278
+ params = { k: v for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] if k in file_pattern}
279
+ matching = file_pattern.format(**params)
280
+
281
+ if isinstance(file_pattern, tuple):
282
+ pattern, directions = file_pattern
283
+ if f'{src}-{tgt}' in directions and matching in file_path:
284
+ return True
285
+ else:
286
+ if matching in file_path:
287
+ return True
288
+ return False
289
+
290
+ def extracted_glob(extracted_folder, file_patterns, src, tgt, lang):
291
+ def get_matching_pattern(file_pattern):
292
+ params = {
293
+ k: v
294
+ for k, v in [('src', src), ('tgt', tgt), ('lang', lang)]
295
+ if '{' + k + '}' in file_pattern
296
+ }
297
+ file_pattern = re.sub(r'{src:(.*?)}', r'\1' if lang == src else '', file_pattern)
298
+ file_pattern = re.sub(r'{tgt:(.*?)}', r'\1' if lang == tgt else '', file_pattern)
299
+ file_pattern = file_pattern.format(**params)
300
+ return file_pattern
301
+ for file_pattern in file_patterns:
302
+ if isinstance(file_pattern, tuple):
303
+ file_pattern, lang_pairs = file_pattern
304
+ if f'{src}-{tgt}' not in lang_pairs:
305
+ continue
306
+ # print('working on pattern: ', file_pattern, lang_pairs )
307
+ matching_pattern = get_matching_pattern(file_pattern)
308
+ if matching_pattern is None:
309
+ continue
310
+ glob_patterns = f'{extracted_folder}/{matching_pattern}'
311
+ # print('glob_patterns: ', glob_patterns)
312
+ for f in glob.glob(glob_patterns):
313
+ yield f
314
+
315
+ # for debug usage
316
+ def all_extracted_files(split, src, tgt, extracted_folders, split_urls):
317
+ def get_url(url):
318
+ if isinstance(url, tuple):
319
+ url, downloaded_file = url
320
+ return url
321
+ return [
322
+ f
323
+ for url in split_urls
324
+ for f in my_glob(extracted_folders[str(get_url(url))])
325
+ ]
326
+
327
+ def concat_files(split, src, tgt, extracted_folders, split_urls, path_patterns, to_folder, debug=False):
328
+ # if debug:
329
+ # print('extracted files to be filtered by patterns: ',
330
+ # '\n\t'.join(sorted(all_extracted_files(split, src, tgt, extracted_folders, split_urls))))
331
+ for lang in [src, tgt]:
332
+ to_file = f'{to_folder}/{split}.{src}-{tgt}.{lang}'
333
+ s_src, s_tgt, s_lang = src.split('_')[0], tgt.split('_')[0], lang.split('_')[0]
334
+ files = []
335
+ for url in split_urls:
336
+ if isinstance(url, tuple):
337
+ url, downloaded_file = url
338
+ if str(url) not in extracted_folders:
339
+ print(f'warning: {url} not in extracted files')
340
+ for extracted_file in set(
341
+ extracted_glob(
342
+ extracted_folders[str(url)], path_patterns,
343
+ s_src, s_tgt, s_lang)):
344
+ files.append(extracted_file)
345
+ if len(files) == 0:
346
+ print('warning: ', f'No files found for split {to_file}')
347
+ continue
348
+ files = sorted(set(files))
349
+ print(f'concating {len(files)} files into {to_file}')
350
+ cmd = ['cat'] + [f'"{f}"' for f in files] + [f'>{to_file}']
351
+ cmd = " ".join(cmd)
352
+ call(cmd, debug=debug)
353
+
354
+ UTILS = os.path.join(pathlib.Path(__file__).parent, 'utils')
355
+ LID_MODEL = f'{download_to}/lid.176.bin'
356
+ LID_MULTI = f'{UTILS}/fasttext_multi_filter.py'
357
+
358
+ def lid_filter(split, src, tgt, from_folder, to_folder, debug=False):
359
+ if not os.path.exists(LID_MODEL):
360
+ call(f'wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O {LID_MODEL}')
361
+ from_prefix = f'{from_folder}/{split}.{src}-{tgt}'
362
+ to_prefix = f'{to_folder}/{split}.{src}-{tgt}'
363
+ if os.path.exists(f'{from_prefix}.{src}') and os.path.exists(f'{from_prefix}.{tgt}'):
364
+ s_src, s_tgt = src.split('_')[0], tgt.split('_')[0]
365
+ cmd = (
366
+ f'python {LID_MULTI} --model {LID_MODEL} --inputs {from_prefix}.{src} {from_prefix}.{tgt} '
367
+ f'--langs {s_src} {s_tgt} --outputs {to_prefix}.{src} {to_prefix}.{tgt}'
368
+ )
369
+ print(f'filtering {from_prefix}')
370
+ call(cmd, debug=debug)
371
+
372
+ def concat_into_splits(dl_dataset, src, tgt, extracted_folders, to_folder, debug):
373
+ to_folder_tmp = f"{to_folder}_tmp"
374
+ os.makedirs(to_folder_tmp, exist_ok=True)
375
+ concat_files('train', src, tgt,
376
+ extracted_folders,
377
+ split_urls=dl_dataset.train_urls,
378
+ path_patterns=dl_dataset.train_files_patterns,
379
+ to_folder=to_folder_tmp, debug=debug)
380
+ lid_filter('train', src, tgt, to_folder_tmp, to_folder, debug)
381
+
382
+ concat_files('valid', src, tgt,
383
+ extracted_folders,
384
+ split_urls=dl_dataset.valid_urls,
385
+ path_patterns=dl_dataset.valid_files_patterns,
386
+ to_folder=to_folder, debug=debug)
387
+ concat_files('test', src, tgt,
388
+ extracted_folders,
389
+ split_urls=dl_dataset.test_urls,
390
+ path_patterns=dl_dataset.test_files_patterns,
391
+ to_folder=to_folder, debug=debug)
392
+
393
+
394
+ def download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=False):
395
+ pool = mp.Pool(processes=num_processes)
396
+ download_f = partial(download_a_url, dl_folder)
397
+ downloaded_files = pool.imap_unordered(download_f, urls)
398
+ pool.close()
399
+ pool.join()
400
+
401
+ BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ")
402
+ def run_eval_bleu(cmd):
403
+ output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip()
404
+ print(output)
405
+ bleu = -1.0
406
+ for line in output.strip().split('\n'):
407
+ m = BLEU_REGEX.search(line)
408
+ if m is not None:
409
+ bleu = m.groups()[0]
410
+ bleu = float(bleu)
411
+ break
412
+ return bleu
413
+
414
+ def check_wmt_test_bleu(raw_folder, wmt_lang_pairs):
415
+ not_matchings = []
416
+ for wmt, src_tgts in wmt_lang_pairs:
417
+ for src_tgt in src_tgts:
418
+ print(f'checking test bleus for: {src_tgt} at {wmt}')
419
+ src, tgt = src_tgt.split('-')
420
+ ssrc, stgt = src[:2], tgt[:2]
421
+ if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'):
422
+ # reversed direction may have different test set
423
+ test_src = f'{raw_folder}/test.{tgt}-{src}.{src}'
424
+ else:
425
+ test_src = f'{raw_folder}/test.{src}-{tgt}.{src}'
426
+ cmd1 = f'cat {test_src} | sacrebleu -t "{wmt}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""'
427
+ test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}'
428
+ cmd2 = f'cat {test_tgt} | sacrebleu -t "{wmt}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""'
429
+ bleu1 = run_eval_bleu(cmd1)
430
+ if bleu1 != 100.0:
431
+ not_matchings.append(f'{wmt}:{src_tgt} source side not matching: {test_src}')
432
+ bleu2 = run_eval_bleu(cmd2)
433
+ if bleu2 != 100.0:
434
+ not_matchings.append(f'{wmt}:{src_tgt} target side not matching: {test_tgt}')
435
+ return not_matchings
436
+
437
+ def download_and_extract(
438
+ to_folder, lang_pairs, dl_dataset,
439
+ to_manually_download_urls,
440
+ completed_urls={}, completed_extraction={},
441
+ debug=False):
442
+
443
+ dl_folder = f'{to_folder}/downloads'
444
+ extract_folder = f'{to_folder}/extracted'
445
+ raw_folder = f'{to_folder}/raw'
446
+ lid_filtered = f'{to_folder}/lid_filtered'
447
+
448
+ os.makedirs(extract_folder, exist_ok=True)
449
+ os.makedirs(raw_folder, exist_ok=True)
450
+ os.makedirs(lid_filtered, exist_ok=True)
451
+
452
+
453
+ to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls)
454
+
455
+ completed_urls = download_dataset(
456
+ dl_folder, dl_dataset, completed_urls)
457
+ if debug:
458
+ print('completed urls: ', completed_urls)
459
+
460
+
461
+ extracted_folders = extract_all_files(
462
+ completed_urls,
463
+ extract_folder=extract_folder,
464
+ completed_extraction=completed_extraction,
465
+ debug=debug)
466
+ if debug:
467
+ print('download files have been extracted to folders: ', extracted_folders)
468
+
469
+ converted_files = convert_files_if_needed(extracted_folders, debug=False)
470
+ for src_tgt in lang_pairs:
471
+ print(f'working on {dl_dataset.name}: {src_tgt}')
472
+ src, tgt = src_tgt.split('-')
473
+ concat_into_splits(dl_dataset,
474
+ src=src, tgt=tgt,
475
+ extracted_folders=extracted_folders,
476
+ to_folder=raw_folder, debug=debug)
477
+ print('completed data into: ', raw_folder)
478
+
479
+ def download_czang16(download_to, username=None):
480
+ wgets = [
481
+ f'wget --user={username} --password=czeng -P {download_to} http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar'
482
+ for i in range(10)]
483
+ cmds = []
484
+ for i, cmd in enumerate(wgets):
485
+ filename = f'{download_to}/data-plaintext-format.{i}.tar'
486
+ if os.path.exists(filename):
487
+ print(f'{filename} has already been downloaded; so skip')
488
+ continue
489
+ cmds.append(cmd)
490
+ if cmds and username is None:
491
+ raise ValueError('No czeng username is given; please register at http://ufal.mff.cuni.cz/czeng/czeng16 to obtain username to download')
492
+ for cmd in cmds:
493
+ call(cmd)
494
+ print('done with downloading czeng1.6')
495
+
496
+ def download_czeng17_script(download_to, extract_folder, debug=False):
497
+ url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip'
498
+ filename = f'{download_to}/convert_czeng16_to_17.pl.zip'
499
+ extract_to = f'{extract_folder}/{get_extract_name(filename)}'
500
+ script_path = f'{extract_to}/convert_czeng16_to_17.pl'
501
+
502
+ if not os.path.exists(script_path):
503
+ wget.download(url, filename, bar=bar_custom)
504
+ extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug)
505
+ return script_path
506
+
507
+ czeng17_script_path = ""
508
+ def convert2czeng17(file, debug):
509
+ en_file = f'{file}.en'
510
+ cs_file = f'{file}.cs'
511
+
512
+ if not os.path.exists(en_file) or not os.path.exists(cs_file):
513
+ cs_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f3 > {cs_file}'
514
+ en_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f4 > {en_file}'
515
+ call(cs_cmd, debug)
516
+ call(en_cmd, debug)
517
+ else:
518
+ print(f'already extracted: {en_file} and {cs_file}')
519
+ return file
520
+
521
+ def extract_czeng17(extract_folder, debug=False):
522
+ url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip'
523
+ filename = f'{download_to}/convert_czeng16_to_17.pl.zip'
524
+ extract_to = f'{extract_folder}/{get_extract_name(filename)}'
525
+ script_path = f'{extract_to}/convert_czeng16_to_17.pl'
526
+
527
+ if not os.path.exists(script_path):
528
+ wget.download(url, filename, bar=bar_custom)
529
+ extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug)
530
+ return script_path
531
+
532
+ #########
533
+ # definitions of wmt data sources
534
+ # for es-en
535
+ # Punctuation in the official test sets will be encoded with ASCII characters (not complex Unicode characters) as much as possible. You may want to normalize your system's output before submission. You are able able to use a rawer version of the test sets that does not have this normalization.
536
+ # script to normalize punctuation: http://www.statmt.org/wmt11/normalize-punctuation.perl
537
+ wmt13_es_en = DLDataset(
538
+ name='wmt13_es-en',
539
+ train_urls=[
540
+ 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
541
+ 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
542
+ 'http://www.statmt.org/wmt13/training-parallel-un.tgz',
543
+ 'http://www.statmt.org/wmt13/training-parallel-nc-v8.tgz',
544
+ ],
545
+ valid_urls=[
546
+ ('http://www.statmt.org/wmt13/dev.tgz', 'wmt13_dev.tgz')
547
+ ],
548
+ test_urls=[
549
+ ('http://www.statmt.org/wmt13/test.tgz', 'wmt13_test.tgz')
550
+ ],
551
+ train_files_patterns=[
552
+ ('*/europarl-v7.{src}-{tgt}.{lang}', ['es-en']),
553
+ ('*commoncrawl.{src}-{tgt}.{lang}', ['es-en']),
554
+ ('*/news-commentary-v8.{src}-{tgt}.{lang}', ['es-en']),
555
+ ('un/*undoc.2000.{src}-{tgt}.{lang}', ['es-en']),
556
+ ] ,
557
+ valid_files_patterns=[
558
+ ('dev/newstest2012.{lang}', ['es-en'])
559
+ ],
560
+ test_files_patterns=[
561
+ ('test/newstest*.{lang}', ['es-en'])
562
+ ],
563
+ )
564
+
565
+ wmt14_de_fr_en = DLDataset(
566
+ name='wmt14_de_fr_en',
567
+ train_urls=[
568
+ 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
569
+ 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
570
+ 'http://www.statmt.org/wmt13/training-parallel-un.tgz',
571
+ 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz',
572
+ ('http://www.statmt.org/wmt10/training-giga-fren.tar', 'training-giga-fren.gz.tar'), #it is actuall a gz.tar
573
+ ],
574
+ valid_urls=[
575
+ ('http://www.statmt.org/wmt14/dev.tgz', 'wmt14_dev.tgz'),
576
+ ],
577
+ test_urls=[
578
+ ('http://www.statmt.org/wmt14/test-full.tgz', 'wmt14_test_full.tgz'), # cleaned test sets
579
+ ],
580
+ train_files_patterns=[
581
+ ('*/europarl-v7.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
582
+ ('*commoncrawl.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
583
+ ('*/*news-commentary-v9.{src}-{tgt}.{lang}', ['fr-en', 'de-en']),
584
+ ('un/undoc.2000.{src}-{tgt}.{lang}', ['fr-en']),
585
+ ('*giga-{src}{tgt}*{lang}', ['fr-en'])
586
+ ],
587
+ valid_files_patterns=[
588
+ ('dev/newstest2013.{lang}', ['fr-en', 'de-en'])
589
+ ],
590
+ test_files_patterns=[
591
+ ('test-full/newstest*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['en-de', 'de-en', 'fr-en', 'en-fr']),
592
+ ],
593
+ )
594
+
595
+ # pip install git+https://github.com/amake/tmx2corpus.git
596
+ wmt16_ro_en = DLDataset(
597
+ name='wmt16_ro-en',
598
+ train_urls=[
599
+ ('http://data.statmt.org/wmt16/translation-task/training-parallel-ep-v8.tgz', 'wmt16_training-parallel-ep-v8.tgz'),
600
+ ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-ro.tmx.gz', 'en-ro.tmx.gz'),
601
+ ],
602
+ valid_urls=[
603
+ ('http://data.statmt.org/wmt16/translation-task/dev-romanian-updated.tgz', 'wmt16_dev.tgz')
604
+ ],
605
+ test_urls=[
606
+ ('http://data.statmt.org/wmt16/translation-task/test.tgz', 'wmt16_test.tgz')
607
+ ],
608
+ train_files_patterns=[
609
+ ('*/*europarl-v8.{src}-{tgt}.{lang}', ['ro-en']),
610
+ ('bitext.{lang}', ['ro-en']) #setimes from tmux
611
+ ] ,
612
+ valid_files_patterns=[
613
+ ('dev/newsdev2016*{src}{tgt}*.{lang}', ['ro-en', 'ro-en'])
614
+ ],
615
+ test_files_patterns=[
616
+ ('test/newstest*{src}{tgt}*.{lang}', ['ro-en', 'en-ro'])
617
+ ],
618
+ )
619
+
620
+ cwmt_wmt_instruction = 'cwmt download instruction at: http://nlp.nju.edu.cn/cwmt-wmt'
621
+ wmt17_fi_lv_tr_zh_en_manual_downloads = [
622
+ # fake urls to have unique keys for the data
623
+ ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), cwmt_wmt_instruction),
624
+ ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), cwmt_wmt_instruction),
625
+ ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), cwmt_wmt_instruction),
626
+ ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), cwmt_wmt_instruction),
627
+ ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), cwmt_wmt_instruction),
628
+ ( ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), cwmt_wmt_instruction),
629
+ ]
630
+ wmt17_fi_lv_tr_zh_en = DLDataset(
631
+ name='wmt17_fi_lv_tr_zh_en',
632
+ train_urls=[
633
+ ('http://data.statmt.org/wmt17/translation-task/training-parallel-ep-v8.tgz', 'wmt17_training-parallel-ep-v8.tgz'),
634
+ 'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz',
635
+ 'http://www.statmt.org/wmt15/wiki-titles.tgz',
636
+ ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-tr.tmx.gz', 'en-tr.tmx.gz'),
637
+ ('http://data.statmt.org/wmt17/translation-task/rapid2016.tgz', 'wmt17_rapid2016.tgz'),
638
+ 'http://data.statmt.org/wmt17/translation-task/leta.v1.tgz',
639
+ 'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz',
640
+ 'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz',
641
+ (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00',
642
+ 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01',), 'UNv1.0.en-zh.tar.gz'),
643
+ #manually download files:
644
+ ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'),
645
+ ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'),
646
+ ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'),
647
+ ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'),
648
+ ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'),
649
+ ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'),
650
+ ],
651
+ valid_urls=[
652
+ ('http://data.statmt.org/wmt17/translation-task/dev.tgz', 'wmt17_dev.tgz'),
653
+ ],
654
+ test_urls=[
655
+ #NEW: Improved translations for zh test sets
656
+ ('http://data.statmt.org/wmt17/translation-task/test-update-1.tgz', 'wmt17_test_zh_en.tgz'),
657
+ ('http://data.statmt.org/wmt17/translation-task/test.tgz', 'wmt17_test_others.tgz')
658
+ ],
659
+ train_files_patterns=[
660
+ ('casict*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
661
+ ('casia*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
662
+ ('dataum*/Book*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en']),
663
+ ('neu*/NEU*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en'] ),
664
+ ('*/*UNv1.0.en-zh.{src:zh}{tgt:en}', ['zh-en']),
665
+ ('training/*news-commentary-v12.{src}-{tgt}.{lang}', ['zh-en', ]),
666
+
667
+ ('*/*europarl-v8.{src}-{tgt}.{lang}', ['fi-en', 'lv-en']),
668
+ ('wiki/fi-en/titles.{src}-{tgt}.{lang}', ['fi-en', ]),
669
+ ('rapid2016.{tgt}-{src}.{lang}', ['fi-en', 'lv-en']),
670
+ ('*/leta.{lang}', ['lv-en']),
671
+ ('*/dcep.{lang}', ['lv-en']),
672
+ ('*/farewell.{lang}', ['lv-en']),
673
+ ('bitext.{lang}', ['tr-en']),
674
+ ] ,
675
+ valid_files_patterns=[
676
+ ('dev/newsdev2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}',
677
+ [
678
+ 'fi-en', 'lv-en', 'tr-en', 'zh-en',
679
+ 'en-fi', 'en-lv', 'en-tr', 'en-zh'
680
+ ]),
681
+ ('dev/newstest2016*{src}{tgt}-{src:src}{tgt:ref}.{lang}',
682
+ [
683
+ 'fi-en', 'tr-en',
684
+ 'en-fi', 'en-tr',
685
+ ]),
686
+ ],
687
+ test_files_patterns=[
688
+ ('test/newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
689
+ [
690
+ 'fi-en', 'lv-en', 'tr-en',
691
+ 'en-fi', 'en-lv', 'en-tr',
692
+ ]),
693
+ ('newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
694
+ [
695
+ 'zh-en',
696
+ 'en-zh'
697
+ ]),
698
+ ],
699
+ )
700
+
701
+ czeng_instruction = 'download instruction at: http://ufal.mff.cuni.cz/czeng/czeng16'
702
+ #alternative: use the prepared data but detokenize it?
703
+ wmt18_cs_et_en_manual_downloads = [
704
+ #for cs, need to register and download; Register and download CzEng 1.6.
705
+ #Better results can be obtained by using a subset of sentences, released under a new version name CzEng 1.7.
706
+ # ((f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar',
707
+ # f'data-plaintext-format.{i}.tar'), czeng_instruction)
708
+ # for i in range(10)
709
+ ]
710
+
711
+ wmt18_cs_et_en = DLDataset(
712
+ name='wmt18_cs_et_en',
713
+ train_urls=[
714
+ 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz',
715
+ 'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz',
716
+ 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-cs.zipporah0-dedup-clean.tgz',
717
+ 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-et.zipporah0-dedup-clean.tgz',
718
+ 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
719
+ 'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz',
720
+ ('http://data.statmt.org/wmt18/translation-task/rapid2016.tgz', 'wmt18_rapid2016.tgz'),
721
+ # (tuple(
722
+ # (f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar',
723
+ # f'data-plaintext-format.{i}.tar')
724
+ # for i in range(10)
725
+ # ),
726
+ # 'czeng16_data_plaintext.gz.tar'),
727
+ ],
728
+ valid_urls=[
729
+ ('http://data.statmt.org/wmt18/translation-task/dev.tgz', 'wmt18_dev.tgz'),
730
+ ],
731
+ test_urls=[
732
+ ('http://data.statmt.org/wmt18/translation-task/test.tgz', 'wmt18_test.tgz'),
733
+ ],
734
+ train_files_patterns=[
735
+ # ('*/*europarl-v7.{src}-{tgt}.{lang}', ['cs-en']),
736
+ ('*/*europarl-v8.{src}-{tgt}.{lang}', ['et-en']),
737
+ # ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['cs-en', 'et-en']),
738
+ ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['et-en']),
739
+ # ('*commoncrawl.{src}-{tgt}.{lang}', ['cs-en']),
740
+ # ('*/news-commentary-v13.{src}-{tgt}.{lang}', ['cs-en']),
741
+ # ('data.plaintext-format/*train.{lang}', ['cs-en']),
742
+ ('rapid2016.{tgt}-{src}.{lang}', ['et-en']),
743
+ ] ,
744
+ valid_files_patterns=[
745
+ ('dev/newsdev2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['et-en']),
746
+ # ('dev/newstest2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['cs-en'])
747
+ ],
748
+ test_files_patterns=[
749
+ ('test/newstest2018-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
750
+ # ['cs-en', 'et-en']),
751
+ ['et-en']),
752
+ ]
753
+ )
754
+
755
+ ru_en_yandex_instruction = 'Yandex Corpus download instruction at: https://translate.yandex.ru/corpus?lang=en'
756
+ wmt19_ru_gu_kk_lt_manual_downloads = [
757
+ (('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), ru_en_yandex_instruction)
758
+ ]
759
+ wmt19_ru_gu_kk_lt = DLDataset(
760
+ name='wmt19_ru_gu_kk_lt',
761
+ train_urls=[
762
+ 'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz',
763
+ 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz',
764
+ 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz',
765
+ 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
766
+ 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14-wmt19.en-kk.tsv.gz',
767
+ 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz',
768
+ 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
769
+ 'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz',
770
+ 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
771
+ 'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz',
772
+ 'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz',
773
+ (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00',
774
+ 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01',
775
+ 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02',),
776
+ 'wmt19_UNv1.0.en-ru.tar.gz'),
777
+ 'https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2016.en-lt.tmx.zip',
778
+ ('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'),
779
+ ],
780
+ valid_urls=[
781
+ ('http://data.statmt.org/wmt19/translation-task/dev.tgz', 'wmt19_dev.tgz'),
782
+ ],
783
+ test_urls=[
784
+ ('http://data.statmt.org/wmt19/translation-task/test.tgz', 'wmt19_test.tgz'),
785
+ ],
786
+ train_files_patterns=[
787
+ ('*europarl-v9.{src}-{tgt}.tsv.{lang}', ['lt-en']),
788
+ #paracrawl
789
+ ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['ru-en']),
790
+ ('bitext.{lang}', ['lt-en',]),
791
+ ('*commoncrawl.{src}-{tgt}.{lang}', ['ru-en',]),
792
+ ('*news-commentary-v14-wmt19.{tgt}-{src}.tsv.{lang}', ['kk-en', ]),
793
+ ('*news-commentary-v14.{tgt}-{src}.tsv.{lang}', ['ru-en']),
794
+ #yandex
795
+ ('corpus.{tgt}_{src}.1m.{lang}', ['ru-en']),
796
+ ('wikititles_v1_wikititles-v1.{src}-{tgt}.tsv.{lang}', ['ru-en', 'kk-en', 'lt-en', 'gu-en']),
797
+ ('*/UNv1.0.{tgt}-{src}.{lang}', ['ru-en']),
798
+ #rapid
799
+ ('bitext.{lang}', ['lt-en'])
800
+ ],
801
+ valid_files_patterns=[
802
+ ('dev/newsdev2019*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['gu-en', 'kk-en', 'lt-en']),
803
+ ('dev/newstest2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['ru-en']),
804
+ ],
805
+ test_files_patterns=[
806
+ ('sgm/newstest2019-{src}{tgt}-{src:src}{tgt:ref}.{lang}',
807
+ ['ru-en', 'gu-en', 'kk-en', 'lt-en', 'en-ru', 'en-gu', 'en-kk', 'en-lt']),
808
+ ]
809
+ )
810
+
811
+
812
+ #########
813
+
814
+ if __name__ == "__main__":
815
+ # speed up the downloads with multiple processing
816
+ dl_folder = f'{to_data_path}/downloads'
817
+ extract_folder = f'{to_data_path}/extracted'
818
+
819
+ urls = [
820
+ url
821
+ for dataset in [wmt13_es_en, wmt14_de_fr_en, wmt16_ro_en, wmt18_cs_et_en, wmt19_ru_gu_kk_lt]
822
+ for urls in [dataset.train_urls, dataset.valid_urls, dataset.test_urls]
823
+ for url in urls
824
+ ]
825
+ urls = set(urls)
826
+ download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=True)
827
+
828
+ # check manually downlaods
829
+ to_manually_download_urls = (
830
+ wmt17_fi_lv_tr_zh_en_manual_downloads + wmt18_cs_et_en_manual_downloads + wmt19_ru_gu_kk_lt_manual_downloads
831
+ )
832
+ to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls)
833
+ if len(to_be_manually_dowloaded) > 0:
834
+ print('Missing files that need to be downloaded manually; stop the process now.')
835
+ exit(-1)
836
+
837
+ completed_urls = {}
838
+ completed_extraction = {}
839
+ def work_on_wmt(directions, wmt_data):
840
+ download_and_extract(
841
+ to_data_path,
842
+ directions,
843
+ wmt_data,
844
+ to_manually_download_urls=to_manually_download_urls,
845
+ completed_urls=completed_urls, completed_extraction=completed_extraction, debug=True)
846
+
847
+ work_on_wmt(
848
+ ['es_XX-en_XX'],
849
+ wmt13_es_en,)
850
+ work_on_wmt(
851
+ [
852
+ 'fr_XX-en_XX', 'en_XX-fr_XX',
853
+ # 'en_XX-de_DE', 'de_DE-en_XX',
854
+ ],
855
+ wmt14_de_fr_en,)
856
+ work_on_wmt(
857
+ ['ro_RO-en_XX', 'en_XX-ro_XX'],
858
+ wmt16_ro_en,)
859
+ work_on_wmt(
860
+ [
861
+ # 'zh_CN-en_XX',
862
+ 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX',
863
+ #in case the reversed directions have different train/valid/test data
864
+ # 'en_XX-zh_CN',
865
+ 'en_XX-lv_LV', 'en_XX-fi_FI', 'en_XX-tr_TR',
866
+ ],
867
+ wmt17_fi_lv_tr_zh_en, )
868
+ # czeng17_script_path = download_czeng17_script(download_to, extract_to, debug=False)
869
+ # cz_username = None
870
+ work_on_wmt(
871
+ [
872
+ # 'cs_CZ-en_XX',
873
+ 'et_EE-en_XX'],
874
+ wmt18_cs_et_en,)
875
+ work_on_wmt(
876
+ [
877
+ # 'ru_RU-en_XX', 'en_XX-ru_RU',
878
+ 'gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX',
879
+ #in case the reversed directions have different train/valid/test data
880
+ 'en_XX-gu_IN', 'en_XX-kk_KZ', 'en_XX-lt_LT'
881
+ ],
882
+ wmt19_ru_gu_kk_lt,)
883
+
884
+ not_matching = check_wmt_test_bleu(
885
+ f'{to_data_path}/raw',
886
+ [
887
+ ('wmt13', ['es_XX-en_XX']),
888
+ ('wmt14/full', ['fr_XX-en_XX',]),
889
+ ('wmt16', ['ro_RO-en_XX',]),
890
+ # ('wmt17/improved', ['zh_CN-en_XX']),
891
+ ('wmt17', [ 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX']),
892
+ ('wmt18', ['cs_CZ-en_XX', 'et_EE-en_XX']),
893
+ ('wmt19', ['gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX']),
894
+ #'ru_RU-en_XX',
895
+ ]
896
+ )
897
+ if len(not_matching) > 0:
898
+ print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching))
899
+
fairseq/examples/multilingual/data_scripts/download_wmt20.sh ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ if [ -z $WORKDIR_ROOT ] ;
9
+ then
10
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
11
+ exit
12
+ fi
13
+
14
+
15
+
16
+ set -x -e
17
+
18
+ # TODO update the workdir and dest dir name
19
+ # put fasttext model
20
+ WORKDIR=$WORKDIR_ROOT
21
+ # put intermediate files
22
+ TMP_DIR=$WORKDIR_ROOT/tmp/tmp_wmt20_lowres_download
23
+ # output {train,valid,test} files to dest
24
+ DEST=$WORKDIR_ROOT/ML50/raw
25
+
26
+ UTILS=$PWD/utils
27
+
28
+ # per dataset locations
29
+ COMMONCRAWL_DIR=$TMP_DIR/commoncrawl
30
+ YANDEX_CORPUS=$WORKDIR_ROOT/wmt20/official/ru/yandex/1mcorpus.zip
31
+ # unzipped
32
+ CZENG_CORPUS=$WORKDIR_ROOT/wmt20/official/cs/czeng/czeng20-train
33
+ CCMT_DIR=$WORKDIR_ROOT/wmt20/official/zh/ccmt/parallel
34
+
35
+ download_and_select() {
36
+ SUBFOLDER=$1
37
+ URL=$2
38
+ UNCOMPRESS_CMD=$3
39
+ LANG=$4
40
+ INPUT_FILEPATH=$5
41
+ if [[ $# -gt 5 ]]; then
42
+ LANG_COL=$6
43
+ EN_COL=$7
44
+ fi
45
+
46
+ mkdir -p $SUBFOLDER
47
+ cd $SUBFOLDER
48
+ wget -nc --content-disposition $URL
49
+ $UNCOMPRESS_CMD
50
+
51
+ if [[ $# -gt 5 ]]; then
52
+ cut -f$LANG_COL $INPUT_FILEPATH > $INPUT_FILEPATH.$LANG
53
+ cut -f$EN_COL $INPUT_FILEPATH > $INPUT_FILEPATH.en
54
+ fi
55
+ cd ..
56
+
57
+ ln -sf $SUBFOLDER/$INPUT_FILEPATH.$LANG $SUBFOLDER.$LANG
58
+ ln -sf $SUBFOLDER/$INPUT_FILEPATH.en $SUBFOLDER.en
59
+ }
60
+
61
+ prepare_lid() {
62
+ pip install fasttext
63
+
64
+ # TODO specify global workdir
65
+ MODEL=$WORKDIR/fasttext/lid.176.bin
66
+ LID_MULTI=$UTILS/fasttext_multi_filter.py
67
+
68
+ if [ ! -f "$MODEL" ]; then
69
+ echo "downloading fasttext lid model..."
70
+ mkdir -p $WORKDIR/fasttext
71
+ wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $MODEL
72
+ fi
73
+ }
74
+
75
+ prepare_moses() {
76
+ pushd $UTILS
77
+ echo 'Cloning Moses github repository (for tokenization scripts)...'
78
+ git clone https://github.com/moses-smt/mosesdecoder.git
79
+ popd
80
+ }
81
+
82
+ lid_filter() {
83
+ # TODO specify global workdir
84
+ MODEL=$WORKDIR/fasttext/lid.176.bin
85
+ LID_MULTI=$UTILS/fasttext_multi_filter.py
86
+
87
+ prepare_lid
88
+
89
+ SRC=$1
90
+ SRC_FILE=$2
91
+ SRC_OUTPUT=$3
92
+ TGT=$4
93
+ TGT_FILE=$5
94
+ TGT_OUTPUT=$6
95
+ python $LID_MULTI --model $MODEL --inputs $SRC_FILE $TGT_FILE --langs $SRC $TGT --outputs $SRC_OUTPUT $TGT_OUTPUT
96
+ }
97
+
98
+ prepare_ja_ted() {
99
+ mkdir -p ted
100
+ cd ted
101
+
102
+ wget -nc https://wit3.fbk.eu/archive/2017-01-trnted//texts/en/ja/en-ja.tgz
103
+ tar -zxvf en-ja.tgz
104
+ cat en-ja/train.tags.en-ja.en | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.en
105
+ cat en-ja/train.tags.en-ja.ja | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.ja
106
+
107
+ cd ..
108
+ ln -sf ted/en-ja/train.en-ja.ja ted.ja
109
+ ln -sf ted/en-ja/train.en-ja.en ted.en
110
+ }
111
+
112
+ prepare_ja() {
113
+ OUTPUT_DIR=$TMP_DIR/ja
114
+ mkdir -p $OUTPUT_DIR
115
+ cd $OUTPUT_DIR
116
+
117
+ download_and_select paracrawl "http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/2.0/bitext/en-ja.tar.gz" "tar -zxvf en-ja.tar.gz" ja en-ja/en-ja.bicleaner05.txt 4 3 &
118
+ download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ja.tsv.gz" "gunzip -f news-commentary-v15.en-ja.tsv.gz" ja news-commentary-v15.en-ja.tsv 2 1 &
119
+ download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ja-en.tsv.gz" "gunzip -f wikititles-v2.ja-en.tsv.gz" ja wikititles-v2.ja-en.tsv 1 2 &
120
+ download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ja.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ja.langid.tsv.gz" ja WikiMatrix.v1.en-ja.langid.tsv 3 2 &
121
+ download_and_select subtitle "https://nlp.stanford.edu/projects/jesc/data/split.tar.gz" "tar -zxvf split.tar.gz" ja split/train 2 1 &
122
+ download_and_select kftt "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" "tar -zxvf kftt-data-1.0.tar.gz" ja kftt-data-1.0/data/orig/kyoto-train &
123
+
124
+ prepare_ja_ted &
125
+
126
+ # ted data needs to
127
+
128
+ wait
129
+
130
+ # remove previous results
131
+ rm -f all.??
132
+ find ./ -maxdepth 1 -name "*.ja" | sort -V | xargs cat > all.ja
133
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
134
+ lid_filter ja all.ja $DEST/train.ja_XX-en_XX.ja_XX en all.en $DEST/train.ja_XX-en_XX.en_XX
135
+ }
136
+
137
+ prepare_ta() {
138
+ OUTPUT_DIR=$TMP_DIR/ta
139
+ mkdir -p $OUTPUT_DIR
140
+ cd $OUTPUT_DIR
141
+
142
+ download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ta-en.tsv.gz" "gunzip -f wikititles-v2.ta-en.tsv.gz" ta wikititles-v2.ta-en.tsv 1 2 &
143
+ download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ta.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ta.langid.tsv.gz" ta WikiMatrix.v1.en-ta.langid.tsv 3 2 &
144
+ download_and_select pmindia "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.ta-en.tsv" "" ta pmindia.v1.ta-en.tsv 2 1 &
145
+ download_and_select tanzil "https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-ta.txt.zip" "unzip en-ta.txt.zip" ta Tanzil.en-ta &
146
+ download_and_select pib "http://preon.iiit.ac.in/~jerin/resources/datasets/pib-v0.tar" "tar -xvf pib-v0.tar" ta pib/en-ta/train &
147
+ download_and_select mkb "http://preon.iiit.ac.in/~jerin/resources/datasets/mkb-v0.tar" "tar -xvf mkb-v0.tar" ta mkb/en-ta/mkb &
148
+ download_and_select ufal "http://ufal.mff.cuni.cz/~ramasamy/parallel/data/v2/en-ta-parallel-v2.tar.gz" "tar -zxvf en-ta-parallel-v2.tar.gz" ta en-ta-parallel-v2/corpus.bcn.train &
149
+
150
+ wait
151
+
152
+ # need special handling for nlpc
153
+ mkdir -p nlpc
154
+ cd nlpc
155
+ wget -nc https://raw.githubusercontent.com/nlpc-uom/English-Tamil-Parallel-Corpus/master/En-Ta%20Corpus/En-Ta%20English.txt
156
+ wget -nc https://github.com/nlpc-uom/English-Tamil-Parallel-Corpus/raw/master/En-Ta%20Corpus/En-Ta%20Tamil.txt
157
+ tail -n +4 "En-Ta English.txt" > en-ta.en
158
+ tail -n +4 "En-Ta Tamil.txt" > en-ta.ta
159
+ cd ..
160
+ ln -sf nlpc/en-ta.en nlpc.en
161
+ ln -sf nlpc/en-ta.ta nlpc.ta
162
+
163
+ # remove previous results
164
+ rm -f all.??
165
+ find ./ -maxdepth 1 -name "*.ta" | sort -V | xargs cat > all.ta
166
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
167
+ lid_filter ta all.ta $DEST/train.ta_IN-en_XX.ta_IN en all.en $DEST/train.ta_IN-en_XX.en_XX
168
+ }
169
+
170
+ prepare_iu() {
171
+ OUTPUT_DIR=$TMP_DIR/iu
172
+ mkdir -p $OUTPUT_DIR
173
+ cd $OUTPUT_DIR
174
+
175
+ download_and_select nh "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" "tar -zxvf Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0.1.tgz" iu Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/NunavutHansard > /dev/null &
176
+ download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.iu-en.tsv.gz" "gunzip -f wikititles-v2.iu-en.tsv.gz" iu wikititles-v2.iu-en.tsv 1 2 &
177
+
178
+ wait
179
+
180
+ # remove previous results
181
+ rm -f all.??
182
+ find ./ -maxdepth 1 -name "*.iu" | sort -V | xargs cat | nh/Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/scripts/normalize-iu-spelling.pl > all.iu
183
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
184
+ paste all.iu all.en | awk -F $'\t' '$1!=""&&$2!=""' > all.iuen
185
+ cut -f1 all.iuen > $DEST/train.iu_CA-en_XX.iu_CA
186
+ cut -f2 all.iuen > $DEST/train.iu_CA-en_XX.en_XX
187
+ }
188
+
189
+ prepare_km() {
190
+ OUTPUT_DIR=$TMP_DIR/km
191
+ mkdir -p $OUTPUT_DIR
192
+ cd $OUTPUT_DIR
193
+
194
+ download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-km.xz" "unxz wmt20-sent.en-km.zx" km wmt20-sent.en-km 2 1 &
195
+
196
+ # km-parallel has multiple sets, concat all of them together
197
+ mkdir -p opus
198
+ cd opus
199
+ wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/km-parallel.tgz"
200
+ tar -zxvf km-parallel.tgz
201
+ find ./km-parallel -maxdepth 1 -name "*.km" | sort -V | xargs cat > opus.km
202
+ find ./km-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
203
+ cd ..
204
+ ln -sf opus/opus.km .
205
+ ln -sf opus/opus.en .
206
+
207
+ wait
208
+
209
+ # remove previous results
210
+ rm -f all.??
211
+ find ./ -maxdepth 1 -name "*.km" | sort -V | xargs cat > all.km
212
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
213
+ lid_filter km all.km $DEST/train.km_KH-en_XX.km_KH en all.en $DEST/train.km_KH-en_XX.en_XX
214
+ }
215
+
216
+ prepare_ps() {
217
+ OUTPUT_DIR=$TMP_DIR/ps
218
+ mkdir -p $OUTPUT_DIR
219
+ cd $OUTPUT_DIR
220
+
221
+ download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz" "unxz wmt20-sent.en-ps.xz" ps wmt20-sent.en-ps 2 1 &
222
+ download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ps-en.tsv.gz" "gunzip -f wikititles-v2.ps-en.tsv.gz" ps wikititles-v2.ps-en.tsv 1 2 &
223
+ # ps-parallel has multiple sets, concat all of them together
224
+ mkdir -p opus
225
+ cd opus
226
+ wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz"
227
+ tar -zxvf ps-parallel.tgz
228
+ find ./ps-parallel -maxdepth 1 -name "*.ps" | sort -V | xargs cat > opus.ps
229
+ find ./ps-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
230
+ cd ..
231
+ ln -sf opus/opus.ps opus.ps
232
+ ln -sf opus/opus.en opus.en
233
+
234
+ wait
235
+
236
+ # remove previous results
237
+ rm -f all.??
238
+ find ./ -maxdepth 1 -name "*.ps" | sort -V | xargs cat > all.ps
239
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
240
+ lid_filter ps all.ps $DEST/train.ps_AF-en_XX.ps_AF en all.en $DEST/train.ps_AF-en_XX.en_XX
241
+ }
242
+
243
+ download_commoncrawl() {
244
+ mkdir -p $COMMONCRAWL_DIR
245
+ cd $COMMONCRAWL_DIR
246
+
247
+ wget -nc "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
248
+ tar -zxvf training-parallel-commoncrawl.tgz
249
+ }
250
+ link_commoncrawl() {
251
+ LANG=$1
252
+ ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.en commoncrawl.en
253
+ ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.$LANG commoncrawl.$LANG
254
+ }
255
+
256
+ strip_xlf() {
257
+ INPUT_FILE=$1
258
+ SRC=$2
259
+ TGT=$3
260
+ grep '<source xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$SRC
261
+ grep '<target xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$TGT
262
+ }
263
+
264
+ download_and_process_tilde() {
265
+ URL=$1
266
+ UNCOMPRESS_CMD=$2
267
+ FILENAME=$3
268
+ LANG=$4
269
+ PROCESS_CMD=$5
270
+
271
+ mkdir -p tilde
272
+ cd tilde
273
+ wget -nc $URL
274
+ $UNCOMPRESS_CMD
275
+ echo "executing cmd"
276
+ echo $PROCESS_CMD
277
+ $PROCESS_CMD
278
+ cd ..
279
+ ln -sf tilde/$FILENAME.$LANG tilde.$LANG
280
+ ln -sf tilde/$FILENAME.en tilde.en
281
+ }
282
+
283
+ prepare_cs() {
284
+ OUTPUT_DIR=$TMP_DIR/cs
285
+ mkdir -p $OUTPUT_DIR
286
+ cd $OUTPUT_DIR
287
+
288
+ #download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.cs-en.tsv.gz" "gunzip europarl-v10.cs-en.tsv.gz" cs europarl-v10.cs-en.tsv 1 2 &
289
+ #download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-cs.txt.gz" "gunzip en-cs.txt.gz" cs en-cs.txt 2 1 &
290
+ #link_commoncrawl cs
291
+ #download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.cs-en.tsv.gz" "gunzip news-commentary-v15.cs-en.tsv.gz" cs news-commentary-v15.cs-en.tsv 1 2 &
292
+ #download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.cs-en.tsv.gz" "gunzip wikititles-v2.cs-en.tsv.gz" cs wikititles-v2.cs-en.tsv 1 2 &
293
+ #download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.cs-en.xlf.gz" "gunzip RAPID_2019.cs-en.xlf.gz" RAPID_2019.cs-en.xlf cs "strip_xlf RAPID_2019.cs-en.xlf cs en" &
294
+ #download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.cs-en.langid.tsv.gz" "gunzip WikiMatrix.v1.cs-en.langid.tsv.gz" cs WikiMatrix.v1.cs-en.langid.tsv 2 3 &
295
+
296
+ #wait
297
+
298
+ # remove previous results
299
+ #rm -f all.??
300
+ #find ./ -maxdepth 1 -name "*.cs" | sort -V | xargs cat > all.cs
301
+ #find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
302
+ if [ -z $CZENG_CORPUS ] ;
303
+ then
304
+ echo "Please download CZENG_CORPUS manually and place them at $CZENG_CORPUS. Exitting..."
305
+ exit
306
+ fi
307
+ cat $CZENG_CORPUS | sed '/^$/d' | cut -f5 > all.cs
308
+ cat $CZENG_CORPUS | sed '/^$/d' | cut -f6 > all.en
309
+
310
+ lid_filter cs all.cs $DEST/train.cs_CZ-en_XX.cs_CZ en all.en $DEST/train.cs_CZ-en_XX.en_XX
311
+ }
312
+
313
+ prepare_de() {
314
+ OUTPUT_DIR=$TMP_DIR/de
315
+ mkdir -p $OUTPUT_DIR
316
+ cd $OUTPUT_DIR
317
+
318
+ download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz" "gunzip europarl-v10.de-en.tsv.gz" de europarl-v10.de-en.tsv 1 2 &
319
+ download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-de.txt.gz" "gunzip en-de.txt.gz" de en-de.txt 2 1 &
320
+ link_commoncrawl de
321
+ download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.de-en.tsv.gz" "gunzip news-commentary-v15.de-en.tsv.gz" de news-commentary-v15.de-en.tsv 1 2 &
322
+ download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.de-en.tsv.gz" "gunzip wikititles-v2.de-en.tsv.gz" de wikititles-v2.de-en.tsv 1 2 &
323
+ download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.de-en.xlf.gz" "gunzip RAPID_2019.de-en.xlf.gz" RAPID_2019.de-en.xlf de "strip_xlf RAPID_2019.de-en.xlf de en" &
324
+ download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.de-en.langid.tsv.gz" "gunzip WikiMatrix.v1.de-en.langid.tsv.gz" de WikiMatrix.v1.de-en.langid.tsv 2 3 &
325
+
326
+ wait
327
+
328
+ # remove previous results
329
+ rm -f all.??
330
+ find ./ -maxdepth 1 -name "*.de" | sort -V | xargs cat > all.de
331
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
332
+ lid_filter de all.de $DEST/train.de_DE-en_XX.de_DE en all.en $DEST/train.de_DE-en_XX.en_XX
333
+ }
334
+
335
+ prepare_tmx() {
336
+ TMX_FILE=$1
337
+ git clone https://github.com/amake/TMX2Corpus $UTILS/tmx2corpus
338
+ pip install tinysegmenter
339
+
340
+ python $UTILS/tmx2corpus/tmx2corpus.py $TMX_FILE
341
+ }
342
+
343
+ prepare_pl() {
344
+ OUTPUT_DIR=$TMP_DIR/pl
345
+ mkdir -p $OUTPUT_DIR
346
+ cd $OUTPUT_DIR
347
+
348
+ # download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.pl-en.tsv.gz" "gunzip europarl-v10.pl-en.tsv.gz" pl europarl-v10.pl-en.tsv 1 2 &
349
+ # download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-pl.txt.gz" "gunzip en-pl.txt.gz" pl en-pl.txt 2 1 &
350
+ # download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.pl-en.tsv.gz" "gunzip wikititles-v2.pl-en.tsv.gz" pl wikititles-v2.pl-en.tsv 1 2 &
351
+ download_and_select tilde "https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2019.en-pl.tmx.zip" "gunzip rapid2019.en-pl.tmx.zip" bitext pl "prepare_tmx RAPID_2019.UNIQUE.en-pl.tmx" &
352
+ # download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-pl.langid.tsv.gz" "gunzip WikiMatrix.v1.en-pl.langid.tsv.gz" pl WikiMatrix.v1.en-pl.langid.tsv 3 2 &
353
+
354
+ wait
355
+
356
+ # remove previous results
357
+ rm -f all.??
358
+ find ./ -maxdepth 1 -name "*.pl" | sort -V | xargs cat > all.pl
359
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
360
+ lid_filter pl all.pl $DEST/train.pl_PL-en_XX.pl_PL en all.en $DEST/train.pl_PL-en_XX.en_XX
361
+ }
362
+
363
+ prepare_uncorpus() {
364
+ $URLS=$1
365
+ $FILES=$2
366
+
367
+ mkdir -p uncorpus
368
+ cd uncorpus
369
+
370
+ for URL in $URLS; do
371
+ wget -nc $URL
372
+ done
373
+ cat $FILES > uncorpus.tar.gz
374
+ tar -zxvf uncorpus.tar.gz
375
+
376
+ cd ..
377
+ ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.$LANG uncorpus.$LANG
378
+ ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.en uncorpus.en
379
+ }
380
+
381
+ prepare_yandex() {
382
+ mkdir -p yandex
383
+ cd yandex
384
+ unzip $YANDEX_CORPUS ./
385
+ cd ..
386
+ ln -s yandex/corpus.en_ru.1m.en yandex.en
387
+ ln -s yandex/corpus.en_ru.1m.ru yandex.ru
388
+ }
389
+
390
+ prepare_ru() {
391
+ OUTPUT_DIR=$TMP_DIR/ru
392
+ mkdir -p $OUTPUT_DIR
393
+ cd $OUTPUT_DIR
394
+
395
+ download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" "tar -zxvf paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" ru paracrawl-release1.en-ru.zipporah0-dedup-clean &
396
+ link_commoncrawl ru
397
+ download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ru.tsv.gz" "gunzip news-commentary-v15.en-ru.tsv.gz" ru news-commentary-v15.en-ru.tsv 2 1 &
398
+ prepare_yandex &
399
+ download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ru-en.tsv.gz" "gunzip wikititles-v2.ru-en.tsv.gz" ru wikititles-v2.ru-en.tsv 1 2 &
400
+ prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" "UNv1.0.en-ru.tar.gz.00 UNv1.0.en-ru.tar.gz.01 UNv1.0.en-ru.tar.gz.02" &
401
+ download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ru.langid.tsv.gz" "gunzip WikiMatrix.v1.en-ru.langid.tsv.gz" ru WikiMatrix.v1.en-ru.langid.tsv 3 2 &
402
+
403
+ wait
404
+
405
+ # remove previous results
406
+ rm -f all.??
407
+ find ./ -maxdepth 1 -name "*.ru" | sort -V | xargs cat > all.ru
408
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
409
+ lid_filter ru all.ru $DEST/train.ru_RU-en_XX.ru_RU en all.en $DEST/train.ru_RU-en_XX.en_XX
410
+ }
411
+
412
+ prepare_ccmt() {
413
+ mkdir -p ccmt
414
+ cd ccmt
415
+ # assume ccmt data is already unzipped under CCMT_DIR folder
416
+ cat $CCMT_DIR/datum2017/Book*_cn.txt | sed 's/ //g' > datum2017.detok.zh
417
+ cat $CCMT_DIR/datum2017/Book*_en.txt > datum2017.detok.en
418
+ cat $CCMT_DIR/casict2011/casict-A_ch.txt $CCMT_DIR/casict2011/casict-B_ch.txt $CCMT_DIR/casict2015/casict2015_ch.txt $CCMT_DIR/datum2015/datum_ch.txt $CCMT_DIR/neu2017/NEU_cn.txt datum2017.detok.zh > ccmt.zh
419
+ cat $CCMT_DIR/casict2011/casict-A_en.txt $CCMT_DIR/casict2011/casict-B_en.txt $CCMT_DIR/casict2015/casict2015_en.txt $CCMT_DIR/datum2015/datum_en.txt $CCMT_DIR/neu2017/NEU_en.txt datum2017.detok.en > ccmt.en
420
+ cd ..
421
+ ln -sf ccmt/ccmt.zh ccmt.zh
422
+ ln -sf ccmt/ccmt.en ccmt.en
423
+ }
424
+
425
+ prepare_zh() {
426
+ OUTPUT_DIR=$TMP_DIR/zh
427
+ mkdir -p $OUTPUT_DIR
428
+ cd $OUTPUT_DIR
429
+
430
+ download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-zh.tsv.gz" "gunzip news-commentary-v15.en-zh.tsv.gz" zh news-commentary-v15.en-zh.tsv 2 1 &
431
+ download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.zh-en.tsv.gz" "gunzip wikititles-v2.zh-en.tsv.gz" zh wikititles-v2.zh-en.tsv 1 2 &
432
+ prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" "UNv1.0.en-zh.tar.gz.00 UNv1.0.en-zh.tar.gz.01" &
433
+ prepare_ccmt &
434
+ download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-zh.langid.tsv.gz" "gunzip WikiMatrix.v1.en-zh.langid.tsv.gz" zh WikiMatrix.v1.en-zh.langid.tsv 3 2 &
435
+
436
+ wait
437
+
438
+ # remove previous results
439
+ rm -f all.??
440
+ find ./ -maxdepth 1 -name "*.zh" | sort -V | xargs cat > all.zh
441
+ find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
442
+ lid_filter zh all.zh $DEST/train.zh_CN-en_XX.zh_CN en all.en $DEST/train.zh_CN-en_XX.en_XX
443
+ }
444
+
445
+ prepare_tests() {
446
+ OUTPUT_DIR=$TMP_DIR
447
+ mkdir -p $OUTPUT_DIR
448
+ cd $OUTPUT_DIR
449
+ wget -nc http://data.statmt.org/wmt20/translation-task/dev.tgz
450
+ tar -zxvf dev.tgz
451
+ cd dev
452
+
453
+ cat newsdev2020-jaen-src.ja.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.ja
454
+ cat newsdev2020-jaen-ref.en.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.en
455
+ split newsdev2020-jaen.ja -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.ja_XX
456
+ split newsdev2020-jaen.en -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.en_XX
457
+ split newsdev2020-jaen.ja -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.ja_XX
458
+ split newsdev2020-jaen.en -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.en_XX
459
+
460
+ cat newsdev2020-iuen-src.iu.sgm | strip_sgm.sh > newsdev2020-iuen.iu
461
+ cat newsdev2020-iuen-ref.en.sgm | strip_sgm.sh > newsdev2020-iuen.en
462
+ split newsdev2020-iuen.iu -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.iu_CA
463
+ split newsdev2020-iuen.en -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.en_XX
464
+ split newsdev2020-iuen.iu -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.iu_CA
465
+ split newsdev2020-iuen.en -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.en_XX
466
+
467
+ cat newsdev2020-taen-src.ta.sgm | strip_sgm.sh > newsdev2020-taen.ta
468
+ cat newsdev2020-taen-ref.en.sgm | strip_sgm.sh > newsdev2020-taen.en
469
+ split newsdev2020-taen.ta -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.ta_IN
470
+ split newsdev2020-taen.en -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.en_XX
471
+ split newsdev2020-taen.ta -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.ta_IN
472
+ split newsdev2020-taen.en -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.en_XX
473
+
474
+ cp wikipedia.dev.km-en.km $DEST/valid.km_KH-en_XX.km_KH
475
+ cp wikipedia.dev.km-en.en $DEST/valid.km_KH-en_XX.en_XX
476
+ cp wikipedia.devtest.km-en.km $DEST/test.km_KH-en_XX.km_KH
477
+ cp wikipedia.devtest.km-en.en $DEST/test.km_KH-en_XX.en_XX
478
+
479
+ cp wikipedia.dev.ps-en.ps $DEST/valid.ps_AF-en_XX.ps_AF
480
+ cp wikipedia.dev.ps-en.en $DEST/valid.ps_AF-en_XX.en_XX
481
+ cp wikipedia.devtest.ps-en.ps $DEST/test.ps_AF-en_XX.ps_AF
482
+ cp wikipedia.devtest.ps-en.en $DEST/test.ps_AF-en_XX.en_XX
483
+
484
+ cat newsdev2020-plen-src.pl.sgm | strip_sgm.sh > newsdev2020-plen.pl
485
+ cat newsdev2020-plen-ref.en.sgm | strip_sgm.sh > newsdev2020-plen.en
486
+ split newsdev2020-plen.pl -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.pl_PL
487
+ split newsdev2020-plen.en -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.en_XX
488
+ split newsdev2020-plen.pl -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.pl_PL
489
+ split newsdev2020-plen.en -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.en_XX
490
+
491
+ cat newstest2018-encs-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.en_XX
492
+ cat newstest2018-encs-ref.cs.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.cs_CZ
493
+ cat newstest2019-encs-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.en_XX
494
+ cat newstest2019-encs-ref.cs.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.cs_CZ
495
+
496
+ cat newstest2018-deen-src.de.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.de_DE
497
+ cat newstest2018-deen-ref.en.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.en_XX
498
+ cat newstest2018-ende-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.en_XX
499
+ cat newstest2018-ende-ref.de.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.de_DE
500
+ cat newstest2019-deen-src.de.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.de_DE
501
+ cat newstest2019-deen-ref.en.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.en_XX
502
+ cat newstest2019-ende-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.en_XX
503
+ cat newstest2019-ende-ref.de.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.de_DE
504
+
505
+ cat newstest2018-ruen-src.ru.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.ru_RU
506
+ cat newstest2018-ruen-ref.en.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.en_XX
507
+ cat newstest2018-enru-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.en_XX
508
+ cat newstest2018-enru-ref.ru.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.ru_RU
509
+ cat newstest2019-ruen-src.ru.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.ru_RU
510
+ cat newstest2019-ruen-ref.en.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.en_XX
511
+ cat newstest2019-enru-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.en_XX
512
+ cat newstest2019-enru-ref.ru.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.ru_RU
513
+
514
+ cat newstest2018-zhen-src.zh.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.zh_CN
515
+ cat newstest2018-zhen-ref.en.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.en_XX
516
+ cat newstest2018-enzh-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.en_XX
517
+ cat newstest2018-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.zh_CN
518
+ cat newstest2019-zhen-src.zh.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.zh_CN
519
+ cat newstest2019-zhen-ref.en.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.en_XX
520
+ cat newstest2019-enzh-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.en_XX
521
+ cat newstest2019-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.zh_CN
522
+ }
523
+
524
+ mkdir -p $DEST
525
+
526
+ prepare_lid
527
+ prepare_moses
528
+ download_commoncrawl
529
+
530
+ prepare_ja &
531
+ prepare_ta &
532
+ prepare_km &
533
+ prepare_ps &
534
+ prepare_iu &
535
+ prepare_cs &
536
+ prepare_de &
537
+ prepare_pl &
538
+ prepare_ru &
539
+ prepare_zh &
540
+
541
+ # prepare valid/test set
542
+ prepare_tests &
543
+
544
+ # wait
545
+
546
+ # TODO remove intermediate files
547
+ # rm -rf $TMP_DIR
fairseq/examples/multilingual/data_scripts/preprocess_ML50_v1.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ if [ -z $WORKDIR_ROOT ] ;
9
+ then
10
+ echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
11
+ exit
12
+ fi
13
+
14
+ if [ -z $SPM_PATH ] ;
15
+ then
16
+ echo "Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting..."
17
+ exit
18
+ fi
19
+
20
+ ML50=${WORKDIR_ROOT}/ML50
21
+
22
+ mkdir -p $ML50/dedup
23
+ mkdir -p $ML50/cleaned_dedup
24
+
25
+ python ./dedup_all.py --from-folder $ML50/raw --to-folder $ML50/dedup
26
+ python ./remove_valid_test_in_train.py --from-folder $ML50/dedup --to-folder $ML50/clean
27
+ python ./binarize.py --raw-folder $ML50/clean
fairseq/examples/multilingual/data_scripts/remove_valid_test_in_train.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, sys
2
+ import glob, itertools
3
+ import pandas as pd
4
+
5
+ WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None)
6
+
7
+ if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip():
8
+ print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."')
9
+ sys.exit(-1)
10
+
11
+
12
+ def load_langs(path):
13
+ with open(path) as fr:
14
+ langs = [l.strip() for l in fr]
15
+ return langs
16
+
17
+
18
+
19
+ def load_sentences(raw_data, split, direction):
20
+ src, tgt = direction.split('-')
21
+ src_path = f"{raw_data}/{split}.{direction}.{src}"
22
+ tgt_path = f"{raw_data}/{split}.{direction}.{tgt}"
23
+ if os.path.exists(src_path) and os.path.exists(tgt_path):
24
+ return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())]
25
+ else:
26
+ return []
27
+
28
+ def swap_direction(d):
29
+ src, tgt = d.split('-')
30
+ return f'{tgt}-{src}'
31
+
32
+ def get_all_test_data(raw_data, directions, split='test'):
33
+ test_data = [
34
+ x
35
+ for dd in directions
36
+ for d in [dd, swap_direction(dd)]
37
+ for x in load_sentences(raw_data, split, d)
38
+ ]
39
+ # all_test_data = {s for _, d in test_data for s in d}
40
+ all_test_data = {}
41
+ for lang, d in test_data:
42
+ for s in d:
43
+ s = s.strip()
44
+ lgs = all_test_data.get(s, set())
45
+ lgs.add(lang)
46
+ all_test_data[s] = lgs
47
+ return all_test_data, test_data
48
+
49
+ def check_train_sentences(raw_data, direction, all_test_data, mess_up_train={}):
50
+ src, tgt = direction.split('-')
51
+ tgt_path = f"{raw_data}/train.{direction}.{tgt}"
52
+ src_path = f"{raw_data}/train.{direction}.{src}"
53
+ print(f'check training data in {raw_data}/train.{direction}')
54
+ size = 0
55
+ if not os.path.exists(tgt_path) or not os.path.exists(src_path):
56
+ return mess_up_train, size
57
+ with open(src_path) as f, open(tgt_path) as g:
58
+ for src_line, tgt_line in zip(f, g):
59
+ s = src_line.strip()
60
+ t = tgt_line.strip()
61
+ size += 1
62
+ if s in all_test_data:
63
+ langs = mess_up_train.get(s, set())
64
+ langs.add(direction)
65
+ mess_up_train[s] = langs
66
+ if t in all_test_data:
67
+ langs = mess_up_train.get(t, set())
68
+ langs.add(direction)
69
+ mess_up_train[t] = langs
70
+ return mess_up_train, size
71
+
72
+ def check_train_all(raw_data, directions, all_test_data):
73
+ mess_up_train = {}
74
+ data_sizes = {}
75
+ for direction in directions:
76
+ _, size = check_train_sentences(raw_data, direction, all_test_data, mess_up_train)
77
+ data_sizes[direction] = size
78
+ return mess_up_train, data_sizes
79
+
80
+ def count_train_in_other_set(mess_up_train):
81
+ train_in_others = [(direction, s) for s, directions in mess_up_train.items() for direction in directions]
82
+ counts = {}
83
+ for direction, s in train_in_others:
84
+ counts[direction] = counts.get(direction, 0) + 1
85
+ return counts
86
+
87
+ def train_size_if_remove_in_otherset(data_sizes, mess_up_train):
88
+ counts_in_other = count_train_in_other_set(mess_up_train)
89
+ remain_sizes = []
90
+ for direction, count in counts_in_other.items():
91
+ remain_sizes.append((direction, data_sizes[direction] - count, data_sizes[direction], count, 100 * count / data_sizes[direction] ))
92
+ return remain_sizes
93
+
94
+
95
+ def remove_messed_up_sentences(raw_data, direction, mess_up_train, mess_up_train_pairs, corrected_langs):
96
+ split = 'train'
97
+ src_lang, tgt_lang = direction.split('-')
98
+
99
+ tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}"
100
+ src = f"{raw_data}/{split}.{direction}.{src_lang}"
101
+ print(f'working on {direction}: ', src, tgt)
102
+ if not os.path.exists(tgt) or not os.path.exists(src) :
103
+ return
104
+
105
+ corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}"
106
+ corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}"
107
+ line_num = 0
108
+ keep_num = 0
109
+ with open(src, encoding='utf8',) as fsrc, \
110
+ open(tgt, encoding='utf8',) as ftgt, \
111
+ open(corrected_src, 'w', encoding='utf8') as fsrc_corrected, \
112
+ open(corrected_tgt, 'w', encoding='utf8') as ftgt_corrected:
113
+ for s, t in zip(fsrc, ftgt):
114
+ s = s.strip()
115
+ t = t.strip()
116
+ if t not in mess_up_train \
117
+ and s not in mess_up_train \
118
+ and (s, t) not in mess_up_train_pairs \
119
+ and (t, s) not in mess_up_train_pairs:
120
+ corrected_langs.add(direction)
121
+ print(s, file=fsrc_corrected)
122
+ print(t, file=ftgt_corrected)
123
+ keep_num += 1
124
+ line_num += 1
125
+ if line_num % 1000 == 0:
126
+ print(f'completed {line_num} lines', end='\r')
127
+ return line_num, keep_num
128
+
129
+ ##########
130
+
131
+
132
+ def merge_valid_test_messup(mess_up_train_valid, mess_up_train_test):
133
+ merged_mess = []
134
+ for s in set(list(mess_up_train_valid.keys()) + list(mess_up_train_test.keys())):
135
+ if not s:
136
+ continue
137
+ valid = mess_up_train_valid.get(s, set())
138
+ test = mess_up_train_test.get(s, set())
139
+ merged_mess.append((s, valid | test))
140
+ return dict(merged_mess)
141
+
142
+
143
+
144
+ #########
145
+ def check_train_pairs(raw_data, direction, all_test_data, mess_up_train={}):
146
+ src, tgt = direction.split('-')
147
+ #a hack; TODO: check the reversed directions
148
+ path1 = f"{raw_data}/train.{src}-{tgt}.{src}"
149
+ path2 = f"{raw_data}/train.{src}-{tgt}.{tgt}"
150
+ if not os.path.exists(path1) or not os.path.exists(path2) :
151
+ return
152
+
153
+ with open(path1) as f1, open(path2) as f2:
154
+ for src_line, tgt_line in zip(f1, f2):
155
+ s = src_line.strip()
156
+ t = tgt_line.strip()
157
+ if (s, t) in all_test_data or (t, s) in all_test_data:
158
+ langs = mess_up_train.get( (s, t), set())
159
+ langs.add(src)
160
+ langs.add(tgt)
161
+ mess_up_train[(s, t)] = langs
162
+
163
+
164
+ def load_pairs(raw_data, split, direction):
165
+ src, tgt = direction.split('-')
166
+ src_f = f"{raw_data}/{split}.{direction}.{src}"
167
+ tgt_f = f"{raw_data}/{split}.{direction}.{tgt}"
168
+ if tgt != 'en_XX':
169
+ src_f, tgt_f = tgt_f, src_f
170
+ if os.path.exists(src_f) and os.path.exists(tgt_f):
171
+ return list(zip(open(src_f).read().splitlines(),
172
+ open(tgt_f).read().splitlines(),
173
+ ))
174
+ else:
175
+ return []
176
+
177
+ # skip_langs = ['cs_CZ', 'en_XX', 'tl_XX', 'tr_TR']
178
+ def get_messed_up_test_pairs(split, directions):
179
+ test_pairs = [
180
+ (d, load_pairs(raw_data, split, d))
181
+ for d in directions
182
+ ]
183
+ # all_test_data = {s for _, d in test_data for s in d}
184
+ all_test_pairs = {}
185
+ for direction, d in test_pairs:
186
+ src, tgt = direction.split('-')
187
+ for s in d:
188
+ langs = all_test_pairs.get(s, set())
189
+ langs.add(src)
190
+ langs.add(tgt)
191
+ all_test_pairs[s] = langs
192
+ mess_up_train_pairs = {}
193
+ for direction in directions:
194
+ check_train_pairs(raw_data, direction, all_test_pairs, mess_up_train_pairs)
195
+ return all_test_pairs, mess_up_train_pairs
196
+
197
+
198
+
199
+ if __name__ == "__main__":
200
+ #######
201
+ import argparse
202
+ parser = argparse.ArgumentParser()
203
+ parser.add_argument(
204
+ '--from-folder',
205
+ required=True,
206
+ type=str)
207
+ parser.add_argument(
208
+ '--to-folder',
209
+ required=True,
210
+ type=str)
211
+ parser.add_argument(
212
+ '--directions',
213
+ default=None,
214
+ type=str)
215
+
216
+
217
+ args = parser.parse_args()
218
+ raw_data = args.from_folder
219
+ to_folder = args.to_folder
220
+ os.makedirs(to_folder, exist_ok=True)
221
+
222
+ if args.directions:
223
+ directions = args.directions.split(',')
224
+ else:
225
+ raw_files = itertools.chain(
226
+ glob.glob(f'{raw_data}/train*'),
227
+ glob.glob(f'{raw_data}/valid*'),
228
+ glob.glob(f'{raw_data}/test*'),
229
+ )
230
+ directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files]
231
+ print('working on directions: ', directions)
232
+
233
+ ##########
234
+
235
+
236
+
237
+ all_test_data, test_data = get_all_test_data(raw_data, directions, 'test')
238
+ print('==loaded test data==')
239
+ all_valid_data, valid_data = get_all_test_data(raw_data, directions, 'valid')
240
+ print('==loaded valid data==')
241
+ all_valid_test_data = merge_valid_test_messup(all_test_data, all_valid_data)
242
+ mess_up_train, data_sizes = check_train_all(raw_data, directions, all_valid_test_data)
243
+ print('training messing up with valid, test data:', len(mess_up_train))
244
+ data_situation = train_size_if_remove_in_otherset(data_sizes, mess_up_train)
245
+ df = pd.DataFrame(data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent'])
246
+ df.sort_values('remove_percent', ascending=False)
247
+ df.to_csv(f'{raw_data}/clean_summary.tsv', sep='\t')
248
+ print(f'projected data clean summary in: {raw_data}/clean_summary.tsv')
249
+
250
+ # correct the dataset:
251
+ all_test_pairs, mess_up_test_train_pairs = get_messed_up_test_pairs('test', directions)
252
+ all_valid_pairs, mess_up_valid_train_pairs = get_messed_up_test_pairs('valid', directions)
253
+
254
+ all_messed_pairs = set(mess_up_test_train_pairs.keys()).union(set(mess_up_valid_train_pairs.keys()))
255
+ corrected_directions = set()
256
+
257
+ real_data_situation = []
258
+ for direction in directions:
259
+ org_size, new_size = remove_messed_up_sentences(raw_data, direction, mess_up_train, all_messed_pairs, corrected_directions)
260
+ if org_size == 0:
261
+ print(f"{direction} has size 0")
262
+ continue
263
+ real_data_situation.append(
264
+ (direction, new_size, org_size, org_size - new_size, (org_size - new_size) / org_size * 100)
265
+ )
266
+ print('corrected directions: ', corrected_directions)
267
+ df = pd.DataFrame(real_data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent'])
268
+ df.sort_values('remove_percent', ascending=False)
269
+ df.to_csv(f'{raw_data}/actual_clean_summary.tsv', sep='\t')
270
+ print(f'actual data clean summary (which can be different from the projected one because of duplications) in: {raw_data}/actual_clean_summary.tsv')
271
+
272
+ import shutil
273
+ for direction in directions:
274
+ src_lang, tgt_lang = direction.split('-')
275
+ for split in ['train', 'valid', 'test']:
276
+ # copying valid, test and uncorrected train
277
+ if direction in corrected_directions and split == 'train':
278
+ continue
279
+ tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}"
280
+ src = f"{raw_data}/{split}.{direction}.{src_lang}"
281
+ if not (os.path.exists(src) and os.path.exists(tgt)):
282
+ continue
283
+ corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}"
284
+ corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}"
285
+ print(f'copying {src} to {corrected_src}')
286
+ shutil.copyfile(src, corrected_src)
287
+ print(f'copying {tgt} to {corrected_tgt}')
288
+ shutil.copyfile(tgt, corrected_tgt)
289
+
290
+ print('completed')
fairseq/examples/multilingual/data_scripts/requirement.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ wget
2
+ pandas
fairseq/examples/multilingual/data_scripts/utils/dedup.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ import argparse
8
+
9
+ def deup(src_file, tgt_file, src_file_out, tgt_file_out):
10
+ seen = set()
11
+ dup_count = 0
12
+ with open(src_file, encoding='utf-8') as fsrc, \
13
+ open(tgt_file, encoding='utf-8') as ftgt, \
14
+ open(src_file_out, 'w', encoding='utf-8') as fsrc_out, \
15
+ open(tgt_file_out, 'w', encoding='utf-8') as ftgt_out:
16
+ for s, t in zip(fsrc, ftgt):
17
+ if (s, t) not in seen:
18
+ fsrc_out.write(s)
19
+ ftgt_out.write(t)
20
+ seen.add((s, t))
21
+ else:
22
+ dup_count += 1
23
+ print(f'number of duplication: {dup_count}')
24
+
25
+
26
+ def main():
27
+ parser = argparse.ArgumentParser()
28
+ parser.add_argument("--src-file", type=str, required=True,
29
+ help="src file")
30
+ parser.add_argument("--tgt-file", type=str, required=True,
31
+ help="tgt file")
32
+ parser.add_argument("--src-file-out", type=str, required=True,
33
+ help="src ouptut file")
34
+ parser.add_argument("--tgt-file-out", type=str, required=True,
35
+ help="tgt ouput file")
36
+ args = parser.parse_args()
37
+ deup(args.src_file, args.tgt_file, args.src_file_out, args.tgt_file_out)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ main()
fairseq/examples/multilingual/data_scripts/utils/fasttext_multi_filter.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ #
3
+ # This source code is licensed under the MIT license found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
+
7
+ #!/bin/python
8
+
9
+ import fasttext
10
+ from multiprocessing import Pool
11
+ import contextlib
12
+ import sys
13
+ import argparse
14
+ from functools import partial
15
+ import io
16
+
17
+ model = None
18
+ def init(model_path):
19
+ global model
20
+ model = fasttext.load_model(model_path)
21
+
22
+ def pred(lines):
23
+ return lines, [model.predict(line.strip())[0][0][9:] for line in lines]
24
+
25
+ def main():
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("--model", type=str, required=True,
28
+ help="model to load")
29
+ parser.add_argument("--inputs", nargs="+", default=['-'],
30
+ help="input files to filter")
31
+ parser.add_argument("--langs", nargs="+", required=True,
32
+ help="lang ids of each input file")
33
+ parser.add_argument("--outputs", nargs="+", default=['-'],
34
+ help="path to save lid filtered outputs")
35
+ parser.add_argument("--num-workers", type=int, metavar="N", default=10,
36
+ help="number of processes in parallel")
37
+ args = parser.parse_args()
38
+
39
+ assert len(args.inputs) == len(args.langs) and len(args.inputs) == len(args.outputs)
40
+
41
+ with contextlib.ExitStack() as stack:
42
+ inputs = [
43
+ stack.enter_context(open(input, "r", encoding="utf-8", newline="\n", errors="replace"))
44
+ if input != "-" else io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors="replace")
45
+ for input in args.inputs
46
+ ]
47
+ outputs = [
48
+ stack.enter_context(open(output, "w", encoding="utf-8", newline="\n"))
49
+ if output != "-" else sys.stdout
50
+ for output in args.outputs
51
+ ]
52
+ with Pool(args.num_workers, initializer=partial(init, args.model)) as p:
53
+ skip_cnt = 0
54
+ for lines, preds in p.imap(pred, list(zip(*inputs)), chunksize=500):
55
+ if not all(a == b for a, b in zip(preds, args.langs)):
56
+ skip_cnt += 1
57
+ continue
58
+ for line, output_h in zip(lines, outputs):
59
+ print(line.strip(), file=output_h)
60
+ print(f"Skipped {skip_cnt} lines.")
61
+
62
+ if __name__ == "__main__":
63
+ main()
fairseq/examples/multilingual/data_scripts/utils/strip_sgm.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ grep "seg id" | sed 's/<seg id="[0-9]\+">//g' | sed 's/<\/seg>//g'
fairseq/examples/multilingual/finetune_multilingual_model.sh ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ path_2_data=$1 # <path to data> which contains binarized data for each directions
9
+ lang_list=$2 # <path to a file which contains a list of languages separted by new lines>
10
+ lang_pairs=$3 #a list language pairs to train multilingual models, e.g. "en-fr,en-cs,fr-en,cs-en"
11
+ # pretrained can be an mBART pretrained model as well
12
+ pretrained_model=$4 #<path to a pretrained model>
13
+
14
+
15
+ fairseq-train "$path_2_data" \
16
+ --encoder-normalize-before --decoder-normalize-before \
17
+ --arch transformer --layernorm-embedding \
18
+ --task translation_multi_simple_epoch \
19
+ --finetune-from-model "$pretrained_model" \
20
+ --sampling-method "temperature" \
21
+ --sampling-temperature "1.5" \
22
+ --encoder-langtok "src" \
23
+ --decoder-langtok \
24
+ --lang-dict "$lang_list" \
25
+ --lang-pairs "$lang_pairs" \
26
+ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \
27
+ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \
28
+ --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \
29
+ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \
30
+ --max-tokens 1024 --update-freq 2 \
31
+ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \
32
+ --seed 222 --log-format simple --log-interval 2
fairseq/examples/multilingual/multilingual_fairseq_gen.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Copyright (c) Facebook, Inc. and its affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ lang_pairs="en-fr,en-cs,fr-en,cs-en"
9
+ path_2_data=$1 # <path to data>
10
+ lang_list=$2 # <path to a file which contains list of languages separted by new lines>
11
+ model=$3 # <path to a trained model>
12
+ source_lang=cs
13
+ target_lang=en
14
+
15
+ fairseq-generate "$path_2_data" \
16
+ --path "$model" \
17
+ --task translation_multi_simple_epoch \
18
+ --gen-subset test \
19
+ --source-lang "$source_lang" \
20
+ --target-lang "$target_lang" \
21
+ --sacrebleu --remove-bpe 'sentencepiece'\
22
+ --batch-size 32 \
23
+ --encoder-langtok "src" \
24
+ --decoder-langtok \
25
+ --lang-dict "$lang_list" \
26
+ --lang-pairs "$lang_pairs"