imdbo commited on
Commit
d6361d4
1 Parent(s): 46e5ce9

remove comments and replace original paths to corpora with generic paths

Browse files
Files changed (1) hide show
  1. bpe-gl-en_emb.yaml +21 -38
bpe-gl-en_emb.yaml CHANGED
@@ -8,63 +8,48 @@ overwrite: True
8
  # Corpus opts:
9
  data:
10
  europarl:
11
- path_src: ../DGTcorpora_tokenized/en_gl/europarl/partitions/en_train.txt
12
- path_tgt: ../DGTcorpora_tokenized/en_gl/europarl/partitions/gl_train.txt
13
  transforms: [bpe, filtertoolong]
14
  weight: 120
15
  opensub:
16
- path_tgt: ../DGTcorpora_tokenized/en_gl/opensub/partitions/en_train.txt
17
- path_src: ../DGTcorpora_tokenized/en_gl/opensub/partitions/gl_train.txt
18
  transforms: [bpe, filtertoolong]
19
  weight: 152
20
  opus:
21
- path_tgt: ../DGTcorpora_tokenized/en_gl/opus/partitions/en_train.txt
22
- path_src: ../DGTcorpora_tokenized/en_gl/opus/partitions/gl_train.txt
23
  transforms: [bpe, filtertoolong]
24
  weight: 160
25
  ted2020:
26
- path_tgt: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/en_train.txt
27
- path_src: ../DGTcorpora_tokenized/en_gl/ted2020/partitions/gl_train.txt
28
  transforms: [bpe, filtertoolong]
29
  weight: 10
30
  corgaback:
31
- path_tgt: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/en_train.txt
32
- path_src: ../DGTcorpora_tokenized/en_gl/corgaback/partitions/gl_train.txt
33
  transforms: [bpe, filtertoolong]
34
  weight: 15
35
  ccmatrix:
36
- path_tgt: ../DGTcorpora_tokenized/en_gl/ccmatrix/en_tok_dbo.txt
37
- path_src: ../DGTcorpora_tokenized/en_gl/ccmatrix/gl_tok_dbo.txt
38
  transforms: [bpe, filtertoolong]
39
- weight: 380 ##75 ## 25000000/13000000 = 2; 760/2 = 380 * 5 = 1900 (380/5=75)
40
  wikimatrix:
41
- path_tgt: ../DGTcorpora_tokenized/en_gl/wikimatrix/en.txt
42
- path_src: ../DGTcorpora_tokenized/en_gl/wikimatrix/gl.txt
43
  transforms: [bpe, filtertoolong]
44
- weight: 70 #25000000/450000 = 55 ; 760/55 = 14 ; 14 * 5 = 70
45
  cluvi:
46
- path_tgt: ../DGTcorpora_tokenized/en_gl/cluvi/en.txt
47
- path_src: ../DGTcorpora_tokenized/en_gl/cluvi/gl.txt
48
  transforms: [bpe, filtertoolong]
49
- weight: 70 #25000000/295000 = 84 ; 760/84 = 9 ; 9 * 10 = 90
50
- #wikimedia:
51
- # path_tgt: ../DGTcorpora_tokenized/en_gl/wikimedia/en.txt
52
- #path_src: ../DGTcorpora_tokenized/en_gl/wikimedia/gl.txt
53
- #transforms: [bpe, filtertoolong]
54
- #weight: 4
55
- # xlent:
56
- #path_tgt: ../DGTcorpora_tokenized/en_gl/xlent/en.txt
57
- #path_src: ../DGTcorpora_tokenized/en_gl/xlent/gl.txt
58
- #transforms: [bpe, filtertoolong]
59
- #weight: 50 #25000000/1600000=15; 760/15=50
60
- #linux:
61
- #path_tgt: ../DGTcorpora_tokenized/en_gl/linux/en.txt
62
- #path_src: ../DGTcorpora_tokenized/en_gl/linux/gl.txt
63
- #transforms: [bpe, filtertoolong]
64
- #weight: 20 #25000000/150000=166; 760/166=5 * 5 = 20
65
  valid:
66
- path_tgt: ../DGTcorpora_tokenized/en_gl/partitions/all-en_valid.txt
67
- path_src: ../DGTcorpora_tokenized/en_gl/partitions/all-gl_valid.txt
68
  transforms: [bpe, filtertoolong]
69
 
70
  ### Transform related opts:
@@ -73,8 +58,6 @@ tgt_subword_model: ./bpe/en.code
73
  src_subword_model: ./bpe/gl.code
74
  src_subword_vocab: ./run/bpe.vocab.src
75
  tgt_subword_vocab: ./run/bpe.vocab.tgt
76
- #tgt_subword_model: ../sentencepiece/en-gl/en.sp.model
77
- #src_subword_model: ../sentencepiece/en-gl/gl.sp.model
78
  src_subword_type: bpe
79
  tgt_subord_type: bpe
80
 
 
8
  # Corpus opts:
9
  data:
10
  europarl:
11
+ path_src: corpora/europarl/partitions/en_train.txt
12
+ path_tgt: corpora/europarl/partitions/gl_train.txt
13
  transforms: [bpe, filtertoolong]
14
  weight: 120
15
  opensub:
16
+ path_tgt: corpora/opensub/partitions/en_train.txt
17
+ path_src: corpora/opensub/partitions/gl_train.txt
18
  transforms: [bpe, filtertoolong]
19
  weight: 152
20
  opus:
21
+ path_tgt: corpora/opus/partitions/en_train.txt
22
+ path_src: corpora/opus/partitions/gl_train.txt
23
  transforms: [bpe, filtertoolong]
24
  weight: 160
25
  ted2020:
26
+ path_tgt: corpora/ted2020/partitions/en_train.txt
27
+ path_src: corpora/ted2020/partitions/gl_train.txt
28
  transforms: [bpe, filtertoolong]
29
  weight: 10
30
  corgaback:
31
+ path_tgt: corpora/corgaback/partitions/en_train.txt
32
+ path_src: corpora/corgaback/partitions/gl_train.txt
33
  transforms: [bpe, filtertoolong]
34
  weight: 15
35
  ccmatrix:
36
+ path_tgt: corpora/ccmatrix/en_tok_dbo.txt
37
+ path_src: corpora/ccmatrix/gl_tok_dbo.txt
38
  transforms: [bpe, filtertoolong]
39
+ weight: 380
40
  wikimatrix:
41
+ path_tgt: corpora/wikimatrix/en.txt
42
+ path_src: corpora/wikimatrix/gl.txt
43
  transforms: [bpe, filtertoolong]
44
+ weight: 70
45
  cluvi:
46
+ path_tgt: corpora/cluvi/en.txt
47
+ path_src: corpora/cluvi/gl.txt
48
  transforms: [bpe, filtertoolong]
49
+ weight: 70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  valid:
51
+ path_tgt: corpora/partitions/all-en_valid.txt
52
+ path_src: corpora/partitions/all-gl_valid.txt
53
  transforms: [bpe, filtertoolong]
54
 
55
  ### Transform related opts:
 
58
  src_subword_model: ./bpe/gl.code
59
  src_subword_vocab: ./run/bpe.vocab.src
60
  tgt_subword_vocab: ./run/bpe.vocab.tgt
 
 
61
  src_subword_type: bpe
62
  tgt_subord_type: bpe
63