imdbo commited on
Commit
c7b0dc8
1 Parent(s): 3e9f026

remove comments and replace original paths with generic paths

Browse files
Files changed (1) hide show
  1. bpe-gl-es_emb.yaml +37 -40
bpe-gl-es_emb.yaml CHANGED
@@ -1,77 +1,75 @@
1
 
2
  save_data: run
3
  ## Where the vocab(s) will be written
4
- src_vocab: run/vocab/es-gl/bpe.vocab.src
5
- tgt_vocab: run/vocab/es-gl/bpe.vocab.tgt
6
  overwrite: True
7
 
8
  # Corpus opts:
9
  data:
10
  europarl:
11
- path_tgt: ../DGTcorpora_tokenized/es_gz/europarl/partitions/es_train.txt
12
- path_src: ../DGTcorpora_tokenized/es_gz/europarl_translit/partitions/gl_train.txt
13
  transforms: [bpe, filtertoolong]
14
- weight: 120 #60 #120
15
  opensub:
16
- path_tgt: ../DGTcorpora_tokenized/es_gz/opensub/partitions/es_train.txt
17
- path_src: ../DGTcorpora_tokenized/es_gz/opensub_translit/partitions/gl_train.txt
18
  transforms: [bpe, filtertoolong]
19
- weight: 180 #900 #180
20
  dgt:
21
- path_tgt: ../DGTcorpora_tokenized/es_gz/dgt/partitions/es_train.txt
22
- path_src: ../DGTcorpora_tokenized/es_gz/dgt_translit/partitions/gl_train.txt
23
  transforms: [bpe, filtertoolong]
24
- weight: 18 #9 #18
25
  cluvi:
26
- path_tgt: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/es_train.txt
27
- path_src: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/gl_train.txt
28
  transforms: [bpe, filtertoolong]
29
- weight: 40 # 4 #40
30
  opensub-es-gl:
31
- path_tgt: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/es_train.txt
32
- path_src: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/gl_train.txt
33
  transforms: [bpe, filtertoolong]
34
- weight: 25 # 5 #25 #25
35
  ted2020:
36
- path_tgt: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/es_train.txt
37
- path_src: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/gl_train.txt
38
  transforms: [bpe, filtertoolong]
39
- weight: 10 # 1 #10 #10
40
  corgaback:
41
- path_tgt: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/es_train.txt
42
- path_src: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/gl_train.txt
43
  transforms: [bpe, filtertoolong]
44
- weight: 13 # 66 #14 #13
45
  ccmatrix:
46
- path_tgt: ../DGTcorpora_tokenized/es_gz/ccmatrix/es.txt
47
- path_src: ../DGTcorpora_tokenized/es_gz/ccmatrix/gl.txt
48
  transforms: [bpe, filtertoolong]
49
- weight: 180 ##como opensub, tamanho semelhante
50
  resto:
51
- path_tgt: ../DGTcorpora_tokenized/es_gz/resto/es.txt
52
- path_src: ../DGTcorpora_tokenized/es_gz/resto/gl.txt
53
  transforms: [bpe, filtertoolong]
54
- weight: 120 ##como europarl, tamanho semelhante
55
  opensub_2018:
56
- path_tgt: ../DGTcorpora_tokenized/es_gz/opensub_2018/es.txt
57
- path_src: ../DGTcorpora_tokenized/es_gz/opensub_2018/gl.txt
58
  transforms: [bpe, filtertoolong]
59
- weight: 25 #igual que opensub_es-gl
60
 
61
 
62
  valid:
63
- path_tgt: ../DGTcorpora_tokenized/es_gz/partitions/all-es_valid.txt
64
- path_src: ../DGTcorpora_tokenized/es_gz/partitions_translit/all-gl_valid.txt
65
  transforms: [bpe, filtertoolong]
66
 
67
  ### Transform related opts:
68
  #### Subword
69
  tgt_subword_model: ./bpe/es.code
70
  src_subword_model: ./bpe/gl.code
71
- tgt_subword_vocab: ./run/vocab/es-gl/bpe.vocab.src
72
- src_subword_vocab: ./run/vocab/es-gl/bpe.vocab.tgt
73
- #tgt_subword_model: ../sentencepiece/en-gl/en.sp.model
74
- #src_subword_model: ../sentencepiece/en-gl/gl.sp.model
75
  src_subword_type: bpe
76
  tgt_subord_type: bpe
77
 
@@ -88,7 +86,7 @@ src_embeddings: ../embeddings/gl.emb.txt
88
  embeddings_type: "word2vec"
89
 
90
  # word_vec_size need to match with the pretrained embeddings dimensions
91
- word_vec_size: 300
92
 
93
 
94
  #### Filter
@@ -146,7 +144,6 @@ enc_layers: 6
146
  dec_layers: 6
147
  heads: 8
148
  rnn_size: 512
149
- word_vec_size: 512
150
  transformer_ff: 2048
151
  dropout_steps: [0]
152
  dropout: [0.1]
 
1
 
2
  save_data: run
3
  ## Where the vocab(s) will be written
4
+ src_vocab: run/vocab/gl-es/bpe.vocab.src
5
+ tgt_vocab: run/vocab/gl-es/bpe.vocab.tgt
6
  overwrite: True
7
 
8
  # Corpus opts:
9
  data:
10
  europarl:
11
+ path_tgt: corpora/europarl/partitions/es_train.txt
12
+ path_src: corpora/europarl_translit/partitions/gl_train.txt
13
  transforms: [bpe, filtertoolong]
14
+ weight: 120
15
  opensub:
16
+ path_tgt: corpora/opensub/partitions/es_train.txt
17
+ path_src: corpora/opensub_translit/partitions/gl_train.txt
18
  transforms: [bpe, filtertoolong]
19
+ weight: 180
20
  dgt:
21
+ path_tgt: corpora/dgt/partitions/es_train.txt
22
+ path_src: corpora/dgt_translit/partitions/gl_train.txt
23
  transforms: [bpe, filtertoolong]
24
+ weight: 18
25
  cluvi:
26
+ path_tgt: corpora/cluvi/partitions/es_train.txt
27
+ path_src: corpora/cluvi/partitions/gl_train.txt
28
  transforms: [bpe, filtertoolong]
29
+ weight: 40
30
  opensub-es-gl:
31
+ path_tgt: corpora/opensub-es-gl/partitions/es_train.txt
32
+ path_src: corpora/opensub-es-gl/partitions/gl_train.txt
33
  transforms: [bpe, filtertoolong]
34
+ weight: 25
35
  ted2020:
36
+ path_tgt: corpora/ted2020/partitions/es_train.txt
37
+ path_src: corpora/ted2020/partitions/gl_train.txt
38
  transforms: [bpe, filtertoolong]
39
+ weight: 10
40
  corgaback:
41
+ path_tgt: corpora/corgaback/partitions/es_train.txt
42
+ path_src: corpora/corgaback/partitions/gl_train.txt
43
  transforms: [bpe, filtertoolong]
44
+ weight: 13
45
  ccmatrix:
46
+ path_tgt: corpora/ccmatrix/es.txt
47
+ path_src: corpora/ccmatrix/gl.txt
48
  transforms: [bpe, filtertoolong]
49
+ weight: 180
50
  resto:
51
+ path_tgt: corpora/resto/es.txt
52
+ path_src: corpora/resto/gl.txt
53
  transforms: [bpe, filtertoolong]
54
+ weight: 120
55
  opensub_2018:
56
+ path_tgt: corpora/opensub_2018/es.txt
57
+ path_src: corpora/opensub_2018/gl.txt
58
  transforms: [bpe, filtertoolong]
59
+ weight: 25
60
 
61
 
62
  valid:
63
+ path_tgt: corpora/partitions/all-es_valid.txt
64
+ path_src: corpora/partitions_translit/all-gl_valid.txt
65
  transforms: [bpe, filtertoolong]
66
 
67
  ### Transform related opts:
68
  #### Subword
69
  tgt_subword_model: ./bpe/es.code
70
  src_subword_model: ./bpe/gl.code
71
+ tgt_subword_vocab: ./run/vocab/gl-es/bpe.vocab.src
72
+ src_subword_vocab: ./run/vocab/gl-es/bpe.vocab.tgt
 
 
73
  src_subword_type: bpe
74
  tgt_subord_type: bpe
75
 
 
86
  embeddings_type: "word2vec"
87
 
88
  # word_vec_size need to match with the pretrained embeddings dimensions
89
+ word_vec_size: 512
90
 
91
 
92
  #### Filter
 
144
  dec_layers: 6
145
  heads: 8
146
  rnn_size: 512
 
147
  transformer_ff: 2048
148
  dropout_steps: [0]
149
  dropout: [0.1]