imdbo commited on
Commit
7bae329
1 Parent(s): 3c19b79

Create bpe-gl-es_emb.yaml

Browse files
Files changed (1) hide show
  1. bpe-gl-es_emb.yaml +155 -0
bpe-gl-es_emb.yaml ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ save_data: run
3
+ ## Where the vocab(s) will be written
4
+ src_vocab: run/vocab/es-gl/bpe.vocab.src
5
+ tgt_vocab: run/vocab/es-gl/bpe.vocab.tgt
6
+ overwrite: True
7
+
8
+ # Corpus opts:
9
+ data:
10
+ europarl:
11
+ path_tgt: ../DGTcorpora_tokenized/es_gz/europarl/partitions/es_train.txt
12
+ path_src: ../DGTcorpora_tokenized/es_gz/europarl_translit/partitions/gl_train.txt
13
+ transforms: [bpe, filtertoolong]
14
+ weight: 120 #60 #120
15
+ opensub:
16
+ path_tgt: ../DGTcorpora_tokenized/es_gz/opensub/partitions/es_train.txt
17
+ path_src: ../DGTcorpora_tokenized/es_gz/opensub_translit/partitions/gl_train.txt
18
+ transforms: [bpe, filtertoolong]
19
+ weight: 180 #900 #180
20
+ dgt:
21
+ path_tgt: ../DGTcorpora_tokenized/es_gz/dgt/partitions/es_train.txt
22
+ path_src: ../DGTcorpora_tokenized/es_gz/dgt_translit/partitions/gl_train.txt
23
+ transforms: [bpe, filtertoolong]
24
+ weight: 18 #9 #18
25
+ cluvi:
26
+ path_tgt: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/es_train.txt
27
+ path_src: ../DGTcorpora_tokenized/es_gz/cluvi/partitions/gl_train.txt
28
+ transforms: [bpe, filtertoolong]
29
+ weight: 40 # 4 #40
30
+ opensub-es-gl:
31
+ path_tgt: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/es_train.txt
32
+ path_src: ../DGTcorpora_tokenized/es_gz/opensub-es-gl/partitions/gl_train.txt
33
+ transforms: [bpe, filtertoolong]
34
+ weight: 25 # 5 #25 #25
35
+ ted2020:
36
+ path_tgt: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/es_train.txt
37
+ path_src: ../DGTcorpora_tokenized/es_gz/ted2020/partitions/gl_train.txt
38
+ transforms: [bpe, filtertoolong]
39
+ weight: 10 # 1 #10 #10
40
+ corgaback:
41
+ path_tgt: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/es_train.txt
42
+ path_src: ../DGTcorpora_tokenized/es_gz/corgaback/partitions/gl_train.txt
43
+ transforms: [bpe, filtertoolong]
44
+ weight: 13 # 66 #14 #13
45
+ ccmatrix:
46
+ path_tgt: ../DGTcorpora_tokenized/es_gz/ccmatrix/es.txt
47
+ path_src: ../DGTcorpora_tokenized/es_gz/ccmatrix/gl.txt
48
+ transforms: [bpe, filtertoolong]
49
+ weight: 180 ##como opensub, tamanho semelhante
50
+ resto:
51
+ path_tgt: ../DGTcorpora_tokenized/es_gz/resto/es.txt
52
+ path_src: ../DGTcorpora_tokenized/es_gz/resto/gl.txt
53
+ transforms: [bpe, filtertoolong]
54
+ weight: 120 ##como europarl, tamanho semelhante
55
+ opensub_2018:
56
+ path_tgt: ../DGTcorpora_tokenized/es_gz/opensub_2018/es.txt
57
+ path_src: ../DGTcorpora_tokenized/es_gz/opensub_2018/gl.txt
58
+ transforms: [bpe, filtertoolong]
59
+ weight: 25 #igual que opensub_es-gl
60
+
61
+
62
+ valid:
63
+ path_tgt: ../DGTcorpora_tokenized/es_gz/partitions/all-es_valid.txt
64
+ path_src: ../DGTcorpora_tokenized/es_gz/partitions_translit/all-gl_valid.txt
65
+ transforms: [bpe, filtertoolong]
66
+
67
+ ### Transform related opts:
68
+ #### Subword
69
+ tgt_subword_model: ./bpe/es.code
70
+ src_subword_model: ./bpe/gl.code
71
+ tgt_subword_vocab: ./run/vocab/es-gl/bpe.vocab.src
72
+ src_subword_vocab: ./run/vocab/es-gl/bpe.vocab.tgt
73
+ #tgt_subword_model: ../sentencepiece/en-gl/en.sp.model
74
+ #src_subword_model: ../sentencepiece/en-gl/gl.sp.model
75
+ src_subword_type: bpe
76
+ tgt_subord_type: bpe
77
+
78
+ src_subword_nbest: 1
79
+ src_subword_alpha: 0.0
80
+ tgt_subword_nbest: 1
81
+ tgt_subword_alpha: 0.0
82
+
83
+ ##embeddings
84
+ tgt_embeddings: ../embeddings/es.emb.txt
85
+ src_embeddings: ../embeddings/gl.emb.txt
86
+
87
+ ## supported types: GloVe, word2vec
88
+ embeddings_type: "word2vec"
89
+
90
+ # word_vec_size need to match with the pretrained embeddings dimensions
91
+ word_vec_size: 300
92
+
93
+
94
+ #### Filter
95
+ src_seq_length: 150
96
+ tgt_seq_length: 150
97
+
98
+ # silently ignore empty lines in the data
99
+ skip_empty_level: silent
100
+
101
+
102
+
103
+ # General opts
104
+ save_model: run/model
105
+ keep_checkpoint: 50
106
+ save_checkpoint_steps: 10000
107
+ average_decay: 0.0005
108
+ seed: 1234
109
+ report_every: 1000
110
+ train_steps: 200000
111
+ valid_steps: 10000
112
+
113
+ # Batching
114
+ queue_size: 10000
115
+ bucket_size: 32768
116
+ world_size: 1
117
+ gpu_ranks: [0]
118
+ batch_type: "tokens"
119
+ #batch_size: 4096
120
+ batch_size: 8192
121
+ valid_batch_size: 64
122
+ batch_size_multiple: 1
123
+ max_generator_batches: 2
124
+ accum_count: [4]
125
+ accum_steps: [0]
126
+
127
+ # Optimization
128
+ model_dtype: "fp16"
129
+ optim: "adam"
130
+ learning_rate: 2
131
+ #learning_rate: 0.00005
132
+ warmup_steps: 8000
133
+ decay_method: "noam"
134
+ adam_beta2: 0.998
135
+ max_grad_norm: 0
136
+ label_smoothing: 0.1
137
+ param_init: 0
138
+ param_init_glorot: true
139
+ normalization: "tokens"
140
+
141
+ # Model
142
+ encoder_type: transformer
143
+ decoder_type: transformer
144
+ position_encoding: true
145
+ enc_layers: 6
146
+ dec_layers: 6
147
+ heads: 8
148
+ rnn_size: 512
149
+ word_vec_size: 512
150
+ transformer_ff: 2048
151
+ dropout_steps: [0]
152
+ dropout: [0.1]
153
+ attention_dropout: [0.1]
154
+ share_decoder_embeddings: true
155
+ share_embeddings: false