File size: 4,842 Bytes
783cefa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
authors: false
cite: false
build-info: ""
workspace: -8000
log: train.log
log-level: info
log-time-zone: PST8PDT
quiet: false
quiet-translation: true
seed: 141414
check-nan: false
interpolate-env-vars: true
relative-paths: false
dump-config: ""
sigterm: save-and-exit
model: model_files/model.npz
pretrained-model: ""
ignore-model-config: false
type: transformer
dim-vocabs:
  - 64000
  - 64000
dim-emb: 1024
factors-dim-emb: 0
factors-combine: sum
lemma-dependency: ""
lemma-dim-emb: 0
dim-rnn: 1024
enc-type: bidirectional
enc-cell: gru
enc-cell-depth: 1
enc-depth: 6
dec-cell: gru
dec-cell-base-depth: 2
dec-cell-high-depth: 1
dec-depth: 6
skip: false
layer-normalization: false
right-left: false
input-types:
  []
tied-embeddings: true
tied-embeddings-src: false
tied-embeddings-all: true
output-omit-bias: false
transformer-heads: 8
transformer-no-projection: false
transformer-rnn-projection: false
transformer-pool: false
transformer-dim-ffn: 8192
transformer-decoder-dim-ffn: 8192
transformer-ffn-depth: 2
transformer-decoder-ffn-depth: 0
transformer-ffn-activation: relu
transformer-dim-aan: 2048
transformer-aan-depth: 2
transformer-aan-activation: swish
transformer-aan-nogate: false
transformer-decoder-autoreg: self-attention
transformer-tied-layers: []
transformer-guided-alignment-layer: last
transformer-preprocess: ""
transformer-postprocess-emb: d
transformer-postprocess: dan
transformer-postprocess-top: ""
transformer-train-position-embeddings: false
transformer-depth-scaling: true
transformer-no-bias: false
transformer-no-affine: false
bert-mask-symbol: "[MASK]"
bert-sep-symbol: "[SEP]"
bert-class-symbol: "[CLS]"
bert-masking-fraction: 0.15
bert-train-type-embeddings: true
bert-type-vocab-size: 2
comet-final-sigmoid: false
comet-mix: false
comet-mix-norm: false
comet-dropout: 0.1
comet-mixup: 0
comet-mixup-reg: false
comet-pooler-ffn:
  - 2048
  - 1024
comet-prepend-zero: false
dropout-rnn: 0
dropout-src: 0
dropout-trg: 0
transformer-dropout: 0.1
transformer-dropout-attention: 0
transformer-dropout-ffn: 0.1
cost-type: ce-sum
multi-loss-type: sum
unlikelihood-loss: false
overwrite: false
overwrite-checkpoint: true
no-reload: false
train-sets:
  - stdin
vocabs:
  - vocab
  - vocab
sentencepiece-alphas:
  []
sentencepiece-options: ""
sentencepiece-max-lines: 2000000
no-spm-encode: false
after-epochs: 0
after-batches: 0
after: 40e
disp-freq: 100Mt
disp-first: 10
disp-label-counts: true
save-freq: 1Gt
logical-epoch:
  - 1Gt
max-length: 256
max-length-crop: false
tsv: true
tsv-fields: 2
shuffle: batches
no-restore-corpus: true
tempdir: /tmp
sqlite: ""
sqlite-drop: false
devices:
  - 0
  - 1
no-nccl: false
sharding: local
sync-freq: 200u
cpu-threads: 0
mini-batch: 1000
mini-batch-words: 500000
mini-batch-fit: true
mini-batch-fit-step: 5
gradient-checkpointing: false
maxi-batch: 1000
maxi-batch-sort: trg
shuffle-in-ram: true
data-threads: 8
all-caps-every: 0
english-title-case-every: 0
mini-batch-words-ref: 0
mini-batch-warmup: 4000
mini-batch-track-lr: false
mini-batch-round-up: true
optimizer: adam
optimizer-params:
  - 0.9
  - 0.999
  - 1e-08
  - 0.01
optimizer-delay: 1
sync-sgd: true
learn-rate: 0.0005
lr-report: true
lr-decay: 0
lr-decay-strategy: epoch+stalled
lr-decay-start:
  - 10
  - 1
lr-decay-freq: 50000
lr-decay-reset-optimizer: false
lr-decay-repeat-warmup: false
lr-decay-inv-sqrt:
  - 4000
lr-warmup: 4000
lr-warmup-start-rate: 0
lr-warmup-cycle: false
lr-warmup-at-reload: false
label-smoothing: 0.1
factor-weight: 1
clip-norm: 0
exponential-smoothing: 1e-3
exponential-smoothing-replace-freq: 0
guided-alignment: none
guided-alignment-cost: ce
guided-alignment-weight: 0
data-weighting: ""
data-weighting-type: sentence
embedding-vectors:
  []
embedding-normalization: false
embedding-fix-src: false
embedding-fix-trg: false
precision:
  - float32
  - float32
cost-scaling:
  - 256.f
  - 10000
  - 1.f
  - 256.f
throw-on-divergence:
  []
custom-fallbacks:
  []
gradient-norm-average-window: 100
dynamic-gradient-scaling:
  - 2
  - log
check-gradient-nan: false
normalize-gradient: false
train-embedder-rank:
  []
quantize-bits: 0
quantize-optimization-steps: 0
quantize-log-based: false
quantize-biases: false
ulr: false
ulr-query-vectors: ""
ulr-keys-vectors: ""
ulr-trainable-transformation: false
ulr-dim-emb: 0
ulr-dropout: 0
ulr-softmax-temperature: 1
valid-sets:
  - dev.en-de
valid-freq: 1Gt
valid-metrics:
  - perplexity
  - ce-mean-words
  - bleu
  - chrf
valid-reset-stalled: false
valid-reset-all: false
early-stopping: 40
early-stopping-epsilon:
  - 0
early-stopping-on: first
beam-size: 4
normalize: 1.0
max-length-factor: 3
word-penalty: 0.0
allow-unk: false
n-best: false
word-scores: false
valid-mini-batch: 32
valid-max-length: 1000
valid-script-path: ""
valid-script-args:
  []
valid-translation-output: valid.trg.output
keep-best: true
valid-log: valid.log