rewicks commited on
Commit
a8875c7
1 Parent(s): 8a5255e

Upload model.npz.yml with huggingface_hub

Browse files
Files changed (1) hide show
  1. model.npz.yml +244 -0
model.npz.yml ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ authors: false
2
+ cite: false
3
+ build-info: ""
4
+ workspace: -8000
5
+ log: train.log
6
+ log-level: info
7
+ log-time-zone: PST8PDT
8
+ quiet: false
9
+ quiet-translation: true
10
+ seed: 141414
11
+ check-nan: false
12
+ interpolate-env-vars: true
13
+ relative-paths: false
14
+ dump-config: ""
15
+ sigterm: save-and-exit
16
+ model: model_files/model.npz
17
+ pretrained-model: ""
18
+ ignore-model-config: false
19
+ type: lm-transformer
20
+ dim-vocabs:
21
+ - 16000
22
+ dim-emb: 1024
23
+ factors-dim-emb: 0
24
+ factors-combine: sum
25
+ lemma-dependency: ""
26
+ lemma-dim-emb: 0
27
+ dim-rnn: 1024
28
+ enc-type: bidirectional
29
+ enc-cell: gru
30
+ enc-cell-depth: 1
31
+ enc-depth: 1
32
+ dec-cell: gru
33
+ dec-cell-base-depth: 2
34
+ dec-cell-high-depth: 1
35
+ dec-depth: 12
36
+ skip: false
37
+ layer-normalization: false
38
+ right-left: false
39
+ input-types:
40
+ []
41
+ tied-embeddings: true
42
+ tied-embeddings-src: false
43
+ tied-embeddings-all: true
44
+ output-omit-bias: true
45
+ transformer-heads: 8
46
+ transformer-no-projection: false
47
+ transformer-rnn-projection: false
48
+ transformer-pool: false
49
+ transformer-dim-ffn: 8192
50
+ transformer-decoder-dim-ffn: 8192
51
+ transformer-ffn-depth: 2
52
+ transformer-decoder-ffn-depth: 0
53
+ transformer-ffn-activation: relu
54
+ transformer-dim-aan: 2048
55
+ transformer-aan-depth: 2
56
+ transformer-aan-activation: swish
57
+ transformer-aan-nogate: false
58
+ transformer-decoder-autoreg: self-attention
59
+ transformer-tied-layers: []
60
+ transformer-guided-alignment-layer: last
61
+ transformer-preprocess: ""
62
+ transformer-postprocess-emb: d
63
+ transformer-postprocess: dan
64
+ transformer-postprocess-top: ""
65
+ transformer-train-position-embeddings: false
66
+ transformer-depth-scaling: true
67
+ transformer-no-bias: false
68
+ transformer-no-affine: false
69
+ bert-mask-symbol: "[MASK]"
70
+ bert-sep-symbol: "[SEP]"
71
+ bert-class-symbol: "[CLS]"
72
+ bert-masking-fraction: 0.15
73
+ bert-train-type-embeddings: true
74
+ bert-type-vocab-size: 2
75
+ comet-final-sigmoid: false
76
+ comet-mix: false
77
+ comet-mix-norm: false
78
+ comet-dropout: 0.1
79
+ comet-mixup: 0
80
+ comet-mixup-reg: false
81
+ comet-pooler-ffn:
82
+ - 2048
83
+ - 1024
84
+ comet-prepend-zero: false
85
+ dropout-rnn: 0
86
+ dropout-src: 0
87
+ dropout-trg: 0
88
+ transformer-dropout: 0.1
89
+ transformer-dropout-attention: 0
90
+ transformer-dropout-ffn: 0.1
91
+ cost-type: ce-sum
92
+ multi-loss-type: sum
93
+ unlikelihood-loss: false
94
+ overwrite: false
95
+ overwrite-checkpoint: true
96
+ no-reload: false
97
+ train-sets:
98
+ - stdin
99
+ vocabs:
100
+ - vocab
101
+ sentencepiece-alphas:
102
+ []
103
+ sentencepiece-options: ""
104
+ sentencepiece-max-lines: 2000000
105
+ no-spm-encode: false
106
+ after-epochs: 0
107
+ after-batches: 0
108
+ after: 40e
109
+ disp-freq: 100Mt
110
+ disp-first: 10
111
+ disp-label-counts: true
112
+ save-freq: 1Gt
113
+ logical-epoch:
114
+ - 1Gt
115
+ max-length: 256
116
+ max-length-crop: false
117
+ tsv: true
118
+ tsv-fields: 1
119
+ shuffle: batches
120
+ no-restore-corpus: true
121
+ tempdir: /tmp
122
+ sqlite: ""
123
+ sqlite-drop: false
124
+ devices:
125
+ - 0
126
+ - 1
127
+ no-nccl: false
128
+ sharding: local
129
+ sync-freq: 200u
130
+ cpu-threads: 0
131
+ mini-batch: 1000
132
+ mini-batch-words: 500000
133
+ mini-batch-fit: true
134
+ mini-batch-fit-step: 5
135
+ gradient-checkpointing: false
136
+ maxi-batch: 1000
137
+ maxi-batch-sort: trg
138
+ shuffle-in-ram: true
139
+ data-threads: 8
140
+ all-caps-every: 0
141
+ english-title-case-every: 0
142
+ mini-batch-words-ref: 0
143
+ mini-batch-warmup: 4000
144
+ mini-batch-track-lr: false
145
+ mini-batch-round-up: true
146
+ optimizer: adam
147
+ optimizer-params:
148
+ - 0.9
149
+ - 0.999
150
+ - 1e-08
151
+ - 0.01
152
+ optimizer-delay: 1
153
+ sync-sgd: true
154
+ learn-rate: 0.0005
155
+ lr-report: true
156
+ lr-decay: 0
157
+ lr-decay-strategy: epoch+stalled
158
+ lr-decay-start:
159
+ - 10
160
+ - 1
161
+ lr-decay-freq: 50000
162
+ lr-decay-reset-optimizer: false
163
+ lr-decay-repeat-warmup: false
164
+ lr-decay-inv-sqrt:
165
+ - 4000
166
+ lr-warmup: 4000
167
+ lr-warmup-start-rate: 0
168
+ lr-warmup-cycle: false
169
+ lr-warmup-at-reload: false
170
+ label-smoothing: 0.1
171
+ factor-weight: 1
172
+ clip-norm: 0
173
+ exponential-smoothing: 1e-3
174
+ exponential-smoothing-replace-freq: 0
175
+ guided-alignment: none
176
+ guided-alignment-cost: ce
177
+ guided-alignment-weight: 0
178
+ data-weighting: ""
179
+ data-weighting-type: sentence
180
+ embedding-vectors:
181
+ []
182
+ embedding-normalization: false
183
+ embedding-fix-src: false
184
+ embedding-fix-trg: false
185
+ precision:
186
+ - float32
187
+ - float32
188
+ cost-scaling:
189
+ - 256.f
190
+ - 10000
191
+ - 1.f
192
+ - 256.f
193
+ throw-on-divergence:
194
+ []
195
+ custom-fallbacks:
196
+ []
197
+ gradient-norm-average-window: 100
198
+ dynamic-gradient-scaling:
199
+ - 2
200
+ - log
201
+ check-gradient-nan: false
202
+ normalize-gradient: false
203
+ train-embedder-rank:
204
+ []
205
+ quantize-bits: 0
206
+ quantize-optimization-steps: 0
207
+ quantize-log-based: false
208
+ quantize-biases: false
209
+ ulr: false
210
+ ulr-query-vectors: ""
211
+ ulr-keys-vectors: ""
212
+ ulr-trainable-transformation: false
213
+ ulr-dim-emb: 0
214
+ ulr-dropout: 0
215
+ ulr-softmax-temperature: 1
216
+ valid-sets:
217
+ - dev.de
218
+ valid-freq: 1Gt
219
+ valid-metrics:
220
+ - perplexity
221
+ - ce-mean-words
222
+ - bleu
223
+ - chrf
224
+ valid-reset-stalled: false
225
+ valid-reset-all: false
226
+ early-stopping: 40
227
+ early-stopping-epsilon:
228
+ - 0
229
+ early-stopping-on: first
230
+ beam-size: 4
231
+ normalize: 1.0
232
+ max-length-factor: 3
233
+ word-penalty: 0.0
234
+ allow-unk: false
235
+ n-best: false
236
+ word-scores: false
237
+ valid-mini-batch: 32
238
+ valid-max-length: 1000
239
+ valid-script-path: ""
240
+ valid-script-args:
241
+ []
242
+ valid-translation-output: valid.trg.output
243
+ keep-best: true
244
+ valid-log: valid.log