Attila1011 commited on
Commit
75919ae
·
verified ·
1 Parent(s): d534866

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -73,3 +73,4 @@ checkpoints-v5.14-b/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=l
73
  checkpoints-v5.15/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
74
  checkpoints-v4.7/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
75
  checkpoints-v4.7/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
73
  checkpoints-v5.15/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
74
  checkpoints-v4.7/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
75
  checkpoints-v4.7/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
76
+ checkpoints-d1.0/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-d1.0/checkpoint-10240/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3b9a5dc9dc3511476ab6222d6e0c2337cd6f41b1a4bf43d76ff882a5344ddf9
3
+ size 746712
checkpoints-d1.0/checkpoint-10240/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40749d102e3cefae47b7fec0b532da328049b8676ec9f0bcad23db44baaad36a
3
+ size 57273243
checkpoints-d1.0/checkpoint-10240/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcee6606e041187380cd9eb1089ad3c05577e5c9cadecead12a4aa1006bfdc59
3
+ size 24002016
checkpoints-d1.0/checkpoint-10240/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd42af311255ef416ee01cec5cce39743f02b8bdc0719ccfa035ec2e314c526f
3
+ size 1569995
checkpoints-d1.0/checkpoint-10240/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7eca8556f7ff9f000e8d9536da545feb9147b46a0d7fe4b514cc2dd1f481973a
3
+ size 14645
checkpoints-d1.0/checkpoint-10240/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed087604ee25d5e61efeae79a040184b2282edbd9e4cd00e68e0b67564dec00
3
+ size 1383
checkpoints-d1.0/checkpoint-10240/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc2b020b76cf7752b91a7ef4cc51b906e83834b39b1307c5cbe62f6b9d4dc131
3
+ size 1465
checkpoints-d1.0/checkpoint-10240/trainer_state.json ADDED
@@ -0,0 +1,664 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.44131273299286744,
6
+ "eval_steps": 1024,
7
+ "global_step": 10240,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011032818324821687,
14
+ "grad_norm": 0.20388168096542358,
15
+ "learning_rate": 0.000498046875,
16
+ "loss": 2.512310028076172,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.022065636649643373,
21
+ "grad_norm": 0.3460715115070343,
22
+ "learning_rate": 0.000998046875,
23
+ "loss": 2.018148422241211,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03309845497446506,
28
+ "grad_norm": 0.5453425645828247,
29
+ "learning_rate": 0.000999688448778502,
30
+ "loss": 1.8114819526672363,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "grad_norm": 0.9051710367202759,
36
+ "learning_rate": 0.0009987492950653055,
37
+ "loss": 1.75458824634552,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.04413127329928675,
42
+ "eval_bleu": 0.9399749239907662,
43
+ "eval_cos_loss": 0.47715059564566054,
44
+ "eval_dec_loss": 0.10647435713885055,
45
+ "eval_loss": 1.73655072051579,
46
+ "eval_mse2_loss": 0.1675516524389863,
47
+ "eval_mse_loss": 1.3505616757407117,
48
+ "eval_rec_loss": 0.047009017791098624,
49
+ "eval_var_loss": 0.01723895594080501,
50
+ "flow/cos_sim": 0.522849405561683,
51
+ "flow/improvement_ratio": 0.8910323953323527,
52
+ "flow/mag_ratio_mean": 0.5448383128465112,
53
+ "flow/mag_ratio_std": 0.23550461588510826,
54
+ "step": 1024
55
+ },
56
+ {
57
+ "epoch": 0.04413127329928675,
58
+ "eval_bleu": 0.9399749239907662,
59
+ "eval_cos_loss": 0.47715059564566054,
60
+ "eval_dec_loss": 0.10647435713885055,
61
+ "eval_loss": 1.73655072051579,
62
+ "eval_mse2_loss": 0.1675516524389863,
63
+ "eval_mse_loss": 1.3505616757407117,
64
+ "eval_rec_loss": 0.047009017791098624,
65
+ "eval_runtime": 152.3582,
66
+ "eval_samples_per_second": 196.904,
67
+ "eval_steps_per_second": 3.078,
68
+ "eval_var_loss": 0.01723895594080501,
69
+ "flow/cos_sim": 0.522849405561683,
70
+ "flow/improvement_ratio": 0.8910323953323527,
71
+ "flow/mag_ratio_mean": 0.5448383128465112,
72
+ "flow/mag_ratio_std": 0.23550461588510826,
73
+ "step": 1024
74
+ },
75
+ {
76
+ "epoch": 0.05516409162410843,
77
+ "grad_norm": 0.5948837399482727,
78
+ "learning_rate": 0.0009971837136430763,
79
+ "loss": 1.732498049736023,
80
+ "step": 1280
81
+ },
82
+ {
83
+ "epoch": 0.06619690994893011,
84
+ "grad_norm": 0.6182620525360107,
85
+ "learning_rate": 0.0009949936708776692,
86
+ "loss": 1.7030788660049438,
87
+ "step": 1536
88
+ },
89
+ {
90
+ "epoch": 0.07722972827375181,
91
+ "grad_norm": 1.142866611480713,
92
+ "learning_rate": 0.0009921819174566252,
93
+ "loss": 1.7001720666885376,
94
+ "step": 1792
95
+ },
96
+ {
97
+ "epoch": 0.0882625465985735,
98
+ "grad_norm": 0.8506317734718323,
99
+ "learning_rate": 0.000988751984934317,
100
+ "loss": 1.6855676174163818,
101
+ "step": 2048
102
+ },
103
+ {
104
+ "epoch": 0.0882625465985735,
105
+ "eval_bleu": 0.9434877818736953,
106
+ "eval_cos_loss": 0.46131489716613217,
107
+ "eval_dec_loss": 0.09124524126659388,
108
+ "eval_loss": 1.6677722422553023,
109
+ "eval_mse2_loss": 0.15732173794812992,
110
+ "eval_mse_loss": 1.3088257922800874,
111
+ "eval_rec_loss": 0.047009017791098624,
112
+ "eval_var_loss": 0.01723895594080501,
113
+ "flow/cos_sim": 0.5386851041683002,
114
+ "flow/improvement_ratio": 0.8871048878250855,
115
+ "flow/mag_ratio_mean": 0.5622577369848548,
116
+ "flow/mag_ratio_std": 0.24945266208033573,
117
+ "step": 2048
118
+ },
119
+ {
120
+ "epoch": 0.0882625465985735,
121
+ "eval_bleu": 0.9434877818736953,
122
+ "eval_cos_loss": 0.46131489716613217,
123
+ "eval_dec_loss": 0.09124524126659388,
124
+ "eval_loss": 1.6677722422553023,
125
+ "eval_mse2_loss": 0.15732173794812992,
126
+ "eval_mse_loss": 1.3088257922800874,
127
+ "eval_rec_loss": 0.047009017791098624,
128
+ "eval_runtime": 150.2408,
129
+ "eval_samples_per_second": 199.68,
130
+ "eval_steps_per_second": 3.122,
131
+ "eval_var_loss": 0.01723895594080501,
132
+ "flow/cos_sim": 0.5386851041683002,
133
+ "flow/improvement_ratio": 0.8871048878250855,
134
+ "flow/mag_ratio_mean": 0.5622577369848548,
135
+ "flow/mag_ratio_std": 0.24945266208033573,
136
+ "step": 2048
137
+ },
138
+ {
139
+ "epoch": 0.09929536492339518,
140
+ "grad_norm": 0.7440093159675598,
141
+ "learning_rate": 0.0009847081812963268,
142
+ "loss": 1.6802997589111328,
143
+ "step": 2304
144
+ },
145
+ {
146
+ "epoch": 0.11032818324821686,
147
+ "grad_norm": 0.9319222569465637,
148
+ "learning_rate": 0.0009800555855486275,
149
+ "loss": 1.6744197607040405,
150
+ "step": 2560
151
+ },
152
+ {
153
+ "epoch": 0.12136100157303854,
154
+ "grad_norm": 0.8629500865936279,
155
+ "learning_rate": 0.0009748000413383664,
156
+ "loss": 1.6740639209747314,
157
+ "step": 2816
158
+ },
159
+ {
160
+ "epoch": 0.13239381989786023,
161
+ "grad_norm": 0.9893732666969299,
162
+ "learning_rate": 0.0009689481496142604,
163
+ "loss": 1.664785623550415,
164
+ "step": 3072
165
+ },
166
+ {
167
+ "epoch": 0.13239381989786023,
168
+ "eval_bleu": 0.9404068449586629,
169
+ "eval_cos_loss": 0.4534218231243874,
170
+ "eval_dec_loss": 0.10443712005824614,
171
+ "eval_loss": 1.6572931651621738,
172
+ "eval_mse2_loss": 0.1550013455850229,
173
+ "eval_mse_loss": 1.288264540212749,
174
+ "eval_rec_loss": 0.047009017791098624,
175
+ "eval_var_loss": 0.01723895594080501,
176
+ "flow/cos_sim": 0.5465781726816824,
177
+ "flow/improvement_ratio": 0.8946911343125138,
178
+ "flow/mag_ratio_mean": 0.5628405101517878,
179
+ "flow/mag_ratio_std": 0.24253392000315285,
180
+ "step": 3072
181
+ },
182
+ {
183
+ "epoch": 0.13239381989786023,
184
+ "eval_bleu": 0.9404068449586629,
185
+ "eval_cos_loss": 0.4534218231243874,
186
+ "eval_dec_loss": 0.10443712005824614,
187
+ "eval_loss": 1.6572931651621738,
188
+ "eval_mse2_loss": 0.1550013455850229,
189
+ "eval_mse_loss": 1.288264540212749,
190
+ "eval_rec_loss": 0.047009017791098624,
191
+ "eval_runtime": 153.5602,
192
+ "eval_samples_per_second": 195.363,
193
+ "eval_steps_per_second": 3.054,
194
+ "eval_var_loss": 0.01723895594080501,
195
+ "flow/cos_sim": 0.5465781726816824,
196
+ "flow/improvement_ratio": 0.8946911343125138,
197
+ "flow/mag_ratio_mean": 0.5628405101517878,
198
+ "flow/mag_ratio_std": 0.24253392000315285,
199
+ "step": 3072
200
+ },
201
+ {
202
+ "epoch": 0.14342663822268192,
203
+ "grad_norm": 0.9933224320411682,
204
+ "learning_rate": 0.0009625072603358231,
205
+ "loss": 1.6605451107025146,
206
+ "step": 3328
207
+ },
208
+ {
209
+ "epoch": 0.15445945654750362,
210
+ "grad_norm": 1.221793532371521,
211
+ "learning_rate": 0.0009554854632418371,
212
+ "loss": 1.6490036249160767,
213
+ "step": 3584
214
+ },
215
+ {
216
+ "epoch": 0.1654922748723253,
217
+ "grad_norm": 0.8394345045089722,
218
+ "learning_rate": 0.000947891577689663,
219
+ "loss": 1.649448275566101,
220
+ "step": 3840
221
+ },
222
+ {
223
+ "epoch": 0.176525093197147,
224
+ "grad_norm": 1.245514154434204,
225
+ "learning_rate": 0.0009397351415781539,
226
+ "loss": 1.6489267349243164,
227
+ "step": 4096
228
+ },
229
+ {
230
+ "epoch": 0.176525093197147,
231
+ "eval_bleu": 0.942338221101898,
232
+ "eval_cos_loss": 0.450266804585833,
233
+ "eval_dec_loss": 0.1002394597608048,
234
+ "eval_loss": 1.6427197324187517,
235
+ "eval_mse2_loss": 0.15268841918025697,
236
+ "eval_mse_loss": 1.280517189741643,
237
+ "eval_rec_loss": 0.047009017791098624,
238
+ "eval_var_loss": 0.01723895594080501,
239
+ "flow/cos_sim": 0.5497331933172018,
240
+ "flow/improvement_ratio": 0.8926444684010325,
241
+ "flow/mag_ratio_mean": 0.5659083429175907,
242
+ "flow/mag_ratio_std": 0.24314757854318314,
243
+ "step": 4096
244
+ },
245
+ {
246
+ "epoch": 0.176525093197147,
247
+ "eval_bleu": 0.942338221101898,
248
+ "eval_cos_loss": 0.450266804585833,
249
+ "eval_dec_loss": 0.1002394597608048,
250
+ "eval_loss": 1.6427197324187517,
251
+ "eval_mse2_loss": 0.15268841918025697,
252
+ "eval_mse_loss": 1.280517189741643,
253
+ "eval_rec_loss": 0.047009017791098624,
254
+ "eval_runtime": 152.0027,
255
+ "eval_samples_per_second": 197.365,
256
+ "eval_steps_per_second": 3.085,
257
+ "eval_var_loss": 0.01723895594080501,
258
+ "flow/cos_sim": 0.5497331933172018,
259
+ "flow/improvement_ratio": 0.8926444684010325,
260
+ "flow/mag_ratio_mean": 0.5659083429175907,
261
+ "flow/mag_ratio_std": 0.24314757854318314,
262
+ "step": 4096
263
+ },
264
+ {
265
+ "epoch": 0.18755791152196866,
266
+ "grad_norm": 1.0416312217712402,
267
+ "learning_rate": 0.000931026399368079,
268
+ "loss": 1.6447768211364746,
269
+ "step": 4352
270
+ },
271
+ {
272
+ "epoch": 0.19859072984679035,
273
+ "grad_norm": 1.1173036098480225,
274
+ "learning_rate": 0.0009217762892151117,
275
+ "loss": 1.6489276885986328,
276
+ "step": 4608
277
+ },
278
+ {
279
+ "epoch": 0.20962354817161205,
280
+ "grad_norm": 0.930402934551239,
281
+ "learning_rate": 0.0009119964292315354,
282
+ "loss": 1.6420283317565918,
283
+ "step": 4864
284
+ },
285
+ {
286
+ "epoch": 0.22065636649643372,
287
+ "grad_norm": 0.9209682941436768,
288
+ "learning_rate": 0.0009016991028939279,
289
+ "loss": 1.6357425451278687,
290
+ "step": 5120
291
+ },
292
+ {
293
+ "epoch": 0.22065636649643372,
294
+ "eval_bleu": 0.9431814239032791,
295
+ "eval_cos_loss": 0.4463651056991203,
296
+ "eval_dec_loss": 0.09822412853889755,
297
+ "eval_loss": 1.628653102846288,
298
+ "eval_mse2_loss": 0.14985301353529826,
299
+ "eval_mse_loss": 1.2716914730539708,
300
+ "eval_rec_loss": 0.047009017791098624,
301
+ "eval_var_loss": 0.01723895594080501,
302
+ "flow/cos_sim": 0.5536348958259453,
303
+ "flow/improvement_ratio": 0.8939987117293547,
304
+ "flow/mag_ratio_mean": 0.576050937048662,
305
+ "flow/mag_ratio_std": 0.2491114061397276,
306
+ "step": 5120
307
+ },
308
+ {
309
+ "epoch": 0.22065636649643372,
310
+ "eval_bleu": 0.9431814239032791,
311
+ "eval_cos_loss": 0.4463651056991203,
312
+ "eval_dec_loss": 0.09822412853889755,
313
+ "eval_loss": 1.628653102846288,
314
+ "eval_mse2_loss": 0.14985301353529826,
315
+ "eval_mse_loss": 1.2716914730539708,
316
+ "eval_rec_loss": 0.047009017791098624,
317
+ "eval_runtime": 150.4792,
318
+ "eval_samples_per_second": 199.363,
319
+ "eval_steps_per_second": 3.117,
320
+ "eval_var_loss": 0.01723895594080501,
321
+ "flow/cos_sim": 0.5536348958259453,
322
+ "flow/improvement_ratio": 0.8939987117293547,
323
+ "flow/mag_ratio_mean": 0.576050937048662,
324
+ "flow/mag_ratio_std": 0.2491114061397276,
325
+ "step": 5120
326
+ },
327
+ {
328
+ "epoch": 0.23168918482125542,
329
+ "grad_norm": 0.6372450590133667,
330
+ "learning_rate": 0.0008908972436151494,
331
+ "loss": 1.6375595331192017,
332
+ "step": 5376
333
+ },
334
+ {
335
+ "epoch": 0.2427220031460771,
336
+ "grad_norm": 0.833997368812561,
337
+ "learning_rate": 0.0008796044185000127,
338
+ "loss": 1.6372840404510498,
339
+ "step": 5632
340
+ },
341
+ {
342
+ "epoch": 0.2537548214708988,
343
+ "grad_norm": 0.6318811178207397,
344
+ "learning_rate": 0.0008678348113050368,
345
+ "loss": 1.628332257270813,
346
+ "step": 5888
347
+ },
348
+ {
349
+ "epoch": 0.26478763979572045,
350
+ "grad_norm": 0.7464238405227661,
351
+ "learning_rate": 0.0008556032046236897,
352
+ "loss": 1.6342945098876953,
353
+ "step": 6144
354
+ },
355
+ {
356
+ "epoch": 0.26478763979572045,
357
+ "eval_bleu": 0.941904927802284,
358
+ "eval_cos_loss": 0.44474837520737637,
359
+ "eval_dec_loss": 0.102102437767504,
360
+ "eval_loss": 1.628202569764306,
361
+ "eval_mse2_loss": 0.15077992649411343,
362
+ "eval_mse_loss": 1.266597391954109,
363
+ "eval_rec_loss": 0.047009017791098624,
364
+ "eval_var_loss": 0.01723895594080501,
365
+ "flow/cos_sim": 0.5552516249197124,
366
+ "flow/improvement_ratio": 0.8912770662988935,
367
+ "flow/mag_ratio_mean": 0.5855364076364269,
368
+ "flow/mag_ratio_std": 0.25087345015011364,
369
+ "step": 6144
370
+ },
371
+ {
372
+ "epoch": 0.26478763979572045,
373
+ "eval_bleu": 0.941904927802284,
374
+ "eval_cos_loss": 0.44474837520737637,
375
+ "eval_dec_loss": 0.102102437767504,
376
+ "eval_loss": 1.628202569764306,
377
+ "eval_mse2_loss": 0.15077992649411343,
378
+ "eval_mse_loss": 1.266597391954109,
379
+ "eval_rec_loss": 0.047009017791098624,
380
+ "eval_runtime": 151.0104,
381
+ "eval_samples_per_second": 198.662,
382
+ "eval_steps_per_second": 3.106,
383
+ "eval_var_loss": 0.01723895594080501,
384
+ "flow/cos_sim": 0.5552516249197124,
385
+ "flow/improvement_ratio": 0.8912770662988935,
386
+ "flow/mag_ratio_mean": 0.5855364076364269,
387
+ "flow/mag_ratio_std": 0.25087345015011364,
388
+ "step": 6144
389
+ },
390
+ {
391
+ "epoch": 0.2758204581205422,
392
+ "grad_norm": 1.1199694871902466,
393
+ "learning_rate": 0.000842924961319492,
394
+ "loss": 1.6266489028930664,
395
+ "step": 6400
396
+ },
397
+ {
398
+ "epoch": 0.28685327644536385,
399
+ "grad_norm": 0.8668828010559082,
400
+ "learning_rate": 0.0008298160052303045,
401
+ "loss": 1.62454092502594,
402
+ "step": 6656
403
+ },
404
+ {
405
+ "epoch": 0.2978860947701855,
406
+ "grad_norm": 0.8108460307121277,
407
+ "learning_rate": 0.0008162928011680314,
408
+ "loss": 1.624453067779541,
409
+ "step": 6912
410
+ },
411
+ {
412
+ "epoch": 0.30891891309500724,
413
+ "grad_norm": 0.8465085625648499,
414
+ "learning_rate": 0.000802372334238864,
415
+ "loss": 1.6209194660186768,
416
+ "step": 7168
417
+ },
418
+ {
419
+ "epoch": 0.30891891309500724,
420
+ "eval_bleu": 0.942268661485615,
421
+ "eval_cos_loss": 0.4415781778186115,
422
+ "eval_dec_loss": 0.09969639404813872,
423
+ "eval_loss": 1.61412800298825,
424
+ "eval_mse2_loss": 0.14795255090699774,
425
+ "eval_mse_loss": 1.2580732640935415,
426
+ "eval_rec_loss": 0.047009017791098624,
427
+ "eval_var_loss": 0.01723895594080501,
428
+ "flow/cos_sim": 0.5584218213553114,
429
+ "flow/improvement_ratio": 0.8916484548338949,
430
+ "flow/mag_ratio_mean": 0.5767976182864419,
431
+ "flow/mag_ratio_std": 0.2534670445011623,
432
+ "step": 7168
433
+ },
434
+ {
435
+ "epoch": 0.30891891309500724,
436
+ "eval_bleu": 0.942268661485615,
437
+ "eval_cos_loss": 0.4415781778186115,
438
+ "eval_dec_loss": 0.09969639404813872,
439
+ "eval_loss": 1.61412800298825,
440
+ "eval_mse2_loss": 0.14795255090699774,
441
+ "eval_mse_loss": 1.2580732640935415,
442
+ "eval_rec_loss": 0.047009017791098624,
443
+ "eval_runtime": 150.313,
444
+ "eval_samples_per_second": 199.584,
445
+ "eval_steps_per_second": 3.12,
446
+ "eval_var_loss": 0.01723895594080501,
447
+ "flow/cos_sim": 0.5584218213553114,
448
+ "flow/improvement_ratio": 0.8916484548338949,
449
+ "flow/mag_ratio_mean": 0.5767976182864419,
450
+ "flow/mag_ratio_std": 0.2534670445011623,
451
+ "step": 7168
452
+ },
453
+ {
454
+ "epoch": 0.3199517314198289,
455
+ "grad_norm": 1.8415089845657349,
456
+ "learning_rate": 0.0007880720885100349,
457
+ "loss": 1.6192532777786255,
458
+ "step": 7424
459
+ },
460
+ {
461
+ "epoch": 0.3309845497446506,
462
+ "grad_norm": 0.7575666904449463,
463
+ "learning_rate": 0.0007734100250498788,
464
+ "loss": 1.6192028522491455,
465
+ "step": 7680
466
+ },
467
+ {
468
+ "epoch": 0.3420173680694723,
469
+ "grad_norm": 1.108810544013977,
470
+ "learning_rate": 0.000758404559368781,
471
+ "loss": 1.614426851272583,
472
+ "step": 7936
473
+ },
474
+ {
475
+ "epoch": 0.353050186394294,
476
+ "grad_norm": 1.224976897239685,
477
+ "learning_rate": 0.0007430745382893488,
478
+ "loss": 1.612691879272461,
479
+ "step": 8192
480
+ },
481
+ {
482
+ "epoch": 0.353050186394294,
483
+ "eval_bleu": 0.9417854724917855,
484
+ "eval_cos_loss": 0.4400326657905253,
485
+ "eval_dec_loss": 0.1005224303852743,
486
+ "eval_loss": 1.6120708153954446,
487
+ "eval_mse2_loss": 0.14864133708258429,
488
+ "eval_mse_loss": 1.254655804715431,
489
+ "eval_rec_loss": 0.047009017791098624,
490
+ "eval_var_loss": 0.01723895594080501,
491
+ "flow/cos_sim": 0.5599673331927643,
492
+ "flow/improvement_ratio": 0.8927549306771903,
493
+ "flow/mag_ratio_mean": 0.5825493686488951,
494
+ "flow/mag_ratio_std": 0.25123427366651196,
495
+ "step": 8192
496
+ },
497
+ {
498
+ "epoch": 0.353050186394294,
499
+ "eval_bleu": 0.9417854724917855,
500
+ "eval_cos_loss": 0.4400326657905253,
501
+ "eval_dec_loss": 0.1005224303852743,
502
+ "eval_loss": 1.6120708153954446,
503
+ "eval_mse2_loss": 0.14864133708258429,
504
+ "eval_mse_loss": 1.254655804715431,
505
+ "eval_rec_loss": 0.047009017791098624,
506
+ "eval_runtime": 153.0586,
507
+ "eval_samples_per_second": 196.003,
508
+ "eval_steps_per_second": 3.064,
509
+ "eval_var_loss": 0.01723895594080501,
510
+ "flow/cos_sim": 0.5599673331927643,
511
+ "flow/improvement_ratio": 0.8927549306771903,
512
+ "flow/mag_ratio_mean": 0.5825493686488951,
513
+ "flow/mag_ratio_std": 0.25123427366651196,
514
+ "step": 8192
515
+ },
516
+ {
517
+ "epoch": 0.36408300471911564,
518
+ "grad_norm": 1.2766138315200806,
519
+ "learning_rate": 0.0007274392162748551,
520
+ "loss": 1.6162679195404053,
521
+ "step": 8448
522
+ },
523
+ {
524
+ "epoch": 0.3751158230439373,
525
+ "grad_norm": 0.862872302532196,
526
+ "learning_rate": 0.000711518231245687,
527
+ "loss": 1.6088062524795532,
528
+ "step": 8704
529
+ },
530
+ {
531
+ "epoch": 0.38614864136875904,
532
+ "grad_norm": 0.7975575923919678,
533
+ "learning_rate": 0.0006953315799141723,
534
+ "loss": 1.6033779382705688,
535
+ "step": 8960
536
+ },
537
+ {
538
+ "epoch": 0.3971814596935807,
539
+ "grad_norm": 1.822509765625,
540
+ "learning_rate": 0.0006788995926687669,
541
+ "loss": 1.6062895059585571,
542
+ "step": 9216
543
+ },
544
+ {
545
+ "epoch": 0.3971814596935807,
546
+ "eval_bleu": 0.9410773493346376,
547
+ "eval_cos_loss": 0.4370425479498499,
548
+ "eval_dec_loss": 0.10427019556861188,
549
+ "eval_loss": 1.6075624590997757,
550
+ "eval_mse2_loss": 0.1481303899272927,
551
+ "eval_mse_loss": 1.247209641470838,
552
+ "eval_rec_loss": 0.047009017791098624,
553
+ "eval_var_loss": 0.01723895594080501,
554
+ "flow/cos_sim": 0.5629574498896406,
555
+ "flow/improvement_ratio": 0.8962287469459241,
556
+ "flow/mag_ratio_mean": 0.5783110863364327,
557
+ "flow/mag_ratio_std": 0.2480927841432059,
558
+ "step": 9216
559
+ },
560
+ {
561
+ "epoch": 0.3971814596935807,
562
+ "eval_bleu": 0.9410773493346376,
563
+ "eval_cos_loss": 0.4370425479498499,
564
+ "eval_dec_loss": 0.10427019556861188,
565
+ "eval_loss": 1.6075624590997757,
566
+ "eval_mse2_loss": 0.1481303899272927,
567
+ "eval_mse_loss": 1.247209641470838,
568
+ "eval_rec_loss": 0.047009017791098624,
569
+ "eval_runtime": 159.7751,
570
+ "eval_samples_per_second": 187.764,
571
+ "eval_steps_per_second": 2.935,
572
+ "eval_var_loss": 0.01723895594080501,
573
+ "flow/cos_sim": 0.5629574498896406,
574
+ "flow/improvement_ratio": 0.8962287469459241,
575
+ "flow/mag_ratio_mean": 0.5783110863364327,
576
+ "flow/mag_ratio_std": 0.2480927841432059,
577
+ "step": 9216
578
+ },
579
+ {
580
+ "epoch": 0.4082142780184024,
581
+ "grad_norm": 1.1795552968978882,
582
+ "learning_rate": 0.0006622429080391422,
583
+ "loss": 1.6098705530166626,
584
+ "step": 9472
585
+ },
586
+ {
587
+ "epoch": 0.4192470963432241,
588
+ "grad_norm": 0.8205899000167847,
589
+ "learning_rate": 0.0006453824467742515,
590
+ "loss": 1.6050623655319214,
591
+ "step": 9728
592
+ },
593
+ {
594
+ "epoch": 0.43027991466804577,
595
+ "grad_norm": 0.6470943093299866,
596
+ "learning_rate": 0.0006283393855659275,
597
+ "loss": 1.61065673828125,
598
+ "step": 9984
599
+ },
600
+ {
601
+ "epoch": 0.44131273299286744,
602
+ "grad_norm": 0.9093553423881531,
603
+ "learning_rate": 0.0006111351304510173,
604
+ "loss": 1.6007680892944336,
605
+ "step": 10240
606
+ },
607
+ {
608
+ "epoch": 0.44131273299286744,
609
+ "eval_bleu": 0.9417572640186486,
610
+ "eval_cos_loss": 0.4365569194242644,
611
+ "eval_dec_loss": 0.10090916226905927,
612
+ "eval_loss": 1.6032250524838088,
613
+ "eval_mse2_loss": 0.14803322787478027,
614
+ "eval_mse_loss": 1.2463789934288465,
615
+ "eval_rec_loss": 0.047009017791098624,
616
+ "eval_var_loss": 0.01723895594080501,
617
+ "flow/cos_sim": 0.5634430805757356,
618
+ "flow/improvement_ratio": 0.8962225704304954,
619
+ "flow/mag_ratio_mean": 0.5846807780042131,
620
+ "flow/mag_ratio_std": 0.25302369241267125,
621
+ "step": 10240
622
+ },
623
+ {
624
+ "epoch": 0.44131273299286744,
625
+ "eval_bleu": 0.9417572640186486,
626
+ "eval_cos_loss": 0.4365569194242644,
627
+ "eval_dec_loss": 0.10090916226905927,
628
+ "eval_loss": 1.6032250524838088,
629
+ "eval_mse2_loss": 0.14803322787478027,
630
+ "eval_mse_loss": 1.2463789934288465,
631
+ "eval_rec_loss": 0.047009017791098624,
632
+ "eval_runtime": 152.6772,
633
+ "eval_samples_per_second": 196.493,
634
+ "eval_steps_per_second": 3.072,
635
+ "eval_var_loss": 0.01723895594080501,
636
+ "flow/cos_sim": 0.5634430805757356,
637
+ "flow/improvement_ratio": 0.8962225704304954,
638
+ "flow/mag_ratio_mean": 0.5846807780042131,
639
+ "flow/mag_ratio_std": 0.25302369241267125,
640
+ "step": 10240
641
+ }
642
+ ],
643
+ "logging_steps": 256,
644
+ "max_steps": 23204,
645
+ "num_input_tokens_seen": 0,
646
+ "num_train_epochs": 1,
647
+ "save_steps": 1024,
648
+ "stateful_callbacks": {
649
+ "TrainerControl": {
650
+ "args": {
651
+ "should_epoch_stop": false,
652
+ "should_evaluate": false,
653
+ "should_log": false,
654
+ "should_save": true,
655
+ "should_training_stop": false
656
+ },
657
+ "attributes": {}
658
+ }
659
+ },
660
+ "total_flos": 0.0,
661
+ "train_batch_size": 64,
662
+ "trial_name": null,
663
+ "trial_params": null
664
+ }
checkpoints-d1.0/checkpoint-10240/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a16bb839f687414b8e48611327c4b9cfddeefe38c031ca70808f9a97c476b7
3
+ size 5137