Attila1011 commited on
Commit
24b1806
·
verified ·
1 Parent(s): 651c1b1

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -37,3 +37,4 @@ checkpoints-v5/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -t
37
  checkpoints-v5.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoints-v5.2/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
39
  checkpoints-v5.3/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
37
  checkpoints-v5.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
38
  checkpoints-v5.2/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
39
  checkpoints-v5.3/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoints-v5.4/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.4/checkpoint-12288/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e150a75747789abb882bf148649092c42d97dd4c48ee2c3ba878de67cfece00
3
+ size 54599592
checkpoints-v5.4/checkpoint-12288/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7ff0bbcc2389184e4a41dac69d7d2bd1edf1f93c94226e4b6da2bec584f6a99
3
+ size 59276512
checkpoints-v5.4/checkpoint-12288/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68b8a9b6191fc8917f7179f63f12b88eef948e1d3d9a4dea33f7639d3c43e0d9
3
+ size 54599624
checkpoints-v5.4/checkpoint-12288/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb3c3757623ba5666b44c502aa2de9db04d737893bb7637ece37279f4fc7ba2b
3
+ size 76551435
checkpoints-v5.4/checkpoint-12288/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4089acccf675deef93a67b51b7910cde1de8218040fddc478779096a07fc1777
3
+ size 14645
checkpoints-v5.4/checkpoint-12288/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7f212b71e3880c4f7a41ce532777c9adc951337930ebe4c5060edb719687304
3
+ size 1383
checkpoints-v5.4/checkpoint-12288/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b460fd0709bd5e8feb60e761c3267494e2afe1bf2a6daad307d2660b2e698ce
3
+ size 1465
checkpoints-v5.4/checkpoint-12288/trainer_state.json ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5295752795914409,
6
+ "eval_steps": 1024,
7
+ "global_step": 12288,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 6.4686360359191895,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 14.04021167755127,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.08472661559124553,
22
+ "eval_ce_clean_loss": 3.6515917010398815,
23
+ "eval_ce_pred_loss": 6.157996057701517,
24
+ "eval_flow_cos_loss": 0.38175737285919026,
25
+ "eval_flow_mse_loss": 1.2173780876436213,
26
+ "eval_loss": 9.275006284083386,
27
+ "flow/cos_sim": 0.6182426629798499,
28
+ "flow/improvement_ratio": 0.994758785787676,
29
+ "flow/mag_ratio_mean": 0.6230432560194784,
30
+ "flow/mag_ratio_std": 0.0744406624294039,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "eval_bleu": 0.08472661559124553,
36
+ "eval_ce_clean_loss": 3.6515917010398815,
37
+ "eval_ce_pred_loss": 6.157996057701517,
38
+ "eval_flow_cos_loss": 0.38175737285919026,
39
+ "eval_flow_mse_loss": 1.2173780876436213,
40
+ "eval_loss": 9.275006284083386,
41
+ "eval_runtime": 212.9025,
42
+ "eval_samples_per_second": 140.91,
43
+ "eval_steps_per_second": 2.203,
44
+ "flow/cos_sim": 0.6182426629798499,
45
+ "flow/improvement_ratio": 0.994758785787676,
46
+ "flow/mag_ratio_mean": 0.6230432560194784,
47
+ "flow/mag_ratio_std": 0.0744406624294039,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.0882625465985735,
52
+ "grad_norm": 2.9631969928741455,
53
+ "learning_rate": 9.9476028157316e-05,
54
+ "loss": 6.443160057067871,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.0882625465985735,
59
+ "eval_bleu": 0.26560929308857356,
60
+ "eval_ce_clean_loss": 0.681954366947288,
61
+ "eval_ce_pred_loss": 4.135940641228324,
62
+ "eval_flow_cos_loss": 0.3561113254347844,
63
+ "eval_flow_mse_loss": 1.1538333908072922,
64
+ "eval_loss": 4.8199739954364835,
65
+ "flow/cos_sim": 0.6438886908325814,
66
+ "flow/improvement_ratio": 0.9941694777149127,
67
+ "flow/mag_ratio_mean": 0.6341400431163275,
68
+ "flow/mag_ratio_std": 0.08245640958168868,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.0882625465985735,
73
+ "eval_bleu": 0.26560929308857356,
74
+ "eval_ce_clean_loss": 0.681954366947288,
75
+ "eval_ce_pred_loss": 4.135940641228324,
76
+ "eval_flow_cos_loss": 0.3561113254347844,
77
+ "eval_flow_mse_loss": 1.1538333908072922,
78
+ "eval_loss": 4.8199739954364835,
79
+ "eval_runtime": 208.0696,
80
+ "eval_samples_per_second": 144.183,
81
+ "eval_steps_per_second": 2.254,
82
+ "flow/cos_sim": 0.6438886908325814,
83
+ "flow/improvement_ratio": 0.9941694777149127,
84
+ "flow/mag_ratio_mean": 0.6341400431163275,
85
+ "flow/mag_ratio_std": 0.08245640958168868,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.13239381989786023,
90
+ "grad_norm": 2.705171823501587,
91
+ "learning_rate": 9.7915094488941e-05,
92
+ "loss": 4.337810516357422,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.13239381989786023,
97
+ "eval_bleu": 0.32689116234163157,
98
+ "eval_ce_clean_loss": 0.23400198015322818,
99
+ "eval_ce_pred_loss": 3.659708074415162,
100
+ "eval_flow_cos_loss": 0.3280593024006785,
101
+ "eval_flow_mse_loss": 1.0939402854773028,
102
+ "eval_loss": 3.9717527071296024,
103
+ "flow/cos_sim": 0.6719407138031429,
104
+ "flow/improvement_ratio": 0.9939516864097449,
105
+ "flow/mag_ratio_mean": 0.6437487534876826,
106
+ "flow/mag_ratio_std": 0.09126702068584051,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.13239381989786023,
111
+ "eval_bleu": 0.32689116234163157,
112
+ "eval_ce_clean_loss": 0.23400198015322818,
113
+ "eval_ce_pred_loss": 3.659708074415162,
114
+ "eval_flow_cos_loss": 0.3280593024006785,
115
+ "eval_flow_mse_loss": 1.0939402854773028,
116
+ "eval_loss": 3.9717527071296024,
117
+ "eval_runtime": 211.6943,
118
+ "eval_samples_per_second": 141.714,
119
+ "eval_steps_per_second": 2.215,
120
+ "flow/cos_sim": 0.6719407138031429,
121
+ "flow/improvement_ratio": 0.9939516864097449,
122
+ "flow/mag_ratio_mean": 0.6437487534876826,
123
+ "flow/mag_ratio_std": 0.09126702068584051,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.176525093197147,
128
+ "grad_norm": 4.605835437774658,
129
+ "learning_rate": 9.534693146185996e-05,
130
+ "loss": 3.8106541633605957,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.176525093197147,
135
+ "eval_bleu": 0.3496092396585298,
136
+ "eval_ce_clean_loss": 0.1174579276554366,
137
+ "eval_ce_pred_loss": 3.4018036912499205,
138
+ "eval_flow_cos_loss": 0.308671236038208,
139
+ "eval_flow_mse_loss": 1.0620912140620542,
140
+ "eval_loss": 3.6379795023627373,
141
+ "flow/cos_sim": 0.6913287766706715,
142
+ "flow/improvement_ratio": 0.99417618164884,
143
+ "flow/mag_ratio_mean": 0.6605991765634337,
144
+ "flow/mag_ratio_std": 0.09435669084919542,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.176525093197147,
149
+ "eval_bleu": 0.3496092396585298,
150
+ "eval_ce_clean_loss": 0.1174579276554366,
151
+ "eval_ce_pred_loss": 3.4018036912499205,
152
+ "eval_flow_cos_loss": 0.308671236038208,
153
+ "eval_flow_mse_loss": 1.0620912140620542,
154
+ "eval_loss": 3.6379795023627373,
155
+ "eval_runtime": 212.2533,
156
+ "eval_samples_per_second": 141.341,
157
+ "eval_steps_per_second": 2.21,
158
+ "flow/cos_sim": 0.6913287766706715,
159
+ "flow/improvement_ratio": 0.99417618164884,
160
+ "flow/mag_ratio_mean": 0.6605991765634337,
161
+ "flow/mag_ratio_std": 0.09435669084919542,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.22065636649643372,
166
+ "grad_norm": 1.91041898727417,
167
+ "learning_rate": 9.18264920723673e-05,
168
+ "loss": 3.563469886779785,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.22065636649643372,
173
+ "eval_bleu": 0.3655524466235782,
174
+ "eval_ce_clean_loss": 0.0654882211913305,
175
+ "eval_ce_pred_loss": 3.236920855446919,
176
+ "eval_flow_cos_loss": 0.2889146146489613,
177
+ "eval_flow_mse_loss": 1.036446757662271,
178
+ "eval_loss": 3.4400081954785247,
179
+ "flow/cos_sim": 0.711085402508026,
180
+ "flow/improvement_ratio": 0.994324432888519,
181
+ "flow/mag_ratio_mean": 0.6904204569137427,
182
+ "flow/mag_ratio_std": 0.09082355081780887,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.22065636649643372,
187
+ "eval_bleu": 0.3655524466235782,
188
+ "eval_ce_clean_loss": 0.0654882211913305,
189
+ "eval_ce_pred_loss": 3.236920855446919,
190
+ "eval_flow_cos_loss": 0.2889146146489613,
191
+ "eval_flow_mse_loss": 1.036446757662271,
192
+ "eval_loss": 3.4400081954785247,
193
+ "eval_runtime": 212.6957,
194
+ "eval_samples_per_second": 141.047,
195
+ "eval_steps_per_second": 2.205,
196
+ "flow/cos_sim": 0.711085402508026,
197
+ "flow/improvement_ratio": 0.994324432888519,
198
+ "flow/mag_ratio_mean": 0.6904204569137427,
199
+ "flow/mag_ratio_std": 0.09082355081780887,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.26478763979572045,
204
+ "grad_norm": 4.8472394943237305,
205
+ "learning_rate": 8.74324003722993e-05,
206
+ "loss": 3.4198970794677734,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.26478763979572045,
211
+ "eval_bleu": 0.37708007455782677,
212
+ "eval_ce_clean_loss": 0.04180683485909439,
213
+ "eval_ce_pred_loss": 3.098344222060653,
214
+ "eval_flow_cos_loss": 0.276241691127769,
215
+ "eval_flow_mse_loss": 1.034245941430521,
216
+ "eval_loss": 3.3139541271144646,
217
+ "flow/cos_sim": 0.7237583064575439,
218
+ "flow/improvement_ratio": 0.9940337580658479,
219
+ "flow/mag_ratio_mean": 0.7007273408904005,
220
+ "flow/mag_ratio_std": 0.08993671732797806,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.26478763979572045,
225
+ "eval_bleu": 0.37708007455782677,
226
+ "eval_ce_clean_loss": 0.04180683485909439,
227
+ "eval_ce_pred_loss": 3.098344222060653,
228
+ "eval_flow_cos_loss": 0.276241691127769,
229
+ "eval_flow_mse_loss": 1.034245941430521,
230
+ "eval_loss": 3.3139541271144646,
231
+ "eval_runtime": 213.2822,
232
+ "eval_samples_per_second": 140.659,
233
+ "eval_steps_per_second": 2.199,
234
+ "flow/cos_sim": 0.7237583064575439,
235
+ "flow/improvement_ratio": 0.9940337580658479,
236
+ "flow/mag_ratio_mean": 0.7007273408904005,
237
+ "flow/mag_ratio_std": 0.08993671732797806,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.30891891309500724,
242
+ "grad_norm": 1.408441185951233,
243
+ "learning_rate": 8.22483558761947e-05,
244
+ "loss": 3.2942428588867188,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.30891891309500724,
249
+ "eval_bleu": 0.3835006770235858,
250
+ "eval_ce_clean_loss": 0.02844317371982819,
251
+ "eval_ce_pred_loss": 2.991236975452285,
252
+ "eval_flow_cos_loss": 0.26042752873414615,
253
+ "eval_flow_mse_loss": 1.0223680001332054,
254
+ "eval_loss": 3.209783913992615,
255
+ "flow/cos_sim": 0.7395724975732344,
256
+ "flow/improvement_ratio": 0.9952030264492482,
257
+ "flow/mag_ratio_mean": 0.7151250421111264,
258
+ "flow/mag_ratio_std": 0.08892434012534013,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.30891891309500724,
263
+ "eval_bleu": 0.3835006770235858,
264
+ "eval_ce_clean_loss": 0.02844317371982819,
265
+ "eval_ce_pred_loss": 2.991236975452285,
266
+ "eval_flow_cos_loss": 0.26042752873414615,
267
+ "eval_flow_mse_loss": 1.0223680001332054,
268
+ "eval_loss": 3.209783913992615,
269
+ "eval_runtime": 211.8993,
270
+ "eval_samples_per_second": 141.577,
271
+ "eval_steps_per_second": 2.213,
272
+ "flow/cos_sim": 0.7395724975732344,
273
+ "flow/improvement_ratio": 0.9952030264492482,
274
+ "flow/mag_ratio_mean": 0.7151250421111264,
275
+ "flow/mag_ratio_std": 0.08892434012534013,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.353050186394294,
280
+ "grad_norm": 1.3102294206619263,
281
+ "learning_rate": 7.638710244802891e-05,
282
+ "loss": 3.210357904434204,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.353050186394294,
287
+ "eval_bleu": 0.3993825003723074,
288
+ "eval_ce_clean_loss": 0.020586447313682102,
289
+ "eval_ce_pred_loss": 2.869530324234383,
290
+ "eval_flow_cos_loss": 0.2547679298213804,
291
+ "eval_flow_mse_loss": 1.038607116192897,
292
+ "eval_loss": 3.1315567335848615,
293
+ "flow/cos_sim": 0.7452320968672665,
294
+ "flow/improvement_ratio": 0.99444860372462,
295
+ "flow/mag_ratio_mean": 0.7234579237031021,
296
+ "flow/mag_ratio_std": 0.08825072420558441,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.353050186394294,
301
+ "eval_bleu": 0.3993825003723074,
302
+ "eval_ce_clean_loss": 0.020586447313682102,
303
+ "eval_ce_pred_loss": 2.869530324234383,
304
+ "eval_flow_cos_loss": 0.2547679298213804,
305
+ "eval_flow_mse_loss": 1.038607116192897,
306
+ "eval_loss": 3.1315567335848615,
307
+ "eval_runtime": 211.4546,
308
+ "eval_samples_per_second": 141.874,
309
+ "eval_steps_per_second": 2.218,
310
+ "flow/cos_sim": 0.7452320968672665,
311
+ "flow/improvement_ratio": 0.99444860372462,
312
+ "flow/mag_ratio_mean": 0.7234579237031021,
313
+ "flow/mag_ratio_std": 0.08825072420558441,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.3971814596935807,
318
+ "grad_norm": 2.7642617225646973,
319
+ "learning_rate": 6.997821756319211e-05,
320
+ "loss": 3.1443495750427246,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.3971814596935807,
325
+ "eval_bleu": 0.4094899145449865,
326
+ "eval_ce_clean_loss": 0.01607099493770902,
327
+ "eval_ce_pred_loss": 2.775586368178508,
328
+ "eval_flow_cos_loss": 0.24236023524549724,
329
+ "eval_flow_mse_loss": 1.0269770694694031,
330
+ "eval_loss": 3.046548553621337,
331
+ "flow/cos_sim": 0.7576397708229927,
332
+ "flow/improvement_ratio": 0.9949808230023902,
333
+ "flow/mag_ratio_mean": 0.7366276771020788,
334
+ "flow/mag_ratio_std": 0.09276079728023838,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.3971814596935807,
339
+ "eval_bleu": 0.4094899145449865,
340
+ "eval_ce_clean_loss": 0.01607099493770902,
341
+ "eval_ce_pred_loss": 2.775586368178508,
342
+ "eval_flow_cos_loss": 0.24236023524549724,
343
+ "eval_flow_mse_loss": 1.0269770694694031,
344
+ "eval_loss": 3.046548553621337,
345
+ "eval_runtime": 212.1106,
346
+ "eval_samples_per_second": 141.436,
347
+ "eval_steps_per_second": 2.211,
348
+ "flow/cos_sim": 0.7576397708229927,
349
+ "flow/improvement_ratio": 0.9949808230023902,
350
+ "flow/mag_ratio_mean": 0.7366276771020788,
351
+ "flow/mag_ratio_std": 0.09276079728023838,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.44131273299286744,
356
+ "grad_norm": 2.0153911113739014,
357
+ "learning_rate": 6.314377890922702e-05,
358
+ "loss": 3.072866201400757,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.44131273299286744,
363
+ "eval_bleu": 0.41923082806405243,
364
+ "eval_ce_clean_loss": 0.012997276724568371,
365
+ "eval_ce_pred_loss": 2.717693049516251,
366
+ "eval_flow_cos_loss": 0.23471591219719032,
367
+ "eval_flow_mse_loss": 1.0302080986088018,
368
+ "eval_loss": 3.00426946926727,
369
+ "flow/cos_sim": 0.7652840989230792,
370
+ "flow/improvement_ratio": 0.9956137638356386,
371
+ "flow/mag_ratio_mean": 0.74410386723496,
372
+ "flow/mag_ratio_std": 0.09413890032244644,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.44131273299286744,
377
+ "eval_bleu": 0.41923082806405243,
378
+ "eval_ce_clean_loss": 0.012997276724568371,
379
+ "eval_ce_pred_loss": 2.717693049516251,
380
+ "eval_flow_cos_loss": 0.23471591219719032,
381
+ "eval_flow_mse_loss": 1.0302080986088018,
382
+ "eval_loss": 3.00426946926727,
383
+ "eval_runtime": 210.9776,
384
+ "eval_samples_per_second": 142.195,
385
+ "eval_steps_per_second": 2.223,
386
+ "flow/cos_sim": 0.7652840989230792,
387
+ "flow/improvement_ratio": 0.9956137638356386,
388
+ "flow/mag_ratio_mean": 0.74410386723496,
389
+ "flow/mag_ratio_std": 0.09413890032244644,
390
+ "step": 10240
391
+ },
392
+ {
393
+ "epoch": 0.4854440062921542,
394
+ "grad_norm": 1.8962148427963257,
395
+ "learning_rate": 5.603332356428589e-05,
396
+ "loss": 3.02541184425354,
397
+ "step": 11264
398
+ },
399
+ {
400
+ "epoch": 0.4854440062921542,
401
+ "eval_bleu": 0.4169916279016593,
402
+ "eval_ce_clean_loss": 0.010806279969828596,
403
+ "eval_ce_pred_loss": 2.706889592508263,
404
+ "eval_flow_cos_loss": 0.22115590358212558,
405
+ "eval_flow_mse_loss": 1.007251028567235,
406
+ "eval_loss": 2.968168963755626,
407
+ "flow/cos_sim": 0.7788441080782713,
408
+ "flow/improvement_ratio": 0.9940794661863527,
409
+ "flow/mag_ratio_mean": 0.7585708880221157,
410
+ "flow/mag_ratio_std": 0.09570817299870285,
411
+ "step": 11264
412
+ },
413
+ {
414
+ "epoch": 0.4854440062921542,
415
+ "eval_bleu": 0.4169916279016593,
416
+ "eval_ce_clean_loss": 0.010806279969828596,
417
+ "eval_ce_pred_loss": 2.706889592508263,
418
+ "eval_flow_cos_loss": 0.22115590358212558,
419
+ "eval_flow_mse_loss": 1.007251028567235,
420
+ "eval_loss": 2.968168963755626,
421
+ "eval_runtime": 212.5141,
422
+ "eval_samples_per_second": 141.167,
423
+ "eval_steps_per_second": 2.207,
424
+ "flow/cos_sim": 0.7788441080782713,
425
+ "flow/improvement_ratio": 0.9940794661863527,
426
+ "flow/mag_ratio_mean": 0.7585708880221157,
427
+ "flow/mag_ratio_std": 0.09570817299870285,
428
+ "step": 11264
429
+ },
430
+ {
431
+ "epoch": 0.5295752795914409,
432
+ "grad_norm": 1.0911729335784912,
433
+ "learning_rate": 4.880324964674035e-05,
434
+ "loss": 2.9968912601470947,
435
+ "step": 12288
436
+ },
437
+ {
438
+ "epoch": 0.5295752795914409,
439
+ "eval_bleu": 0.4245546218369888,
440
+ "eval_ce_clean_loss": 0.009449388876037081,
441
+ "eval_ce_pred_loss": 2.6375575258787762,
442
+ "eval_flow_cos_loss": 0.21768109258939464,
443
+ "eval_flow_mse_loss": 1.0174772260285645,
444
+ "eval_loss": 2.927637138346365,
445
+ "flow/cos_sim": 0.7823189269505075,
446
+ "flow/improvement_ratio": 0.9946964611885136,
447
+ "flow/mag_ratio_mean": 0.7612636199892203,
448
+ "flow/mag_ratio_std": 0.09714165055103648,
449
+ "step": 12288
450
+ },
451
+ {
452
+ "epoch": 0.5295752795914409,
453
+ "eval_bleu": 0.4245546218369888,
454
+ "eval_ce_clean_loss": 0.009449388876037081,
455
+ "eval_ce_pred_loss": 2.6375575258787762,
456
+ "eval_flow_cos_loss": 0.21768109258939464,
457
+ "eval_flow_mse_loss": 1.0174772260285645,
458
+ "eval_loss": 2.927637138346365,
459
+ "eval_runtime": 211.2424,
460
+ "eval_samples_per_second": 142.017,
461
+ "eval_steps_per_second": 2.22,
462
+ "flow/cos_sim": 0.7823189269505075,
463
+ "flow/improvement_ratio": 0.9946964611885136,
464
+ "flow/mag_ratio_mean": 0.7612636199892203,
465
+ "flow/mag_ratio_std": 0.09714165055103648,
466
+ "step": 12288
467
+ }
468
+ ],
469
+ "logging_steps": 1024,
470
+ "max_steps": 23204,
471
+ "num_input_tokens_seen": 0,
472
+ "num_train_epochs": 1,
473
+ "save_steps": 1024,
474
+ "stateful_callbacks": {
475
+ "TrainerControl": {
476
+ "args": {
477
+ "should_epoch_stop": false,
478
+ "should_evaluate": false,
479
+ "should_log": false,
480
+ "should_save": true,
481
+ "should_training_stop": false
482
+ },
483
+ "attributes": {}
484
+ }
485
+ },
486
+ "total_flos": 0.0,
487
+ "train_batch_size": 64,
488
+ "trial_name": null,
489
+ "trial_params": null
490
+ }
checkpoints-v5.4/checkpoint-12288/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137