Attila1011 commited on
Commit
b4fc5c4
·
verified ·
1 Parent(s): 778c66e

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -57,3 +57,4 @@ checkpoints-v5.11-b/checkpoint-9216/eval_state.json filter=lfs diff=lfs merge=lf
57
  checkpoints-v5.11-c/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
58
  checkpoints-v5.11-c/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
59
  checkpoints-v5.12/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
57
  checkpoints-v5.11-c/checkpoint-4096/eval_state.json filter=lfs diff=lfs merge=lfs -text
58
  checkpoints-v5.11-c/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
59
  checkpoints-v5.12/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
60
+ checkpoints-v5.12-b/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v5.12-b/checkpoint-12288/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:becb8301744f02b3bee543f98e6c4560263bad84c3059c340c03ab3f90b1188f
3
+ size 55150648
checkpoints-v5.12-b/checkpoint-12288/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd61a1b086998b238df88bfc881eaa8c6b0865b75312290e16a953736bcd25a
3
+ size 60506033
checkpoints-v5.12-b/checkpoint-12288/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:534b3a2eb69f22de17a88cc5758bf6547328c15f2dd780380ca608b9dcc89356
3
+ size 55150680
checkpoints-v5.12-b/checkpoint-12288/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8b8cb3dc1b35065cff4919488935ea316a8ede85090bcdb71cc7a904eb502b0
3
+ size 77725643
checkpoints-v5.12-b/checkpoint-12288/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ecb8eb105f441aa0b9d23a606d4d39b07260a74be26d231f328a9562372100c
3
+ size 14645
checkpoints-v5.12-b/checkpoint-12288/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d75c6d4696a144ff7e32ec64517c03d79b057231c7919118cd52c5ce4205b4e3
3
+ size 1383
checkpoints-v5.12-b/checkpoint-12288/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc7aee33185fd41553948a85c58129c8525a971860371a65bbab2d24b328e431
3
+ size 1465
checkpoints-v5.12-b/checkpoint-12288/trainer_state.json ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5295752795914409,
6
+ "eval_steps": 1024,
7
+ "global_step": 12288,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.04413127329928675,
14
+ "grad_norm": 2.1699578762054443,
15
+ "learning_rate": 9.990234375e-05,
16
+ "loss": 14.694396018981934,
17
+ "step": 1024
18
+ },
19
+ {
20
+ "epoch": 0.04413127329928675,
21
+ "eval_bleu": 0.05128154542206175,
22
+ "eval_ce_clean_loss": 3.8575630203238935,
23
+ "eval_ce_pred_loss": 5.57112058545989,
24
+ "eval_flow_consistency_loss": 0.21214947685885277,
25
+ "eval_flow_mse_loss": 0.9729119348627672,
26
+ "eval_loss": 10.37152462574973,
27
+ "flow/cos_sim": 0.6056304070741129,
28
+ "flow/improvement_ratio": 0.9951175209809976,
29
+ "flow/mag_ratio_mean": 0.596425344821995,
30
+ "flow/mag_ratio_std": 0.07446127475451814,
31
+ "step": 1024
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "eval_bleu": 0.05128154542206175,
36
+ "eval_ce_clean_loss": 3.8575630203238935,
37
+ "eval_ce_pred_loss": 5.57112058545989,
38
+ "eval_flow_consistency_loss": 0.21214947685885277,
39
+ "eval_flow_mse_loss": 0.9729119348627672,
40
+ "eval_loss": 10.37152462574973,
41
+ "eval_runtime": 210.6333,
42
+ "eval_samples_per_second": 142.428,
43
+ "eval_steps_per_second": 2.227,
44
+ "flow/cos_sim": 0.6056304070741129,
45
+ "flow/improvement_ratio": 0.9951175209809976,
46
+ "flow/mag_ratio_mean": 0.596425344821995,
47
+ "flow/mag_ratio_std": 0.07446127475451814,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.0882625465985735,
52
+ "grad_norm": 2.5218427181243896,
53
+ "learning_rate": 9.9476028157316e-05,
54
+ "loss": 8.70075798034668,
55
+ "step": 2048
56
+ },
57
+ {
58
+ "epoch": 0.0882625465985735,
59
+ "eval_bleu": 0.23857394758705833,
60
+ "eval_ce_clean_loss": 1.3269661532790422,
61
+ "eval_ce_pred_loss": 3.7169905786575286,
62
+ "eval_flow_consistency_loss": 0.17732289093516784,
63
+ "eval_flow_mse_loss": 1.097706873788,
64
+ "eval_loss": 7.496601121003694,
65
+ "flow/cos_sim": 0.5965837963354359,
66
+ "flow/improvement_ratio": 0.9938774880315704,
67
+ "flow/mag_ratio_mean": 0.5317577726042855,
68
+ "flow/mag_ratio_std": 0.06900304873614932,
69
+ "step": 2048
70
+ },
71
+ {
72
+ "epoch": 0.0882625465985735,
73
+ "eval_bleu": 0.23857394758705833,
74
+ "eval_ce_clean_loss": 1.3269661532790422,
75
+ "eval_ce_pred_loss": 3.7169905786575286,
76
+ "eval_flow_consistency_loss": 0.17732289093516784,
77
+ "eval_flow_mse_loss": 1.097706873788,
78
+ "eval_loss": 7.496601121003694,
79
+ "eval_runtime": 208.2982,
80
+ "eval_samples_per_second": 144.024,
81
+ "eval_steps_per_second": 2.252,
82
+ "flow/cos_sim": 0.5965837963354359,
83
+ "flow/improvement_ratio": 0.9938774880315704,
84
+ "flow/mag_ratio_mean": 0.5317577726042855,
85
+ "flow/mag_ratio_std": 0.06900304873614932,
86
+ "step": 2048
87
+ },
88
+ {
89
+ "epoch": 0.13239381989786023,
90
+ "grad_norm": 2.14751935005188,
91
+ "learning_rate": 9.791307026072513e-05,
92
+ "loss": 6.987759113311768,
93
+ "step": 3072
94
+ },
95
+ {
96
+ "epoch": 0.13239381989786023,
97
+ "eval_bleu": 0.3493089368796349,
98
+ "eval_ce_clean_loss": 0.5745006232881851,
99
+ "eval_ce_pred_loss": 3.0287138033014878,
100
+ "eval_flow_consistency_loss": 0.18727965287562373,
101
+ "eval_flow_mse_loss": 1.1055078414965793,
102
+ "eval_loss": 6.408606252690622,
103
+ "flow/cos_sim": 0.6098656960641906,
104
+ "flow/improvement_ratio": 0.9930039020235351,
105
+ "flow/mag_ratio_mean": 0.554409195238085,
106
+ "flow/mag_ratio_std": 0.08226730507701191,
107
+ "step": 3072
108
+ },
109
+ {
110
+ "epoch": 0.13239381989786023,
111
+ "eval_bleu": 0.3493089368796349,
112
+ "eval_ce_clean_loss": 0.5745006232881851,
113
+ "eval_ce_pred_loss": 3.0287138033014878,
114
+ "eval_flow_consistency_loss": 0.18727965287562373,
115
+ "eval_flow_mse_loss": 1.1055078414965793,
116
+ "eval_loss": 6.408606252690622,
117
+ "eval_runtime": 207.7037,
118
+ "eval_samples_per_second": 144.437,
119
+ "eval_steps_per_second": 2.258,
120
+ "flow/cos_sim": 0.6098656960641906,
121
+ "flow/improvement_ratio": 0.9930039020235351,
122
+ "flow/mag_ratio_mean": 0.554409195238085,
123
+ "flow/mag_ratio_std": 0.08226730507701191,
124
+ "step": 3072
125
+ },
126
+ {
127
+ "epoch": 0.176525093197147,
128
+ "grad_norm": 1.9934877157211304,
129
+ "learning_rate": 9.534693146185996e-05,
130
+ "loss": 6.277302265167236,
131
+ "step": 4096
132
+ },
133
+ {
134
+ "epoch": 0.176525093197147,
135
+ "eval_bleu": 0.4002615891374953,
136
+ "eval_ce_clean_loss": 0.30466608197958484,
137
+ "eval_ce_pred_loss": 2.713986599114912,
138
+ "eval_flow_consistency_loss": 0.19233520605416696,
139
+ "eval_flow_mse_loss": 1.1293878446001488,
140
+ "eval_loss": 5.949396448603062,
141
+ "flow/cos_sim": 0.6327927858590572,
142
+ "flow/improvement_ratio": 0.9936480792854895,
143
+ "flow/mag_ratio_mean": 0.5879036461366519,
144
+ "flow/mag_ratio_std": 0.10289596083131172,
145
+ "step": 4096
146
+ },
147
+ {
148
+ "epoch": 0.176525093197147,
149
+ "eval_bleu": 0.4002615891374953,
150
+ "eval_ce_clean_loss": 0.30466608197958484,
151
+ "eval_ce_pred_loss": 2.713986599114912,
152
+ "eval_flow_consistency_loss": 0.19233520605416696,
153
+ "eval_flow_mse_loss": 1.1293878446001488,
154
+ "eval_loss": 5.949396448603062,
155
+ "eval_runtime": 207.8349,
156
+ "eval_samples_per_second": 144.345,
157
+ "eval_steps_per_second": 2.257,
158
+ "flow/cos_sim": 0.6327927858590572,
159
+ "flow/improvement_ratio": 0.9936480792854895,
160
+ "flow/mag_ratio_mean": 0.5879036461366519,
161
+ "flow/mag_ratio_std": 0.10289596083131172,
162
+ "step": 4096
163
+ },
164
+ {
165
+ "epoch": 0.22065636649643372,
166
+ "grad_norm": 2.809199810028076,
167
+ "learning_rate": 9.18264920723673e-05,
168
+ "loss": 5.952817440032959,
169
+ "step": 5120
170
+ },
171
+ {
172
+ "epoch": 0.22065636649643372,
173
+ "eval_bleu": 0.42835581482136026,
174
+ "eval_ce_clean_loss": 0.1883118874482763,
175
+ "eval_ce_pred_loss": 2.513261323798694,
176
+ "eval_flow_consistency_loss": 0.2029014579268661,
177
+ "eval_flow_mse_loss": 1.1447690499108483,
178
+ "eval_loss": 5.668197899230762,
179
+ "flow/cos_sim": 0.6509825842721122,
180
+ "flow/improvement_ratio": 0.9934964183805339,
181
+ "flow/mag_ratio_mean": 0.6140208387934069,
182
+ "flow/mag_ratio_std": 0.12194133851764553,
183
+ "step": 5120
184
+ },
185
+ {
186
+ "epoch": 0.22065636649643372,
187
+ "eval_bleu": 0.42835581482136026,
188
+ "eval_ce_clean_loss": 0.1883118874482763,
189
+ "eval_ce_pred_loss": 2.513261323798694,
190
+ "eval_flow_consistency_loss": 0.2029014579268661,
191
+ "eval_flow_mse_loss": 1.1447690499108483,
192
+ "eval_loss": 5.668197899230762,
193
+ "eval_runtime": 209.54,
194
+ "eval_samples_per_second": 143.171,
195
+ "eval_steps_per_second": 2.238,
196
+ "flow/cos_sim": 0.6509825842721122,
197
+ "flow/improvement_ratio": 0.9934964183805339,
198
+ "flow/mag_ratio_mean": 0.6140208387934069,
199
+ "flow/mag_ratio_std": 0.12194133851764553,
200
+ "step": 5120
201
+ },
202
+ {
203
+ "epoch": 0.26478763979572045,
204
+ "grad_norm": 2.2502171993255615,
205
+ "learning_rate": 8.74324003722993e-05,
206
+ "loss": 5.733938217163086,
207
+ "step": 6144
208
+ },
209
+ {
210
+ "epoch": 0.26478763979572045,
211
+ "eval_bleu": 0.45224998228939733,
212
+ "eval_ce_clean_loss": 0.12550937678259827,
213
+ "eval_ce_pred_loss": 2.4016613040143238,
214
+ "eval_flow_consistency_loss": 0.20323555880009747,
215
+ "eval_flow_mse_loss": 1.1755171974838923,
216
+ "eval_loss": 5.540907144546509,
217
+ "flow/cos_sim": 0.6656121532800101,
218
+ "flow/improvement_ratio": 0.993739642822412,
219
+ "flow/mag_ratio_mean": 0.6285864149075328,
220
+ "flow/mag_ratio_std": 0.14186543162697668,
221
+ "step": 6144
222
+ },
223
+ {
224
+ "epoch": 0.26478763979572045,
225
+ "eval_bleu": 0.45224998228939733,
226
+ "eval_ce_clean_loss": 0.12550937678259827,
227
+ "eval_ce_pred_loss": 2.4016613040143238,
228
+ "eval_flow_consistency_loss": 0.20323555880009747,
229
+ "eval_flow_mse_loss": 1.1755171974838923,
230
+ "eval_loss": 5.540907144546509,
231
+ "eval_runtime": 210.3673,
232
+ "eval_samples_per_second": 142.608,
233
+ "eval_steps_per_second": 2.229,
234
+ "flow/cos_sim": 0.6656121532800101,
235
+ "flow/improvement_ratio": 0.993739642822412,
236
+ "flow/mag_ratio_mean": 0.6285864149075328,
237
+ "flow/mag_ratio_std": 0.14186543162697668,
238
+ "step": 6144
239
+ },
240
+ {
241
+ "epoch": 0.30891891309500724,
242
+ "grad_norm": 1.2597520351409912,
243
+ "learning_rate": 8.22483558761947e-05,
244
+ "loss": 5.561953544616699,
245
+ "step": 7168
246
+ },
247
+ {
248
+ "epoch": 0.30891891309500724,
249
+ "eval_bleu": 0.4687065651483432,
250
+ "eval_ce_clean_loss": 0.08836616801300537,
251
+ "eval_ce_pred_loss": 2.2620769802695397,
252
+ "eval_flow_consistency_loss": 0.2063061216238465,
253
+ "eval_flow_mse_loss": 1.1843450824334931,
254
+ "eval_loss": 5.343514618843095,
255
+ "flow/cos_sim": 0.6833345525300325,
256
+ "flow/improvement_ratio": 0.9944396457438276,
257
+ "flow/mag_ratio_mean": 0.6510747799487002,
258
+ "flow/mag_ratio_std": 0.15373703074861944,
259
+ "step": 7168
260
+ },
261
+ {
262
+ "epoch": 0.30891891309500724,
263
+ "eval_bleu": 0.4687065651483432,
264
+ "eval_ce_clean_loss": 0.08836616801300537,
265
+ "eval_ce_pred_loss": 2.2620769802695397,
266
+ "eval_flow_consistency_loss": 0.2063061216238465,
267
+ "eval_flow_mse_loss": 1.1843450824334931,
268
+ "eval_loss": 5.343514618843095,
269
+ "eval_runtime": 212.5372,
270
+ "eval_samples_per_second": 141.152,
271
+ "eval_steps_per_second": 2.207,
272
+ "flow/cos_sim": 0.6833345525300325,
273
+ "flow/improvement_ratio": 0.9944396457438276,
274
+ "flow/mag_ratio_mean": 0.6510747799487002,
275
+ "flow/mag_ratio_std": 0.15373703074861944,
276
+ "step": 7168
277
+ },
278
+ {
279
+ "epoch": 0.353050186394294,
280
+ "grad_norm": 2.1638965606689453,
281
+ "learning_rate": 7.639311770076283e-05,
282
+ "loss": 5.4200921058654785,
283
+ "step": 8192
284
+ },
285
+ {
286
+ "epoch": 0.353050186394294,
287
+ "eval_bleu": 0.4806818081298782,
288
+ "eval_ce_clean_loss": 0.06611348781536129,
289
+ "eval_ce_pred_loss": 2.205830120582825,
290
+ "eval_flow_consistency_loss": 0.21406505789075578,
291
+ "eval_flow_mse_loss": 1.2107092700024913,
292
+ "eval_loss": 5.302672486315404,
293
+ "flow/cos_sim": 0.6940237746309879,
294
+ "flow/improvement_ratio": 0.993346476732795,
295
+ "flow/mag_ratio_mean": 0.6572817430567386,
296
+ "flow/mag_ratio_std": 0.1605924148676492,
297
+ "step": 8192
298
+ },
299
+ {
300
+ "epoch": 0.353050186394294,
301
+ "eval_bleu": 0.4806818081298782,
302
+ "eval_ce_clean_loss": 0.06611348781536129,
303
+ "eval_ce_pred_loss": 2.205830120582825,
304
+ "eval_flow_consistency_loss": 0.21406505789075578,
305
+ "eval_flow_mse_loss": 1.2107092700024913,
306
+ "eval_loss": 5.302672486315404,
307
+ "eval_runtime": 210.6328,
308
+ "eval_samples_per_second": 142.428,
309
+ "eval_steps_per_second": 2.227,
310
+ "flow/cos_sim": 0.6940237746309879,
311
+ "flow/improvement_ratio": 0.993346476732795,
312
+ "flow/mag_ratio_mean": 0.6572817430567386,
313
+ "flow/mag_ratio_std": 0.1605924148676492,
314
+ "step": 8192
315
+ },
316
+ {
317
+ "epoch": 0.3971814596935807,
318
+ "grad_norm": 3.1213200092315674,
319
+ "learning_rate": 6.998470950469718e-05,
320
+ "loss": 5.320468425750732,
321
+ "step": 9216
322
+ },
323
+ {
324
+ "epoch": 0.3971814596935807,
325
+ "eval_bleu": 0.49321486760812644,
326
+ "eval_ce_clean_loss": 0.051587040851841855,
327
+ "eval_ce_pred_loss": 2.1326764727960517,
328
+ "eval_flow_consistency_loss": 0.20684453628973157,
329
+ "eval_flow_mse_loss": 1.2189766706179963,
330
+ "eval_loss": 5.198114048951725,
331
+ "flow/cos_sim": 0.7069450036040755,
332
+ "flow/improvement_ratio": 0.9930796729984568,
333
+ "flow/mag_ratio_mean": 0.6787635914044086,
334
+ "flow/mag_ratio_std": 0.16792897991280056,
335
+ "step": 9216
336
+ },
337
+ {
338
+ "epoch": 0.3971814596935807,
339
+ "eval_bleu": 0.49321486760812644,
340
+ "eval_ce_clean_loss": 0.051587040851841855,
341
+ "eval_ce_pred_loss": 2.1326764727960517,
342
+ "eval_flow_consistency_loss": 0.20684453628973157,
343
+ "eval_flow_mse_loss": 1.2189766706179963,
344
+ "eval_loss": 5.198114048951725,
345
+ "eval_runtime": 210.2128,
346
+ "eval_samples_per_second": 142.713,
347
+ "eval_steps_per_second": 2.231,
348
+ "flow/cos_sim": 0.7069450036040755,
349
+ "flow/improvement_ratio": 0.9930796729984568,
350
+ "flow/mag_ratio_mean": 0.6787635914044086,
351
+ "flow/mag_ratio_std": 0.16792897991280056,
352
+ "step": 9216
353
+ },
354
+ {
355
+ "epoch": 0.44131273299286744,
356
+ "grad_norm": 1.7769527435302734,
357
+ "learning_rate": 6.315061173955019e-05,
358
+ "loss": 5.260382652282715,
359
+ "step": 10240
360
+ },
361
+ {
362
+ "epoch": 0.44131273299286744,
363
+ "eval_bleu": 0.5003817272012132,
364
+ "eval_ce_clean_loss": 0.04119707541520407,
365
+ "eval_ce_pred_loss": 2.080879623447654,
366
+ "eval_flow_consistency_loss": 0.20481373865340055,
367
+ "eval_flow_mse_loss": 1.2196696361244868,
368
+ "eval_loss": 5.118794566786874,
369
+ "flow/cos_sim": 0.7191916523711768,
370
+ "flow/improvement_ratio": 0.993994485340647,
371
+ "flow/mag_ratio_mean": 0.6901472842515405,
372
+ "flow/mag_ratio_std": 0.17046165418650297,
373
+ "step": 10240
374
+ },
375
+ {
376
+ "epoch": 0.44131273299286744,
377
+ "eval_bleu": 0.5003817272012132,
378
+ "eval_ce_clean_loss": 0.04119707541520407,
379
+ "eval_ce_pred_loss": 2.080879623447654,
380
+ "eval_flow_consistency_loss": 0.20481373865340055,
381
+ "eval_flow_mse_loss": 1.2196696361244868,
382
+ "eval_loss": 5.118794566786874,
383
+ "eval_runtime": 209.7844,
384
+ "eval_samples_per_second": 143.004,
385
+ "eval_steps_per_second": 2.236,
386
+ "flow/cos_sim": 0.7191916523711768,
387
+ "flow/improvement_ratio": 0.993994485340647,
388
+ "flow/mag_ratio_mean": 0.6901472842515405,
389
+ "flow/mag_ratio_std": 0.17046165418650297,
390
+ "step": 10240
391
+ },
392
+ {
393
+ "epoch": 0.4854440062921542,
394
+ "grad_norm": 2.305457592010498,
395
+ "learning_rate": 5.604035379537632e-05,
396
+ "loss": 5.207396030426025,
397
+ "step": 11264
398
+ },
399
+ {
400
+ "epoch": 0.4854440062921542,
401
+ "eval_bleu": 0.5041758930303886,
402
+ "eval_ce_clean_loss": 0.03406135918203193,
403
+ "eval_ce_pred_loss": 2.032164579007163,
404
+ "eval_flow_consistency_loss": 0.1978874005107229,
405
+ "eval_flow_mse_loss": 1.2266178403073535,
406
+ "eval_loss": 5.049889673302168,
407
+ "flow/cos_sim": 0.7285718597582916,
408
+ "flow/improvement_ratio": 0.995131308081816,
409
+ "flow/mag_ratio_mean": 0.7023200071188432,
410
+ "flow/mag_ratio_std": 0.1736910315528353,
411
+ "step": 11264
412
+ },
413
+ {
414
+ "epoch": 0.4854440062921542,
415
+ "eval_bleu": 0.5041758930303886,
416
+ "eval_ce_clean_loss": 0.03406135918203193,
417
+ "eval_ce_pred_loss": 2.032164579007163,
418
+ "eval_flow_consistency_loss": 0.1978874005107229,
419
+ "eval_flow_mse_loss": 1.2266178403073535,
420
+ "eval_loss": 5.049889673302168,
421
+ "eval_runtime": 208.1179,
422
+ "eval_samples_per_second": 144.149,
423
+ "eval_steps_per_second": 2.254,
424
+ "flow/cos_sim": 0.7285718597582916,
425
+ "flow/improvement_ratio": 0.995131308081816,
426
+ "flow/mag_ratio_mean": 0.7023200071188432,
427
+ "flow/mag_ratio_std": 0.1736910315528353,
428
+ "step": 11264
429
+ },
430
+ {
431
+ "epoch": 0.5295752795914409,
432
+ "grad_norm": 1.614006757736206,
433
+ "learning_rate": 4.881032966918056e-05,
434
+ "loss": 5.130342483520508,
435
+ "step": 12288
436
+ },
437
+ {
438
+ "epoch": 0.5295752795914409,
439
+ "eval_bleu": 0.514033522638653,
440
+ "eval_ce_clean_loss": 0.028551021900607834,
441
+ "eval_ce_pred_loss": 2.0023169781861783,
442
+ "eval_flow_consistency_loss": 0.1769014176592898,
443
+ "eval_flow_mse_loss": 1.2329019299193995,
444
+ "eval_loss": 4.997204606212787,
445
+ "flow/cos_sim": 0.737011756088688,
446
+ "flow/improvement_ratio": 0.9937192791306388,
447
+ "flow/mag_ratio_mean": 0.707237169813754,
448
+ "flow/mag_ratio_std": 0.17565249913791095,
449
+ "step": 12288
450
+ },
451
+ {
452
+ "epoch": 0.5295752795914409,
453
+ "eval_bleu": 0.514033522638653,
454
+ "eval_ce_clean_loss": 0.028551021900607834,
455
+ "eval_ce_pred_loss": 2.0023169781861783,
456
+ "eval_flow_consistency_loss": 0.1769014176592898,
457
+ "eval_flow_mse_loss": 1.2329019299193995,
458
+ "eval_loss": 4.997204606212787,
459
+ "eval_runtime": 209.7397,
460
+ "eval_samples_per_second": 143.034,
461
+ "eval_steps_per_second": 2.236,
462
+ "flow/cos_sim": 0.737011756088688,
463
+ "flow/improvement_ratio": 0.9937192791306388,
464
+ "flow/mag_ratio_mean": 0.707237169813754,
465
+ "flow/mag_ratio_std": 0.17565249913791095,
466
+ "step": 12288
467
+ }
468
+ ],
469
+ "logging_steps": 1024,
470
+ "max_steps": 23204,
471
+ "num_input_tokens_seen": 0,
472
+ "num_train_epochs": 1,
473
+ "save_steps": 1024,
474
+ "stateful_callbacks": {
475
+ "TrainerControl": {
476
+ "args": {
477
+ "should_epoch_stop": false,
478
+ "should_evaluate": false,
479
+ "should_log": false,
480
+ "should_save": true,
481
+ "should_training_stop": false
482
+ },
483
+ "attributes": {}
484
+ }
485
+ },
486
+ "total_flos": 0.0,
487
+ "train_batch_size": 64,
488
+ "trial_name": null,
489
+ "trial_params": null
490
+ }
checkpoints-v5.12-b/checkpoint-12288/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8469bbc81a2ba0be2f5b44007faafd15c75615abe30f4f4e56171816d31caa5b
3
+ size 5137