Attila1011 commited on
Commit
4ddc7a5
·
verified ·
1 Parent(s): 75919ae

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -74,3 +74,4 @@ checkpoints-v5.15/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs
74
  checkpoints-v4.7/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
75
  checkpoints-v4.7/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
76
  checkpoints-d1.0/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
74
  checkpoints-v4.7/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
75
  checkpoints-v4.7/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
76
  checkpoints-d1.0/checkpoint-10240/eval_state.json filter=lfs diff=lfs merge=lfs -text
77
+ checkpoints-d1.1/checkpoint-11264/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-d1.1/checkpoint-11264/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23e63d448e361121d7872532d98f47e7e52a6fd6ce2a62032928dc596aa1a095
3
+ size 746712
checkpoints-d1.1/checkpoint-11264/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1277638d7496d99a91d95fa2f44fe42737fdbaba1aab4182cf93080471ea57b8
3
+ size 57292057
checkpoints-d1.1/checkpoint-11264/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94d0ceb939d87027a12516dec601816c2d8c459b94f6a24d7fcf0309372977b4
3
+ size 24002016
checkpoints-d1.1/checkpoint-11264/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd82bb557889ae3238f2b338685b12aeeeb1c01e3e6a66696e34878d50e2114e
3
+ size 1569995
checkpoints-d1.1/checkpoint-11264/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7cfc61462969f5ba350924264a7a5144a5ddf7261e3539e2bdf80a4f26cc8ee
3
+ size 14645
checkpoints-d1.1/checkpoint-11264/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04050294348d6e04878694718ea868fb71c948a4cc1b6bcc5bf74a159d253de6
3
+ size 1383
checkpoints-d1.1/checkpoint-11264/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccf022a1be483726c969dc5e04a5e0174513bbea4e2812e09e97b7093b3e4e2e
3
+ size 1465
checkpoints-d1.1/checkpoint-11264/trainer_state.json ADDED
@@ -0,0 +1,727 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.4854440062921542,
6
+ "eval_steps": 1024,
7
+ "global_step": 11264,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011032818324821687,
14
+ "grad_norm": 0.10309942811727524,
15
+ "learning_rate": 0.000498046875,
16
+ "loss": 1.9074174165725708,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.022065636649643373,
21
+ "grad_norm": 0.2910110056400299,
22
+ "learning_rate": 0.000998046875,
23
+ "loss": 1.5273144245147705,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03309845497446506,
28
+ "grad_norm": 0.3859289586544037,
29
+ "learning_rate": 0.000999688448778502,
30
+ "loss": 1.3800736665725708,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.04413127329928675,
35
+ "grad_norm": 0.5722110867500305,
36
+ "learning_rate": 0.0009987492950653055,
37
+ "loss": 1.342606544494629,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.04413127329928675,
42
+ "eval_bleu": 0.9366650964401493,
43
+ "eval_cos_loss": 0.4710617309440174,
44
+ "eval_dec_loss": 0.11786629736169314,
45
+ "eval_loss": 1.3323029561845987,
46
+ "eval_mse2_loss": 0.1665979178824913,
47
+ "eval_mse_loss": 1.3323029561845987,
48
+ "eval_rec_loss": 0.047009017791098624,
49
+ "eval_var_loss": 0.01723895594080501,
50
+ "flow/cos_sim": 0.5289382661329404,
51
+ "flow/improvement_ratio": 0.8936813888010948,
52
+ "flow/mag_ratio_mean": 0.5435932263382462,
53
+ "flow/mag_ratio_std": 0.2489985737210906,
54
+ "step": 1024
55
+ },
56
+ {
57
+ "epoch": 0.04413127329928675,
58
+ "eval_bleu": 0.9366650964401493,
59
+ "eval_cos_loss": 0.4710617309440174,
60
+ "eval_dec_loss": 0.11786629736169314,
61
+ "eval_loss": 1.3323029561845987,
62
+ "eval_mse2_loss": 0.1665979178824913,
63
+ "eval_mse_loss": 1.3323029561845987,
64
+ "eval_rec_loss": 0.047009017791098624,
65
+ "eval_runtime": 157.3375,
66
+ "eval_samples_per_second": 190.673,
67
+ "eval_steps_per_second": 2.981,
68
+ "eval_var_loss": 0.01723895594080501,
69
+ "flow/cos_sim": 0.5289382661329404,
70
+ "flow/improvement_ratio": 0.8936813888010948,
71
+ "flow/mag_ratio_mean": 0.5435932263382462,
72
+ "flow/mag_ratio_std": 0.2489985737210906,
73
+ "step": 1024
74
+ },
75
+ {
76
+ "epoch": 0.05516409162410843,
77
+ "grad_norm": 0.6506242752075195,
78
+ "learning_rate": 0.0009971837136430763,
79
+ "loss": 1.3261979818344116,
80
+ "step": 1280
81
+ },
82
+ {
83
+ "epoch": 0.06619690994893011,
84
+ "grad_norm": 0.6324401497840881,
85
+ "learning_rate": 0.0009949936708776692,
86
+ "loss": 1.3123514652252197,
87
+ "step": 1536
88
+ },
89
+ {
90
+ "epoch": 0.07722972827375181,
91
+ "grad_norm": 1.1031574010849,
92
+ "learning_rate": 0.0009921819174566252,
93
+ "loss": 1.3050185441970825,
94
+ "step": 1792
95
+ },
96
+ {
97
+ "epoch": 0.0882625465985735,
98
+ "grad_norm": 0.762417733669281,
99
+ "learning_rate": 0.000988751984934317,
100
+ "loss": 1.3001574277877808,
101
+ "step": 2048
102
+ },
103
+ {
104
+ "epoch": 0.0882625465985735,
105
+ "eval_bleu": 0.938925796606621,
106
+ "eval_cos_loss": 0.4579503086330032,
107
+ "eval_dec_loss": 0.10506504188690867,
108
+ "eval_loss": 1.2970875999820766,
109
+ "eval_mse2_loss": 0.15707123614768229,
110
+ "eval_mse_loss": 1.2970875999820766,
111
+ "eval_rec_loss": 0.047009017791098624,
112
+ "eval_var_loss": 0.01723895594080501,
113
+ "flow/cos_sim": 0.5420496905409197,
114
+ "flow/improvement_ratio": 0.8918823948038667,
115
+ "flow/mag_ratio_mean": 0.5503126610316702,
116
+ "flow/mag_ratio_std": 0.25175602854823254,
117
+ "step": 2048
118
+ },
119
+ {
120
+ "epoch": 0.0882625465985735,
121
+ "eval_bleu": 0.938925796606621,
122
+ "eval_cos_loss": 0.4579503086330032,
123
+ "eval_dec_loss": 0.10506504188690867,
124
+ "eval_loss": 1.2970875999820766,
125
+ "eval_mse2_loss": 0.15707123614768229,
126
+ "eval_mse_loss": 1.2970875999820766,
127
+ "eval_rec_loss": 0.047009017791098624,
128
+ "eval_runtime": 151.9416,
129
+ "eval_samples_per_second": 197.444,
130
+ "eval_steps_per_second": 3.087,
131
+ "eval_var_loss": 0.01723895594080501,
132
+ "flow/cos_sim": 0.5420496905409197,
133
+ "flow/improvement_ratio": 0.8918823948038667,
134
+ "flow/mag_ratio_mean": 0.5503126610316702,
135
+ "flow/mag_ratio_std": 0.25175602854823254,
136
+ "step": 2048
137
+ },
138
+ {
139
+ "epoch": 0.09929536492339518,
140
+ "grad_norm": 0.39165085554122925,
141
+ "learning_rate": 0.0009847081812963268,
142
+ "loss": 1.2909460067749023,
143
+ "step": 2304
144
+ },
145
+ {
146
+ "epoch": 0.11032818324821686,
147
+ "grad_norm": 0.6050369739532471,
148
+ "learning_rate": 0.0009800555855486275,
149
+ "loss": 1.291382908821106,
150
+ "step": 2560
151
+ },
152
+ {
153
+ "epoch": 0.12136100157303854,
154
+ "grad_norm": 0.6340572237968445,
155
+ "learning_rate": 0.0009748000413383664,
156
+ "loss": 1.2860350608825684,
157
+ "step": 2816
158
+ },
159
+ {
160
+ "epoch": 0.13239381989786023,
161
+ "grad_norm": 0.8046131134033203,
162
+ "learning_rate": 0.0009689481496142604,
163
+ "loss": 1.2806360721588135,
164
+ "step": 3072
165
+ },
166
+ {
167
+ "epoch": 0.13239381989786023,
168
+ "eval_bleu": 0.9365596012238808,
169
+ "eval_cos_loss": 0.4510079253075728,
170
+ "eval_dec_loss": 0.1170106883853801,
171
+ "eval_loss": 1.2785198518208094,
172
+ "eval_mse2_loss": 0.15482012001372603,
173
+ "eval_mse_loss": 1.2785198518208094,
174
+ "eval_rec_loss": 0.047009017791098624,
175
+ "eval_var_loss": 0.01723895594080501,
176
+ "flow/cos_sim": 0.5489920710703966,
177
+ "flow/improvement_ratio": 0.895310169598187,
178
+ "flow/mag_ratio_mean": 0.5600611698398712,
179
+ "flow/mag_ratio_std": 0.2589119763326035,
180
+ "step": 3072
181
+ },
182
+ {
183
+ "epoch": 0.13239381989786023,
184
+ "eval_bleu": 0.9365596012238808,
185
+ "eval_cos_loss": 0.4510079253075728,
186
+ "eval_dec_loss": 0.1170106883853801,
187
+ "eval_loss": 1.2785198518208094,
188
+ "eval_mse2_loss": 0.15482012001372603,
189
+ "eval_mse_loss": 1.2785198518208094,
190
+ "eval_rec_loss": 0.047009017791098624,
191
+ "eval_runtime": 150.2303,
192
+ "eval_samples_per_second": 199.693,
193
+ "eval_steps_per_second": 3.122,
194
+ "eval_var_loss": 0.01723895594080501,
195
+ "flow/cos_sim": 0.5489920710703966,
196
+ "flow/improvement_ratio": 0.895310169598187,
197
+ "flow/mag_ratio_mean": 0.5600611698398712,
198
+ "flow/mag_ratio_std": 0.2589119763326035,
199
+ "step": 3072
200
+ },
201
+ {
202
+ "epoch": 0.14342663822268192,
203
+ "grad_norm": 0.7344346046447754,
204
+ "learning_rate": 0.0009625072603358231,
205
+ "loss": 1.277908444404602,
206
+ "step": 3328
207
+ },
208
+ {
209
+ "epoch": 0.15445945654750362,
210
+ "grad_norm": 0.7456739544868469,
211
+ "learning_rate": 0.0009554854632418371,
212
+ "loss": 1.274967074394226,
213
+ "step": 3584
214
+ },
215
+ {
216
+ "epoch": 0.1654922748723253,
217
+ "grad_norm": 0.528167724609375,
218
+ "learning_rate": 0.000947891577689663,
219
+ "loss": 1.2722811698913574,
220
+ "step": 3840
221
+ },
222
+ {
223
+ "epoch": 0.176525093197147,
224
+ "grad_norm": 0.7374073266983032,
225
+ "learning_rate": 0.0009397351415781539,
226
+ "loss": 1.2716022729873657,
227
+ "step": 4096
228
+ },
229
+ {
230
+ "epoch": 0.176525093197147,
231
+ "eval_bleu": 0.9383145863088955,
232
+ "eval_cos_loss": 0.44795799712890755,
233
+ "eval_dec_loss": 0.11301154795406597,
234
+ "eval_loss": 1.2707049117159488,
235
+ "eval_mse2_loss": 0.15204078735890927,
236
+ "eval_mse_loss": 1.2707049117159488,
237
+ "eval_rec_loss": 0.047009017791098624,
238
+ "eval_var_loss": 0.01723895594080501,
239
+ "flow/cos_sim": 0.552042003760714,
240
+ "flow/improvement_ratio": 0.8948889724227157,
241
+ "flow/mag_ratio_mean": 0.5576132778674047,
242
+ "flow/mag_ratio_std": 0.25525683488672984,
243
+ "step": 4096
244
+ },
245
+ {
246
+ "epoch": 0.176525093197147,
247
+ "eval_bleu": 0.9383145863088955,
248
+ "eval_cos_loss": 0.44795799712890755,
249
+ "eval_dec_loss": 0.11301154795406597,
250
+ "eval_loss": 1.2707049117159488,
251
+ "eval_mse2_loss": 0.15204078735890927,
252
+ "eval_mse_loss": 1.2707049117159488,
253
+ "eval_rec_loss": 0.047009017791098624,
254
+ "eval_runtime": 149.5476,
255
+ "eval_samples_per_second": 200.605,
256
+ "eval_steps_per_second": 3.136,
257
+ "eval_var_loss": 0.01723895594080501,
258
+ "flow/cos_sim": 0.552042003760714,
259
+ "flow/improvement_ratio": 0.8948889724227157,
260
+ "flow/mag_ratio_mean": 0.5576132778674047,
261
+ "flow/mag_ratio_std": 0.25525683488672984,
262
+ "step": 4096
263
+ },
264
+ {
265
+ "epoch": 0.18755791152196866,
266
+ "grad_norm": 1.123129963874817,
267
+ "learning_rate": 0.000931026399368079,
268
+ "loss": 1.2691912651062012,
269
+ "step": 4352
270
+ },
271
+ {
272
+ "epoch": 0.19859072984679035,
273
+ "grad_norm": 0.49173882603645325,
274
+ "learning_rate": 0.0009217762892151117,
275
+ "loss": 1.26752769947052,
276
+ "step": 4608
277
+ },
278
+ {
279
+ "epoch": 0.20962354817161205,
280
+ "grad_norm": 0.5665431618690491,
281
+ "learning_rate": 0.0009119964292315354,
282
+ "loss": 1.2669333219528198,
283
+ "step": 4864
284
+ },
285
+ {
286
+ "epoch": 0.22065636649643372,
287
+ "grad_norm": 0.4946308732032776,
288
+ "learning_rate": 0.0009016991028939279,
289
+ "loss": 1.2646225690841675,
290
+ "step": 5120
291
+ },
292
+ {
293
+ "epoch": 0.22065636649643372,
294
+ "eval_bleu": 0.9396675860722136,
295
+ "eval_cos_loss": 0.44516199083724767,
296
+ "eval_dec_loss": 0.10893038547893705,
297
+ "eval_loss": 1.264682760879175,
298
+ "eval_mse2_loss": 0.1498125367073108,
299
+ "eval_mse_loss": 1.264682760879175,
300
+ "eval_rec_loss": 0.047009017791098624,
301
+ "eval_var_loss": 0.01723895594080501,
302
+ "flow/cos_sim": 0.5548380070657872,
303
+ "flow/improvement_ratio": 0.8946977740665997,
304
+ "flow/mag_ratio_mean": 0.5694006043456511,
305
+ "flow/mag_ratio_std": 0.2655116878211625,
306
+ "step": 5120
307
+ },
308
+ {
309
+ "epoch": 0.22065636649643372,
310
+ "eval_bleu": 0.9396675860722136,
311
+ "eval_cos_loss": 0.44516199083724767,
312
+ "eval_dec_loss": 0.10893038547893705,
313
+ "eval_loss": 1.264682760879175,
314
+ "eval_mse2_loss": 0.1498125367073108,
315
+ "eval_mse_loss": 1.264682760879175,
316
+ "eval_rec_loss": 0.047009017791098624,
317
+ "eval_runtime": 151.8799,
318
+ "eval_samples_per_second": 197.524,
319
+ "eval_steps_per_second": 3.088,
320
+ "eval_var_loss": 0.01723895594080501,
321
+ "flow/cos_sim": 0.5548380070657872,
322
+ "flow/improvement_ratio": 0.8946977740665997,
323
+ "flow/mag_ratio_mean": 0.5694006043456511,
324
+ "flow/mag_ratio_std": 0.2655116878211625,
325
+ "step": 5120
326
+ },
327
+ {
328
+ "epoch": 0.23168918482125542,
329
+ "grad_norm": 0.5147830843925476,
330
+ "learning_rate": 0.0008908972436151494,
331
+ "loss": 1.261371374130249,
332
+ "step": 5376
333
+ },
334
+ {
335
+ "epoch": 0.2427220031460771,
336
+ "grad_norm": 0.7221893668174744,
337
+ "learning_rate": 0.0008796044185000127,
338
+ "loss": 1.259010672569275,
339
+ "step": 5632
340
+ },
341
+ {
342
+ "epoch": 0.2537548214708988,
343
+ "grad_norm": 0.6270182132720947,
344
+ "learning_rate": 0.0008678348113050368,
345
+ "loss": 1.2565613985061646,
346
+ "step": 5888
347
+ },
348
+ {
349
+ "epoch": 0.26478763979572045,
350
+ "grad_norm": 0.3954711854457855,
351
+ "learning_rate": 0.0008556032046236897,
352
+ "loss": 1.258548378944397,
353
+ "step": 6144
354
+ },
355
+ {
356
+ "epoch": 0.26478763979572045,
357
+ "eval_bleu": 0.9381239377332383,
358
+ "eval_cos_loss": 0.4434889930524806,
359
+ "eval_dec_loss": 0.11391587999226378,
360
+ "eval_loss": 1.2588644528439812,
361
+ "eval_mse2_loss": 0.15056055846181252,
362
+ "eval_mse_loss": 1.2588644528439812,
363
+ "eval_rec_loss": 0.047009017791098624,
364
+ "eval_var_loss": 0.01723895594080501,
365
+ "flow/cos_sim": 0.5565110067568863,
366
+ "flow/improvement_ratio": 0.8946461940625074,
367
+ "flow/mag_ratio_mean": 0.5628604918146438,
368
+ "flow/mag_ratio_std": 0.2606462057528974,
369
+ "step": 6144
370
+ },
371
+ {
372
+ "epoch": 0.26478763979572045,
373
+ "eval_bleu": 0.9381239377332383,
374
+ "eval_cos_loss": 0.4434889930524806,
375
+ "eval_dec_loss": 0.11391587999226378,
376
+ "eval_loss": 1.2588644528439812,
377
+ "eval_mse2_loss": 0.15056055846181252,
378
+ "eval_mse_loss": 1.2588644528439812,
379
+ "eval_rec_loss": 0.047009017791098624,
380
+ "eval_runtime": 153.8457,
381
+ "eval_samples_per_second": 195.001,
382
+ "eval_steps_per_second": 3.049,
383
+ "eval_var_loss": 0.01723895594080501,
384
+ "flow/cos_sim": 0.5565110067568863,
385
+ "flow/improvement_ratio": 0.8946461940625074,
386
+ "flow/mag_ratio_mean": 0.5628604918146438,
387
+ "flow/mag_ratio_std": 0.2606462057528974,
388
+ "step": 6144
389
+ },
390
+ {
391
+ "epoch": 0.2758204581205422,
392
+ "grad_norm": 0.8126729130744934,
393
+ "learning_rate": 0.000842924961319492,
394
+ "loss": 1.2565950155258179,
395
+ "step": 6400
396
+ },
397
+ {
398
+ "epoch": 0.28685327644536385,
399
+ "grad_norm": 0.84797203540802,
400
+ "learning_rate": 0.0008298160052303045,
401
+ "loss": 1.2548315525054932,
402
+ "step": 6656
403
+ },
404
+ {
405
+ "epoch": 0.2978860947701855,
406
+ "grad_norm": 0.561568021774292,
407
+ "learning_rate": 0.0008162928011680314,
408
+ "loss": 1.2526129484176636,
409
+ "step": 6912
410
+ },
411
+ {
412
+ "epoch": 0.30891891309500724,
413
+ "grad_norm": 0.45474377274513245,
414
+ "learning_rate": 0.000802372334238864,
415
+ "loss": 1.2513761520385742,
416
+ "step": 7168
417
+ },
418
+ {
419
+ "epoch": 0.30891891309500724,
420
+ "eval_bleu": 0.9385536520845816,
421
+ "eval_cos_loss": 0.4402598062557961,
422
+ "eval_dec_loss": 0.11249503215500858,
423
+ "eval_loss": 1.2510530173397267,
424
+ "eval_mse2_loss": 0.1480516226116274,
425
+ "eval_mse_loss": 1.2510530173397267,
426
+ "eval_rec_loss": 0.047009017791098624,
427
+ "eval_var_loss": 0.01723895594080501,
428
+ "flow/cos_sim": 0.5597401952692694,
429
+ "flow/improvement_ratio": 0.895444744939743,
430
+ "flow/mag_ratio_mean": 0.5710282248220464,
431
+ "flow/mag_ratio_std": 0.26387540328858505,
432
+ "step": 7168
433
+ },
434
+ {
435
+ "epoch": 0.30891891309500724,
436
+ "eval_bleu": 0.9385536520845816,
437
+ "eval_cos_loss": 0.4402598062557961,
438
+ "eval_dec_loss": 0.11249503215500858,
439
+ "eval_loss": 1.2510530173397267,
440
+ "eval_mse2_loss": 0.1480516226116274,
441
+ "eval_mse_loss": 1.2510530173397267,
442
+ "eval_rec_loss": 0.047009017791098624,
443
+ "eval_runtime": 152.7181,
444
+ "eval_samples_per_second": 196.44,
445
+ "eval_steps_per_second": 3.071,
446
+ "eval_var_loss": 0.01723895594080501,
447
+ "flow/cos_sim": 0.5597401952692694,
448
+ "flow/improvement_ratio": 0.895444744939743,
449
+ "flow/mag_ratio_mean": 0.5710282248220464,
450
+ "flow/mag_ratio_std": 0.26387540328858505,
451
+ "step": 7168
452
+ },
453
+ {
454
+ "epoch": 0.3199517314198289,
455
+ "grad_norm": 1.3543585538864136,
456
+ "learning_rate": 0.0007880720885100349,
457
+ "loss": 1.2521653175354004,
458
+ "step": 7424
459
+ },
460
+ {
461
+ "epoch": 0.3309845497446506,
462
+ "grad_norm": 0.4370076358318329,
463
+ "learning_rate": 0.0007734100250498788,
464
+ "loss": 1.249273419380188,
465
+ "step": 7680
466
+ },
467
+ {
468
+ "epoch": 0.3420173680694723,
469
+ "grad_norm": 1.0196475982666016,
470
+ "learning_rate": 0.000758404559368781,
471
+ "loss": 1.2500712871551514,
472
+ "step": 7936
473
+ },
474
+ {
475
+ "epoch": 0.353050186394294,
476
+ "grad_norm": 0.733001708984375,
477
+ "learning_rate": 0.0007430745382893488,
478
+ "loss": 1.245364785194397,
479
+ "step": 8192
480
+ },
481
+ {
482
+ "epoch": 0.353050186394294,
483
+ "eval_bleu": 0.9376793187397806,
484
+ "eval_cos_loss": 0.4385024095013706,
485
+ "eval_dec_loss": 0.11364057421017049,
486
+ "eval_loss": 1.2459661925016945,
487
+ "eval_mse2_loss": 0.148339767350571,
488
+ "eval_mse_loss": 1.2459661925016945,
489
+ "eval_rec_loss": 0.047009017791098624,
490
+ "eval_var_loss": 0.01723895594080501,
491
+ "flow/cos_sim": 0.5614975882745755,
492
+ "flow/improvement_ratio": 0.8961417695352518,
493
+ "flow/mag_ratio_mean": 0.5688313084370547,
494
+ "flow/mag_ratio_std": 0.26494109700483554,
495
+ "step": 8192
496
+ },
497
+ {
498
+ "epoch": 0.353050186394294,
499
+ "eval_bleu": 0.9376793187397806,
500
+ "eval_cos_loss": 0.4385024095013706,
501
+ "eval_dec_loss": 0.11364057421017049,
502
+ "eval_loss": 1.2459661925016945,
503
+ "eval_mse2_loss": 0.148339767350571,
504
+ "eval_mse_loss": 1.2459661925016945,
505
+ "eval_rec_loss": 0.047009017791098624,
506
+ "eval_runtime": 152.8054,
507
+ "eval_samples_per_second": 196.328,
508
+ "eval_steps_per_second": 3.069,
509
+ "eval_var_loss": 0.01723895594080501,
510
+ "flow/cos_sim": 0.5614975882745755,
511
+ "flow/improvement_ratio": 0.8961417695352518,
512
+ "flow/mag_ratio_mean": 0.5688313084370547,
513
+ "flow/mag_ratio_std": 0.26494109700483554,
514
+ "step": 8192
515
+ },
516
+ {
517
+ "epoch": 0.36408300471911564,
518
+ "grad_norm": 0.676328718662262,
519
+ "learning_rate": 0.0007274392162748551,
520
+ "loss": 1.2448910474777222,
521
+ "step": 8448
522
+ },
523
+ {
524
+ "epoch": 0.3751158230439373,
525
+ "grad_norm": 0.6379961967468262,
526
+ "learning_rate": 0.000711518231245687,
527
+ "loss": 1.2442706823349,
528
+ "step": 8704
529
+ },
530
+ {
531
+ "epoch": 0.38614864136875904,
532
+ "grad_norm": 0.5386805534362793,
533
+ "learning_rate": 0.0006953315799141723,
534
+ "loss": 1.2446835041046143,
535
+ "step": 8960
536
+ },
537
+ {
538
+ "epoch": 0.3971814596935807,
539
+ "grad_norm": 0.8263258934020996,
540
+ "learning_rate": 0.0006788995926687669,
541
+ "loss": 1.2411766052246094,
542
+ "step": 9216
543
+ },
544
+ {
545
+ "epoch": 0.3971814596935807,
546
+ "eval_bleu": 0.9372486918854673,
547
+ "eval_cos_loss": 0.43675092898452206,
548
+ "eval_dec_loss": 0.11516488874867273,
549
+ "eval_loss": 1.241364901762273,
550
+ "eval_mse2_loss": 0.1478570194196091,
551
+ "eval_mse_loss": 1.241364901762273,
552
+ "eval_rec_loss": 0.047009017791098624,
553
+ "eval_var_loss": 0.01723895594080501,
554
+ "flow/cos_sim": 0.5632490722863659,
555
+ "flow/improvement_ratio": 0.8974738620491679,
556
+ "flow/mag_ratio_mean": 0.5655419154207844,
557
+ "flow/mag_ratio_std": 0.2603240320041998,
558
+ "step": 9216
559
+ },
560
+ {
561
+ "epoch": 0.3971814596935807,
562
+ "eval_bleu": 0.9372486918854673,
563
+ "eval_cos_loss": 0.43675092898452206,
564
+ "eval_dec_loss": 0.11516488874867273,
565
+ "eval_loss": 1.241364901762273,
566
+ "eval_mse2_loss": 0.1478570194196091,
567
+ "eval_mse_loss": 1.241364901762273,
568
+ "eval_rec_loss": 0.047009017791098624,
569
+ "eval_runtime": 152.8433,
570
+ "eval_samples_per_second": 196.28,
571
+ "eval_steps_per_second": 3.069,
572
+ "eval_var_loss": 0.01723895594080501,
573
+ "flow/cos_sim": 0.5632490722863659,
574
+ "flow/improvement_ratio": 0.8974738620491679,
575
+ "flow/mag_ratio_mean": 0.5655419154207844,
576
+ "flow/mag_ratio_std": 0.2603240320041998,
577
+ "step": 9216
578
+ },
579
+ {
580
+ "epoch": 0.4082142780184024,
581
+ "grad_norm": 0.7855456471443176,
582
+ "learning_rate": 0.0006622429080391422,
583
+ "loss": 1.2460049390792847,
584
+ "step": 9472
585
+ },
586
+ {
587
+ "epoch": 0.4192470963432241,
588
+ "grad_norm": 0.4608207941055298,
589
+ "learning_rate": 0.0006453824467742515,
590
+ "loss": 1.2414920330047607,
591
+ "step": 9728
592
+ },
593
+ {
594
+ "epoch": 0.43027991466804577,
595
+ "grad_norm": 0.5247617959976196,
596
+ "learning_rate": 0.0006283393855659275,
597
+ "loss": 1.2424880266189575,
598
+ "step": 9984
599
+ },
600
+ {
601
+ "epoch": 0.44131273299286744,
602
+ "grad_norm": 0.8765453100204468,
603
+ "learning_rate": 0.0006111351304510173,
604
+ "loss": 1.237776517868042,
605
+ "step": 10240
606
+ },
607
+ {
608
+ "epoch": 0.44131273299286744,
609
+ "eval_bleu": 0.937646836000478,
610
+ "eval_cos_loss": 0.4353823194752878,
611
+ "eval_dec_loss": 0.11402556833737632,
612
+ "eval_loss": 1.2377641976260936,
613
+ "eval_mse2_loss": 0.1474350707204357,
614
+ "eval_mse_loss": 1.2377641976260936,
615
+ "eval_rec_loss": 0.047009017791098624,
616
+ "eval_var_loss": 0.01723895594080501,
617
+ "flow/cos_sim": 0.564617679063191,
618
+ "flow/improvement_ratio": 0.899760089830549,
619
+ "flow/mag_ratio_mean": 0.5730336795229394,
620
+ "flow/mag_ratio_std": 0.26344449729172154,
621
+ "step": 10240
622
+ },
623
+ {
624
+ "epoch": 0.44131273299286744,
625
+ "eval_bleu": 0.937646836000478,
626
+ "eval_cos_loss": 0.4353823194752878,
627
+ "eval_dec_loss": 0.11402556833737632,
628
+ "eval_loss": 1.2377641976260936,
629
+ "eval_mse2_loss": 0.1474350707204357,
630
+ "eval_mse_loss": 1.2377641976260936,
631
+ "eval_rec_loss": 0.047009017791098624,
632
+ "eval_runtime": 151.9737,
633
+ "eval_samples_per_second": 197.403,
634
+ "eval_steps_per_second": 3.086,
635
+ "eval_var_loss": 0.01723895594080501,
636
+ "flow/cos_sim": 0.564617679063191,
637
+ "flow/improvement_ratio": 0.899760089830549,
638
+ "flow/mag_ratio_mean": 0.5730336795229394,
639
+ "flow/mag_ratio_std": 0.26344449729172154,
640
+ "step": 10240
641
+ },
642
+ {
643
+ "epoch": 0.45234555131768917,
644
+ "grad_norm": 0.6895334124565125,
645
+ "learning_rate": 0.0005937912899254605,
646
+ "loss": 1.2384426593780518,
647
+ "step": 10496
648
+ },
649
+ {
650
+ "epoch": 0.46337836964251083,
651
+ "grad_norm": 0.6421330571174622,
652
+ "learning_rate": 0.0005763296478040787,
653
+ "loss": 1.240878939628601,
654
+ "step": 10752
655
+ },
656
+ {
657
+ "epoch": 0.4744111879673325,
658
+ "grad_norm": 0.7770284414291382,
659
+ "learning_rate": 0.0005587721358601663,
660
+ "loss": 1.2393468618392944,
661
+ "step": 11008
662
+ },
663
+ {
664
+ "epoch": 0.4854440062921542,
665
+ "grad_norm": 1.0520166158676147,
666
+ "learning_rate": 0.0005411408062792448,
667
+ "loss": 1.237922191619873,
668
+ "step": 11264
669
+ },
670
+ {
671
+ "epoch": 0.4854440062921542,
672
+ "eval_bleu": 0.93652744913201,
673
+ "eval_cos_loss": 0.4366011674851497,
674
+ "eval_dec_loss": 0.11468809016390459,
675
+ "eval_loss": 1.2409222840246108,
676
+ "eval_mse2_loss": 0.14564816977804912,
677
+ "eval_mse_loss": 1.2409222840246108,
678
+ "eval_rec_loss": 0.047009017791098624,
679
+ "eval_var_loss": 0.01723895594080501,
680
+ "flow/cos_sim": 0.5633988297824413,
681
+ "flow/improvement_ratio": 0.897065937773251,
682
+ "flow/mag_ratio_mean": 0.5639294942558956,
683
+ "flow/mag_ratio_std": 0.25510865748564066,
684
+ "step": 11264
685
+ },
686
+ {
687
+ "epoch": 0.4854440062921542,
688
+ "eval_bleu": 0.93652744913201,
689
+ "eval_cos_loss": 0.4366011674851497,
690
+ "eval_dec_loss": 0.11468809016390459,
691
+ "eval_loss": 1.2409222840246108,
692
+ "eval_mse2_loss": 0.14564816977804912,
693
+ "eval_mse_loss": 1.2409222840246108,
694
+ "eval_rec_loss": 0.047009017791098624,
695
+ "eval_runtime": 152.4483,
696
+ "eval_samples_per_second": 196.788,
697
+ "eval_steps_per_second": 3.076,
698
+ "eval_var_loss": 0.01723895594080501,
699
+ "flow/cos_sim": 0.5633988297824413,
700
+ "flow/improvement_ratio": 0.897065937773251,
701
+ "flow/mag_ratio_mean": 0.5639294942558956,
702
+ "flow/mag_ratio_std": 0.25510865748564066,
703
+ "step": 11264
704
+ }
705
+ ],
706
+ "logging_steps": 256,
707
+ "max_steps": 23204,
708
+ "num_input_tokens_seen": 0,
709
+ "num_train_epochs": 1,
710
+ "save_steps": 1024,
711
+ "stateful_callbacks": {
712
+ "TrainerControl": {
713
+ "args": {
714
+ "should_epoch_stop": false,
715
+ "should_evaluate": false,
716
+ "should_log": false,
717
+ "should_save": true,
718
+ "should_training_stop": false
719
+ },
720
+ "attributes": {}
721
+ }
722
+ },
723
+ "total_flos": 0.0,
724
+ "train_batch_size": 64,
725
+ "trial_name": null,
726
+ "trial_params": null
727
+ }
checkpoints-d1.1/checkpoint-11264/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a16bb839f687414b8e48611327c4b9cfddeefe38c031ca70808f9a97c476b7
3
+ size 5137