ChiefTheLord commited on
Commit
4f4bdfc
·
verified ·
1 Parent(s): 9bcef97

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -79,3 +79,4 @@ checkpoints-d1.2/checkpoint-20480/eval_state.json filter=lfs diff=lfs merge=lfs
79
  checkpoints-v1.1-d/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
80
  checkpoints-d2.0/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
81
  checkpoints-d2.0/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
79
  checkpoints-v1.1-d/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
80
  checkpoints-d2.0/checkpoint-7168/eval_state.json filter=lfs diff=lfs merge=lfs -text
81
  checkpoints-d2.0/checkpoint-21504/eval_state.json filter=lfs diff=lfs merge=lfs -text
82
+ checkpoints-d2.0-t1/checkpoint-17408/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-d2.0-t1/checkpoint-17408/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9444efb7d76e323b7b3178005a04d895fb24e07fab9707d99122f0f140b11f56
3
+ size 42402279
checkpoints-d2.0-t1/checkpoint-17408/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da65479c29f71f785b44f33c17a8204bee782d86a25c2e8af7532867fccc21d
3
+ size 32318104
checkpoints-d2.0-t1/checkpoint-17408/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad8e37cf61b6f2fd7cc202071742e70be39c400ecaa1929a97c9bde1a46273b
3
+ size 10010635
checkpoints-d2.0-t1/checkpoint-17408/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf1f3265930d1f1ea49e43ac26f5b9dcff623d55dd3c8497a3326caee03b2129
3
+ size 14645
checkpoints-d2.0-t1/checkpoint-17408/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cde0a87150daa6e4c1711b6971f37b1f87e96bb470ca1bc2e66650b7bb9252d6
3
+ size 1383
checkpoints-d2.0-t1/checkpoint-17408/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:907bd363cbb9a2ad34bb5cd3891b5c4fef8b45659aaf375f17f2876c5871354f
3
+ size 1465
checkpoints-d2.0-t1/checkpoint-17408/trainer_state.json ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.8040275275968778,
6
+ "eval_steps": 1024,
7
+ "global_step": 17408,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011823934229365849,
14
+ "grad_norm": 0.7295829057693481,
15
+ "learning_rate": 0.000498046875,
16
+ "loss": 1.8308284282684326,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.023647868458731697,
21
+ "grad_norm": 0.8052719831466675,
22
+ "learning_rate": 0.000998046875,
23
+ "loss": 1.8338860273361206,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03547180268809755,
28
+ "grad_norm": 0.8997901678085327,
29
+ "learning_rate": 0.000999640996023194,
30
+ "loss": 1.8343064785003662,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.047295736917463395,
35
+ "grad_norm": 0.7250568270683289,
36
+ "learning_rate": 0.0009985588674043958,
37
+ "loss": 1.8296632766723633,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.047295736917463395,
42
+ "eval_beta_ce_loss": 0.6351920716022248,
43
+ "eval_bleu": 0.4998693474981933,
44
+ "eval_loss": 1.8211502122552428,
45
+ "eval_uni_ce_loss": 0.5507660694590443,
46
+ "step": 1024
47
+ },
48
+ {
49
+ "epoch": 0.047295736917463395,
50
+ "eval_beta_ce_loss": 0.6351920716022248,
51
+ "eval_bleu": 0.4998693474981933,
52
+ "eval_loss": 1.8211502122552428,
53
+ "eval_runtime": 139.8967,
54
+ "eval_samples_per_second": 200.098,
55
+ "eval_steps_per_second": 3.131,
56
+ "eval_uni_ce_loss": 0.5507660694590443,
57
+ "step": 1024
58
+ },
59
+ {
60
+ "epoch": 0.05911967114682925,
61
+ "grad_norm": 0.7675038576126099,
62
+ "learning_rate": 0.0009967551747861387,
63
+ "loss": 1.8191157579421997,
64
+ "step": 1280
65
+ },
66
+ {
67
+ "epoch": 0.0709436053761951,
68
+ "grad_norm": 0.8503870368003845,
69
+ "learning_rate": 0.000994232528651847,
70
+ "loss": 1.8177354335784912,
71
+ "step": 1536
72
+ },
73
+ {
74
+ "epoch": 0.08276753960556095,
75
+ "grad_norm": 0.8022050261497498,
76
+ "learning_rate": 0.0009909945800260092,
77
+ "loss": 1.807867407798767,
78
+ "step": 1792
79
+ },
80
+ {
81
+ "epoch": 0.09459147383492679,
82
+ "grad_norm": 0.9533084034919739,
83
+ "learning_rate": 0.0009870460151900522,
84
+ "loss": 1.8013904094696045,
85
+ "step": 2048
86
+ },
87
+ {
88
+ "epoch": 0.09459147383492679,
89
+ "eval_beta_ce_loss": 0.6264337543483194,
90
+ "eval_bleu": 0.49771606091961523,
91
+ "eval_loss": 1.7982131136606818,
92
+ "eval_uni_ce_loss": 0.545345604555792,
93
+ "step": 2048
94
+ },
95
+ {
96
+ "epoch": 0.09459147383492679,
97
+ "eval_beta_ce_loss": 0.6264337543483194,
98
+ "eval_bleu": 0.49771606091961523,
99
+ "eval_loss": 1.7982131136606818,
100
+ "eval_runtime": 133.1265,
101
+ "eval_samples_per_second": 210.274,
102
+ "eval_steps_per_second": 3.29,
103
+ "eval_uni_ce_loss": 0.545345604555792,
104
+ "step": 2048
105
+ },
106
+ {
107
+ "epoch": 0.10641540806429264,
108
+ "grad_norm": 0.9694677591323853,
109
+ "learning_rate": 0.0009823925488998885,
110
+ "loss": 1.8004993200302124,
111
+ "step": 2304
112
+ },
113
+ {
114
+ "epoch": 0.1182393422936585,
115
+ "grad_norm": 0.8076265454292297,
116
+ "learning_rate": 0.0009770409161149525,
117
+ "loss": 1.7975847721099854,
118
+ "step": 2560
119
+ },
120
+ {
121
+ "epoch": 0.13006327652302435,
122
+ "grad_norm": 0.7418428063392639,
123
+ "learning_rate": 0.0009709988622506973,
124
+ "loss": 1.789136528968811,
125
+ "step": 2816
126
+ },
127
+ {
128
+ "epoch": 0.1418872107523902,
129
+ "grad_norm": 0.7103869915008545,
130
+ "learning_rate": 0.000964275131968659,
131
+ "loss": 1.7754944562911987,
132
+ "step": 3072
133
+ },
134
+ {
135
+ "epoch": 0.1418872107523902,
136
+ "eval_beta_ce_loss": 0.620368564918161,
137
+ "eval_bleu": 0.49887627167388116,
138
+ "eval_loss": 1.7806649703413384,
139
+ "eval_uni_ce_loss": 0.5399278415936858,
140
+ "step": 3072
141
+ },
142
+ {
143
+ "epoch": 0.1418872107523902,
144
+ "eval_beta_ce_loss": 0.620368564918161,
145
+ "eval_bleu": 0.49887627167388116,
146
+ "eval_loss": 1.7806649703413384,
147
+ "eval_runtime": 134.7634,
148
+ "eval_samples_per_second": 207.72,
149
+ "eval_steps_per_second": 3.25,
150
+ "eval_uni_ce_loss": 0.5399278415936858,
151
+ "step": 3072
152
+ },
153
+ {
154
+ "epoch": 0.15371114498175603,
155
+ "grad_norm": 0.7851372957229614,
156
+ "learning_rate": 0.0009568794565203123,
157
+ "loss": 1.7792595624923706,
158
+ "step": 3328
159
+ },
160
+ {
161
+ "epoch": 0.1655350792111219,
162
+ "grad_norm": 0.7782461047172546,
163
+ "learning_rate": 0.0009488225396630347,
164
+ "loss": 1.7753657102584839,
165
+ "step": 3584
166
+ },
167
+ {
168
+ "epoch": 0.17735901344048774,
169
+ "grad_norm": 0.7312053442001343,
170
+ "learning_rate": 0.0009401160421685646,
171
+ "loss": 1.769174575805664,
172
+ "step": 3840
173
+ },
174
+ {
175
+ "epoch": 0.18918294766985358,
176
+ "grad_norm": 0.8110184669494629,
177
+ "learning_rate": 0.0009307725649463714,
178
+ "loss": 1.769570231437683,
179
+ "step": 4096
180
+ },
181
+ {
182
+ "epoch": 0.18918294766985358,
183
+ "eval_beta_ce_loss": 0.6151743780260217,
184
+ "eval_bleu": 0.5008352932587944,
185
+ "eval_loss": 1.766069833818636,
186
+ "eval_uni_ce_loss": 0.5357210769500906,
187
+ "step": 4096
188
+ },
189
+ {
190
+ "epoch": 0.18918294766985358,
191
+ "eval_beta_ce_loss": 0.6151743780260217,
192
+ "eval_bleu": 0.5008352932587944,
193
+ "eval_loss": 1.766069833818636,
194
+ "eval_runtime": 135.334,
195
+ "eval_samples_per_second": 206.844,
196
+ "eval_steps_per_second": 3.236,
197
+ "eval_uni_ce_loss": 0.5357210769500906,
198
+ "step": 4096
199
+ },
200
+ {
201
+ "epoch": 0.20100688189921945,
202
+ "grad_norm": 0.9300932288169861,
203
+ "learning_rate": 0.0009208056308063659,
204
+ "loss": 1.7624573707580566,
205
+ "step": 4352
206
+ },
207
+ {
208
+ "epoch": 0.2128308161285853,
209
+ "grad_norm": 0.7691043019294739,
210
+ "learning_rate": 0.0009102296648873445,
211
+ "loss": 1.7650220394134521,
212
+ "step": 4608
213
+ },
214
+ {
215
+ "epoch": 0.22465475035795113,
216
+ "grad_norm": 0.8132173418998718,
217
+ "learning_rate": 0.0008990599737794927,
218
+ "loss": 1.761474370956421,
219
+ "step": 4864
220
+ },
221
+ {
222
+ "epoch": 0.236478684587317,
223
+ "grad_norm": 0.7635871171951294,
224
+ "learning_rate": 0.0008873127233711644,
225
+ "loss": 1.7527934312820435,
226
+ "step": 5120
227
+ },
228
+ {
229
+ "epoch": 0.236478684587317,
230
+ "eval_beta_ce_loss": 0.611639610028158,
231
+ "eval_bleu": 0.5024511500541371,
232
+ "eval_loss": 1.753186773763944,
233
+ "eval_uni_ce_loss": 0.5299075521426658,
234
+ "step": 5120
235
+ },
236
+ {
237
+ "epoch": 0.236478684587317,
238
+ "eval_beta_ce_loss": 0.611639610028158,
239
+ "eval_bleu": 0.5024511500541371,
240
+ "eval_loss": 1.753186773763944,
241
+ "eval_runtime": 135.0739,
242
+ "eval_samples_per_second": 207.242,
243
+ "eval_steps_per_second": 3.243,
244
+ "eval_uni_ce_loss": 0.5299075521426658,
245
+ "step": 5120
246
+ },
247
+ {
248
+ "epoch": 0.24830261881668284,
249
+ "grad_norm": 0.7438808083534241,
250
+ "learning_rate": 0.0008750049154520011,
251
+ "loss": 1.7492367029190063,
252
+ "step": 5376
253
+ },
254
+ {
255
+ "epoch": 0.2601265530460487,
256
+ "grad_norm": 0.7799262404441833,
257
+ "learning_rate": 0.0008621543631062487,
258
+ "loss": 1.744637131690979,
259
+ "step": 5632
260
+ },
261
+ {
262
+ "epoch": 0.27195048727541454,
263
+ "grad_norm": 0.7694032192230225,
264
+ "learning_rate": 0.0008487796649318904,
265
+ "loss": 1.7494758367538452,
266
+ "step": 5888
267
+ },
268
+ {
269
+ "epoch": 0.2837744215047804,
270
+ "grad_norm": 0.7369454503059387,
271
+ "learning_rate": 0.0008349001781229053,
272
+ "loss": 1.7494639158248901,
273
+ "step": 6144
274
+ },
275
+ {
276
+ "epoch": 0.2837744215047804,
277
+ "eval_beta_ce_loss": 0.6058613158524309,
278
+ "eval_bleu": 0.5028032064859724,
279
+ "eval_loss": 1.739504432841523,
280
+ "eval_uni_ce_loss": 0.5277818014088287,
281
+ "step": 6144
282
+ },
283
+ {
284
+ "epoch": 0.2837744215047804,
285
+ "eval_beta_ce_loss": 0.6058613158524309,
286
+ "eval_bleu": 0.5028032064859724,
287
+ "eval_loss": 1.739504432841523,
288
+ "eval_runtime": 136.5059,
289
+ "eval_samples_per_second": 205.068,
290
+ "eval_steps_per_second": 3.209,
291
+ "eval_uni_ce_loss": 0.5277818014088287,
292
+ "step": 6144
293
+ },
294
+ {
295
+ "epoch": 0.2955983557341462,
296
+ "grad_norm": 0.8735672235488892,
297
+ "learning_rate": 0.0008205359904536107,
298
+ "loss": 1.7374849319458008,
299
+ "step": 6400
300
+ },
301
+ {
302
+ "epoch": 0.30742228996351206,
303
+ "grad_norm": 1.0139427185058594,
304
+ "learning_rate": 0.0008057078912056363,
305
+ "loss": 1.7405836582183838,
306
+ "step": 6656
307
+ },
308
+ {
309
+ "epoch": 0.3192462241928779,
310
+ "grad_norm": 1.0008057355880737,
311
+ "learning_rate": 0.0007904373410796086,
312
+ "loss": 1.7392584085464478,
313
+ "step": 6912
314
+ },
315
+ {
316
+ "epoch": 0.3310701584222438,
317
+ "grad_norm": 0.8208015561103821,
318
+ "learning_rate": 0.0007747464411350876,
319
+ "loss": 1.7316131591796875,
320
+ "step": 7168
321
+ },
322
+ {
323
+ "epoch": 0.3310701584222438,
324
+ "eval_beta_ce_loss": 0.6033149586148459,
325
+ "eval_bleu": 0.5017084521574802,
326
+ "eval_loss": 1.7307289455034962,
327
+ "eval_uni_ce_loss": 0.5240990275253444,
328
+ "step": 7168
329
+ },
330
+ {
331
+ "epoch": 0.3310701584222438,
332
+ "eval_beta_ce_loss": 0.6033149586148459,
333
+ "eval_bleu": 0.5017084521574802,
334
+ "eval_loss": 1.7307289455034962,
335
+ "eval_runtime": 136.4837,
336
+ "eval_samples_per_second": 205.101,
337
+ "eval_steps_per_second": 3.209,
338
+ "eval_uni_ce_loss": 0.5240990275253444,
339
+ "step": 7168
340
+ },
341
+ {
342
+ "epoch": 0.34289409265160964,
343
+ "grad_norm": 0.9193022847175598,
344
+ "learning_rate": 0.000758657900803716,
345
+ "loss": 1.7337878942489624,
346
+ "step": 7424
347
+ },
348
+ {
349
+ "epoch": 0.3547180268809755,
350
+ "grad_norm": 0.7078978419303894,
351
+ "learning_rate": 0.000742195005021869,
352
+ "loss": 1.7295589447021484,
353
+ "step": 7680
354
+ },
355
+ {
356
+ "epoch": 0.3665419611103413,
357
+ "grad_norm": 0.7894092798233032,
358
+ "learning_rate": 0.0007253815805303786,
359
+ "loss": 1.723784327507019,
360
+ "step": 7936
361
+ },
362
+ {
363
+ "epoch": 0.37836589533970716,
364
+ "grad_norm": 0.7812065482139587,
365
+ "learning_rate": 0.0007082419613901028,
366
+ "loss": 1.7162002325057983,
367
+ "step": 8192
368
+ },
369
+ {
370
+ "epoch": 0.37836589533970716,
371
+ "eval_beta_ce_loss": 0.6003280380000807,
372
+ "eval_bleu": 0.5041936063198118,
373
+ "eval_loss": 1.7231203064526597,
374
+ "eval_uni_ce_loss": 0.5224642294318709,
375
+ "step": 8192
376
+ },
377
+ {
378
+ "epoch": 0.37836589533970716,
379
+ "eval_beta_ce_loss": 0.6003280380000807,
380
+ "eval_bleu": 0.5041936063198118,
381
+ "eval_loss": 1.7231203064526597,
382
+ "eval_runtime": 136.1645,
383
+ "eval_samples_per_second": 205.582,
384
+ "eval_steps_per_second": 3.217,
385
+ "eval_uni_ce_loss": 0.5224642294318709,
386
+ "step": 8192
387
+ },
388
+ {
389
+ "epoch": 0.390189829569073,
390
+ "grad_norm": 0.6728302836418152,
391
+ "learning_rate": 0.0006908009537632514,
392
+ "loss": 1.7215343713760376,
393
+ "step": 8448
394
+ },
395
+ {
396
+ "epoch": 0.4020137637984389,
397
+ "grad_norm": 0.6643648743629456,
398
+ "learning_rate": 0.0006730838000114403,
399
+ "loss": 1.7175226211547852,
400
+ "step": 8704
401
+ },
402
+ {
403
+ "epoch": 0.41383769802780473,
404
+ "grad_norm": 0.6722604036331177,
405
+ "learning_rate": 0.0006551161421624341,
406
+ "loss": 1.7141364812850952,
407
+ "step": 8960
408
+ },
409
+ {
410
+ "epoch": 0.4256616322571706,
411
+ "grad_norm": 0.7661899328231812,
412
+ "learning_rate": 0.0006369239847984517,
413
+ "loss": 1.7067075967788696,
414
+ "step": 9216
415
+ },
416
+ {
417
+ "epoch": 0.4256616322571706,
418
+ "eval_beta_ce_loss": 0.5960702291906697,
419
+ "eval_bleu": 0.508842331362903,
420
+ "eval_loss": 1.7111802789718593,
421
+ "eval_uni_ce_loss": 0.5190398215431057,
422
+ "step": 9216
423
+ },
424
+ {
425
+ "epoch": 0.4256616322571706,
426
+ "eval_beta_ce_loss": 0.5960702291906697,
427
+ "eval_bleu": 0.508842331362903,
428
+ "eval_loss": 1.7111802789718593,
429
+ "eval_runtime": 135.4442,
430
+ "eval_samples_per_second": 206.676,
431
+ "eval_steps_per_second": 3.234,
432
+ "eval_uni_ce_loss": 0.5190398215431057,
433
+ "step": 9216
434
+ },
435
+ {
436
+ "epoch": 0.4374855664865364,
437
+ "grad_norm": 0.7040799260139465,
438
+ "learning_rate": 0.0006185336574197479,
439
+ "loss": 1.7019784450531006,
440
+ "step": 9472
441
+ },
442
+ {
443
+ "epoch": 0.44930950071590225,
444
+ "grad_norm": 0.7697860598564148,
445
+ "learning_rate": 0.0005999717763379407,
446
+ "loss": 1.7126665115356445,
447
+ "step": 9728
448
+ },
449
+ {
450
+ "epoch": 0.4611334349452681,
451
+ "grad_norm": 0.7353936433792114,
452
+ "learning_rate": 0.0005812652061542363,
453
+ "loss": 1.7056387662887573,
454
+ "step": 9984
455
+ },
456
+ {
457
+ "epoch": 0.472957369174634,
458
+ "grad_norm": 0.8408072590827942,
459
+ "learning_rate": 0.0005624410208783071,
460
+ "loss": 1.7016122341156006,
461
+ "step": 10240
462
+ },
463
+ {
464
+ "epoch": 0.472957369174634,
465
+ "eval_beta_ce_loss": 0.5948017802276567,
466
+ "eval_bleu": 0.5119743719435753,
467
+ "eval_loss": 1.705481736083009,
468
+ "eval_uni_ce_loss": 0.5158781752874863,
469
+ "step": 10240
470
+ },
471
+ {
472
+ "epoch": 0.472957369174634,
473
+ "eval_beta_ce_loss": 0.5948017802276567,
474
+ "eval_bleu": 0.5119743719435753,
475
+ "eval_loss": 1.705481736083009,
476
+ "eval_runtime": 134.7944,
477
+ "eval_samples_per_second": 207.672,
478
+ "eval_steps_per_second": 3.249,
479
+ "eval_uni_ce_loss": 0.5158781752874863,
480
+ "step": 10240
481
+ },
482
+ {
483
+ "epoch": 0.48478130340399983,
484
+ "grad_norm": 0.6450205445289612,
485
+ "learning_rate": 0.0005435264647440881,
486
+ "loss": 1.7052643299102783,
487
+ "step": 10496
488
+ },
489
+ {
490
+ "epoch": 0.49660523763336567,
491
+ "grad_norm": 0.6514373421669006,
492
+ "learning_rate": 0.000524548912779213,
493
+ "loss": 1.69474458694458,
494
+ "step": 10752
495
+ },
496
+ {
497
+ "epoch": 0.5084291718627315,
498
+ "grad_norm": 0.7065825462341309,
499
+ "learning_rate": 0.0005055358311851499,
500
+ "loss": 1.7007395029067993,
501
+ "step": 11008
502
+ },
503
+ {
504
+ "epoch": 0.5202531060920974,
505
+ "grad_norm": 0.8027070164680481,
506
+ "learning_rate": 0.0004865147375853812,
507
+ "loss": 1.696798324584961,
508
+ "step": 11264
509
+ },
510
+ {
511
+ "epoch": 0.5202531060920974,
512
+ "eval_beta_ce_loss": 0.5911326601625033,
513
+ "eval_bleu": 0.5082277536286182,
514
+ "eval_loss": 1.6985954277591617,
515
+ "eval_uni_ce_loss": 0.5163301084547827,
516
+ "step": 11264
517
+ },
518
+ {
519
+ "epoch": 0.5202531060920974,
520
+ "eval_beta_ce_loss": 0.5911326601625033,
521
+ "eval_bleu": 0.5082277536286182,
522
+ "eval_loss": 1.6985954277591617,
523
+ "eval_runtime": 138.4679,
524
+ "eval_samples_per_second": 202.162,
525
+ "eval_steps_per_second": 3.163,
526
+ "eval_uni_ce_loss": 0.5163301084547827,
527
+ "step": 11264
528
+ },
529
+ {
530
+ "epoch": 0.5320770403214632,
531
+ "grad_norm": 0.7604843974113464,
532
+ "learning_rate": 0.0004675131611991607,
533
+ "loss": 1.6915127038955688,
534
+ "step": 11520
535
+ },
536
+ {
537
+ "epoch": 0.5439009745508291,
538
+ "grad_norm": 0.6604936718940735,
539
+ "learning_rate": 0.0004485586029984899,
540
+ "loss": 1.692893147468567,
541
+ "step": 11776
542
+ },
543
+ {
544
+ "epoch": 0.5557249087801949,
545
+ "grad_norm": 0.7172213792800903,
546
+ "learning_rate": 0.00042967849590597266,
547
+ "loss": 1.6868789196014404,
548
+ "step": 12032
549
+ },
550
+ {
551
+ "epoch": 0.5675488430095608,
552
+ "grad_norm": 0.7629127502441406,
553
+ "learning_rate": 0.0004109001650911621,
554
+ "loss": 1.6939620971679688,
555
+ "step": 12288
556
+ },
557
+ {
558
+ "epoch": 0.5675488430095608,
559
+ "eval_beta_ce_loss": 0.5891192779965597,
560
+ "eval_bleu": 0.5114094116436015,
561
+ "eval_loss": 1.6906098777845027,
562
+ "eval_uni_ce_loss": 0.5123713216552995,
563
+ "step": 12288
564
+ },
565
+ {
566
+ "epoch": 0.5675488430095608,
567
+ "eval_beta_ce_loss": 0.5891192779965597,
568
+ "eval_bleu": 0.5114094116436015,
569
+ "eval_loss": 1.6906098777845027,
570
+ "eval_runtime": 135.3599,
571
+ "eval_samples_per_second": 206.804,
572
+ "eval_steps_per_second": 3.236,
573
+ "eval_uni_ce_loss": 0.5123713216552995,
574
+ "step": 12288
575
+ },
576
+ {
577
+ "epoch": 0.5793727772389267,
578
+ "grad_norm": 0.6822903156280518,
579
+ "learning_rate": 0.0003922507884228551,
580
+ "loss": 1.6874535083770752,
581
+ "step": 12544
582
+ },
583
+ {
584
+ "epoch": 0.5911967114682924,
585
+ "grad_norm": 0.7348518371582031,
586
+ "learning_rate": 0.00037375735713457723,
587
+ "loss": 1.6786881685256958,
588
+ "step": 12800
589
+ },
590
+ {
591
+ "epoch": 0.6030206456976583,
592
+ "grad_norm": 0.797898530960083,
593
+ "learning_rate": 0.00035544663676018276,
594
+ "loss": 1.6823772192001343,
595
+ "step": 13056
596
+ },
597
+ {
598
+ "epoch": 0.6148445799270241,
599
+ "grad_norm": 0.666439950466156,
600
+ "learning_rate": 0.00033734512839611255,
601
+ "loss": 1.6804336309432983,
602
+ "step": 13312
603
+ },
604
+ {
605
+ "epoch": 0.6148445799270241,
606
+ "eval_beta_ce_loss": 0.5856027225517246,
607
+ "eval_bleu": 0.5110656031508146,
608
+ "eval_loss": 1.6819111756538148,
609
+ "eval_uni_ce_loss": 0.5107057281689013,
610
+ "step": 13312
611
+ },
612
+ {
613
+ "epoch": 0.6148445799270241,
614
+ "eval_beta_ce_loss": 0.5856027225517246,
615
+ "eval_bleu": 0.5110656031508146,
616
+ "eval_loss": 1.6819111756538148,
617
+ "eval_runtime": 135.0756,
618
+ "eval_samples_per_second": 207.24,
619
+ "eval_steps_per_second": 3.243,
620
+ "eval_uni_ce_loss": 0.5107057281689013,
621
+ "step": 13312
622
+ },
623
+ {
624
+ "epoch": 0.62666851415639,
625
+ "grad_norm": 0.9536261558532715,
626
+ "learning_rate": 0.0003194790303463687,
627
+ "loss": 1.6719762086868286,
628
+ "step": 13568
629
+ },
630
+ {
631
+ "epoch": 0.6384924483857558,
632
+ "grad_norm": 0.7538688778877258,
633
+ "learning_rate": 0.00030187420020572406,
634
+ "loss": 1.6798793077468872,
635
+ "step": 13824
636
+ },
637
+ {
638
+ "epoch": 0.6503163826151217,
639
+ "grad_norm": 0.6934835910797119,
640
+ "learning_rate": 0.00028455611743603626,
641
+ "loss": 1.6709473133087158,
642
+ "step": 14080
643
+ },
644
+ {
645
+ "epoch": 0.6621403168444876,
646
+ "grad_norm": 0.7217169404029846,
647
+ "learning_rate": 0.0002675498464898373,
648
+ "loss": 1.6749387979507446,
649
+ "step": 14336
650
+ },
651
+ {
652
+ "epoch": 0.6621403168444876,
653
+ "eval_beta_ce_loss": 0.5853756803627972,
654
+ "eval_bleu": 0.5112546154669725,
655
+ "eval_loss": 1.6793161832034316,
656
+ "eval_uni_ce_loss": 0.5085648222737116,
657
+ "step": 14336
658
+ },
659
+ {
660
+ "epoch": 0.6621403168444876,
661
+ "eval_beta_ce_loss": 0.5853756803627972,
662
+ "eval_bleu": 0.5112546154669725,
663
+ "eval_loss": 1.6793161832034316,
664
+ "eval_runtime": 137.7212,
665
+ "eval_samples_per_second": 203.258,
666
+ "eval_steps_per_second": 3.18,
667
+ "eval_uni_ce_loss": 0.5085648222737116,
668
+ "step": 14336
669
+ },
670
+ {
671
+ "epoch": 0.6739642510738534,
672
+ "grad_norm": 0.8353447914123535,
673
+ "learning_rate": 0.0002508800005345623,
674
+ "loss": 1.679425835609436,
675
+ "step": 14592
676
+ },
677
+ {
678
+ "epoch": 0.6857881853032193,
679
+ "grad_norm": 0.6642665266990662,
680
+ "learning_rate": 0.00023457070582992562,
681
+ "loss": 1.6819074153900146,
682
+ "step": 14848
683
+ },
684
+ {
685
+ "epoch": 0.6976121195325851,
686
+ "grad_norm": 0.6584897637367249,
687
+ "learning_rate": 0.00021864556680999692,
688
+ "loss": 1.6745400428771973,
689
+ "step": 15104
690
+ },
691
+ {
692
+ "epoch": 0.709436053761951,
693
+ "grad_norm": 0.6884040236473083,
694
+ "learning_rate": 0.0002031276319205152,
695
+ "loss": 1.6730715036392212,
696
+ "step": 15360
697
+ },
698
+ {
699
+ "epoch": 0.709436053761951,
700
+ "eval_beta_ce_loss": 0.582520013870714,
701
+ "eval_bleu": 0.5116472036246329,
702
+ "eval_loss": 1.6724328953925878,
703
+ "eval_uni_ce_loss": 0.5073928678552854,
704
+ "step": 15360
705
+ },
706
+ {
707
+ "epoch": 0.709436053761951,
708
+ "eval_beta_ce_loss": 0.582520013870714,
709
+ "eval_bleu": 0.5116472036246329,
710
+ "eval_loss": 1.6724328953925878,
711
+ "eval_runtime": 135.5849,
712
+ "eval_samples_per_second": 206.461,
713
+ "eval_steps_per_second": 3.23,
714
+ "eval_uni_ce_loss": 0.5073928678552854,
715
+ "step": 15360
716
+ },
717
+ {
718
+ "epoch": 0.7212599879913169,
719
+ "grad_norm": 0.7419734597206116,
720
+ "learning_rate": 0.00018803936026088542,
721
+ "loss": 1.6732308864593506,
722
+ "step": 15616
723
+ },
724
+ {
725
+ "epoch": 0.7330839222206826,
726
+ "grad_norm": 0.6793375611305237,
727
+ "learning_rate": 0.00017340258907913464,
728
+ "loss": 1.6713887453079224,
729
+ "step": 15872
730
+ },
731
+ {
732
+ "epoch": 0.7449078564500485,
733
+ "grad_norm": 0.6444223523139954,
734
+ "learning_rate": 0.0001592385021668743,
735
+ "loss": 1.664468765258789,
736
+ "step": 16128
737
+ },
738
+ {
739
+ "epoch": 0.7567317906794143,
740
+ "grad_norm": 0.7465987205505371,
741
+ "learning_rate": 0.0001455675992000087,
742
+ "loss": 1.6718043088912964,
743
+ "step": 16384
744
+ },
745
+ {
746
+ "epoch": 0.7567317906794143,
747
+ "eval_beta_ce_loss": 0.5802708933614704,
748
+ "eval_bleu": 0.5137912099965332,
749
+ "eval_loss": 1.6668929272590707,
750
+ "eval_uni_ce_loss": 0.5063511407402552,
751
+ "step": 16384
752
+ },
753
+ {
754
+ "epoch": 0.7567317906794143,
755
+ "eval_beta_ce_loss": 0.5802708933614704,
756
+ "eval_bleu": 0.5137912099965332,
757
+ "eval_loss": 1.6668929272590707,
758
+ "eval_runtime": 137.3458,
759
+ "eval_samples_per_second": 203.814,
760
+ "eval_steps_per_second": 3.189,
761
+ "eval_uni_ce_loss": 0.5063511407402552,
762
+ "step": 16384
763
+ },
764
+ {
765
+ "epoch": 0.7685557249087802,
766
+ "grad_norm": 0.7398092746734619,
767
+ "learning_rate": 0.000132409666069565,
768
+ "loss": 1.6554030179977417,
769
+ "step": 16640
770
+ },
771
+ {
772
+ "epoch": 0.780379659138146,
773
+ "grad_norm": 0.7583303451538086,
774
+ "learning_rate": 0.0001197837462455823,
775
+ "loss": 1.6627554893493652,
776
+ "step": 16896
777
+ },
778
+ {
779
+ "epoch": 0.7922035933675119,
780
+ "grad_norm": 0.7260330319404602,
781
+ "learning_rate": 0.00010770811321550749,
782
+ "loss": 1.6633129119873047,
783
+ "step": 17152
784
+ },
785
+ {
786
+ "epoch": 0.8040275275968778,
787
+ "grad_norm": 0.6960119605064392,
788
+ "learning_rate": 9.620024403698591e-05,
789
+ "loss": 1.6641393899917603,
790
+ "step": 17408
791
+ },
792
+ {
793
+ "epoch": 0.8040275275968778,
794
+ "eval_beta_ce_loss": 0.580827721654008,
795
+ "eval_bleu": 0.5137603294282832,
796
+ "eval_loss": 1.6667931901809832,
797
+ "eval_uni_ce_loss": 0.5051377474853437,
798
+ "step": 17408
799
+ },
800
+ {
801
+ "epoch": 0.8040275275968778,
802
+ "eval_beta_ce_loss": 0.580827721654008,
803
+ "eval_bleu": 0.5137603294282832,
804
+ "eval_loss": 1.6667931901809832,
805
+ "eval_runtime": 135.8623,
806
+ "eval_samples_per_second": 206.039,
807
+ "eval_steps_per_second": 3.224,
808
+ "eval_uni_ce_loss": 0.5051377474853437,
809
+ "step": 17408
810
+ }
811
+ ],
812
+ "logging_steps": 256,
813
+ "max_steps": 21651,
814
+ "num_input_tokens_seen": 0,
815
+ "num_train_epochs": 1,
816
+ "save_steps": 1024,
817
+ "stateful_callbacks": {
818
+ "TrainerControl": {
819
+ "args": {
820
+ "should_epoch_stop": false,
821
+ "should_evaluate": false,
822
+ "should_log": false,
823
+ "should_save": true,
824
+ "should_training_stop": false
825
+ },
826
+ "attributes": {}
827
+ }
828
+ },
829
+ "total_flos": 0.0,
830
+ "train_batch_size": 64,
831
+ "trial_name": null,
832
+ "trial_params": null
833
+ }
checkpoints-d2.0-t1/checkpoint-17408/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a16bb839f687414b8e48611327c4b9cfddeefe38c031ca70808f9a97c476b7
3
+ size 5137