pszemraj commited on
Commit
687ed54
1 Parent(s): 83f1019

update model with approx 1.6 epochs training

Browse files
Files changed (9) hide show
  1. config.json +1 -1
  2. latest +1 -0
  3. merges.txt +1 -1
  4. pytorch_model.bin +2 -2
  5. tokenizer_config.json +1 -1
  6. trainer_state.json +621 -1155
  7. training_args.bin +2 -2
  8. vocab.json +0 -0
  9. zero_to_fp32.py +484 -0
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "pszemraj/opt-peter-1.3B-1E",
3
  "activation_dropout": 0.0,
4
  "activation_function": "relu",
5
  "architectures": [
1
  {
2
+ "_name_or_path": "pszemraj/opt-peter-1.3B",
3
  "activation_dropout": 0.0,
4
  "activation_function": "relu",
5
  "architectures": [
latest ADDED
@@ -0,0 +1 @@
 
1
+ global_step1016
merges.txt CHANGED
@@ -1,4 +1,4 @@
1
- #version: 0.2 - Trained by `huggingface/tokenizers`
2
  Ġ t
3
  Ġ a
4
  h e
1
+ #version: 0.2
2
  Ġ t
3
  Ġ a
4
  h e
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d2508e76de339f54093ad2f86fc540bfa5ac8c7a037fc5c0c51be088f877e80
3
- size 5263011731
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cad2a7fef5856274323ae7e812eec18858a768fd7d3f83bcfc3c2e9d0d3d1de
3
+ size 5263006227
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "errors": "replace", "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_bos_token": true, "special_tokens_map_file": null, "name_or_path": "pszemraj/opt-peter-1.3B-1E", "model_max_length": 512}
1
+ {"errors": "replace", "unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "add_bos_token": true, "special_tokens_map_file": null, "name_or_path": "pszemraj/opt-peter-1.3B", "model_max_length": 512, "tokenizer_class": "GPT2Tokenizer"}
trainer_state.json CHANGED
@@ -1,1776 +1,1242 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.1828941378787112,
5
- "global_step": 1461,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.0,
12
- "learning_rate": 6e-06,
13
- "loss": 1.9846,
14
  "step": 5
15
  },
16
  {
17
- "epoch": 0.01,
18
- "learning_rate": 1.2e-05,
19
- "loss": 1.8345,
20
  "step": 10
21
  },
22
  {
23
- "epoch": 0.01,
24
- "learning_rate": 1.8e-05,
25
- "loss": 1.7106,
26
  "step": 15
27
  },
28
  {
29
- "epoch": 0.02,
30
- "learning_rate": 2.4e-05,
31
- "loss": 1.6318,
32
  "step": 20
33
  },
34
  {
35
- "epoch": 0.02,
36
- "learning_rate": 3e-05,
37
- "loss": 1.5838,
38
  "step": 25
39
  },
40
  {
41
- "epoch": 0.02,
42
- "learning_rate": 2.9999690442167746e-05,
43
- "loss": 1.6664,
44
  "step": 30
45
  },
46
  {
47
- "epoch": 0.03,
48
- "learning_rate": 2.999876178144779e-05,
49
- "loss": 1.5785,
50
  "step": 35
51
  },
52
  {
53
- "epoch": 0.03,
54
- "learning_rate": 2.9997214056170024e-05,
55
- "loss": 1.6536,
56
  "step": 40
57
  },
58
  {
59
- "epoch": 0.04,
60
- "learning_rate": 2.9995047330215847e-05,
61
- "loss": 1.5866,
62
  "step": 45
63
  },
64
  {
65
- "epoch": 0.04,
66
- "learning_rate": 2.9992261693015524e-05,
67
- "loss": 1.6324,
68
  "step": 50
69
  },
70
  {
71
- "epoch": 0.04,
72
- "learning_rate": 2.9988857259544498e-05,
73
- "loss": 1.6263,
74
  "step": 55
75
  },
76
  {
77
- "epoch": 0.05,
78
- "learning_rate": 2.9984834170318635e-05,
79
- "loss": 1.6481,
80
  "step": 60
81
  },
82
  {
83
- "epoch": 0.05,
84
- "learning_rate": 2.9980192591388452e-05,
85
- "loss": 1.6703,
86
  "step": 65
87
  },
88
  {
89
- "epoch": 0.06,
90
- "learning_rate": 2.997493271433222e-05,
91
- "loss": 1.6566,
92
  "step": 70
93
  },
94
  {
95
- "epoch": 0.06,
96
- "learning_rate": 2.9969054756248093e-05,
97
- "loss": 1.6168,
98
  "step": 75
99
  },
100
  {
101
- "epoch": 0.06,
102
- "learning_rate": 2.9962558959745133e-05,
103
- "loss": 1.6836,
104
  "step": 80
105
  },
106
  {
107
- "epoch": 0.07,
108
- "learning_rate": 2.9955445592933296e-05,
109
- "loss": 1.7089,
110
  "step": 85
111
  },
112
  {
113
- "epoch": 0.07,
114
- "learning_rate": 2.994771494941238e-05,
115
- "loss": 1.6573,
116
  "step": 90
117
  },
118
  {
119
- "epoch": 0.08,
120
- "learning_rate": 2.9939367348259873e-05,
121
- "loss": 1.5558,
122
  "step": 95
123
  },
124
  {
125
- "epoch": 0.08,
126
- "learning_rate": 2.9930403134017823e-05,
127
- "loss": 1.6069,
128
  "step": 100
129
  },
130
  {
131
- "epoch": 0.08,
132
- "learning_rate": 2.992082267667859e-05,
133
- "loss": 1.6357,
134
  "step": 105
135
  },
136
  {
137
- "epoch": 0.09,
138
- "learning_rate": 2.9910626371669593e-05,
139
- "loss": 1.5968,
140
  "step": 110
141
  },
142
  {
143
- "epoch": 0.09,
144
- "learning_rate": 2.9899814639836972e-05,
145
- "loss": 1.6244,
146
  "step": 115
147
  },
148
  {
149
- "epoch": 0.1,
150
- "learning_rate": 2.9888387927428234e-05,
151
- "loss": 1.5168,
152
  "step": 120
153
  },
154
  {
155
- "epoch": 0.1,
156
- "learning_rate": 2.987634670607381e-05,
157
- "loss": 1.6162,
158
  "step": 125
159
  },
160
  {
161
- "epoch": 0.11,
162
- "learning_rate": 2.9863691472767633e-05,
163
- "loss": 1.6014,
164
  "step": 130
165
  },
166
  {
167
- "epoch": 0.11,
168
- "learning_rate": 2.9850422749846577e-05,
169
- "loss": 1.6596,
170
  "step": 135
171
  },
172
  {
173
- "epoch": 0.11,
174
- "learning_rate": 2.9836541084968914e-05,
175
- "loss": 1.6175,
176
  "step": 140
177
  },
178
  {
179
- "epoch": 0.12,
180
- "learning_rate": 2.9822047051091735e-05,
181
- "loss": 1.6219,
182
  "step": 145
183
  },
184
  {
185
- "epoch": 0.12,
186
- "learning_rate": 2.9806941246447258e-05,
187
- "loss": 1.5605,
188
  "step": 150
189
  },
190
  {
191
- "epoch": 0.13,
192
- "learning_rate": 2.9791224294518173e-05,
193
- "loss": 1.6062,
194
  "step": 155
195
  },
196
  {
197
- "epoch": 0.13,
198
- "learning_rate": 2.9774896844011887e-05,
199
- "loss": 1.5561,
200
  "step": 160
201
  },
202
  {
203
- "epoch": 0.13,
204
- "learning_rate": 2.9757959568833758e-05,
205
- "loss": 1.5929,
206
  "step": 165
207
  },
208
  {
209
- "epoch": 0.14,
210
- "learning_rate": 2.9740413168059278e-05,
211
- "loss": 1.6087,
212
  "step": 170
213
  },
214
  {
215
- "epoch": 0.14,
216
- "learning_rate": 2.9722258365905223e-05,
217
- "loss": 1.603,
218
  "step": 175
219
  },
220
  {
221
- "epoch": 0.15,
222
- "learning_rate": 2.9703495911699746e-05,
223
- "loss": 1.5866,
224
  "step": 180
225
  },
226
  {
227
- "epoch": 0.15,
228
- "learning_rate": 2.9684126579851468e-05,
229
- "loss": 1.696,
230
  "step": 185
231
  },
232
  {
233
- "epoch": 0.15,
234
- "learning_rate": 2.9664151169817515e-05,
235
- "loss": 1.59,
236
  "step": 190
237
  },
238
  {
239
- "epoch": 0.16,
240
- "learning_rate": 2.9643570506070493e-05,
241
- "loss": 1.5724,
242
  "step": 195
243
  },
244
  {
245
- "epoch": 0.16,
246
- "learning_rate": 2.9622385438064493e-05,
247
- "loss": 1.5482,
248
  "step": 200
249
  },
250
  {
251
- "epoch": 0.17,
252
- "learning_rate": 2.9600596840200022e-05,
253
- "loss": 1.6198,
254
  "step": 205
255
  },
256
  {
257
- "epoch": 0.17,
258
- "learning_rate": 2.9578205611787877e-05,
259
- "loss": 1.6246,
260
  "step": 210
261
  },
262
  {
263
- "epoch": 0.17,
264
- "learning_rate": 2.955521267701209e-05,
265
- "loss": 1.6953,
266
  "step": 215
267
  },
268
  {
269
- "epoch": 0.18,
270
- "learning_rate": 2.9531618984891737e-05,
271
- "loss": 1.6397,
272
  "step": 220
273
  },
274
  {
275
- "epoch": 0.18,
276
- "learning_rate": 2.9507425509241757e-05,
277
- "loss": 1.5797,
278
  "step": 225
279
  },
280
  {
281
- "epoch": 0.19,
282
- "learning_rate": 2.948263324863282e-05,
283
- "loss": 1.63,
284
  "step": 230
285
  },
286
  {
287
- "epoch": 0.19,
288
- "learning_rate": 2.945724322635004e-05,
289
- "loss": 1.5258,
290
  "step": 235
291
  },
292
  {
293
- "epoch": 0.19,
294
- "learning_rate": 2.9431256490350795e-05,
295
- "loss": 1.6107,
296
  "step": 240
297
  },
298
  {
299
- "epoch": 0.2,
300
- "learning_rate": 2.9404674113221433e-05,
301
- "loss": 1.5577,
302
  "step": 245
303
  },
304
  {
305
- "epoch": 0.2,
306
- "learning_rate": 2.937749719213303e-05,
307
- "loss": 1.6001,
308
  "step": 250
309
  },
310
  {
311
- "epoch": 0.21,
312
- "learning_rate": 2.9349726848796083e-05,
313
- "loss": 1.6326,
314
  "step": 255
315
  },
316
  {
317
- "epoch": 0.21,
318
- "learning_rate": 2.932136422941424e-05,
319
- "loss": 1.5575,
320
  "step": 260
321
  },
322
  {
323
- "epoch": 0.21,
324
- "learning_rate": 2.929241050463696e-05,
325
- "loss": 1.692,
326
  "step": 265
327
  },
328
  {
329
- "epoch": 0.22,
330
- "learning_rate": 2.926286686951121e-05,
331
- "loss": 1.5985,
332
  "step": 270
333
  },
334
  {
335
- "epoch": 0.22,
336
- "learning_rate": 2.9232734543432146e-05,
337
- "loss": 1.5635,
338
  "step": 275
339
  },
340
  {
341
- "epoch": 0.23,
342
- "learning_rate": 2.920201477009277e-05,
343
- "loss": 1.6745,
344
  "step": 280
345
  },
346
  {
347
- "epoch": 0.23,
348
- "learning_rate": 2.9170708817432612e-05,
349
- "loss": 1.5613,
350
  "step": 285
351
  },
352
  {
353
- "epoch": 0.23,
354
- "learning_rate": 2.9138817977585383e-05,
355
- "loss": 1.6063,
356
  "step": 290
357
  },
358
  {
359
- "epoch": 0.24,
360
- "learning_rate": 2.910634356682565e-05,
361
- "loss": 1.6118,
362
  "step": 295
363
  },
364
  {
365
- "epoch": 0.24,
366
- "learning_rate": 2.9073286925514504e-05,
367
- "loss": 1.5912,
368
  "step": 300
369
  },
370
  {
371
- "epoch": 0.25,
372
- "learning_rate": 2.9039649418044247e-05,
373
- "loss": 1.596,
374
  "step": 305
375
  },
376
  {
377
- "epoch": 0.25,
378
- "learning_rate": 2.900543243278206e-05,
379
- "loss": 1.5501,
380
  "step": 310
381
  },
382
  {
383
- "epoch": 0.25,
384
- "learning_rate": 2.8970637382012714e-05,
385
- "loss": 1.6077,
386
  "step": 315
387
  },
388
  {
389
- "epoch": 0.26,
390
- "learning_rate": 2.8935265701880277e-05,
391
- "loss": 1.5393,
392
  "step": 320
393
  },
394
  {
395
- "epoch": 0.26,
396
- "learning_rate": 2.8899318852328833e-05,
397
- "loss": 1.5622,
398
  "step": 325
399
  },
400
  {
401
- "epoch": 0.27,
402
- "learning_rate": 2.8862798317042222e-05,
403
- "loss": 1.5683,
404
  "step": 330
405
  },
406
  {
407
- "epoch": 0.27,
408
- "learning_rate": 2.882570560338281e-05,
409
- "loss": 1.5644,
410
  "step": 335
411
  },
412
  {
413
- "epoch": 0.28,
414
- "learning_rate": 2.878804224232927e-05,
415
- "loss": 1.6027,
416
  "step": 340
417
  },
418
  {
419
- "epoch": 0.28,
420
- "learning_rate": 2.8749809788413383e-05,
421
- "loss": 1.5418,
422
  "step": 345
423
  },
424
  {
425
- "epoch": 0.28,
426
- "learning_rate": 2.87110098196559e-05,
427
- "loss": 1.5934,
428
  "step": 350
429
  },
430
  {
431
- "epoch": 0.29,
432
- "learning_rate": 2.8671643937501375e-05,
433
- "loss": 1.5265,
434
  "step": 355
435
  },
436
  {
437
- "epoch": 0.29,
438
- "learning_rate": 2.8631713766752097e-05,
439
- "loss": 1.6044,
440
  "step": 360
441
  },
442
  {
443
- "epoch": 0.3,
444
- "learning_rate": 2.859122095550101e-05,
445
- "loss": 1.6244,
446
  "step": 365
447
  },
448
  {
449
- "epoch": 0.3,
450
- "learning_rate": 2.8550167175063705e-05,
451
- "loss": 1.5721,
452
  "step": 370
453
  },
454
  {
455
- "epoch": 0.3,
456
- "learning_rate": 2.850855411990941e-05,
457
- "loss": 1.5572,
458
  "step": 375
459
  },
460
  {
461
- "epoch": 0.31,
462
- "learning_rate": 2.8466383507591083e-05,
463
- "loss": 1.5034,
464
  "step": 380
465
  },
466
  {
467
- "epoch": 0.31,
468
- "learning_rate": 2.84236570786745e-05,
469
- "loss": 1.6026,
470
  "step": 385
471
  },
472
  {
473
- "epoch": 0.32,
474
- "learning_rate": 2.8380376596666425e-05,
475
- "loss": 1.6428,
476
  "step": 390
477
  },
478
  {
479
- "epoch": 0.32,
480
- "learning_rate": 2.833654384794181e-05,
481
- "loss": 1.5364,
482
  "step": 395
483
  },
484
  {
485
- "epoch": 0.32,
486
- "learning_rate": 2.8292160641670088e-05,
487
- "loss": 1.5064,
488
  "step": 400
489
  },
490
  {
491
- "epoch": 0.33,
492
- "learning_rate": 2.8247228809740468e-05,
493
- "loss": 1.6338,
494
  "step": 405
495
  },
496
  {
497
- "epoch": 0.33,
498
- "learning_rate": 2.820175020668635e-05,
499
- "loss": 1.5189,
500
  "step": 410
501
  },
502
  {
503
- "epoch": 0.34,
504
- "learning_rate": 2.8155726709608777e-05,
505
- "loss": 1.5359,
506
  "step": 415
507
  },
508
  {
509
- "epoch": 0.34,
510
- "learning_rate": 2.810916021809894e-05,
511
- "loss": 1.6113,
512
  "step": 420
513
  },
514
  {
515
- "epoch": 0.34,
516
- "learning_rate": 2.8062052654159797e-05,
517
- "loss": 1.5724,
518
  "step": 425
519
  },
520
  {
521
- "epoch": 0.35,
522
- "learning_rate": 2.8014405962126735e-05,
523
- "loss": 1.5174,
524
  "step": 430
525
  },
526
  {
527
- "epoch": 0.35,
528
- "learning_rate": 2.7966222108587307e-05,
529
- "loss": 1.5741,
530
  "step": 435
531
  },
532
  {
533
- "epoch": 0.36,
534
- "learning_rate": 2.7917503082300086e-05,
535
- "loss": 1.595,
536
  "step": 440
537
  },
538
  {
539
- "epoch": 0.36,
540
- "learning_rate": 2.7868250894112555e-05,
541
- "loss": 1.6298,
542
  "step": 445
543
  },
544
  {
545
- "epoch": 0.36,
546
- "learning_rate": 2.7818467576878136e-05,
547
- "loss": 1.5871,
548
  "step": 450
549
  },
550
  {
551
- "epoch": 0.37,
552
- "learning_rate": 2.776815518537226e-05,
553
- "loss": 1.5841,
554
  "step": 455
555
  },
556
  {
557
- "epoch": 0.37,
558
- "learning_rate": 2.7717315796207576e-05,
559
- "loss": 1.5128,
560
  "step": 460
561
  },
562
  {
563
- "epoch": 0.38,
564
- "learning_rate": 2.7665951507748223e-05,
565
- "loss": 1.5055,
566
  "step": 465
567
  },
568
  {
569
- "epoch": 0.38,
570
- "learning_rate": 2.7614064440023254e-05,
571
- "loss": 1.6022,
572
  "step": 470
573
  },
574
  {
575
- "epoch": 0.38,
576
- "learning_rate": 2.7561656734639085e-05,
577
- "loss": 1.5673,
578
  "step": 475
579
  },
580
  {
581
- "epoch": 0.39,
582
- "learning_rate": 2.7508730554691145e-05,
583
- "loss": 1.5504,
584
  "step": 480
585
  },
586
  {
587
- "epoch": 0.39,
588
- "learning_rate": 2.7455288084674565e-05,
589
- "loss": 1.6085,
590
  "step": 485
591
  },
592
  {
593
- "epoch": 0.4,
594
- "learning_rate": 2.7401331530394037e-05,
595
- "loss": 1.5598,
596
  "step": 490
597
  },
598
  {
599
- "epoch": 0.4,
600
- "learning_rate": 2.7346863118872766e-05,
601
- "loss": 1.5559,
602
  "step": 495
603
  },
604
  {
605
- "epoch": 0.4,
606
- "learning_rate": 2.729188509826053e-05,
607
- "loss": 1.5366,
608
  "step": 500
609
  },
610
  {
611
- "epoch": 0.41,
612
- "learning_rate": 2.7236399737740912e-05,
613
- "loss": 1.5431,
614
  "step": 505
615
  },
616
  {
617
- "epoch": 0.41,
618
- "learning_rate": 2.7180409327437648e-05,
619
- "loss": 1.58,
620
  "step": 510
621
  },
622
  {
623
- "epoch": 0.42,
624
- "learning_rate": 2.712391617832006e-05,
625
- "loss": 1.5656,
626
  "step": 515
627
  },
628
  {
629
- "epoch": 0.42,
630
- "learning_rate": 2.7066922622107726e-05,
631
- "loss": 1.5488,
632
  "step": 520
633
  },
634
  {
635
- "epoch": 0.42,
636
- "learning_rate": 2.700943101117421e-05,
637
- "loss": 1.563,
638
  "step": 525
639
  },
640
  {
641
- "epoch": 0.43,
642
- "learning_rate": 2.6951443718449966e-05,
643
- "loss": 1.5845,
644
  "step": 530
645
  },
646
  {
647
- "epoch": 0.43,
648
- "learning_rate": 2.689296313732442e-05,
649
- "loss": 1.5782,
650
  "step": 535
651
  },
652
  {
653
- "epoch": 0.44,
654
- "learning_rate": 2.6833991681547158e-05,
655
- "loss": 1.6135,
656
  "step": 540
657
  },
658
  {
659
- "epoch": 0.44,
660
- "learning_rate": 2.677453178512831e-05,
661
- "loss": 1.5745,
662
  "step": 545
663
  },
664
  {
665
- "epoch": 0.45,
666
- "learning_rate": 2.6714585902238105e-05,
667
- "loss": 1.6138,
668
  "step": 550
669
  },
670
  {
671
- "epoch": 0.45,
672
- "learning_rate": 2.6654156507105543e-05,
673
- "loss": 1.5663,
674
  "step": 555
675
  },
676
  {
677
- "epoch": 0.45,
678
- "learning_rate": 2.6593246093916307e-05,
679
- "loss": 1.5842,
680
  "step": 560
681
  },
682
  {
683
- "epoch": 0.46,
684
- "learning_rate": 2.653185717670978e-05,
685
- "loss": 1.5424,
686
  "step": 565
687
  },
688
  {
689
- "epoch": 0.46,
690
- "learning_rate": 2.6469992289275325e-05,
691
- "loss": 1.5542,
692
  "step": 570
693
  },
694
  {
695
- "epoch": 0.47,
696
- "learning_rate": 2.6407653985047665e-05,
697
- "loss": 1.5571,
698
  "step": 575
699
  },
700
  {
701
- "epoch": 0.47,
702
- "learning_rate": 2.6344844837001508e-05,
703
- "loss": 1.5838,
704
  "step": 580
705
  },
706
  {
707
- "epoch": 0.47,
708
- "learning_rate": 2.6281567437545347e-05,
709
- "loss": 1.5626,
710
  "step": 585
711
  },
712
  {
713
- "epoch": 0.48,
714
- "learning_rate": 2.6217824398414473e-05,
715
- "loss": 1.5316,
716
  "step": 590
717
  },
718
  {
719
- "epoch": 0.48,
720
- "learning_rate": 2.615361835056314e-05,
721
- "loss": 1.6038,
722
  "step": 595
723
  },
724
  {
725
- "epoch": 0.49,
726
- "learning_rate": 2.6088951944056024e-05,
727
- "loss": 1.6042,
728
  "step": 600
729
  },
730
  {
731
- "epoch": 0.49,
732
- "learning_rate": 2.6023827847958802e-05,
733
- "loss": 1.5683,
734
  "step": 605
735
  },
736
  {
737
- "epoch": 0.49,
738
- "learning_rate": 2.5958248750228018e-05,
739
- "loss": 1.5166,
740
  "step": 610
741
  },
742
  {
743
- "epoch": 0.5,
744
- "learning_rate": 2.5892217357600104e-05,
745
- "loss": 1.5522,
746
  "step": 615
747
  },
748
  {
749
- "epoch": 0.5,
750
- "learning_rate": 2.5825736395479708e-05,
751
- "loss": 1.6122,
 
 
 
 
 
 
 
 
752
  "step": 620
753
  },
754
  {
755
- "epoch": 0.51,
756
- "learning_rate": 2.5758808607827153e-05,
757
- "loss": 1.5722,
758
  "step": 625
759
  },
760
  {
761
- "epoch": 0.51,
762
- "learning_rate": 2.5691436757045232e-05,
763
- "loss": 1.5333,
764
  "step": 630
765
  },
766
  {
767
- "epoch": 0.51,
768
- "learning_rate": 2.5623623623865152e-05,
769
- "loss": 1.563,
770
  "step": 635
771
  },
772
  {
773
- "epoch": 0.52,
774
- "learning_rate": 2.5555372007231777e-05,
775
- "loss": 1.5318,
776
  "step": 640
777
  },
778
  {
779
- "epoch": 0.52,
780
- "learning_rate": 2.548668472418811e-05,
781
- "loss": 1.548,
782
  "step": 645
783
  },
784
  {
785
- "epoch": 0.53,
786
- "learning_rate": 2.5417564609759005e-05,
787
- "loss": 1.5749,
788
  "step": 650
789
  },
790
  {
791
- "epoch": 0.53,
792
- "learning_rate": 2.5348014516834175e-05,
793
- "loss": 1.6166,
794
  "step": 655
795
  },
796
  {
797
- "epoch": 0.53,
798
- "learning_rate": 2.5278037316050417e-05,
799
- "loss": 1.61,
800
  "step": 660
801
  },
802
  {
803
- "epoch": 0.54,
804
- "learning_rate": 2.5207635895673138e-05,
805
- "loss": 1.4852,
806
  "step": 665
807
  },
808
  {
809
- "epoch": 0.54,
810
- "learning_rate": 2.513681316147715e-05,
811
- "loss": 1.5476,
812
  "step": 670
813
  },
814
  {
815
- "epoch": 0.55,
816
- "learning_rate": 2.506557203662673e-05,
817
- "loss": 1.6291,
818
  "step": 675
819
  },
820
  {
821
- "epoch": 0.55,
822
- "learning_rate": 2.4993915461554974e-05,
823
- "loss": 1.5232,
824
  "step": 680
825
  },
826
  {
827
- "epoch": 0.55,
828
- "learning_rate": 2.4921846393842414e-05,
829
- "loss": 1.5827,
830
  "step": 685
831
  },
832
  {
833
- "epoch": 0.56,
834
- "learning_rate": 2.484936780809497e-05,
835
- "loss": 1.5019,
836
  "step": 690
837
  },
838
  {
839
- "epoch": 0.56,
840
- "learning_rate": 2.4776482695821154e-05,
841
- "loss": 1.5315,
842
  "step": 695
843
  },
844
  {
845
- "epoch": 0.57,
846
- "learning_rate": 2.470319406530862e-05,
847
- "loss": 1.5725,
848
  "step": 700
849
  },
850
  {
851
- "epoch": 0.57,
852
- "learning_rate": 2.4629504941499984e-05,
853
- "loss": 1.6135,
854
  "step": 705
855
  },
856
  {
857
- "epoch": 0.57,
858
- "learning_rate": 2.4555418365867965e-05,
859
- "loss": 1.556,
860
  "step": 710
861
  },
862
  {
863
- "epoch": 0.58,
864
- "learning_rate": 2.4480937396289856e-05,
865
- "loss": 1.5323,
866
  "step": 715
867
  },
868
  {
869
- "epoch": 0.58,
870
- "learning_rate": 2.4406065106921332e-05,
871
- "loss": 1.5726,
872
  "step": 720
873
  },
874
  {
875
- "epoch": 0.59,
876
- "learning_rate": 2.4330804588069536e-05,
877
- "loss": 1.6077,
878
  "step": 725
879
  },
880
  {
881
- "epoch": 0.59,
882
- "learning_rate": 2.4255158946065542e-05,
883
- "loss": 1.5959,
884
  "step": 730
885
  },
886
  {
887
- "epoch": 0.59,
888
- "learning_rate": 2.4179131303136146e-05,
889
- "loss": 1.5373,
890
  "step": 735
891
  },
892
  {
893
- "epoch": 0.6,
894
- "learning_rate": 2.4102724797274994e-05,
895
- "loss": 1.5624,
896
  "step": 740
897
  },
898
  {
899
- "epoch": 0.6,
900
- "learning_rate": 2.4025942582113067e-05,
901
- "loss": 1.5431,
902
  "step": 745
903
  },
904
  {
905
- "epoch": 0.61,
906
- "learning_rate": 2.3948787826788495e-05,
907
- "loss": 1.5807,
908
  "step": 750
909
  },
910
  {
911
- "epoch": 0.61,
912
- "learning_rate": 2.3871263715815802e-05,
913
- "loss": 1.5659,
914
  "step": 755
915
  },
916
  {
917
- "epoch": 0.62,
918
- "learning_rate": 2.3793373448954406e-05,
919
- "loss": 1.5356,
920
  "step": 760
921
  },
922
  {
923
- "epoch": 0.62,
924
- "learning_rate": 2.3715120241076602e-05,
925
- "loss": 1.5181,
926
  "step": 765
927
  },
928
  {
929
- "epoch": 0.62,
930
- "learning_rate": 2.3636507322034844e-05,
931
- "loss": 1.6218,
932
  "step": 770
933
  },
934
  {
935
- "epoch": 0.63,
936
- "learning_rate": 2.355753793652844e-05,
937
- "loss": 1.6171,
938
  "step": 775
939
  },
940
  {
941
- "epoch": 0.63,
942
- "learning_rate": 2.3478215343969623e-05,
943
- "loss": 1.5306,
944
  "step": 780
945
  },
946
  {
947
- "epoch": 0.64,
948
- "learning_rate": 2.3398542818349042e-05,
949
- "loss": 1.5147,
950
  "step": 785
951
  },
952
  {
953
- "epoch": 0.64,
954
- "learning_rate": 2.33185236481006e-05,
955
- "loss": 1.5031,
956
  "step": 790
957
  },
958
  {
959
- "epoch": 0.64,
960
- "learning_rate": 2.323816113596575e-05,
961
- "loss": 1.5471,
962
  "step": 795
963
  },
964
  {
965
- "epoch": 0.65,
966
- "learning_rate": 2.3157458598857164e-05,
967
- "loss": 1.5904,
968
  "step": 800
969
  },
970
  {
971
- "epoch": 0.65,
972
- "learning_rate": 2.3076419367721834e-05,
973
- "loss": 1.6055,
974
  "step": 805
975
  },
976
  {
977
- "epoch": 0.66,
978
- "learning_rate": 2.299504678740359e-05,
979
- "loss": 1.6248,
980
  "step": 810
981
  },
982
  {
983
- "epoch": 0.66,
984
- "learning_rate": 2.2913344216505043e-05,
985
- "loss": 1.5905,
986
  "step": 815
987
  },
988
  {
989
- "epoch": 0.66,
990
- "learning_rate": 2.283131502724896e-05,
991
- "loss": 1.4956,
992
  "step": 820
993
  },
994
  {
995
- "epoch": 0.67,
996
- "learning_rate": 2.2748962605339066e-05,
997
- "loss": 1.537,
998
  "step": 825
999
  },
1000
  {
1001
- "epoch": 0.67,
1002
- "learning_rate": 2.266629034982033e-05,
1003
- "loss": 1.5609,
1004
  "step": 830
1005
  },
1006
  {
1007
- "epoch": 0.68,
1008
- "learning_rate": 2.2583301672938648e-05,
1009
- "loss": 1.6342,
1010
  "step": 835
1011
  },
1012
  {
1013
- "epoch": 0.68,
1014
- "learning_rate": 2.25e-05,
1015
- "loss": 1.6218,
1016
  "step": 840
1017
  },
1018
  {
1019
- "epoch": 0.68,
1020
- "learning_rate": 2.24163887692291e-05,
1021
- "loss": 1.6035,
1022
  "step": 845
1023
  },
1024
  {
1025
- "epoch": 0.69,
1026
- "learning_rate": 2.233247143162746e-05,
1027
- "loss": 1.4957,
1028
  "step": 850
1029
  },
1030
  {
1031
- "epoch": 0.69,
1032
- "learning_rate": 2.224825145083096e-05,
1033
- "loss": 1.4736,
1034
  "step": 855
1035
  },
1036
  {
1037
- "epoch": 0.7,
1038
- "learning_rate": 2.216373230296689e-05,
1039
- "loss": 1.581,
1040
  "step": 860
1041
  },
1042
  {
1043
- "epoch": 0.7,
1044
- "learning_rate": 2.2078917476510483e-05,
1045
- "loss": 1.6488,
1046
  "step": 865
1047
  },
1048
  {
1049
- "epoch": 0.7,
1050
- "learning_rate": 2.1993810472140908e-05,
1051
- "loss": 1.5308,
1052
  "step": 870
1053
  },
1054
  {
1055
- "epoch": 0.71,
1056
- "learning_rate": 2.190841480259681e-05,
1057
- "loss": 1.4948,
1058
  "step": 875
1059
  },
1060
  {
1061
- "epoch": 0.71,
1062
- "learning_rate": 2.1822733992531294e-05,
1063
- "loss": 1.5796,
1064
  "step": 880
1065
  },
1066
  {
1067
- "epoch": 0.72,
1068
- "learning_rate": 2.1736771578366472e-05,
1069
- "loss": 1.6163,
1070
  "step": 885
1071
  },
1072
  {
1073
- "epoch": 0.72,
1074
- "learning_rate": 2.1650531108147493e-05,
1075
- "loss": 1.5086,
1076
  "step": 890
1077
  },
1078
  {
1079
- "epoch": 0.72,
1080
- "learning_rate": 2.1564016141396093e-05,
1081
- "loss": 1.564,
1082
  "step": 895
1083
  },
1084
  {
1085
- "epoch": 0.73,
1086
- "learning_rate": 2.1477230248963675e-05,
1087
- "loss": 1.6207,
1088
  "step": 900
1089
  },
1090
  {
1091
- "epoch": 0.73,
1092
- "learning_rate": 2.139017701288394e-05,
1093
- "loss": 1.5649,
1094
  "step": 905
1095
  },
1096
  {
1097
- "epoch": 0.74,
1098
- "learning_rate": 2.1302860026225027e-05,
1099
- "loss": 1.5835,
1100
  "step": 910
1101
  },
1102
  {
1103
- "epoch": 0.74,
1104
- "learning_rate": 2.121528289294122e-05,
1105
- "loss": 1.5632,
1106
  "step": 915
1107
  },
1108
  {
1109
- "epoch": 0.74,
1110
- "learning_rate": 2.1127449227724186e-05,
1111
- "loss": 1.5381,
1112
  "step": 920
1113
  },
1114
  {
1115
- "epoch": 0.75,
1116
- "learning_rate": 2.1039362655853796e-05,
1117
- "loss": 1.5546,
1118
  "step": 925
1119
  },
1120
  {
1121
- "epoch": 0.75,
1122
- "learning_rate": 2.0951026813048475e-05,
1123
- "loss": 1.4658,
1124
  "step": 930
1125
  },
1126
  {
1127
- "epoch": 0.76,
1128
- "learning_rate": 2.0862445345315165e-05,
1129
- "loss": 1.5073,
1130
  "step": 935
1131
  },
1132
  {
1133
- "epoch": 0.76,
1134
- "learning_rate": 2.0773621908798818e-05,
1135
- "loss": 1.5564,
1136
  "step": 940
1137
  },
1138
  {
1139
- "epoch": 0.76,
1140
- "learning_rate": 2.068456016963149e-05,
1141
- "loss": 1.6254,
1142
  "step": 945
1143
  },
1144
  {
1145
- "epoch": 0.77,
1146
- "learning_rate": 2.0595263803781037e-05,
1147
- "loss": 1.5497,
1148
  "step": 950
1149
  },
1150
  {
1151
- "epoch": 0.77,
1152
- "learning_rate": 2.050573649689938e-05,
1153
- "loss": 1.5791,
1154
  "step": 955
1155
  },
1156
  {
1157
- "epoch": 0.78,
1158
- "learning_rate": 2.0415981944170405e-05,
1159
- "loss": 1.4777,
1160
  "step": 960
1161
  },
1162
  {
1163
- "epoch": 0.78,
1164
- "learning_rate": 2.0326003850157408e-05,
1165
- "loss": 1.5316,
1166
  "step": 965
1167
  },
1168
  {
1169
- "epoch": 0.78,
1170
- "learning_rate": 2.0235805928650214e-05,
1171
- "loss": 1.5288,
1172
  "step": 970
1173
  },
1174
  {
1175
- "epoch": 0.79,
1176
- "learning_rate": 2.0145391902511905e-05,
1177
- "loss": 1.5519,
1178
  "step": 975
1179
  },
1180
  {
1181
- "epoch": 0.79,
1182
- "learning_rate": 2.0054765503525136e-05,
1183
- "loss": 1.5359,
1184
  "step": 980
1185
  },
1186
  {
1187
- "epoch": 0.8,
1188
- "learning_rate": 1.9963930472238126e-05,
1189
- "loss": 1.5367,
1190
  "step": 985
1191
  },
1192
  {
1193
- "epoch": 0.8,
1194
- "learning_rate": 1.9872890557810258e-05,
1195
- "loss": 1.5804,
1196
  "step": 990
1197
  },
1198
  {
1199
- "epoch": 0.81,
1200
- "learning_rate": 1.978164951785733e-05,
1201
- "loss": 1.6531,
1202
  "step": 995
1203
  },
1204
  {
1205
- "epoch": 0.81,
1206
- "learning_rate": 1.96902111182965e-05,
1207
- "loss": 1.6579,
1208
  "step": 1000
1209
  },
1210
  {
1211
- "epoch": 0.81,
1212
- "learning_rate": 1.959857913319078e-05,
1213
- "loss": 1.6308,
1214
  "step": 1005
1215
  },
1216
  {
1217
- "epoch": 0.82,
1218
- "learning_rate": 1.9506757344593345e-05,
1219
- "loss": 1.6248,
1220
  "step": 1010
1221
  },
1222
  {
1223
- "epoch": 0.82,
1224
- "learning_rate": 1.941474954239136e-05,
1225
- "loss": 1.5152,
1226
  "step": 1015
1227
- },
1228
- {
1229
- "epoch": 0.83,
1230
- "learning_rate": 1.9322559524149603e-05,
1231
- "loss": 1.5966,
1232
- "step": 1020
1233
- },
1234
- {
1235
- "epoch": 0.83,
1236
- "learning_rate": 1.92301910949537e-05,
1237
- "loss": 1.5624,
1238
- "step": 1025
1239
- },
1240
- {
1241
- "epoch": 0.83,
1242
- "learning_rate": 1.9137648067253087e-05,
1243
- "loss": 1.6706,
1244
- "step": 1030
1245
- },
1246
- {
1247
- "epoch": 0.84,
1248
- "learning_rate": 1.9044934260703623e-05,
1249
- "loss": 1.566,
1250
- "step": 1035
1251
- },
1252
- {
1253
- "epoch": 0.84,
1254
- "learning_rate": 1.895205350200998e-05,
1255
- "loss": 1.508,
1256
- "step": 1040
1257
- },
1258
- {
1259
- "epoch": 0.85,
1260
- "learning_rate": 1.885900962476767e-05,
1261
- "loss": 1.6144,
1262
- "step": 1045
1263
- },
1264
- {
1265
- "epoch": 0.85,
1266
- "learning_rate": 1.8765806469304814e-05,
1267
- "loss": 1.6291,
1268
- "step": 1050
1269
- },
1270
- {
1271
- "epoch": 0.85,
1272
- "learning_rate": 1.8672447882523644e-05,
1273
- "loss": 1.6088,
1274
- "step": 1055
1275
- },
1276
- {
1277
- "epoch": 0.86,
1278
- "learning_rate": 1.8578937717741727e-05,
1279
- "loss": 1.5956,
1280
- "step": 1060
1281
- },
1282
- {
1283
- "epoch": 0.86,
1284
- "learning_rate": 1.8485279834532923e-05,
1285
- "loss": 1.6554,
1286
- "step": 1065
1287
- },
1288
- {
1289
- "epoch": 0.87,
1290
- "learning_rate": 1.839147809856807e-05,
1291
- "loss": 1.6333,
1292
- "step": 1070
1293
- },
1294
- {
1295
- "epoch": 0.87,
1296
- "learning_rate": 1.8297536381455434e-05,
1297
- "loss": 1.6016,
1298
- "step": 1075
1299
- },
1300
- {
1301
- "epoch": 0.87,
1302
- "learning_rate": 1.8203458560580934e-05,
1303
- "loss": 1.5765,
1304
- "step": 1080
1305
- },
1306
- {
1307
- "epoch": 0.88,
1308
- "learning_rate": 1.810924851894807e-05,
1309
- "loss": 1.4411,
1310
- "step": 1085
1311
- },
1312
- {
1313
- "epoch": 0.88,
1314
- "learning_rate": 1.801491014501768e-05,
1315
- "loss": 1.6064,
1316
- "step": 1090
1317
- },
1318
- {
1319
- "epoch": 0.89,
1320
- "learning_rate": 1.7920447332547423e-05,
1321
- "loss": 1.5165,
1322
- "step": 1095
1323
- },
1324
- {
1325
- "epoch": 0.89,
1326
- "learning_rate": 1.7825863980431106e-05,
1327
- "loss": 1.628,
1328
- "step": 1100
1329
- },
1330
- {
1331
- "epoch": 0.89,
1332
- "learning_rate": 1.773116399253772e-05,
1333
- "loss": 1.5792,
1334
- "step": 1105
1335
- },
1336
- {
1337
- "epoch": 0.9,
1338
- "learning_rate": 1.7636351277550324e-05,
1339
- "loss": 1.5908,
1340
- "step": 1110
1341
- },
1342
- {
1343
- "epoch": 0.9,
1344
- "learning_rate": 1.7541429748804722e-05,
1345
- "loss": 1.5477,
1346
- "step": 1115
1347
- },
1348
- {
1349
- "epoch": 0.91,
1350
- "learning_rate": 1.744640332412794e-05,
1351
- "loss": 1.6118,
1352
- "step": 1120
1353
- },
1354
- {
1355
- "epoch": 0.91,
1356
- "learning_rate": 1.7351275925676517e-05,
1357
- "loss": 1.6034,
1358
- "step": 1125
1359
- },
1360
- {
1361
- "epoch": 0.91,
1362
- "learning_rate": 1.725605147977461e-05,
1363
- "loss": 1.6053,
1364
- "step": 1130
1365
- },
1366
- {
1367
- "epoch": 0.92,
1368
- "learning_rate": 1.716073391675197e-05,
1369
- "loss": 1.5638,
1370
- "step": 1135
1371
- },
1372
- {
1373
- "epoch": 0.92,
1374
- "learning_rate": 1.7065327170781678e-05,
1375
- "loss": 1.5809,
1376
- "step": 1140
1377
- },
1378
- {
1379
- "epoch": 0.93,
1380
- "learning_rate": 1.69698351797178e-05,
1381
- "loss": 1.5822,
1382
- "step": 1145
1383
- },
1384
- {
1385
- "epoch": 0.93,
1386
- "learning_rate": 1.6874261884932842e-05,
1387
- "loss": 1.5981,
1388
- "step": 1150
1389
- },
1390
- {
1391
- "epoch": 0.93,
1392
- "learning_rate": 1.677861123115506e-05,
1393
- "loss": 1.5995,
1394
- "step": 1155
1395
- },
1396
- {
1397
- "epoch": 0.94,
1398
- "learning_rate": 1.668288716630566e-05,
1399
- "loss": 1.5323,
1400
- "step": 1160
1401
- },
1402
- {
1403
- "epoch": 0.94,
1404
- "learning_rate": 1.6587093641335844e-05,
1405
- "loss": 1.5981,
1406
- "step": 1165
1407
- },
1408
- {
1409
- "epoch": 0.95,
1410
- "learning_rate": 1.6491234610063732e-05,
1411
- "loss": 1.6448,
1412
- "step": 1170
1413
- },
1414
- {
1415
- "epoch": 0.95,
1416
- "learning_rate": 1.6395314029011184e-05,
1417
- "loss": 1.63,
1418
- "step": 1175
1419
- },
1420
- {
1421
- "epoch": 0.95,
1422
- "learning_rate": 1.6299335857240484e-05,
1423
- "loss": 1.6529,
1424
- "step": 1180
1425
- },
1426
- {
1427
- "epoch": 0.96,
1428
- "learning_rate": 1.6203304056190938e-05,
1429
- "loss": 1.6172,
1430
- "step": 1185
1431
- },
1432
- {
1433
- "epoch": 0.96,
1434
- "learning_rate": 1.6107222589515376e-05,
1435
- "loss": 1.5247,
1436
- "step": 1190
1437
- },
1438
- {
1439
- "epoch": 0.97,
1440
- "learning_rate": 1.601109542291652e-05,
1441
- "loss": 1.5932,
1442
- "step": 1195
1443
- },
1444
- {
1445
- "epoch": 0.97,
1446
- "learning_rate": 1.591492652398336e-05,
1447
- "loss": 1.6186,
1448
- "step": 1200
1449
- },
1450
- {
1451
- "epoch": 0.98,
1452
- "learning_rate": 1.5818719862027338e-05,
1453
- "loss": 1.6524,
1454
- "step": 1205
1455
- },
1456
- {
1457
- "epoch": 0.98,
1458
- "learning_rate": 1.572247940791856e-05,
1459
- "loss": 1.611,
1460
- "step": 1210
1461
- },
1462
- {
1463
- "epoch": 0.98,
1464
- "learning_rate": 1.5626209133921863e-05,
1465
- "loss": 1.5913,
1466
- "step": 1215
1467
- },
1468
- {
1469
- "epoch": 0.99,
1470
- "learning_rate": 1.5529913013532894e-05,
1471
- "loss": 1.7348,
1472
- "step": 1220
1473
- },
1474
- {
1475
- "epoch": 0.99,
1476
- "learning_rate": 1.543359502131409e-05,
1477
- "loss": 1.719,
1478
- "step": 1225
1479
- },
1480
- {
1481
- "epoch": 1.0,
1482
- "learning_rate": 1.533725913273064e-05,
1483
- "loss": 1.6274,
1484
- "step": 1230
1485
- },
1486
- {
1487
- "epoch": 1.0,
1488
- "learning_rate": 1.5240909323986384e-05,
1489
- "loss": 1.5266,
1490
- "step": 1235
1491
- },
1492
- {
1493
- "epoch": 1.0,
1494
- "eval_loss": 3.4220237731933594,
1495
- "eval_runtime": 954.9678,
1496
- "eval_samples_per_second": 9.114,
1497
- "eval_steps_per_second": 2.279,
1498
- "step": 1235
1499
- },
1500
- {
1501
- "epoch": 1.0,
1502
- "learning_rate": 1.5144549571859711e-05,
1503
- "loss": 1.3304,
1504
- "step": 1240
1505
- },
1506
- {
1507
- "epoch": 1.01,
1508
- "learning_rate": 1.5048183853539425e-05,
1509
- "loss": 1.0536,
1510
- "step": 1245
1511
- },
1512
- {
1513
- "epoch": 1.01,
1514
- "learning_rate": 1.4951816146460574e-05,
1515
- "loss": 1.0564,
1516
- "step": 1250
1517
- },
1518
- {
1519
- "epoch": 1.02,
1520
- "learning_rate": 1.4855450428140291e-05,
1521
- "loss": 1.0105,
1522
- "step": 1255
1523
- },
1524
- {
1525
- "epoch": 1.02,
1526
- "learning_rate": 1.4759090676013616e-05,
1527
- "loss": 1.0252,
1528
- "step": 1260
1529
- },
1530
- {
1531
- "epoch": 1.02,
1532
- "learning_rate": 1.4662740867269361e-05,
1533
- "loss": 1.0029,
1534
- "step": 1265
1535
- },
1536
- {
1537
- "epoch": 1.03,
1538
- "learning_rate": 1.4566404978685912e-05,
1539
- "loss": 0.9685,
1540
- "step": 1270
1541
- },
1542
- {
1543
- "epoch": 1.03,
1544
- "learning_rate": 1.447008698646711e-05,
1545
- "loss": 1.0095,
1546
- "step": 1275
1547
- },
1548
- {
1549
- "epoch": 1.04,
1550
- "learning_rate": 1.4373790866078143e-05,
1551
- "loss": 0.9759,
1552
- "step": 1280
1553
- },
1554
- {
1555
- "epoch": 1.04,
1556
- "learning_rate": 1.4277520592081442e-05,
1557
- "loss": 1.0217,
1558
- "step": 1285
1559
- },
1560
- {
1561
- "epoch": 1.04,
1562
- "learning_rate": 1.4181280137972666e-05,
1563
- "loss": 0.9472,
1564
- "step": 1290
1565
- },
1566
- {
1567
- "epoch": 1.05,
1568
- "learning_rate": 1.4085073476016642e-05,
1569
- "loss": 1.0432,
1570
- "step": 1295
1571
- },
1572
- {
1573
- "epoch": 1.05,
1574
- "learning_rate": 1.3988904577083481e-05,
1575
- "loss": 1.0355,
1576
- "step": 1300
1577
- },
1578
- {
1579
- "epoch": 1.06,
1580
- "learning_rate": 1.389277741048463e-05,
1581
- "loss": 0.9971,
1582
- "step": 1305
1583
- },
1584
- {
1585
- "epoch": 1.06,
1586
- "learning_rate": 1.3796695943809063e-05,
1587
- "loss": 0.9947,
1588
- "step": 1310
1589
- },
1590
- {
1591
- "epoch": 1.06,
1592
- "learning_rate": 1.3700664142759521e-05,
1593
- "loss": 1.0532,
1594
- "step": 1315
1595
- },
1596
- {
1597
- "epoch": 1.07,
1598
- "learning_rate": 1.3604685970988817e-05,
1599
- "loss": 0.9791,
1600
- "step": 1320
1601
- },
1602
- {
1603
- "epoch": 1.07,
1604
- "learning_rate": 1.3508765389936272e-05,
1605
- "loss": 1.0299,
1606
- "step": 1325
1607
- },
1608
- {
1609
- "epoch": 1.08,
1610
- "learning_rate": 1.341290635866416e-05,
1611
- "loss": 1.036,
1612
- "step": 1330
1613
- },
1614
- {
1615
- "epoch": 1.08,
1616
- "learning_rate": 1.3317112833694344e-05,
1617
- "loss": 1.0534,
1618
- "step": 1335
1619
- },
1620
- {
1621
- "epoch": 1.08,
1622
- "learning_rate": 1.3221388768844937e-05,
1623
- "loss": 1.0144,
1624
- "step": 1340
1625
- },
1626
- {
1627
- "epoch": 1.09,
1628
- "learning_rate": 1.3125738115067159e-05,
1629
- "loss": 1.0696,
1630
- "step": 1345
1631
- },
1632
- {
1633
- "epoch": 1.09,
1634
- "learning_rate": 1.30301648202822e-05,
1635
- "loss": 1.0651,
1636
- "step": 1350
1637
- },
1638
- {
1639
- "epoch": 1.1,
1640
- "learning_rate": 1.2934672829218327e-05,
1641
- "loss": 1.0598,
1642
- "step": 1355
1643
- },
1644
- {
1645
- "epoch": 1.1,
1646
- "learning_rate": 1.283926608324804e-05,
1647
- "loss": 1.0514,
1648
- "step": 1360
1649
- },
1650
- {
1651
- "epoch": 1.11,
1652
- "learning_rate": 1.2743948520225391e-05,
1653
- "loss": 0.9963,
1654
- "step": 1365
1655
- },
1656
- {
1657
- "epoch": 1.11,
1658
- "learning_rate": 1.2648724074323492e-05,
1659
- "loss": 1.0307,
1660
- "step": 1370
1661
- },
1662
- {
1663
- "epoch": 1.11,
1664
- "learning_rate": 1.255359667587206e-05,
1665
- "loss": 1.0354,
1666
- "step": 1375
1667
- },
1668
- {
1669
- "epoch": 1.12,
1670
- "learning_rate": 1.2458570251195279e-05,
1671
- "loss": 1.0965,
1672
- "step": 1380
1673
- },
1674
- {
1675
- "epoch": 1.12,
1676
- "learning_rate": 1.2363648722449679e-05,
1677
- "loss": 1.0484,
1678
- "step": 1385
1679
- },
1680
- {
1681
- "epoch": 1.13,
1682
- "learning_rate": 1.2268836007462284e-05,
1683
- "loss": 1.0421,
1684
- "step": 1390
1685
- },
1686
- {
1687
- "epoch": 1.13,
1688
- "learning_rate": 1.21741360195689e-05,
1689
- "loss": 1.0695,
1690
- "step": 1395
1691
- },
1692
- {
1693
- "epoch": 1.13,
1694
- "learning_rate": 1.207955266745258e-05,
1695
- "loss": 1.0323,
1696
- "step": 1400
1697
- },
1698
- {
1699
- "epoch": 1.14,
1700
- "learning_rate": 1.198508985498233e-05,
1701
- "loss": 1.0874,
1702
- "step": 1405
1703
- },
1704
- {
1705
- "epoch": 1.14,
1706
- "learning_rate": 1.1890751481051933e-05,
1707
- "loss": 1.0702,
1708
- "step": 1410
1709
- },
1710
- {
1711
- "epoch": 1.15,
1712
- "learning_rate": 1.179654143941907e-05,
1713
- "loss": 1.0211,
1714
- "step": 1415
1715
- },
1716
- {
1717
- "epoch": 1.15,
1718
- "learning_rate": 1.1702463618544562e-05,
1719
- "loss": 1.0252,
1720
- "step": 1420
1721
- },
1722
- {
1723
- "epoch": 1.15,
1724
- "learning_rate": 1.1608521901431932e-05,
1725
- "loss": 1.0588,
1726
- "step": 1425
1727
- },
1728
- {
1729
- "epoch": 1.16,
1730
- "learning_rate": 1.1514720165467076e-05,
1731
- "loss": 1.0644,
1732
- "step": 1430
1733
- },
1734
- {
1735
- "epoch": 1.16,
1736
- "learning_rate": 1.1421062282258276e-05,
1737
- "loss": 1.0831,
1738
- "step": 1435
1739
- },
1740
- {
1741
- "epoch": 1.17,
1742
- "learning_rate": 1.1327552117476363e-05,
1743
- "loss": 1.0262,
1744
- "step": 1440
1745
- },
1746
- {
1747
- "epoch": 1.17,
1748
- "learning_rate": 1.1234193530695189e-05,
1749
- "loss": 1.0775,
1750
- "step": 1445
1751
- },
1752
- {
1753
- "epoch": 1.17,
1754
- "learning_rate": 1.1140990375232336e-05,
1755
- "loss": 1.0476,
1756
- "step": 1450
1757
- },
1758
- {
1759
- "epoch": 1.18,
1760
- "learning_rate": 1.1047946497990016e-05,
1761
- "loss": 1.0373,
1762
- "step": 1455
1763
- },
1764
- {
1765
- "epoch": 1.18,
1766
- "learning_rate": 1.0955065739296378e-05,
1767
- "loss": 1.0734,
1768
- "step": 1460
1769
  }
1770
  ],
1771
- "max_steps": 2470,
1772
  "num_train_epochs": 2,
1773
- "total_flos": 3.470813417302917e+17,
1774
  "trial_name": null,
1775
  "trial_params": null
1776
  }
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.6457290244272493,
5
+ "global_step": 1016,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.01,
12
+ "learning_rate": 2.4193548387096776e-06,
13
+ "loss": 3.4647,
14
  "step": 5
15
  },
16
  {
17
+ "epoch": 0.02,
18
+ "learning_rate": 4.838709677419355e-06,
19
+ "loss": 2.4108,
20
  "step": 10
21
  },
22
  {
23
+ "epoch": 0.02,
24
+ "learning_rate": 7.258064516129032e-06,
25
+ "loss": 1.5747,
26
  "step": 15
27
  },
28
  {
29
+ "epoch": 0.03,
30
+ "learning_rate": 9.67741935483871e-06,
31
+ "loss": 1.5005,
32
  "step": 20
33
  },
34
  {
35
+ "epoch": 0.04,
36
+ "learning_rate": 1.2096774193548387e-05,
37
+ "loss": 1.4694,
38
  "step": 25
39
  },
40
  {
41
+ "epoch": 0.05,
42
+ "learning_rate": 1.4516129032258065e-05,
43
+ "loss": 1.4399,
44
  "step": 30
45
  },
46
  {
47
+ "epoch": 0.06,
48
+ "learning_rate": 1.6935483870967744e-05,
49
+ "loss": 1.4215,
50
  "step": 35
51
  },
52
  {
53
+ "epoch": 0.06,
54
+ "learning_rate": 1.935483870967742e-05,
55
+ "loss": 1.4526,
56
  "step": 40
57
  },
58
  {
59
+ "epoch": 0.07,
60
+ "learning_rate": 2.1774193548387097e-05,
61
+ "loss": 1.4458,
62
  "step": 45
63
  },
64
  {
65
+ "epoch": 0.08,
66
+ "learning_rate": 2.4193548387096773e-05,
67
+ "loss": 1.4237,
68
  "step": 50
69
  },
70
  {
71
+ "epoch": 0.09,
72
+ "learning_rate": 2.6612903225806453e-05,
73
+ "loss": 1.4339,
74
  "step": 55
75
  },
76
  {
77
+ "epoch": 0.1,
78
+ "learning_rate": 2.903225806451613e-05,
79
+ "loss": 1.4294,
80
  "step": 60
81
  },
82
  {
83
+ "epoch": 0.11,
84
+ "learning_rate": 2.999951499529191e-05,
85
+ "loss": 1.4968,
86
  "step": 65
87
  },
88
  {
89
+ "epoch": 0.11,
90
+ "learning_rate": 2.9996551191211948e-05,
91
+ "loss": 1.4261,
92
  "step": 70
93
  },
94
  {
95
+ "epoch": 0.12,
96
+ "learning_rate": 2.9990893561853812e-05,
97
+ "loss": 1.4371,
98
  "step": 75
99
  },
100
  {
101
+ "epoch": 0.13,
102
+ "learning_rate": 2.9982543123495507e-05,
103
+ "loss": 1.4412,
104
  "step": 80
105
  },
106
  {
107
+ "epoch": 0.14,
108
+ "learning_rate": 2.9971501376123366e-05,
109
+ "loss": 1.4638,
110
  "step": 85
111
  },
112
  {
113
+ "epoch": 0.15,
114
+ "learning_rate": 2.9957770303162634e-05,
115
+ "loss": 1.4498,
116
  "step": 90
117
  },
118
  {
119
+ "epoch": 0.15,
120
+ "learning_rate": 2.9941352371121173e-05,
121
+ "loss": 1.4393,
122
  "step": 95
123
  },
124
  {
125
+ "epoch": 0.16,
126
+ "learning_rate": 2.992225052914641e-05,
127
+ "loss": 1.4291,
128
  "step": 100
129
  },
130
  {
131
+ "epoch": 0.17,
132
+ "learning_rate": 2.990046820849558e-05,
133
+ "loss": 1.4587,
134
  "step": 105
135
  },
136
  {
137
+ "epoch": 0.18,
138
+ "learning_rate": 2.9876009321919372e-05,
139
+ "loss": 1.4272,
140
  "step": 110
141
  },
142
  {
143
+ "epoch": 0.19,
144
+ "learning_rate": 2.9848878262959076e-05,
145
+ "loss": 1.4622,
146
  "step": 115
147
  },
148
  {
149
+ "epoch": 0.19,
150
+ "learning_rate": 2.981907990515739e-05,
151
+ "loss": 1.4863,
152
  "step": 120
153
  },
154
  {
155
+ "epoch": 0.2,
156
+ "learning_rate": 2.9786619601182965e-05,
157
+ "loss": 1.4274,
158
  "step": 125
159
  },
160
  {
161
+ "epoch": 0.21,
162
+ "learning_rate": 2.975150318186892e-05,
163
+ "loss": 1.4382,
164
  "step": 130
165
  },
166
  {
167
+ "epoch": 0.22,
168
+ "learning_rate": 2.9713736955165456e-05,
169
+ "loss": 1.4413,
170
  "step": 135
171
  },
172
  {
173
+ "epoch": 0.23,
174
+ "learning_rate": 2.9673327705006745e-05,
175
+ "loss": 1.431,
176
  "step": 140
177
  },
178
  {
179
+ "epoch": 0.23,
180
+ "learning_rate": 2.963028269009235e-05,
181
+ "loss": 1.4659,
182
  "step": 145
183
  },
184
  {
185
+ "epoch": 0.24,
186
+ "learning_rate": 2.9584609642583337e-05,
187
+ "loss": 1.4426,
188
  "step": 150
189
  },
190
  {
191
+ "epoch": 0.25,
192
+ "learning_rate": 2.9536316766713357e-05,
193
+ "loss": 1.4733,
194
  "step": 155
195
  },
196
  {
197
+ "epoch": 0.26,
198
+ "learning_rate": 2.9485412737314923e-05,
199
+ "loss": 1.4595,
200
  "step": 160
201
  },
202
  {
203
+ "epoch": 0.27,
204
+ "learning_rate": 2.9431906698261136e-05,
205
+ "loss": 1.4845,
206
  "step": 165
207
  },
208
  {
209
+ "epoch": 0.28,
210
+ "learning_rate": 2.9375808260823192e-05,
211
+ "loss": 1.5219,
212
  "step": 170
213
  },
214
  {
215
+ "epoch": 0.28,
216
+ "learning_rate": 2.931712750194392e-05,
217
+ "loss": 1.532,
218
  "step": 175
219
  },
220
  {
221
+ "epoch": 0.29,
222
+ "learning_rate": 2.9255874962427638e-05,
223
+ "loss": 1.4632,
224
  "step": 180
225
  },
226
  {
227
+ "epoch": 0.3,
228
+ "learning_rate": 2.9192061645046724e-05,
229
+ "loss": 1.5057,
230
  "step": 185
231
  },
232
  {
233
+ "epoch": 0.31,
234
+ "learning_rate": 2.9125699012565204e-05,
235
+ "loss": 1.5023,
236
  "step": 190
237
  },
238
  {
239
+ "epoch": 0.32,
240
+ "learning_rate": 2.9056798985679688e-05,
241
+ "loss": 1.4423,
242
  "step": 195
243
  },
244
  {
245
+ "epoch": 0.32,
246
+ "learning_rate": 2.8985373940878053e-05,
247
+ "loss": 1.4968,
248
  "step": 200
249
  },
250
  {
251
+ "epoch": 0.33,
252
+ "learning_rate": 2.8911436708216276e-05,
253
+ "loss": 1.4767,
254
  "step": 205
255
  },
256
  {
257
+ "epoch": 0.34,
258
+ "learning_rate": 2.883500056901376e-05,
259
+ "loss": 1.4968,
260
  "step": 210
261
  },
262
  {
263
+ "epoch": 0.35,
264
+ "learning_rate": 2.875607925346762e-05,
265
+ "loss": 1.4842,
266
  "step": 215
267
  },
268
  {
269
+ "epoch": 0.36,
270
+ "learning_rate": 2.867468693818634e-05,
271
+ "loss": 1.4921,
272
  "step": 220
273
  },
274
  {
275
+ "epoch": 0.36,
276
+ "learning_rate": 2.859083824364323e-05,
277
+ "loss": 1.4969,
278
  "step": 225
279
  },
280
  {
281
+ "epoch": 0.37,
282
+ "learning_rate": 2.8504548231550143e-05,
283
+ "loss": 1.4423,
284
  "step": 230
285
  },
286
  {
287
+ "epoch": 0.38,
288
+ "learning_rate": 2.8415832402151956e-05,
289
+ "loss": 1.5165,
290
  "step": 235
291
  },
292
  {
293
+ "epoch": 0.39,
294
+ "learning_rate": 2.832470669144227e-05,
295
+ "loss": 1.4715,
296
  "step": 240
297
  },
298
  {
299
+ "epoch": 0.4,
300
+ "learning_rate": 2.8231187468300836e-05,
301
+ "loss": 1.4747,
302
  "step": 245
303
  },
304
  {
305
+ "epoch": 0.4,
306
+ "learning_rate": 2.8135291531553192e-05,
307
+ "loss": 1.466,
308
  "step": 250
309
  },
310
  {
311
+ "epoch": 0.41,
312
+ "learning_rate": 2.8037036106953134e-05,
313
+ "loss": 1.4609,
314
  "step": 255
315
  },
316
  {
317
+ "epoch": 0.42,
318
+ "learning_rate": 2.793643884408843e-05,
319
+ "loss": 1.4649,
320
  "step": 260
321
  },
322
  {
323
+ "epoch": 0.43,
324
+ "learning_rate": 2.7833517813210437e-05,
325
+ "loss": 1.4923,
326
  "step": 265
327
  },
328
  {
329
+ "epoch": 0.44,
330
+ "learning_rate": 2.7728291501988173e-05,
331
+ "loss": 1.4968,
332
  "step": 270
333
  },
334
  {
335
+ "epoch": 0.45,
336
+ "learning_rate": 2.7620778812187338e-05,
337
+ "loss": 1.4545,
338
  "step": 275
339
  },
340
  {
341
+ "epoch": 0.45,
342
+ "learning_rate": 2.7510999056275038e-05,
343
+ "loss": 1.4791,
344
  "step": 280
345
  },
346
  {
347
+ "epoch": 0.46,
348
+ "learning_rate": 2.739897195395067e-05,
349
+ "loss": 1.4849,
350
  "step": 285
351
  },
352
  {
353
+ "epoch": 0.47,
354
+ "learning_rate": 2.728471762860369e-05,
355
+ "loss": 1.4492,
356
  "step": 290
357
  },
358
  {
359
+ "epoch": 0.48,
360
+ "learning_rate": 2.716825660369885e-05,
361
+ "loss": 1.5052,
362
  "step": 295
363
  },
364
  {
365
+ "epoch": 0.49,
366
+ "learning_rate": 2.704960979908957e-05,
367
+ "loss": 1.4701,
368
  "step": 300
369
  },
370
  {
371
+ "epoch": 0.49,
372
+ "learning_rate": 2.6928798527260127e-05,
373
+ "loss": 1.48,
374
  "step": 305
375
  },
376
  {
377
+ "epoch": 0.5,
378
+ "learning_rate": 2.680584448949729e-05,
379
+ "loss": 1.5158,
380
  "step": 310
381
  },
382
  {
383
+ "epoch": 0.51,
384
+ "learning_rate": 2.6680769771992136e-05,
385
+ "loss": 1.5047,
386
  "step": 315
387
  },
388
  {
389
+ "epoch": 0.52,
390
+ "learning_rate": 2.6553596841872682e-05,
391
+ "loss": 1.5023,
392
  "step": 320
393
  },
394
  {
395
+ "epoch": 0.53,
396
+ "learning_rate": 2.6424348543168177e-05,
397
+ "loss": 1.4932,
398
  "step": 325
399
  },
400
  {
401
+ "epoch": 0.53,
402
+ "learning_rate": 2.6293048092705586e-05,
403
+ "loss": 1.5241,
404
  "step": 330
405
  },
406
  {
407
+ "epoch": 0.54,
408
+ "learning_rate": 2.6159719075939196e-05,
409
+ "loss": 1.4969,
410
  "step": 335
411
  },
412
  {
413
+ "epoch": 0.55,
414
+ "learning_rate": 2.602438544271395e-05,
415
+ "loss": 1.4983,
416
  "step": 340
417
  },
418
  {
419
+ "epoch": 0.56,
420
+ "learning_rate": 2.5887071502963338e-05,
421
+ "loss": 1.4801,
422
  "step": 345
423
  },
424
  {
425
+ "epoch": 0.57,
426
+ "learning_rate": 2.574780192234264e-05,
427
+ "loss": 1.4595,
428
  "step": 350
429
  },
430
  {
431
+ "epoch": 0.57,
432
+ "learning_rate": 2.5606601717798212e-05,
433
+ "loss": 1.4901,
434
  "step": 355
435
  },
436
  {
437
+ "epoch": 0.58,
438
+ "learning_rate": 2.5463496253073726e-05,
439
+ "loss": 1.4939,
440
  "step": 360
441
  },
442
  {
443
+ "epoch": 0.59,
444
+ "learning_rate": 2.531851123415406e-05,
445
+ "loss": 1.4791,
446
  "step": 365
447
  },
448
  {
449
+ "epoch": 0.6,
450
+ "learning_rate": 2.5171672704647785e-05,
451
+ "loss": 1.448,
452
  "step": 370
453
  },
454
  {
455
+ "epoch": 0.61,
456
+ "learning_rate": 2.502300704110891e-05,
457
+ "loss": 1.4857,
458
  "step": 375
459
  },
460
  {
461
+ "epoch": 0.61,
462
+ "learning_rate": 2.4872540948298913e-05,
463
+ "loss": 1.4829,
464
  "step": 380
465
  },
466
  {
467
+ "epoch": 0.62,
468
+ "learning_rate": 2.472030145438974e-05,
469
+ "loss": 1.4786,
470
  "step": 385
471
  },
472
  {
473
+ "epoch": 0.63,
474
+ "learning_rate": 2.4566315906108772e-05,
475
+ "loss": 1.4118,
476
  "step": 390
477
  },
478
  {
479
+ "epoch": 0.64,
480
+ "learning_rate": 2.4410611963826522e-05,
481
+ "loss": 1.429,
482
  "step": 395
483
  },
484
  {
485
+ "epoch": 0.65,
486
+ "learning_rate": 2.4253217596588036e-05,
487
+ "loss": 1.4719,
488
  "step": 400
489
  },
490
  {
491
+ "epoch": 0.66,
492
+ "learning_rate": 2.4094161077088784e-05,
493
+ "loss": 1.5054,
494
  "step": 405
495
  },
496
  {
497
+ "epoch": 0.66,
498
+ "learning_rate": 2.3933470976596088e-05,
499
+ "loss": 1.4493,
500
  "step": 410
501
  },
502
  {
503
+ "epoch": 0.67,
504
+ "learning_rate": 2.3771176159816846e-05,
505
+ "loss": 1.4957,
506
  "step": 415
507
  },
508
  {
509
+ "epoch": 0.68,
510
+ "learning_rate": 2.360730577971259e-05,
511
+ "loss": 1.4764,
512
  "step": 420
513
  },
514
  {
515
+ "epoch": 0.69,
516
+ "learning_rate": 2.3441889272262742e-05,
517
+ "loss": 1.5056,
518
  "step": 425
519
  },
520
  {
521
+ "epoch": 0.7,
522
+ "learning_rate": 2.3274956351177037e-05,
523
+ "loss": 1.4732,
524
  "step": 430
525
  },
526
  {
527
+ "epoch": 0.7,
528
+ "learning_rate": 2.3106537002558074e-05,
529
+ "loss": 1.5047,
530
  "step": 435
531
  },
532
  {
533
+ "epoch": 0.71,
534
+ "learning_rate": 2.293666147951491e-05,
535
+ "loss": 1.5098,
536
  "step": 440
537
  },
538
  {
539
+ "epoch": 0.72,
540
+ "learning_rate": 2.2765360296728697e-05,
541
+ "loss": 1.504,
542
  "step": 445
543
  },
544
  {
545
+ "epoch": 0.73,
546
+ "learning_rate": 2.259266422497137e-05,
547
+ "loss": 1.4742,
548
  "step": 450
549
  },
550
  {
551
+ "epoch": 0.74,
552
+ "learning_rate": 2.2418604285578273e-05,
553
+ "loss": 1.482,
554
  "step": 455
555
  },
556
  {
557
+ "epoch": 0.74,
558
+ "learning_rate": 2.2243211744875818e-05,
559
+ "loss": 1.4702,
560
  "step": 460
561
  },
562
  {
563
+ "epoch": 0.75,
564
+ "learning_rate": 2.2066518108565137e-05,
565
+ "loss": 1.4839,
566
  "step": 465
567
  },
568
  {
569
+ "epoch": 0.76,
570
+ "learning_rate": 2.18885551160627e-05,
571
+ "loss": 1.4942,
572
  "step": 470
573
  },
574
  {
575
+ "epoch": 0.77,
576
+ "learning_rate": 2.1709354734798998e-05,
577
+ "loss": 1.4497,
578
  "step": 475
579
  },
580
  {
581
+ "epoch": 0.78,
582
+ "learning_rate": 2.152894915447624e-05,
583
+ "loss": 1.4927,
584
  "step": 480
585
  },
586
  {
587
+ "epoch": 0.78,
588
+ "learning_rate": 2.134737078128611e-05,
589
+ "loss": 1.4794,
590
  "step": 485
591
  },
592
  {
593
+ "epoch": 0.79,
594
+ "learning_rate": 2.1164652232088674e-05,
595
+ "loss": 1.5034,
596
  "step": 490
597
  },
598
  {
599
+ "epoch": 0.8,
600
+ "learning_rate": 2.0980826328553416e-05,
601
+ "loss": 1.5157,
602
  "step": 495
603
  },
604
  {
605
+ "epoch": 0.81,
606
+ "learning_rate": 2.0795926091263504e-05,
607
+ "loss": 1.4681,
608
  "step": 500
609
  },
610
  {
611
+ "epoch": 0.82,
612
+ "learning_rate": 2.0609984733784287e-05,
613
+ "loss": 1.4732,
614
  "step": 505
615
  },
616
  {
617
+ "epoch": 0.83,
618
+ "learning_rate": 2.042303565669719e-05,
619
+ "loss": 1.5047,
620
  "step": 510
621
  },
622
  {
623
+ "epoch": 0.83,
624
+ "learning_rate": 2.0235112441599948e-05,
625
+ "loss": 1.5093,
626
  "step": 515
627
  },
628
  {
629
+ "epoch": 0.84,
630
+ "learning_rate": 2.0046248845074373e-05,
631
+ "loss": 1.4515,
632
  "step": 520
633
  },
634
  {
635
+ "epoch": 0.85,
636
+ "learning_rate": 1.9856478792622666e-05,
637
+ "loss": 1.5051,
638
  "step": 525
639
  },
640
  {
641
+ "epoch": 0.86,
642
+ "learning_rate": 1.9665836372573397e-05,
643
+ "loss": 1.5073,
644
  "step": 530
645
  },
646
  {
647
+ "epoch": 0.87,
648
+ "learning_rate": 1.947435582995821e-05,
649
+ "loss": 1.4952,
650
  "step": 535
651
  },
652
  {
653
+ "epoch": 0.87,
654
+ "learning_rate": 1.928207156036043e-05,
655
+ "loss": 1.4308,
656
  "step": 540
657
  },
658
  {
659
+ "epoch": 0.88,
660
+ "learning_rate": 1.9089018103736568e-05,
661
+ "loss": 1.4588,
662
  "step": 545
663
  },
664
  {
665
+ "epoch": 0.89,
666
+ "learning_rate": 1.8895230138211942e-05,
667
+ "loss": 1.5477,
668
  "step": 550
669
  },
670
  {
671
+ "epoch": 0.9,
672
+ "learning_rate": 1.870074247385144e-05,
673
+ "loss": 1.4979,
674
  "step": 555
675
  },
676
  {
677
+ "epoch": 0.91,
678
+ "learning_rate": 1.8505590046406615e-05,
679
+ "loss": 1.4487,
680
  "step": 560
681
  },
682
  {
683
+ "epoch": 0.91,
684
+ "learning_rate": 1.8309807911040186e-05,
685
+ "loss": 1.4671,
686
  "step": 565
687
  },
688
  {
689
+ "epoch": 0.92,
690
+ "learning_rate": 1.8113431236029078e-05,
691
+ "loss": 1.4486,
692
  "step": 570
693
  },
694
  {
695
+ "epoch": 0.93,
696
+ "learning_rate": 1.7916495296447162e-05,
697
+ "loss": 1.483,
698
  "step": 575
699
  },
700
  {
701
+ "epoch": 0.94,
702
+ "learning_rate": 1.771903546782883e-05,
703
+ "loss": 1.4896,
704
  "step": 580
705
  },
706
  {
707
+ "epoch": 0.95,
708
+ "learning_rate": 1.7521087219814454e-05,
709
+ "loss": 1.5259,
710
  "step": 585
711
  },
712
  {
713
+ "epoch": 0.95,
714
+ "learning_rate": 1.7322686109779032e-05,
715
+ "loss": 1.4845,
716
  "step": 590
717
  },
718
  {
719
+ "epoch": 0.96,
720
+ "learning_rate": 1.7123867776445e-05,
721
+ "loss": 1.4866,
722
  "step": 595
723
  },
724
  {
725
+ "epoch": 0.97,
726
+ "learning_rate": 1.692466793348047e-05,
727
+ "loss": 1.4968,
728
  "step": 600
729
  },
730
  {
731
+ "epoch": 0.98,
732
+ "learning_rate": 1.6725122363084004e-05,
733
+ "loss": 1.4582,
734
  "step": 605
735
  },
736
  {
737
+ "epoch": 0.99,
738
+ "learning_rate": 1.6525266909557046e-05,
739
+ "loss": 1.4605,
740
  "step": 610
741
  },
742
  {
743
+ "epoch": 1.0,
744
+ "learning_rate": 1.6325137472865262e-05,
745
+ "loss": 1.4391,
746
  "step": 615
747
  },
748
  {
749
+ "epoch": 1.0,
750
+ "eval_loss": 3.6259799003601074,
751
+ "eval_runtime": 967.3528,
752
+ "eval_samples_per_second": 8.998,
753
+ "eval_steps_per_second": 2.249,
754
+ "step": 617
755
+ },
756
+ {
757
+ "epoch": 1.0,
758
+ "learning_rate": 1.6124770002189804e-05,
759
+ "loss": 1.5992,
760
  "step": 620
761
  },
762
  {
763
+ "epoch": 1.01,
764
+ "learning_rate": 1.5924200489469782e-05,
765
+ "loss": 1.1688,
766
  "step": 625
767
  },
768
  {
769
+ "epoch": 1.02,
770
+ "learning_rate": 1.572346496293706e-05,
771
+ "loss": 1.1778,
772
  "step": 630
773
  },
774
  {
775
+ "epoch": 1.03,
776
+ "learning_rate": 1.5522599480644496e-05,
777
+ "loss": 1.1652,
778
  "step": 635
779
  },
780
  {
781
+ "epoch": 1.04,
782
+ "learning_rate": 1.532164012398886e-05,
783
+ "loss": 1.1344,
784
  "step": 640
785
  },
786
  {
787
+ "epoch": 1.05,
788
+ "learning_rate": 1.5120622991229545e-05,
789
+ "loss": 1.1474,
790
  "step": 645
791
  },
792
  {
793
+ "epoch": 1.05,
794
+ "learning_rate": 1.4919584191004244e-05,
795
+ "loss": 1.1457,
796
  "step": 650
797
  },
798
  {
799
+ "epoch": 1.06,
800
+ "learning_rate": 1.471855983584276e-05,
801
+ "loss": 1.1441,
802
  "step": 655
803
  },
804
  {
805
+ "epoch": 1.07,
806
+ "learning_rate": 1.4517586035680145e-05,
807
+ "loss": 1.1546,
808
  "step": 660
809
  },
810
  {
811
+ "epoch": 1.08,
812
+ "learning_rate": 1.431669889137027e-05,
813
+ "loss": 1.1526,
814
  "step": 665
815
  },
816
  {
817
+ "epoch": 1.09,
818
+ "learning_rate": 1.4115934488201047e-05,
819
+ "loss": 1.1778,
820
  "step": 670
821
  },
822
  {
823
+ "epoch": 1.09,
824
+ "learning_rate": 1.3915328889412434e-05,
825
+ "loss": 1.1468,
826
  "step": 675
827
  },
828
  {
829
+ "epoch": 1.1,
830
+ "learning_rate": 1.3714918129718418e-05,
831
+ "loss": 1.1367,
832
  "step": 680
833
  },
834
  {
835
+ "epoch": 1.11,
836
+ "learning_rate": 1.3514738208834112e-05,
837
+ "loss": 1.1972,
838
  "step": 685
839
  },
840
  {
841
+ "epoch": 1.12,
842
+ "learning_rate": 1.331482508500912e-05,
843
+ "loss": 1.1701,
844
  "step": 690
845
  },
846
  {
847
+ "epoch": 1.13,
848
+ "learning_rate": 1.31152146685684e-05,
849
+ "loss": 1.1911,
850
  "step": 695
851
  },
852
  {
853
+ "epoch": 1.13,
854
+ "learning_rate": 1.2915942815461677e-05,
855
+ "loss": 1.1758,
856
  "step": 700
857
  },
858
  {
859
+ "epoch": 1.14,
860
+ "learning_rate": 1.2717045320822658e-05,
861
+ "loss": 1.1486,
862
  "step": 705
863
  },
864
  {
865
+ "epoch": 1.15,
866
+ "learning_rate": 1.2518557912539185e-05,
867
+ "loss": 1.1502,
868
  "step": 710
869
  },
870
  {
871
+ "epoch": 1.16,
872
+ "learning_rate": 1.232051624483541e-05,
873
+ "loss": 1.1459,
874
  "step": 715
875
  },
876
  {
877
+ "epoch": 1.17,
878
+ "learning_rate": 1.2122955891867278e-05,
879
+ "loss": 1.1546,
880
  "step": 720
881
  },
882
  {
883
+ "epoch": 1.17,
884
+ "learning_rate": 1.1925912341332324e-05,
885
+ "loss": 1.165,
886
  "step": 725
887
  },
888
  {
889
+ "epoch": 1.18,
890
+ "learning_rate": 1.1729420988095042e-05,
891
+ "loss": 1.1548,
892
  "step": 730
893
  },
894
  {
895
+ "epoch": 1.19,
896
+ "learning_rate": 1.1533517127828926e-05,
897
+ "loss": 1.1454,
898
  "step": 735
899
  },
900
  {
901
+ "epoch": 1.2,
902
+ "learning_rate": 1.1338235950676305e-05,
903
+ "loss": 1.19,
904
  "step": 740
905
  },
906
  {
907
+ "epoch": 1.21,
908
+ "learning_rate": 1.1143612534927153e-05,
909
+ "loss": 1.1475,
910
  "step": 745
911
  },
912
  {
913
+ "epoch": 1.22,
914
+ "learning_rate": 1.0949681840717997e-05,
915
+ "loss": 1.1754,
916
  "step": 750
917
  },
918
  {
919
+ "epoch": 1.22,
920
+ "learning_rate": 1.0756478703752036e-05,
921
+ "loss": 1.1041,
922
  "step": 755
923
  },
924
  {
925
+ "epoch": 1.23,
926
+ "learning_rate": 1.0564037829041609e-05,
927
+ "loss": 1.1465,
928
  "step": 760
929
  },
930
  {
931
+ "epoch": 1.24,
932
+ "learning_rate": 1.037239378467416e-05,
933
+ "loss": 1.1704,
934
  "step": 765
935
  },
936
  {
937
+ "epoch": 1.25,
938
+ "learning_rate": 1.0181580995602766e-05,
939
+ "loss": 1.1716,
940
  "step": 770
941
  },
942
  {
943
+ "epoch": 1.26,
944
+ "learning_rate": 9.991633737462405e-06,
945
+ "loss": 1.1902,
946
  "step": 775
947
  },
948
  {
949
+ "epoch": 1.26,
950
+ "learning_rate": 9.802586130413045e-06,
951
+ "loss": 1.1342,
952
  "step": 780
953
  },
954
  {
955
+ "epoch": 1.27,
956
+ "learning_rate": 9.614472133010623e-06,
957
+ "loss": 1.1848,
958
  "step": 785
959
  },
960
  {
961
+ "epoch": 1.28,
962
+ "learning_rate": 9.42732553610712e-06,
963
+ "loss": 1.1652,
964
  "step": 790
965
  },
966
  {
967
+ "epoch": 1.29,
968
+ "learning_rate": 9.241179956780689e-06,
969
+ "loss": 1.1832,
970
  "step": 795
971
  },
972
  {
973
+ "epoch": 1.3,
974
+ "learning_rate": 9.056068832297041e-06,
975
+ "loss": 1.1935,
976
  "step": 800
977
  },
978
  {
979
+ "epoch": 1.3,
980
+ "learning_rate": 8.872025414103135e-06,
981
+ "loss": 1.1714,
982
  "step": 805
983
  },
984
  {
985
+ "epoch": 1.31,
986
+ "learning_rate": 8.689082761854213e-06,
987
+ "loss": 1.1525,
988
  "step": 810
989
  },
990
  {
991
+ "epoch": 1.32,
992
+ "learning_rate": 8.507273737475307e-06,
993
+ "loss": 1.1701,
994
  "step": 815
995
  },
996
  {
997
+ "epoch": 1.33,
998
+ "learning_rate": 8.326630999258286e-06,
999
+ "loss": 1.1761,
1000
  "step": 820
1001
  },
1002
  {
1003
+ "epoch": 1.34,
1004
+ "learning_rate": 8.14718699599542e-06,
1005
+ "loss": 1.1323,
1006
  "step": 825
1007
  },
1008
  {
1009
+ "epoch": 1.34,
1010
+ "learning_rate": 7.968973961150653e-06,
1011
+ "loss": 1.1768,
1012
  "step": 830
1013
  },
1014
  {
1015
+ "epoch": 1.35,
1016
+ "learning_rate": 7.792023907069486e-06,
1017
+ "loss": 1.1487,
1018
  "step": 835
1019
  },
1020
  {
1021
+ "epoch": 1.36,
1022
+ "learning_rate": 7.616368619228645e-06,
1023
+ "loss": 1.1445,
1024
  "step": 840
1025
  },
1026
  {
1027
+ "epoch": 1.37,
1028
+ "learning_rate": 7.442039650526419e-06,
1029
+ "loss": 1.1348,
1030
  "step": 845
1031
  },
1032
  {
1033
+ "epoch": 1.38,
1034
+ "learning_rate": 7.2690683156148705e-06,
1035
+ "loss": 1.1478,
1036
  "step": 850
1037
  },
1038
  {
1039
+ "epoch": 1.39,
1040
+ "learning_rate": 7.097485685274776e-06,
1041
+ "loss": 1.1625,
1042
  "step": 855
1043
  },
1044
  {
1045
+ "epoch": 1.39,
1046
+ "learning_rate": 6.927322580834376e-06,
1047
+ "loss": 1.1331,
1048
  "step": 860
1049
  },
1050
  {
1051
+ "epoch": 1.4,
1052
+ "learning_rate": 6.758609568632982e-06,
1053
+ "loss": 1.1608,
1054
  "step": 865
1055
  },
1056
  {
1057
+ "epoch": 1.41,
1058
+ "learning_rate": 6.591376954530345e-06,
1059
+ "loss": 1.1349,
1060
  "step": 870
1061
  },
1062
  {
1063
+ "epoch": 1.42,
1064
+ "learning_rate": 6.4256547784628e-06,
1065
+ "loss": 1.1569,
1066
  "step": 875
1067
  },
1068
  {
1069
+ "epoch": 1.43,
1070
+ "learning_rate": 6.261472809047244e-06,
1071
+ "loss": 1.152,
1072
  "step": 880
1073
  },
1074
  {
1075
+ "epoch": 1.43,
1076
+ "learning_rate": 6.098860538233769e-06,
1077
+ "loss": 1.1498,
1078
  "step": 885
1079
  },
1080
  {
1081
+ "epoch": 1.44,
1082
+ "learning_rate": 5.937847176008072e-06,
1083
+ "loss": 1.1802,
1084
  "step": 890
1085
  },
1086
  {
1087
+ "epoch": 1.45,
1088
+ "learning_rate": 5.778461645144438e-06,
1089
+ "loss": 1.1717,
1090
  "step": 895
1091
  },
1092
  {
1093
+ "epoch": 1.46,
1094
+ "learning_rate": 5.6207325760103845e-06,
1095
+ "loss": 1.152,
1096
  "step": 900
1097
  },
1098
  {
1099
+ "epoch": 1.47,
1100
+ "learning_rate": 5.464688301423782e-06,
1101
+ "loss": 1.1414,
1102
  "step": 905
1103
  },
1104
  {
1105
+ "epoch": 1.47,
1106
+ "learning_rate": 5.310356851563427e-06,
1107
+ "loss": 1.154,
1108
  "step": 910
1109
  },
1110
  {
1111
+ "epoch": 1.48,
1112
+ "learning_rate": 5.1577659489340255e-06,
1113
+ "loss": 1.1353,
1114
  "step": 915
1115
  },
1116
  {
1117
+ "epoch": 1.49,
1118
+ "learning_rate": 5.00694300338638e-06,
1119
+ "loss": 1.1783,
1120
  "step": 920
1121
  },
1122
  {
1123
+ "epoch": 1.5,
1124
+ "learning_rate": 4.857915107193783e-06,
1125
+ "loss": 1.14,
1126
  "step": 925
1127
  },
1128
  {
1129
+ "epoch": 1.51,
1130
+ "learning_rate": 4.710709030185422e-06,
1131
+ "loss": 1.1755,
1132
  "step": 930
1133
  },
1134
  {
1135
+ "epoch": 1.51,
1136
+ "learning_rate": 4.565351214937748e-06,
1137
+ "loss": 1.1121,
1138
  "step": 935
1139
  },
1140
  {
1141
+ "epoch": 1.52,
1142
+ "learning_rate": 4.421867772024601e-06,
1143
+ "loss": 1.171,
1144
  "step": 940
1145
  },
1146
  {
1147
+ "epoch": 1.53,
1148
+ "learning_rate": 4.280284475326948e-06,
1149
+ "loss": 1.1517,
1150
  "step": 945
1151
  },
1152
  {
1153
+ "epoch": 1.54,
1154
+ "learning_rate": 4.140626757403176e-06,
1155
+ "loss": 1.1694,
1156
  "step": 950
1157
  },
1158
  {
1159
+ "epoch": 1.55,
1160
+ "learning_rate": 4.002919704920607e-06,
1161
+ "loss": 1.1465,
1162
  "step": 955
1163
  },
1164
  {
1165
+ "epoch": 1.56,
1166
+ "learning_rate": 3.8671880541492236e-06,
1167
+ "loss": 1.179,
1168
  "step": 960
1169
  },
1170
  {
1171
+ "epoch": 1.56,
1172
+ "learning_rate": 3.7334561865182694e-06,
1173
+ "loss": 1.1498,
1174
  "step": 965
1175
  },
1176
  {
1177
+ "epoch": 1.57,
1178
+ "learning_rate": 3.6017481242366503e-06,
1179
+ "loss": 1.1438,
1180
  "step": 970
1181
  },
1182
  {
1183
+ "epoch": 1.58,
1184
+ "learning_rate": 3.472087525977823e-06,
1185
+ "loss": 1.1647,
1186
  "step": 975
1187
  },
1188
  {
1189
+ "epoch": 1.59,
1190
+ "learning_rate": 3.3444976826299754e-06,
1191
+ "loss": 1.1475,
1192
  "step": 980
1193
  },
1194
  {
1195
+ "epoch": 1.6,
1196
+ "learning_rate": 3.219001513112329e-06,
1197
+ "loss": 1.15,
1198
  "step": 985
1199
  },
1200
  {
1201
+ "epoch": 1.6,
1202
+ "learning_rate": 3.0956215602581933e-06,
1203
+ "loss": 1.1613,
1204
  "step": 990
1205
  },
1206
  {
1207
+ "epoch": 1.61,
1208
+ "learning_rate": 2.974379986765622e-06,
1209
+ "loss": 1.1672,
1210
  "step": 995
1211
  },
1212
  {
1213
+ "epoch": 1.62,
1214
+ "learning_rate": 2.855298571216316e-06,
1215
+ "loss": 1.1702,
1216
  "step": 1000
1217
  },
1218
  {
1219
+ "epoch": 1.63,
1220
+ "learning_rate": 2.738398704163561e-06,
1221
+ "loss": 1.1634,
1222
  "step": 1005
1223
  },
1224
  {
1225
+ "epoch": 1.64,
1226
+ "learning_rate": 2.6237013842898533e-06,
1227
+ "loss": 1.1756,
1228
  "step": 1010
1229
  },
1230
  {
1231
+ "epoch": 1.64,
1232
+ "learning_rate": 2.511227214634887e-06,
1233
+ "loss": 1.1075,
1234
  "step": 1015
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1235
  }
1236
  ],
1237
+ "max_steps": 1234,
1238
  "num_train_epochs": 2,
1239
+ "total_flos": 4.827958566725878e+17,
1240
  "trial_name": null,
1241
  "trial_params": null
1242
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b96a0dcac8585789fb3747446a827a33824ee43a375597e136dd02ef9963c321
3
- size 4335
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b3235764a2f119a2586ca316b39c8e5986e318c5b3665781ee71759bb93cc0
3
+ size 4271
vocab.json CHANGED
The diff for this file is too large to render. See raw diff
zero_to_fp32.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # This script extracts fp32 consolidated weights from a zero 2 and 3 DeepSpeed checkpoints. It gets
4
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
5
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
6
+ # application.
7
+ #
8
+ # example: python zero_to_fp32.py . pytorch_model.bin
9
+
10
+ import argparse
11
+ import torch
12
+ import glob
13
+ import math
14
+ import os
15
+ import re
16
+ from collections import OrderedDict
17
+
18
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
19
+ # DeepSpeed data structures it has to be available in the current python environment.
20
+ import deepspeed
21
+ from deepspeed.utils import logger
22
+ from deepspeed.checkpoint.constants import (DS_VERSION,
23
+ OPTIMIZER_STATE_DICT,
24
+ PARAM_SHAPES,
25
+ SINGLE_PARTITION_OF_FP32_GROUPS,
26
+ FP32_FLAT_GROUPS,
27
+ ZERO_STAGE,
28
+ PARTITION_COUNT,
29
+ PARAM_SHAPES,
30
+ BUFFER_NAMES)
31
+
32
+ debug = 0
33
+
34
+ # load to cpu
35
+ device = torch.device('cpu')
36
+
37
+
38
+ def atoi(text):
39
+ return int(text) if text.isdigit() else text
40
+
41
+
42
+ def natural_keys(text):
43
+ '''
44
+ alist.sort(key=natural_keys) sorts in human order
45
+ http://nedbatchelder.com/blog/200712/human_sorting.html
46
+ (See Toothy's implementation in the comments)
47
+ '''
48
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
49
+
50
+
51
+ def get_model_state_file(checkpoint_dir, zero_stage):
52
+ if not os.path.isdir(checkpoint_dir):
53
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
54
+
55
+ # there should be only one file
56
+ if zero_stage == 2:
57
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
58
+ elif zero_stage == 3:
59
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
60
+
61
+ if not os.path.exists(file):
62
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
63
+
64
+ return file
65
+
66
+
67
+ def get_optim_files(checkpoint_dir):
68
+ # XXX: need to test that this simple glob rule works for multi-node setup too
69
+ optim_files = sorted(glob.glob(os.path.join(checkpoint_dir,
70
+ "*_optim_states.pt")),
71
+ key=natural_keys)
72
+
73
+ if len(optim_files) == 0:
74
+ raise FileNotFoundError(
75
+ f"can't find '*_optim_states.pt' files in directory '{checkpoint_dir}'")
76
+
77
+ return optim_files
78
+
79
+
80
+ def parse_model_state(file):
81
+ state_dict = torch.load(file, map_location=device)
82
+
83
+ if BUFFER_NAMES not in state_dict:
84
+ raise ValueError(f"{file} is not a model state checkpoint")
85
+ buffer_names = state_dict[BUFFER_NAMES]
86
+ if debug:
87
+ print("Found buffers:", buffer_names)
88
+
89
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
90
+ buffers = {
91
+ k: v.float()
92
+ for k,
93
+ v in state_dict["module"].items() if k in buffer_names
94
+ }
95
+ param_shapes = state_dict[PARAM_SHAPES]
96
+
97
+ ds_version = state_dict.get(DS_VERSION, None)
98
+
99
+ return buffers, param_shapes, ds_version
100
+
101
+
102
+ def parse_optim_states(files, ds_checkpoint_dir):
103
+
104
+ total_files = len(files)
105
+ state_dicts = []
106
+ for f in files:
107
+ state_dicts.append(torch.load(f, map_location=device))
108
+
109
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
110
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
111
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
112
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
113
+
114
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
115
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
116
+ # use the max of the partition_count to get the dp world_size.
117
+
118
+ if type(world_size) is list:
119
+ world_size = max(world_size)
120
+
121
+ if world_size != total_files:
122
+ raise ValueError(
123
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
124
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
125
+ )
126
+
127
+ # the groups are named differently in each stage
128
+ if zero_stage == 2:
129
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
130
+ elif zero_stage == 3:
131
+ fp32_groups_key = FP32_FLAT_GROUPS
132
+ else:
133
+ raise ValueError(f"unknown zero stage {zero_stage}")
134
+
135
+ if zero_stage == 2:
136
+ fp32_flat_groups = [
137
+ state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key]
138
+ for i in range(len(state_dicts))
139
+ ]
140
+ elif zero_stage == 3:
141
+ # if there is more than one param group, there will be multiple flattened tensors - one
142
+ # flattened tensor per group - for simplicity merge them into a single tensor
143
+ #
144
+ # XXX: could make the script more memory efficient for when there are multiple groups - it
145
+ # will require matching the sub-lists of param_shapes for each param group flattened tensor
146
+
147
+ fp32_flat_groups = [
148
+ torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key],
149
+ 0) for i in range(len(state_dicts))
150
+ ]
151
+
152
+ return zero_stage, world_size, fp32_flat_groups
153
+
154
+
155
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
156
+ """
157
+ Returns fp32 state_dict reconstructed from ds checkpoint
158
+
159
+ Args:
160
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
161
+
162
+ """
163
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
164
+
165
+ optim_files = get_optim_files(ds_checkpoint_dir)
166
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
167
+ print(
168
+ f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
169
+
170
+ model_file = get_model_state_file(ds_checkpoint_dir, zero_stage)
171
+ buffers, param_shapes, ds_version = parse_model_state(model_file)
172
+ print(f'Parsing checkpoint created by deepspeed=={ds_version}')
173
+
174
+ if zero_stage == 2:
175
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size,
176
+ param_shapes,
177
+ fp32_flat_groups,
178
+ buffers)
179
+ elif zero_stage == 3:
180
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size,
181
+ param_shapes,
182
+ fp32_flat_groups,
183
+ buffers)
184
+
185
+
186
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size,
187
+ param_shapes,
188
+ fp32_flat_groups,
189
+ buffers):
190
+
191
+ # Reconstruction protocol:
192
+ #
193
+ # XXX: document this
194
+
195
+ if debug:
196
+ for i in range(world_size):
197
+ for j in range(len(fp32_flat_groups[0])):
198
+ print(
199
+ f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
200
+
201
+ # XXX: memory usage doubles here (zero2)
202
+ num_param_groups = len(fp32_flat_groups[0])
203
+ merged_single_partition_of_fp32_groups = []
204
+ for i in range(num_param_groups):
205
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
206
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
207
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
208
+ avail_numel = sum([
209
+ full_single_fp32_vector.numel()
210
+ for full_single_fp32_vector in merged_single_partition_of_fp32_groups
211
+ ])
212
+
213
+ if debug:
214
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
215
+ wanted_numel = sum(
216
+ [sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
217
+ # not asserting if there is a mismatch due to possible padding
218
+ print(f"Have {avail_numel} numels to process.")
219
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
220
+
221
+ state_dict = OrderedDict()
222
+
223
+ # buffers
224
+ state_dict.update(buffers)
225
+ if debug:
226
+ print(f"added {len(buffers)} buffers")
227
+
228
+ # params
229
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
230
+ # out-of-core computing solution
231
+ total_numel = 0
232
+ total_params = 0
233
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
234
+ offset = 0
235
+ avail_numel = full_single_fp32_vector.numel()
236
+ for name, shape in shapes.items():
237
+
238
+ unpartitioned_numel = shape.numel()
239
+ total_numel += unpartitioned_numel
240
+ total_params += 1
241
+
242
+ if debug:
243
+ print(
244
+ f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} "
245
+ )
246
+ state_dict[name] = full_single_fp32_vector.narrow(
247
+ 0,
248
+ offset,
249
+ unpartitioned_numel).view(shape)
250
+ offset += unpartitioned_numel
251
+
252
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
253
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
254
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
255
+ # live optimizer object, so we are checking that the numbers are within the right range
256
+ align_to = 2 * world_size
257
+
258
+ def zero2_align(x):
259
+ return align_to * math.ceil(x / align_to)
260
+
261
+ if debug:
262
+ print(f"original offset={offset}, avail_numel={avail_numel}")
263
+
264
+ offset = zero2_align(offset)
265
+ avail_numel = zero2_align(avail_numel)
266
+
267
+ if debug:
268
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
269
+
270
+ # Sanity check
271
+ if offset != avail_numel:
272
+ raise ValueError(
273
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
274
+
275
+ print(
276
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
277
+ )
278
+
279
+ return state_dict
280
+
281
+
282
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
283
+ remainder = unpartitioned_numel % world_size
284
+ padding_numel = (world_size - remainder) if remainder else 0
285
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
286
+ return partitioned_numel, padding_numel
287
+
288
+
289
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size,
290
+ param_shapes,
291
+ fp32_flat_groups,
292
+ buffers):
293
+
294
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
295
+ # param, re-consolidating each param, while dealing with padding if any
296
+
297
+ avail_numel = fp32_flat_groups[0].numel() * world_size
298
+ # merge list of dicts, preserving order
299
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
300
+
301
+ if debug:
302
+ for i in range(world_size):
303
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
304
+
305
+ wanted_params = len(param_shapes)
306
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
307
+ # not asserting if there is a mismatch due to possible padding
308
+ print(f"Have {avail_numel} numels to process.")
309
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
310
+
311
+ state_dict = OrderedDict()
312
+
313
+ # buffers
314
+ state_dict.update(buffers)
315
+ if debug:
316
+ print(f"added {len(buffers)} buffers")
317
+
318
+ # params
319
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
320
+ # out-of-core computing solution
321
+ offset = 0
322
+ total_numel = 0
323
+ total_params = 0
324
+ for name, shape in param_shapes.items():
325
+
326
+ unpartitioned_numel = shape.numel()
327
+ total_numel += unpartitioned_numel
328
+ total_params += 1
329
+
330
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
331
+
332
+ if debug:
333
+ print(
334
+ f"{total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
335
+ )
336
+
337
+ # XXX: memory usage doubles here
338
+ state_dict[name] = torch.cat(
339
+ tuple(fp32_flat_groups[i].narrow(0,
340
+ offset,
341
+ partitioned_numel)
342
+ for i in range(world_size)),
343
+ 0).narrow(0,
344
+ 0,
345
+ unpartitioned_numel).view(shape)
346
+ offset += partitioned_numel
347
+
348
+ offset *= world_size
349
+
350
+ # Sanity check
351
+ if offset != avail_numel:
352
+ raise ValueError(
353
+ f"consumed {offset} numels out of {avail_numel} - something is wrong")
354
+
355
+ print(
356
+ f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements"
357
+ )
358
+
359
+ return state_dict
360
+
361
+
362
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
363
+ """
364
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
365
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
366
+ via a model hub.
367
+
368
+ Args:
369
+ - ``checkpoint_dir``: path to the desired checkpoint folder
370
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
371
+
372
+ Returns:
373
+ - pytorch ``state_dict``
374
+
375
+ Note: this approach may not work if your application doesn't have sufficient free CPU memory and
376
+ you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
377
+ the checkpoint.
378
+
379
+ A typical usage might be ::
380
+
381
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
382
+ # do the training and checkpoint saving
383
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
384
+ model = model.cpu() # move to cpu
385
+ model.load_state_dict(state_dict)
386
+ # submit to model hub or save the model to share with others
387
+
388
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
389
+ application. i.e. you will need to re-initialize the deepspeed engine, since
390
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
391
+
392
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
393
+
394
+ """
395
+ if tag is None:
396
+ latest_path = os.path.join(checkpoint_dir, 'latest')
397
+ if os.path.isfile(latest_path):
398
+ with open(latest_path, 'r') as fd:
399
+ tag = fd.read().strip()
400
+ else:
401
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
402
+
403
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
404
+
405
+ if not os.path.isdir(ds_checkpoint_dir):
406
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
407
+
408
+ return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
409
+
410
+
411
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
412
+ """
413
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
414
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
415
+
416
+ Args:
417
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
418
+ - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
419
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
420
+ """
421
+
422
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
423
+ print(f"Saving fp32 state dict to {output_file}")
424
+ torch.save(state_dict, output_file)
425
+
426
+
427
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
428
+ """
429
+ 1. Put the provided model to cpu
430
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
431
+ 3. Load it into the provided model
432
+
433
+ Args:
434
+ - ``model``: the model object to update
435
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
436
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
437
+
438
+ Returns:
439
+ - ``model`: modified model
440
+
441
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
442
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
443
+ conveniently placed for you in the checkpoint folder.
444
+
445
+ A typical usage might be ::
446
+
447
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
448
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
449
+ # submit to model hub or save the model to share with others
450
+
451
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
452
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
453
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
454
+
455
+ """
456
+ logger.info(f"Extracting fp32 weights")
457
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
458
+
459
+ logger.info(f"Overwriting model with fp32 weights")
460
+ model = model.cpu()
461
+ model.load_state_dict(state_dict, strict=False)
462
+
463
+ return model
464
+
465
+
466
+ if __name__ == "__main__":
467
+
468
+ parser = argparse.ArgumentParser()
469
+ parser.add_argument(
470
+ "checkpoint_dir",
471
+ type=str,
472
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
473
+ parser.add_argument(
474
+ "output_file",
475
+ type=str,
476
+ help=
477
+ "path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)"
478
+ )
479
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
480
+ args = parser.parse_args()
481
+
482
+ debug = args.debug
483
+
484
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file)