Mitchel Hsu commited on
Commit
507c812
1 Parent(s): e77bf98

add: Update model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. adapter_config.json +0 -22
  2. checkpoint-1400/scaler.pt +0 -3
  3. checkpoint-1400/scheduler.pt +0 -3
  4. checkpoint-1400/trainer_state.json +0 -912
  5. checkpoint-1400/training_args.bin +0 -3
  6. checkpoint-3600/optimizer.pt +0 -3
  7. checkpoint-3600/pytorch_model.bin +0 -3
  8. checkpoint-3600/rng_state.pth +0 -3
  9. checkpoint-3600/scaler.pt +0 -3
  10. checkpoint-3600/scheduler.pt +0 -3
  11. checkpoint-3600/trainer_state.json +0 -2320
  12. checkpoint-3600/training_args.bin +0 -3
  13. checkpoint-3800/optimizer.pt +0 -3
  14. checkpoint-3800/pytorch_model.bin +0 -3
  15. checkpoint-3800/rng_state.pth +0 -3
  16. checkpoint-3800/scaler.pt +0 -3
  17. checkpoint-3800/scheduler.pt +0 -3
  18. checkpoint-3800/trainer_state.json +0 -2448
  19. checkpoint-3800/training_args.bin +0 -3
  20. config.json +24 -0
  21. generation_config.json +7 -0
  22. checkpoint-1400/rng_state.pth → pytorch_model-00001-of-00039.bin +2 -2
  23. adapter_model.bin → pytorch_model-00002-of-00039.bin +2 -2
  24. checkpoint-1400/optimizer.pt → pytorch_model-00003-of-00039.bin +2 -2
  25. checkpoint-1400/pytorch_model.bin → pytorch_model-00004-of-00039.bin +2 -2
  26. pytorch_model-00005-of-00039.bin +3 -0
  27. pytorch_model-00006-of-00039.bin +3 -0
  28. pytorch_model-00007-of-00039.bin +3 -0
  29. pytorch_model-00008-of-00039.bin +3 -0
  30. pytorch_model-00009-of-00039.bin +3 -0
  31. pytorch_model-00010-of-00039.bin +3 -0
  32. pytorch_model-00011-of-00039.bin +3 -0
  33. pytorch_model-00012-of-00039.bin +3 -0
  34. pytorch_model-00013-of-00039.bin +3 -0
  35. pytorch_model-00014-of-00039.bin +3 -0
  36. pytorch_model-00015-of-00039.bin +3 -0
  37. pytorch_model-00016-of-00039.bin +3 -0
  38. pytorch_model-00017-of-00039.bin +3 -0
  39. pytorch_model-00018-of-00039.bin +3 -0
  40. pytorch_model-00019-of-00039.bin +3 -0
  41. pytorch_model-00020-of-00039.bin +3 -0
  42. pytorch_model-00021-of-00039.bin +3 -0
  43. pytorch_model-00022-of-00039.bin +3 -0
  44. pytorch_model-00023-of-00039.bin +3 -0
  45. pytorch_model-00024-of-00039.bin +3 -0
  46. pytorch_model-00025-of-00039.bin +3 -0
  47. pytorch_model-00026-of-00039.bin +3 -0
  48. pytorch_model-00027-of-00039.bin +3 -0
  49. pytorch_model-00028-of-00039.bin +3 -0
  50. pytorch_model-00029-of-00039.bin +3 -0
adapter_config.json DELETED
@@ -1,22 +0,0 @@
1
- {
2
- "base_model_name_or_path": "decapoda-research/llama-7b-hf",
3
- "bias": "none",
4
- "fan_in_fan_out": false,
5
- "inference_mode": true,
6
- "init_lora_weights": true,
7
- "layers_pattern": null,
8
- "layers_to_transform": null,
9
- "lora_alpha": 16,
10
- "lora_dropout": 0.05,
11
- "modules_to_save": null,
12
- "peft_type": "LORA",
13
- "r": 16,
14
- "revision": null,
15
- "target_modules": [
16
- "q_proj",
17
- "k_proj",
18
- "v_proj",
19
- "o_proj"
20
- ],
21
- "task_type": "CAUSAL_LM"
22
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1400/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:16fdfc03b58220402968eacaac23fb5471cdb9061302380bd3c8d4d326c02ade
3
- size 557
 
 
 
 
checkpoint-1400/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dcd27fef07230ceb8ed85e2a9692df56c7d01b0fe9f962e0a8e2690c9acc9b3
3
- size 627
 
 
 
 
checkpoint-1400/trainer_state.json DELETED
@@ -1,912 +0,0 @@
1
- {
2
- "best_metric": 0.8923280239105225,
3
- "best_model_checkpoint": "./lora-alpaca-hc8/checkpoint-1400",
4
- "epoch": 3.6012861736334405,
5
- "global_step": 1400,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.03,
12
- "learning_rate": 2.9999999999999997e-05,
13
- "loss": 1.3719,
14
- "step": 10
15
- },
16
- {
17
- "epoch": 0.05,
18
- "learning_rate": 5.9999999999999995e-05,
19
- "loss": 1.5358,
20
- "step": 20
21
- },
22
- {
23
- "epoch": 0.08,
24
- "learning_rate": 8.999999999999999e-05,
25
- "loss": 1.5633,
26
- "step": 30
27
- },
28
- {
29
- "epoch": 0.1,
30
- "learning_rate": 0.00011999999999999999,
31
- "loss": 1.2433,
32
- "step": 40
33
- },
34
- {
35
- "epoch": 0.13,
36
- "learning_rate": 0.00015,
37
- "loss": 0.9843,
38
- "step": 50
39
- },
40
- {
41
- "epoch": 0.15,
42
- "learning_rate": 0.00017999999999999998,
43
- "loss": 1.012,
44
- "step": 60
45
- },
46
- {
47
- "epoch": 0.18,
48
- "learning_rate": 0.00020999999999999998,
49
- "loss": 1.0392,
50
- "step": 70
51
- },
52
- {
53
- "epoch": 0.21,
54
- "learning_rate": 0.00023999999999999998,
55
- "loss": 1.0541,
56
- "step": 80
57
- },
58
- {
59
- "epoch": 0.23,
60
- "learning_rate": 0.00027,
61
- "loss": 0.9608,
62
- "step": 90
63
- },
64
- {
65
- "epoch": 0.26,
66
- "learning_rate": 0.0003,
67
- "loss": 0.8697,
68
- "step": 100
69
- },
70
- {
71
- "epoch": 0.28,
72
- "learning_rate": 0.00029920634920634916,
73
- "loss": 0.9836,
74
- "step": 110
75
- },
76
- {
77
- "epoch": 0.31,
78
- "learning_rate": 0.00029841269841269835,
79
- "loss": 1.003,
80
- "step": 120
81
- },
82
- {
83
- "epoch": 0.33,
84
- "learning_rate": 0.0002976190476190476,
85
- "loss": 1.0052,
86
- "step": 130
87
- },
88
- {
89
- "epoch": 0.36,
90
- "learning_rate": 0.0002968253968253968,
91
- "loss": 0.9267,
92
- "step": 140
93
- },
94
- {
95
- "epoch": 0.39,
96
- "learning_rate": 0.000296031746031746,
97
- "loss": 0.8318,
98
- "step": 150
99
- },
100
- {
101
- "epoch": 0.41,
102
- "learning_rate": 0.0002952380952380952,
103
- "loss": 0.9594,
104
- "step": 160
105
- },
106
- {
107
- "epoch": 0.44,
108
- "learning_rate": 0.00029444444444444445,
109
- "loss": 0.987,
110
- "step": 170
111
- },
112
- {
113
- "epoch": 0.46,
114
- "learning_rate": 0.00029365079365079364,
115
- "loss": 0.9646,
116
- "step": 180
117
- },
118
- {
119
- "epoch": 0.49,
120
- "learning_rate": 0.00029285714285714283,
121
- "loss": 0.8501,
122
- "step": 190
123
- },
124
- {
125
- "epoch": 0.51,
126
- "learning_rate": 0.000292063492063492,
127
- "loss": 0.7568,
128
- "step": 200
129
- },
130
- {
131
- "epoch": 0.51,
132
- "eval_loss": 0.9958714246749878,
133
- "eval_runtime": 189.2223,
134
- "eval_samples_per_second": 10.57,
135
- "eval_steps_per_second": 1.321,
136
- "step": 200
137
- },
138
- {
139
- "epoch": 0.54,
140
- "learning_rate": 0.00029126984126984126,
141
- "loss": 0.949,
142
- "step": 210
143
- },
144
- {
145
- "epoch": 0.57,
146
- "learning_rate": 0.00029047619047619045,
147
- "loss": 0.9581,
148
- "step": 220
149
- },
150
- {
151
- "epoch": 0.59,
152
- "learning_rate": 0.00028968253968253963,
153
- "loss": 0.9526,
154
- "step": 230
155
- },
156
- {
157
- "epoch": 0.62,
158
- "learning_rate": 0.0002888888888888888,
159
- "loss": 0.847,
160
- "step": 240
161
- },
162
- {
163
- "epoch": 0.64,
164
- "learning_rate": 0.00028809523809523806,
165
- "loss": 0.7414,
166
- "step": 250
167
- },
168
- {
169
- "epoch": 0.67,
170
- "learning_rate": 0.00028730158730158725,
171
- "loss": 0.9449,
172
- "step": 260
173
- },
174
- {
175
- "epoch": 0.69,
176
- "learning_rate": 0.0002865079365079365,
177
- "loss": 0.9607,
178
- "step": 270
179
- },
180
- {
181
- "epoch": 0.72,
182
- "learning_rate": 0.0002857142857142857,
183
- "loss": 0.9456,
184
- "step": 280
185
- },
186
- {
187
- "epoch": 0.75,
188
- "learning_rate": 0.0002849206349206349,
189
- "loss": 0.8346,
190
- "step": 290
191
- },
192
- {
193
- "epoch": 0.77,
194
- "learning_rate": 0.0002841269841269841,
195
- "loss": 0.7275,
196
- "step": 300
197
- },
198
- {
199
- "epoch": 0.8,
200
- "learning_rate": 0.0002833333333333333,
201
- "loss": 0.9337,
202
- "step": 310
203
- },
204
- {
205
- "epoch": 0.82,
206
- "learning_rate": 0.0002825396825396825,
207
- "loss": 0.9466,
208
- "step": 320
209
- },
210
- {
211
- "epoch": 0.85,
212
- "learning_rate": 0.00028174603174603173,
213
- "loss": 0.9386,
214
- "step": 330
215
- },
216
- {
217
- "epoch": 0.87,
218
- "learning_rate": 0.0002809523809523809,
219
- "loss": 0.8254,
220
- "step": 340
221
- },
222
- {
223
- "epoch": 0.9,
224
- "learning_rate": 0.0002801587301587301,
225
- "loss": 0.723,
226
- "step": 350
227
- },
228
- {
229
- "epoch": 0.93,
230
- "learning_rate": 0.00027936507936507935,
231
- "loss": 0.9274,
232
- "step": 360
233
- },
234
- {
235
- "epoch": 0.95,
236
- "learning_rate": 0.00027857142857142854,
237
- "loss": 0.9492,
238
- "step": 370
239
- },
240
- {
241
- "epoch": 0.98,
242
- "learning_rate": 0.0002777777777777778,
243
- "loss": 0.871,
244
- "step": 380
245
- },
246
- {
247
- "epoch": 1.0,
248
- "learning_rate": 0.00027698412698412697,
249
- "loss": 0.7533,
250
- "step": 390
251
- },
252
- {
253
- "epoch": 1.03,
254
- "learning_rate": 0.00027619047619047615,
255
- "loss": 0.9284,
256
- "step": 400
257
- },
258
- {
259
- "epoch": 1.03,
260
- "eval_loss": 0.9276881217956543,
261
- "eval_runtime": 189.2181,
262
- "eval_samples_per_second": 10.57,
263
- "eval_steps_per_second": 1.321,
264
- "step": 400
265
- },
266
- {
267
- "epoch": 1.05,
268
- "learning_rate": 0.0002753968253968254,
269
- "loss": 0.9402,
270
- "step": 410
271
- },
272
- {
273
- "epoch": 1.08,
274
- "learning_rate": 0.0002746031746031746,
275
- "loss": 0.9188,
276
- "step": 420
277
- },
278
- {
279
- "epoch": 1.11,
280
- "learning_rate": 0.00027380952380952377,
281
- "loss": 0.7976,
282
- "step": 430
283
- },
284
- {
285
- "epoch": 1.13,
286
- "learning_rate": 0.00027301587301587296,
287
- "loss": 0.7172,
288
- "step": 440
289
- },
290
- {
291
- "epoch": 1.16,
292
- "learning_rate": 0.0002722222222222222,
293
- "loss": 0.9195,
294
- "step": 450
295
- },
296
- {
297
- "epoch": 1.18,
298
- "learning_rate": 0.0002714285714285714,
299
- "loss": 0.9426,
300
- "step": 460
301
- },
302
- {
303
- "epoch": 1.21,
304
- "learning_rate": 0.00027063492063492063,
305
- "loss": 0.9034,
306
- "step": 470
307
- },
308
- {
309
- "epoch": 1.23,
310
- "learning_rate": 0.0002698412698412698,
311
- "loss": 0.788,
312
- "step": 480
313
- },
314
- {
315
- "epoch": 1.26,
316
- "learning_rate": 0.000269047619047619,
317
- "loss": 0.7213,
318
- "step": 490
319
- },
320
- {
321
- "epoch": 1.29,
322
- "learning_rate": 0.00026825396825396825,
323
- "loss": 0.9149,
324
- "step": 500
325
- },
326
- {
327
- "epoch": 1.31,
328
- "learning_rate": 0.00026746031746031744,
329
- "loss": 0.9386,
330
- "step": 510
331
- },
332
- {
333
- "epoch": 1.34,
334
- "learning_rate": 0.0002666666666666666,
335
- "loss": 0.9099,
336
- "step": 520
337
- },
338
- {
339
- "epoch": 1.36,
340
- "learning_rate": 0.00026587301587301587,
341
- "loss": 0.7802,
342
- "step": 530
343
- },
344
- {
345
- "epoch": 1.39,
346
- "learning_rate": 0.00026507936507936506,
347
- "loss": 0.7246,
348
- "step": 540
349
- },
350
- {
351
- "epoch": 1.41,
352
- "learning_rate": 0.00026428571428571424,
353
- "loss": 0.9213,
354
- "step": 550
355
- },
356
- {
357
- "epoch": 1.44,
358
- "learning_rate": 0.00026349206349206343,
359
- "loss": 0.9348,
360
- "step": 560
361
- },
362
- {
363
- "epoch": 1.47,
364
- "learning_rate": 0.0002626984126984127,
365
- "loss": 0.8995,
366
- "step": 570
367
- },
368
- {
369
- "epoch": 1.49,
370
- "learning_rate": 0.00026190476190476186,
371
- "loss": 0.7866,
372
- "step": 580
373
- },
374
- {
375
- "epoch": 1.52,
376
- "learning_rate": 0.0002611111111111111,
377
- "loss": 0.7257,
378
- "step": 590
379
- },
380
- {
381
- "epoch": 1.54,
382
- "learning_rate": 0.0002603174603174603,
383
- "loss": 0.918,
384
- "step": 600
385
- },
386
- {
387
- "epoch": 1.54,
388
- "eval_loss": 0.9236659407615662,
389
- "eval_runtime": 189.1236,
390
- "eval_samples_per_second": 10.575,
391
- "eval_steps_per_second": 1.322,
392
- "step": 600
393
- },
394
- {
395
- "epoch": 1.57,
396
- "learning_rate": 0.00025952380952380953,
397
- "loss": 0.933,
398
- "step": 610
399
- },
400
- {
401
- "epoch": 1.59,
402
- "learning_rate": 0.0002587301587301587,
403
- "loss": 0.9085,
404
- "step": 620
405
- },
406
- {
407
- "epoch": 1.62,
408
- "learning_rate": 0.0002579365079365079,
409
- "loss": 0.7928,
410
- "step": 630
411
- },
412
- {
413
- "epoch": 1.65,
414
- "learning_rate": 0.0002571428571428571,
415
- "loss": 0.7162,
416
- "step": 640
417
- },
418
- {
419
- "epoch": 1.67,
420
- "learning_rate": 0.00025634920634920634,
421
- "loss": 0.9076,
422
- "step": 650
423
- },
424
- {
425
- "epoch": 1.7,
426
- "learning_rate": 0.00025555555555555553,
427
- "loss": 0.9345,
428
- "step": 660
429
- },
430
- {
431
- "epoch": 1.72,
432
- "learning_rate": 0.0002547619047619047,
433
- "loss": 0.9107,
434
- "step": 670
435
- },
436
- {
437
- "epoch": 1.75,
438
- "learning_rate": 0.00025396825396825396,
439
- "loss": 0.7721,
440
- "step": 680
441
- },
442
- {
443
- "epoch": 1.77,
444
- "learning_rate": 0.00025317460317460315,
445
- "loss": 0.7112,
446
- "step": 690
447
- },
448
- {
449
- "epoch": 1.8,
450
- "learning_rate": 0.0002523809523809524,
451
- "loss": 0.9118,
452
- "step": 700
453
- },
454
- {
455
- "epoch": 1.83,
456
- "learning_rate": 0.0002515873015873016,
457
- "loss": 0.9205,
458
- "step": 710
459
- },
460
- {
461
- "epoch": 1.85,
462
- "learning_rate": 0.00025079365079365076,
463
- "loss": 0.9004,
464
- "step": 720
465
- },
466
- {
467
- "epoch": 1.88,
468
- "learning_rate": 0.00025,
469
- "loss": 0.7741,
470
- "step": 730
471
- },
472
- {
473
- "epoch": 1.9,
474
- "learning_rate": 0.0002492063492063492,
475
- "loss": 0.7186,
476
- "step": 740
477
- },
478
- {
479
- "epoch": 1.93,
480
- "learning_rate": 0.0002484126984126984,
481
- "loss": 0.9002,
482
- "step": 750
483
- },
484
- {
485
- "epoch": 1.95,
486
- "learning_rate": 0.00024761904761904757,
487
- "loss": 0.9066,
488
- "step": 760
489
- },
490
- {
491
- "epoch": 1.98,
492
- "learning_rate": 0.0002468253968253968,
493
- "loss": 0.8127,
494
- "step": 770
495
- },
496
- {
497
- "epoch": 2.01,
498
- "learning_rate": 0.000246031746031746,
499
- "loss": 0.7305,
500
- "step": 780
501
- },
502
- {
503
- "epoch": 2.03,
504
- "learning_rate": 0.0002452380952380952,
505
- "loss": 0.8921,
506
- "step": 790
507
- },
508
- {
509
- "epoch": 2.06,
510
- "learning_rate": 0.00024444444444444443,
511
- "loss": 0.9178,
512
- "step": 800
513
- },
514
- {
515
- "epoch": 2.06,
516
- "eval_loss": 0.901778519153595,
517
- "eval_runtime": 189.2038,
518
- "eval_samples_per_second": 10.571,
519
- "eval_steps_per_second": 1.321,
520
- "step": 800
521
- },
522
- {
523
- "epoch": 2.08,
524
- "learning_rate": 0.00024365079365079364,
525
- "loss": 0.8823,
526
- "step": 810
527
- },
528
- {
529
- "epoch": 2.11,
530
- "learning_rate": 0.00024285714285714283,
531
- "loss": 0.7521,
532
- "step": 820
533
- },
534
- {
535
- "epoch": 2.14,
536
- "learning_rate": 0.00024206349206349205,
537
- "loss": 0.717,
538
- "step": 830
539
- },
540
- {
541
- "epoch": 2.16,
542
- "learning_rate": 0.00024126984126984123,
543
- "loss": 0.9045,
544
- "step": 840
545
- },
546
- {
547
- "epoch": 2.19,
548
- "learning_rate": 0.00024047619047619048,
549
- "loss": 0.9146,
550
- "step": 850
551
- },
552
- {
553
- "epoch": 2.21,
554
- "learning_rate": 0.00023968253968253966,
555
- "loss": 0.8678,
556
- "step": 860
557
- },
558
- {
559
- "epoch": 2.24,
560
- "learning_rate": 0.00023888888888888885,
561
- "loss": 0.745,
562
- "step": 870
563
- },
564
- {
565
- "epoch": 2.26,
566
- "learning_rate": 0.00023809523809523807,
567
- "loss": 0.7226,
568
- "step": 880
569
- },
570
- {
571
- "epoch": 2.29,
572
- "learning_rate": 0.00023730158730158728,
573
- "loss": 0.9002,
574
- "step": 890
575
- },
576
- {
577
- "epoch": 2.32,
578
- "learning_rate": 0.0002365079365079365,
579
- "loss": 0.9035,
580
- "step": 900
581
- },
582
- {
583
- "epoch": 2.34,
584
- "learning_rate": 0.00023571428571428569,
585
- "loss": 0.878,
586
- "step": 910
587
- },
588
- {
589
- "epoch": 2.37,
590
- "learning_rate": 0.00023492063492063487,
591
- "loss": 0.7397,
592
- "step": 920
593
- },
594
- {
595
- "epoch": 2.39,
596
- "learning_rate": 0.00023412698412698412,
597
- "loss": 0.7251,
598
- "step": 930
599
- },
600
- {
601
- "epoch": 2.42,
602
- "learning_rate": 0.0002333333333333333,
603
- "loss": 0.9014,
604
- "step": 940
605
- },
606
- {
607
- "epoch": 2.44,
608
- "learning_rate": 0.00023253968253968252,
609
- "loss": 0.9158,
610
- "step": 950
611
- },
612
- {
613
- "epoch": 2.47,
614
- "learning_rate": 0.0002317460317460317,
615
- "loss": 0.8596,
616
- "step": 960
617
- },
618
- {
619
- "epoch": 2.5,
620
- "learning_rate": 0.00023095238095238095,
621
- "loss": 0.7312,
622
- "step": 970
623
- },
624
- {
625
- "epoch": 2.52,
626
- "learning_rate": 0.00023015873015873014,
627
- "loss": 0.7271,
628
- "step": 980
629
- },
630
- {
631
- "epoch": 2.55,
632
- "learning_rate": 0.00022936507936507935,
633
- "loss": 0.9007,
634
- "step": 990
635
- },
636
- {
637
- "epoch": 2.57,
638
- "learning_rate": 0.00022857142857142854,
639
- "loss": 0.9186,
640
- "step": 1000
641
- },
642
- {
643
- "epoch": 2.57,
644
- "eval_loss": 0.8995742201805115,
645
- "eval_runtime": 189.2401,
646
- "eval_samples_per_second": 10.569,
647
- "eval_steps_per_second": 1.321,
648
- "step": 1000
649
- },
650
- {
651
- "epoch": 2.6,
652
- "learning_rate": 0.00022777777777777778,
653
- "loss": 0.8685,
654
- "step": 1010
655
- },
656
- {
657
- "epoch": 2.62,
658
- "learning_rate": 0.00022698412698412697,
659
- "loss": 0.7359,
660
- "step": 1020
661
- },
662
- {
663
- "epoch": 2.65,
664
- "learning_rate": 0.00022619047619047616,
665
- "loss": 0.7166,
666
- "step": 1030
667
- },
668
- {
669
- "epoch": 2.68,
670
- "learning_rate": 0.00022539682539682537,
671
- "loss": 0.9012,
672
- "step": 1040
673
- },
674
- {
675
- "epoch": 2.7,
676
- "learning_rate": 0.0002246031746031746,
677
- "loss": 0.9195,
678
- "step": 1050
679
- },
680
- {
681
- "epoch": 2.73,
682
- "learning_rate": 0.0002238095238095238,
683
- "loss": 0.8733,
684
- "step": 1060
685
- },
686
- {
687
- "epoch": 2.75,
688
- "learning_rate": 0.000223015873015873,
689
- "loss": 0.7488,
690
- "step": 1070
691
- },
692
- {
693
- "epoch": 2.78,
694
- "learning_rate": 0.00022222222222222218,
695
- "loss": 0.7223,
696
- "step": 1080
697
- },
698
- {
699
- "epoch": 2.8,
700
- "learning_rate": 0.00022142857142857142,
701
- "loss": 0.9034,
702
- "step": 1090
703
- },
704
- {
705
- "epoch": 2.83,
706
- "learning_rate": 0.0002206349206349206,
707
- "loss": 0.9174,
708
- "step": 1100
709
- },
710
- {
711
- "epoch": 2.86,
712
- "learning_rate": 0.00021984126984126982,
713
- "loss": 0.868,
714
- "step": 1110
715
- },
716
- {
717
- "epoch": 2.88,
718
- "learning_rate": 0.000219047619047619,
719
- "loss": 0.7394,
720
- "step": 1120
721
- },
722
- {
723
- "epoch": 2.91,
724
- "learning_rate": 0.00021825396825396825,
725
- "loss": 0.7198,
726
- "step": 1130
727
- },
728
- {
729
- "epoch": 2.93,
730
- "learning_rate": 0.00021746031746031744,
731
- "loss": 0.9085,
732
- "step": 1140
733
- },
734
- {
735
- "epoch": 2.96,
736
- "learning_rate": 0.00021666666666666666,
737
- "loss": 0.9148,
738
- "step": 1150
739
- },
740
- {
741
- "epoch": 2.98,
742
- "learning_rate": 0.00021587301587301584,
743
- "loss": 0.7775,
744
- "step": 1160
745
- },
746
- {
747
- "epoch": 3.01,
748
- "learning_rate": 0.0002150793650793651,
749
- "loss": 0.7366,
750
- "step": 1170
751
- },
752
- {
753
- "epoch": 3.04,
754
- "learning_rate": 0.00021428571428571427,
755
- "loss": 0.8865,
756
- "step": 1180
757
- },
758
- {
759
- "epoch": 3.06,
760
- "learning_rate": 0.00021349206349206346,
761
- "loss": 0.8989,
762
- "step": 1190
763
- },
764
- {
765
- "epoch": 3.09,
766
- "learning_rate": 0.00021269841269841268,
767
- "loss": 0.8376,
768
- "step": 1200
769
- },
770
- {
771
- "epoch": 3.09,
772
- "eval_loss": 0.8965018391609192,
773
- "eval_runtime": 189.2693,
774
- "eval_samples_per_second": 10.567,
775
- "eval_steps_per_second": 1.321,
776
- "step": 1200
777
- },
778
- {
779
- "epoch": 3.11,
780
- "learning_rate": 0.0002119047619047619,
781
- "loss": 0.7012,
782
- "step": 1210
783
- },
784
- {
785
- "epoch": 3.14,
786
- "learning_rate": 0.0002111111111111111,
787
- "loss": 0.7288,
788
- "step": 1220
789
- },
790
- {
791
- "epoch": 3.16,
792
- "learning_rate": 0.0002103174603174603,
793
- "loss": 0.8904,
794
- "step": 1230
795
- },
796
- {
797
- "epoch": 3.19,
798
- "learning_rate": 0.00020952380952380948,
799
- "loss": 0.9081,
800
- "step": 1240
801
- },
802
- {
803
- "epoch": 3.22,
804
- "learning_rate": 0.00020873015873015873,
805
- "loss": 0.8461,
806
- "step": 1250
807
- },
808
- {
809
- "epoch": 3.24,
810
- "learning_rate": 0.00020793650793650791,
811
- "loss": 0.6997,
812
- "step": 1260
813
- },
814
- {
815
- "epoch": 3.27,
816
- "learning_rate": 0.00020714285714285713,
817
- "loss": 0.7189,
818
- "step": 1270
819
- },
820
- {
821
- "epoch": 3.29,
822
- "learning_rate": 0.00020634920634920632,
823
- "loss": 0.8863,
824
- "step": 1280
825
- },
826
- {
827
- "epoch": 3.32,
828
- "learning_rate": 0.00020555555555555556,
829
- "loss": 0.906,
830
- "step": 1290
831
- },
832
- {
833
- "epoch": 3.34,
834
- "learning_rate": 0.00020476190476190475,
835
- "loss": 0.8287,
836
- "step": 1300
837
- },
838
- {
839
- "epoch": 3.37,
840
- "learning_rate": 0.00020396825396825393,
841
- "loss": 0.7015,
842
- "step": 1310
843
- },
844
- {
845
- "epoch": 3.4,
846
- "learning_rate": 0.00020317460317460315,
847
- "loss": 0.7325,
848
- "step": 1320
849
- },
850
- {
851
- "epoch": 3.42,
852
- "learning_rate": 0.00020238095238095236,
853
- "loss": 0.8878,
854
- "step": 1330
855
- },
856
- {
857
- "epoch": 3.45,
858
- "learning_rate": 0.00020158730158730158,
859
- "loss": 0.9057,
860
- "step": 1340
861
- },
862
- {
863
- "epoch": 3.47,
864
- "learning_rate": 0.00020079365079365077,
865
- "loss": 0.8399,
866
- "step": 1350
867
- },
868
- {
869
- "epoch": 3.5,
870
- "learning_rate": 0.00019999999999999998,
871
- "loss": 0.7073,
872
- "step": 1360
873
- },
874
- {
875
- "epoch": 3.52,
876
- "learning_rate": 0.0001992063492063492,
877
- "loss": 0.7281,
878
- "step": 1370
879
- },
880
- {
881
- "epoch": 3.55,
882
- "learning_rate": 0.0001984126984126984,
883
- "loss": 0.8829,
884
- "step": 1380
885
- },
886
- {
887
- "epoch": 3.58,
888
- "learning_rate": 0.0001976190476190476,
889
- "loss": 0.8923,
890
- "step": 1390
891
- },
892
- {
893
- "epoch": 3.6,
894
- "learning_rate": 0.0001968253968253968,
895
- "loss": 0.8389,
896
- "step": 1400
897
- },
898
- {
899
- "epoch": 3.6,
900
- "eval_loss": 0.8923280239105225,
901
- "eval_runtime": 189.1693,
902
- "eval_samples_per_second": 10.573,
903
- "eval_steps_per_second": 1.322,
904
- "step": 1400
905
- }
906
- ],
907
- "max_steps": 3880,
908
- "num_train_epochs": 10,
909
- "total_flos": 1.627337272190042e+18,
910
- "trial_name": null,
911
- "trial_params": null
912
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-1400/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e9adb78996a536c4aa514741768e2b05cafc3e20ac4a0a0fe98e38b91109396
3
- size 3899
 
 
 
 
checkpoint-3600/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:33b24cfa8e04448aada6bd0f35798f33e4037e4f5c5e4d990ab5168be1ac720c
3
- size 134433093
 
 
 
 
checkpoint-3600/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:58653c0888eea93a18cfef68476391e6cf3aaabd0a866d5bd1a63232af5da325
3
- size 67201357
 
 
 
 
checkpoint-3600/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:efd17e8763b5b06bd50bbfccd0ef28c0faa97523b4dd293e16d443dfb1f74431
3
- size 14575
 
 
 
 
checkpoint-3600/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fa181fa360d46feed4180ea17c8b6a4a879a9b4231c2e91aff2be20be9076cc
3
- size 557
 
 
 
 
checkpoint-3600/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f0b82598b73ddc100e1ea38611348b8278822263257c909d620c6530bad49649
3
- size 627
 
 
 
 
checkpoint-3600/trainer_state.json DELETED
@@ -1,2320 +0,0 @@
1
- {
2
- "best_metric": 0.8923280239105225,
3
- "best_model_checkpoint": "./lora-alpaca-hc8/checkpoint-1400",
4
- "epoch": 9.260450160771704,
5
- "global_step": 3600,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.03,
12
- "learning_rate": 2.9999999999999997e-05,
13
- "loss": 1.3719,
14
- "step": 10
15
- },
16
- {
17
- "epoch": 0.05,
18
- "learning_rate": 5.9999999999999995e-05,
19
- "loss": 1.5358,
20
- "step": 20
21
- },
22
- {
23
- "epoch": 0.08,
24
- "learning_rate": 8.999999999999999e-05,
25
- "loss": 1.5633,
26
- "step": 30
27
- },
28
- {
29
- "epoch": 0.1,
30
- "learning_rate": 0.00011999999999999999,
31
- "loss": 1.2433,
32
- "step": 40
33
- },
34
- {
35
- "epoch": 0.13,
36
- "learning_rate": 0.00015,
37
- "loss": 0.9843,
38
- "step": 50
39
- },
40
- {
41
- "epoch": 0.15,
42
- "learning_rate": 0.00017999999999999998,
43
- "loss": 1.012,
44
- "step": 60
45
- },
46
- {
47
- "epoch": 0.18,
48
- "learning_rate": 0.00020999999999999998,
49
- "loss": 1.0392,
50
- "step": 70
51
- },
52
- {
53
- "epoch": 0.21,
54
- "learning_rate": 0.00023999999999999998,
55
- "loss": 1.0541,
56
- "step": 80
57
- },
58
- {
59
- "epoch": 0.23,
60
- "learning_rate": 0.00027,
61
- "loss": 0.9608,
62
- "step": 90
63
- },
64
- {
65
- "epoch": 0.26,
66
- "learning_rate": 0.0003,
67
- "loss": 0.8697,
68
- "step": 100
69
- },
70
- {
71
- "epoch": 0.28,
72
- "learning_rate": 0.00029920634920634916,
73
- "loss": 0.9836,
74
- "step": 110
75
- },
76
- {
77
- "epoch": 0.31,
78
- "learning_rate": 0.00029841269841269835,
79
- "loss": 1.003,
80
- "step": 120
81
- },
82
- {
83
- "epoch": 0.33,
84
- "learning_rate": 0.0002976190476190476,
85
- "loss": 1.0052,
86
- "step": 130
87
- },
88
- {
89
- "epoch": 0.36,
90
- "learning_rate": 0.0002968253968253968,
91
- "loss": 0.9267,
92
- "step": 140
93
- },
94
- {
95
- "epoch": 0.39,
96
- "learning_rate": 0.000296031746031746,
97
- "loss": 0.8318,
98
- "step": 150
99
- },
100
- {
101
- "epoch": 0.41,
102
- "learning_rate": 0.0002952380952380952,
103
- "loss": 0.9594,
104
- "step": 160
105
- },
106
- {
107
- "epoch": 0.44,
108
- "learning_rate": 0.00029444444444444445,
109
- "loss": 0.987,
110
- "step": 170
111
- },
112
- {
113
- "epoch": 0.46,
114
- "learning_rate": 0.00029365079365079364,
115
- "loss": 0.9646,
116
- "step": 180
117
- },
118
- {
119
- "epoch": 0.49,
120
- "learning_rate": 0.00029285714285714283,
121
- "loss": 0.8501,
122
- "step": 190
123
- },
124
- {
125
- "epoch": 0.51,
126
- "learning_rate": 0.000292063492063492,
127
- "loss": 0.7568,
128
- "step": 200
129
- },
130
- {
131
- "epoch": 0.51,
132
- "eval_loss": 0.9958714246749878,
133
- "eval_runtime": 189.2223,
134
- "eval_samples_per_second": 10.57,
135
- "eval_steps_per_second": 1.321,
136
- "step": 200
137
- },
138
- {
139
- "epoch": 0.54,
140
- "learning_rate": 0.00029126984126984126,
141
- "loss": 0.949,
142
- "step": 210
143
- },
144
- {
145
- "epoch": 0.57,
146
- "learning_rate": 0.00029047619047619045,
147
- "loss": 0.9581,
148
- "step": 220
149
- },
150
- {
151
- "epoch": 0.59,
152
- "learning_rate": 0.00028968253968253963,
153
- "loss": 0.9526,
154
- "step": 230
155
- },
156
- {
157
- "epoch": 0.62,
158
- "learning_rate": 0.0002888888888888888,
159
- "loss": 0.847,
160
- "step": 240
161
- },
162
- {
163
- "epoch": 0.64,
164
- "learning_rate": 0.00028809523809523806,
165
- "loss": 0.7414,
166
- "step": 250
167
- },
168
- {
169
- "epoch": 0.67,
170
- "learning_rate": 0.00028730158730158725,
171
- "loss": 0.9449,
172
- "step": 260
173
- },
174
- {
175
- "epoch": 0.69,
176
- "learning_rate": 0.0002865079365079365,
177
- "loss": 0.9607,
178
- "step": 270
179
- },
180
- {
181
- "epoch": 0.72,
182
- "learning_rate": 0.0002857142857142857,
183
- "loss": 0.9456,
184
- "step": 280
185
- },
186
- {
187
- "epoch": 0.75,
188
- "learning_rate": 0.0002849206349206349,
189
- "loss": 0.8346,
190
- "step": 290
191
- },
192
- {
193
- "epoch": 0.77,
194
- "learning_rate": 0.0002841269841269841,
195
- "loss": 0.7275,
196
- "step": 300
197
- },
198
- {
199
- "epoch": 0.8,
200
- "learning_rate": 0.0002833333333333333,
201
- "loss": 0.9337,
202
- "step": 310
203
- },
204
- {
205
- "epoch": 0.82,
206
- "learning_rate": 0.0002825396825396825,
207
- "loss": 0.9466,
208
- "step": 320
209
- },
210
- {
211
- "epoch": 0.85,
212
- "learning_rate": 0.00028174603174603173,
213
- "loss": 0.9386,
214
- "step": 330
215
- },
216
- {
217
- "epoch": 0.87,
218
- "learning_rate": 0.0002809523809523809,
219
- "loss": 0.8254,
220
- "step": 340
221
- },
222
- {
223
- "epoch": 0.9,
224
- "learning_rate": 0.0002801587301587301,
225
- "loss": 0.723,
226
- "step": 350
227
- },
228
- {
229
- "epoch": 0.93,
230
- "learning_rate": 0.00027936507936507935,
231
- "loss": 0.9274,
232
- "step": 360
233
- },
234
- {
235
- "epoch": 0.95,
236
- "learning_rate": 0.00027857142857142854,
237
- "loss": 0.9492,
238
- "step": 370
239
- },
240
- {
241
- "epoch": 0.98,
242
- "learning_rate": 0.0002777777777777778,
243
- "loss": 0.871,
244
- "step": 380
245
- },
246
- {
247
- "epoch": 1.0,
248
- "learning_rate": 0.00027698412698412697,
249
- "loss": 0.7533,
250
- "step": 390
251
- },
252
- {
253
- "epoch": 1.03,
254
- "learning_rate": 0.00027619047619047615,
255
- "loss": 0.9284,
256
- "step": 400
257
- },
258
- {
259
- "epoch": 1.03,
260
- "eval_loss": 0.9276881217956543,
261
- "eval_runtime": 189.2181,
262
- "eval_samples_per_second": 10.57,
263
- "eval_steps_per_second": 1.321,
264
- "step": 400
265
- },
266
- {
267
- "epoch": 1.05,
268
- "learning_rate": 0.0002753968253968254,
269
- "loss": 0.9402,
270
- "step": 410
271
- },
272
- {
273
- "epoch": 1.08,
274
- "learning_rate": 0.0002746031746031746,
275
- "loss": 0.9188,
276
- "step": 420
277
- },
278
- {
279
- "epoch": 1.11,
280
- "learning_rate": 0.00027380952380952377,
281
- "loss": 0.7976,
282
- "step": 430
283
- },
284
- {
285
- "epoch": 1.13,
286
- "learning_rate": 0.00027301587301587296,
287
- "loss": 0.7172,
288
- "step": 440
289
- },
290
- {
291
- "epoch": 1.16,
292
- "learning_rate": 0.0002722222222222222,
293
- "loss": 0.9195,
294
- "step": 450
295
- },
296
- {
297
- "epoch": 1.18,
298
- "learning_rate": 0.0002714285714285714,
299
- "loss": 0.9426,
300
- "step": 460
301
- },
302
- {
303
- "epoch": 1.21,
304
- "learning_rate": 0.00027063492063492063,
305
- "loss": 0.9034,
306
- "step": 470
307
- },
308
- {
309
- "epoch": 1.23,
310
- "learning_rate": 0.0002698412698412698,
311
- "loss": 0.788,
312
- "step": 480
313
- },
314
- {
315
- "epoch": 1.26,
316
- "learning_rate": 0.000269047619047619,
317
- "loss": 0.7213,
318
- "step": 490
319
- },
320
- {
321
- "epoch": 1.29,
322
- "learning_rate": 0.00026825396825396825,
323
- "loss": 0.9149,
324
- "step": 500
325
- },
326
- {
327
- "epoch": 1.31,
328
- "learning_rate": 0.00026746031746031744,
329
- "loss": 0.9386,
330
- "step": 510
331
- },
332
- {
333
- "epoch": 1.34,
334
- "learning_rate": 0.0002666666666666666,
335
- "loss": 0.9099,
336
- "step": 520
337
- },
338
- {
339
- "epoch": 1.36,
340
- "learning_rate": 0.00026587301587301587,
341
- "loss": 0.7802,
342
- "step": 530
343
- },
344
- {
345
- "epoch": 1.39,
346
- "learning_rate": 0.00026507936507936506,
347
- "loss": 0.7246,
348
- "step": 540
349
- },
350
- {
351
- "epoch": 1.41,
352
- "learning_rate": 0.00026428571428571424,
353
- "loss": 0.9213,
354
- "step": 550
355
- },
356
- {
357
- "epoch": 1.44,
358
- "learning_rate": 0.00026349206349206343,
359
- "loss": 0.9348,
360
- "step": 560
361
- },
362
- {
363
- "epoch": 1.47,
364
- "learning_rate": 0.0002626984126984127,
365
- "loss": 0.8995,
366
- "step": 570
367
- },
368
- {
369
- "epoch": 1.49,
370
- "learning_rate": 0.00026190476190476186,
371
- "loss": 0.7866,
372
- "step": 580
373
- },
374
- {
375
- "epoch": 1.52,
376
- "learning_rate": 0.0002611111111111111,
377
- "loss": 0.7257,
378
- "step": 590
379
- },
380
- {
381
- "epoch": 1.54,
382
- "learning_rate": 0.0002603174603174603,
383
- "loss": 0.918,
384
- "step": 600
385
- },
386
- {
387
- "epoch": 1.54,
388
- "eval_loss": 0.9236659407615662,
389
- "eval_runtime": 189.1236,
390
- "eval_samples_per_second": 10.575,
391
- "eval_steps_per_second": 1.322,
392
- "step": 600
393
- },
394
- {
395
- "epoch": 1.57,
396
- "learning_rate": 0.00025952380952380953,
397
- "loss": 0.933,
398
- "step": 610
399
- },
400
- {
401
- "epoch": 1.59,
402
- "learning_rate": 0.0002587301587301587,
403
- "loss": 0.9085,
404
- "step": 620
405
- },
406
- {
407
- "epoch": 1.62,
408
- "learning_rate": 0.0002579365079365079,
409
- "loss": 0.7928,
410
- "step": 630
411
- },
412
- {
413
- "epoch": 1.65,
414
- "learning_rate": 0.0002571428571428571,
415
- "loss": 0.7162,
416
- "step": 640
417
- },
418
- {
419
- "epoch": 1.67,
420
- "learning_rate": 0.00025634920634920634,
421
- "loss": 0.9076,
422
- "step": 650
423
- },
424
- {
425
- "epoch": 1.7,
426
- "learning_rate": 0.00025555555555555553,
427
- "loss": 0.9345,
428
- "step": 660
429
- },
430
- {
431
- "epoch": 1.72,
432
- "learning_rate": 0.0002547619047619047,
433
- "loss": 0.9107,
434
- "step": 670
435
- },
436
- {
437
- "epoch": 1.75,
438
- "learning_rate": 0.00025396825396825396,
439
- "loss": 0.7721,
440
- "step": 680
441
- },
442
- {
443
- "epoch": 1.77,
444
- "learning_rate": 0.00025317460317460315,
445
- "loss": 0.7112,
446
- "step": 690
447
- },
448
- {
449
- "epoch": 1.8,
450
- "learning_rate": 0.0002523809523809524,
451
- "loss": 0.9118,
452
- "step": 700
453
- },
454
- {
455
- "epoch": 1.83,
456
- "learning_rate": 0.0002515873015873016,
457
- "loss": 0.9205,
458
- "step": 710
459
- },
460
- {
461
- "epoch": 1.85,
462
- "learning_rate": 0.00025079365079365076,
463
- "loss": 0.9004,
464
- "step": 720
465
- },
466
- {
467
- "epoch": 1.88,
468
- "learning_rate": 0.00025,
469
- "loss": 0.7741,
470
- "step": 730
471
- },
472
- {
473
- "epoch": 1.9,
474
- "learning_rate": 0.0002492063492063492,
475
- "loss": 0.7186,
476
- "step": 740
477
- },
478
- {
479
- "epoch": 1.93,
480
- "learning_rate": 0.0002484126984126984,
481
- "loss": 0.9002,
482
- "step": 750
483
- },
484
- {
485
- "epoch": 1.95,
486
- "learning_rate": 0.00024761904761904757,
487
- "loss": 0.9066,
488
- "step": 760
489
- },
490
- {
491
- "epoch": 1.98,
492
- "learning_rate": 0.0002468253968253968,
493
- "loss": 0.8127,
494
- "step": 770
495
- },
496
- {
497
- "epoch": 2.01,
498
- "learning_rate": 0.000246031746031746,
499
- "loss": 0.7305,
500
- "step": 780
501
- },
502
- {
503
- "epoch": 2.03,
504
- "learning_rate": 0.0002452380952380952,
505
- "loss": 0.8921,
506
- "step": 790
507
- },
508
- {
509
- "epoch": 2.06,
510
- "learning_rate": 0.00024444444444444443,
511
- "loss": 0.9178,
512
- "step": 800
513
- },
514
- {
515
- "epoch": 2.06,
516
- "eval_loss": 0.901778519153595,
517
- "eval_runtime": 189.2038,
518
- "eval_samples_per_second": 10.571,
519
- "eval_steps_per_second": 1.321,
520
- "step": 800
521
- },
522
- {
523
- "epoch": 2.08,
524
- "learning_rate": 0.00024365079365079364,
525
- "loss": 0.8823,
526
- "step": 810
527
- },
528
- {
529
- "epoch": 2.11,
530
- "learning_rate": 0.00024285714285714283,
531
- "loss": 0.7521,
532
- "step": 820
533
- },
534
- {
535
- "epoch": 2.14,
536
- "learning_rate": 0.00024206349206349205,
537
- "loss": 0.717,
538
- "step": 830
539
- },
540
- {
541
- "epoch": 2.16,
542
- "learning_rate": 0.00024126984126984123,
543
- "loss": 0.9045,
544
- "step": 840
545
- },
546
- {
547
- "epoch": 2.19,
548
- "learning_rate": 0.00024047619047619048,
549
- "loss": 0.9146,
550
- "step": 850
551
- },
552
- {
553
- "epoch": 2.21,
554
- "learning_rate": 0.00023968253968253966,
555
- "loss": 0.8678,
556
- "step": 860
557
- },
558
- {
559
- "epoch": 2.24,
560
- "learning_rate": 0.00023888888888888885,
561
- "loss": 0.745,
562
- "step": 870
563
- },
564
- {
565
- "epoch": 2.26,
566
- "learning_rate": 0.00023809523809523807,
567
- "loss": 0.7226,
568
- "step": 880
569
- },
570
- {
571
- "epoch": 2.29,
572
- "learning_rate": 0.00023730158730158728,
573
- "loss": 0.9002,
574
- "step": 890
575
- },
576
- {
577
- "epoch": 2.32,
578
- "learning_rate": 0.0002365079365079365,
579
- "loss": 0.9035,
580
- "step": 900
581
- },
582
- {
583
- "epoch": 2.34,
584
- "learning_rate": 0.00023571428571428569,
585
- "loss": 0.878,
586
- "step": 910
587
- },
588
- {
589
- "epoch": 2.37,
590
- "learning_rate": 0.00023492063492063487,
591
- "loss": 0.7397,
592
- "step": 920
593
- },
594
- {
595
- "epoch": 2.39,
596
- "learning_rate": 0.00023412698412698412,
597
- "loss": 0.7251,
598
- "step": 930
599
- },
600
- {
601
- "epoch": 2.42,
602
- "learning_rate": 0.0002333333333333333,
603
- "loss": 0.9014,
604
- "step": 940
605
- },
606
- {
607
- "epoch": 2.44,
608
- "learning_rate": 0.00023253968253968252,
609
- "loss": 0.9158,
610
- "step": 950
611
- },
612
- {
613
- "epoch": 2.47,
614
- "learning_rate": 0.0002317460317460317,
615
- "loss": 0.8596,
616
- "step": 960
617
- },
618
- {
619
- "epoch": 2.5,
620
- "learning_rate": 0.00023095238095238095,
621
- "loss": 0.7312,
622
- "step": 970
623
- },
624
- {
625
- "epoch": 2.52,
626
- "learning_rate": 0.00023015873015873014,
627
- "loss": 0.7271,
628
- "step": 980
629
- },
630
- {
631
- "epoch": 2.55,
632
- "learning_rate": 0.00022936507936507935,
633
- "loss": 0.9007,
634
- "step": 990
635
- },
636
- {
637
- "epoch": 2.57,
638
- "learning_rate": 0.00022857142857142854,
639
- "loss": 0.9186,
640
- "step": 1000
641
- },
642
- {
643
- "epoch": 2.57,
644
- "eval_loss": 0.8995742201805115,
645
- "eval_runtime": 189.2401,
646
- "eval_samples_per_second": 10.569,
647
- "eval_steps_per_second": 1.321,
648
- "step": 1000
649
- },
650
- {
651
- "epoch": 2.6,
652
- "learning_rate": 0.00022777777777777778,
653
- "loss": 0.8685,
654
- "step": 1010
655
- },
656
- {
657
- "epoch": 2.62,
658
- "learning_rate": 0.00022698412698412697,
659
- "loss": 0.7359,
660
- "step": 1020
661
- },
662
- {
663
- "epoch": 2.65,
664
- "learning_rate": 0.00022619047619047616,
665
- "loss": 0.7166,
666
- "step": 1030
667
- },
668
- {
669
- "epoch": 2.68,
670
- "learning_rate": 0.00022539682539682537,
671
- "loss": 0.9012,
672
- "step": 1040
673
- },
674
- {
675
- "epoch": 2.7,
676
- "learning_rate": 0.0002246031746031746,
677
- "loss": 0.9195,
678
- "step": 1050
679
- },
680
- {
681
- "epoch": 2.73,
682
- "learning_rate": 0.0002238095238095238,
683
- "loss": 0.8733,
684
- "step": 1060
685
- },
686
- {
687
- "epoch": 2.75,
688
- "learning_rate": 0.000223015873015873,
689
- "loss": 0.7488,
690
- "step": 1070
691
- },
692
- {
693
- "epoch": 2.78,
694
- "learning_rate": 0.00022222222222222218,
695
- "loss": 0.7223,
696
- "step": 1080
697
- },
698
- {
699
- "epoch": 2.8,
700
- "learning_rate": 0.00022142857142857142,
701
- "loss": 0.9034,
702
- "step": 1090
703
- },
704
- {
705
- "epoch": 2.83,
706
- "learning_rate": 0.0002206349206349206,
707
- "loss": 0.9174,
708
- "step": 1100
709
- },
710
- {
711
- "epoch": 2.86,
712
- "learning_rate": 0.00021984126984126982,
713
- "loss": 0.868,
714
- "step": 1110
715
- },
716
- {
717
- "epoch": 2.88,
718
- "learning_rate": 0.000219047619047619,
719
- "loss": 0.7394,
720
- "step": 1120
721
- },
722
- {
723
- "epoch": 2.91,
724
- "learning_rate": 0.00021825396825396825,
725
- "loss": 0.7198,
726
- "step": 1130
727
- },
728
- {
729
- "epoch": 2.93,
730
- "learning_rate": 0.00021746031746031744,
731
- "loss": 0.9085,
732
- "step": 1140
733
- },
734
- {
735
- "epoch": 2.96,
736
- "learning_rate": 0.00021666666666666666,
737
- "loss": 0.9148,
738
- "step": 1150
739
- },
740
- {
741
- "epoch": 2.98,
742
- "learning_rate": 0.00021587301587301584,
743
- "loss": 0.7775,
744
- "step": 1160
745
- },
746
- {
747
- "epoch": 3.01,
748
- "learning_rate": 0.0002150793650793651,
749
- "loss": 0.7366,
750
- "step": 1170
751
- },
752
- {
753
- "epoch": 3.04,
754
- "learning_rate": 0.00021428571428571427,
755
- "loss": 0.8865,
756
- "step": 1180
757
- },
758
- {
759
- "epoch": 3.06,
760
- "learning_rate": 0.00021349206349206346,
761
- "loss": 0.8989,
762
- "step": 1190
763
- },
764
- {
765
- "epoch": 3.09,
766
- "learning_rate": 0.00021269841269841268,
767
- "loss": 0.8376,
768
- "step": 1200
769
- },
770
- {
771
- "epoch": 3.09,
772
- "eval_loss": 0.8965018391609192,
773
- "eval_runtime": 189.2693,
774
- "eval_samples_per_second": 10.567,
775
- "eval_steps_per_second": 1.321,
776
- "step": 1200
777
- },
778
- {
779
- "epoch": 3.11,
780
- "learning_rate": 0.0002119047619047619,
781
- "loss": 0.7012,
782
- "step": 1210
783
- },
784
- {
785
- "epoch": 3.14,
786
- "learning_rate": 0.0002111111111111111,
787
- "loss": 0.7288,
788
- "step": 1220
789
- },
790
- {
791
- "epoch": 3.16,
792
- "learning_rate": 0.0002103174603174603,
793
- "loss": 0.8904,
794
- "step": 1230
795
- },
796
- {
797
- "epoch": 3.19,
798
- "learning_rate": 0.00020952380952380948,
799
- "loss": 0.9081,
800
- "step": 1240
801
- },
802
- {
803
- "epoch": 3.22,
804
- "learning_rate": 0.00020873015873015873,
805
- "loss": 0.8461,
806
- "step": 1250
807
- },
808
- {
809
- "epoch": 3.24,
810
- "learning_rate": 0.00020793650793650791,
811
- "loss": 0.6997,
812
- "step": 1260
813
- },
814
- {
815
- "epoch": 3.27,
816
- "learning_rate": 0.00020714285714285713,
817
- "loss": 0.7189,
818
- "step": 1270
819
- },
820
- {
821
- "epoch": 3.29,
822
- "learning_rate": 0.00020634920634920632,
823
- "loss": 0.8863,
824
- "step": 1280
825
- },
826
- {
827
- "epoch": 3.32,
828
- "learning_rate": 0.00020555555555555556,
829
- "loss": 0.906,
830
- "step": 1290
831
- },
832
- {
833
- "epoch": 3.34,
834
- "learning_rate": 0.00020476190476190475,
835
- "loss": 0.8287,
836
- "step": 1300
837
- },
838
- {
839
- "epoch": 3.37,
840
- "learning_rate": 0.00020396825396825393,
841
- "loss": 0.7015,
842
- "step": 1310
843
- },
844
- {
845
- "epoch": 3.4,
846
- "learning_rate": 0.00020317460317460315,
847
- "loss": 0.7325,
848
- "step": 1320
849
- },
850
- {
851
- "epoch": 3.42,
852
- "learning_rate": 0.00020238095238095236,
853
- "loss": 0.8878,
854
- "step": 1330
855
- },
856
- {
857
- "epoch": 3.45,
858
- "learning_rate": 0.00020158730158730158,
859
- "loss": 0.9057,
860
- "step": 1340
861
- },
862
- {
863
- "epoch": 3.47,
864
- "learning_rate": 0.00020079365079365077,
865
- "loss": 0.8399,
866
- "step": 1350
867
- },
868
- {
869
- "epoch": 3.5,
870
- "learning_rate": 0.00019999999999999998,
871
- "loss": 0.7073,
872
- "step": 1360
873
- },
874
- {
875
- "epoch": 3.52,
876
- "learning_rate": 0.0001992063492063492,
877
- "loss": 0.7281,
878
- "step": 1370
879
- },
880
- {
881
- "epoch": 3.55,
882
- "learning_rate": 0.0001984126984126984,
883
- "loss": 0.8829,
884
- "step": 1380
885
- },
886
- {
887
- "epoch": 3.58,
888
- "learning_rate": 0.0001976190476190476,
889
- "loss": 0.8923,
890
- "step": 1390
891
- },
892
- {
893
- "epoch": 3.6,
894
- "learning_rate": 0.0001968253968253968,
895
- "loss": 0.8389,
896
- "step": 1400
897
- },
898
- {
899
- "epoch": 3.6,
900
- "eval_loss": 0.8923280239105225,
901
- "eval_runtime": 189.1693,
902
- "eval_samples_per_second": 10.573,
903
- "eval_steps_per_second": 1.322,
904
- "step": 1400
905
- },
906
- {
907
- "epoch": 3.63,
908
- "learning_rate": 0.00019603174603174603,
909
- "loss": 0.7148,
910
- "step": 1410
911
- },
912
- {
913
- "epoch": 3.65,
914
- "learning_rate": 0.00019523809523809522,
915
- "loss": 0.7331,
916
- "step": 1420
917
- },
918
- {
919
- "epoch": 3.68,
920
- "learning_rate": 0.00019444444444444443,
921
- "loss": 0.8944,
922
- "step": 1430
923
- },
924
- {
925
- "epoch": 3.7,
926
- "learning_rate": 0.00019365079365079362,
927
- "loss": 0.9001,
928
- "step": 1440
929
- },
930
- {
931
- "epoch": 3.73,
932
- "learning_rate": 0.00019285714285714286,
933
- "loss": 0.8397,
934
- "step": 1450
935
- },
936
- {
937
- "epoch": 3.76,
938
- "learning_rate": 0.00019206349206349205,
939
- "loss": 0.6953,
940
- "step": 1460
941
- },
942
- {
943
- "epoch": 3.78,
944
- "learning_rate": 0.00019126984126984124,
945
- "loss": 0.7229,
946
- "step": 1470
947
- },
948
- {
949
- "epoch": 3.81,
950
- "learning_rate": 0.00019047619047619045,
951
- "loss": 0.8833,
952
- "step": 1480
953
- },
954
- {
955
- "epoch": 3.83,
956
- "learning_rate": 0.00018968253968253967,
957
- "loss": 0.9028,
958
- "step": 1490
959
- },
960
- {
961
- "epoch": 3.86,
962
- "learning_rate": 0.00018888888888888888,
963
- "loss": 0.8412,
964
- "step": 1500
965
- },
966
- {
967
- "epoch": 3.88,
968
- "learning_rate": 0.00018809523809523807,
969
- "loss": 0.7024,
970
- "step": 1510
971
- },
972
- {
973
- "epoch": 3.91,
974
- "learning_rate": 0.0001873015873015873,
975
- "loss": 0.7262,
976
- "step": 1520
977
- },
978
- {
979
- "epoch": 3.94,
980
- "learning_rate": 0.0001865079365079365,
981
- "loss": 0.8926,
982
- "step": 1530
983
- },
984
- {
985
- "epoch": 3.96,
986
- "learning_rate": 0.00018571428571428572,
987
- "loss": 0.8703,
988
- "step": 1540
989
- },
990
- {
991
- "epoch": 3.99,
992
- "learning_rate": 0.0001849206349206349,
993
- "loss": 0.7311,
994
- "step": 1550
995
- },
996
- {
997
- "epoch": 4.01,
998
- "learning_rate": 0.0001841269841269841,
999
- "loss": 0.7535,
1000
- "step": 1560
1001
- },
1002
- {
1003
- "epoch": 4.04,
1004
- "learning_rate": 0.00018333333333333334,
1005
- "loss": 0.8779,
1006
- "step": 1570
1007
- },
1008
- {
1009
- "epoch": 4.06,
1010
- "learning_rate": 0.00018253968253968252,
1011
- "loss": 0.887,
1012
- "step": 1580
1013
- },
1014
- {
1015
- "epoch": 4.09,
1016
- "learning_rate": 0.00018174603174603174,
1017
- "loss": 0.797,
1018
- "step": 1590
1019
- },
1020
- {
1021
- "epoch": 4.12,
1022
- "learning_rate": 0.00018095238095238093,
1023
- "loss": 0.6651,
1024
- "step": 1600
1025
- },
1026
- {
1027
- "epoch": 4.12,
1028
- "eval_loss": 0.904344916343689,
1029
- "eval_runtime": 189.1226,
1030
- "eval_samples_per_second": 10.575,
1031
- "eval_steps_per_second": 1.322,
1032
- "step": 1600
1033
- },
1034
- {
1035
- "epoch": 4.14,
1036
- "learning_rate": 0.00018015873015873017,
1037
- "loss": 0.7348,
1038
- "step": 1610
1039
- },
1040
- {
1041
- "epoch": 4.17,
1042
- "learning_rate": 0.00017936507936507936,
1043
- "loss": 0.8756,
1044
- "step": 1620
1045
- },
1046
- {
1047
- "epoch": 4.19,
1048
- "learning_rate": 0.00017857142857142854,
1049
- "loss": 0.8934,
1050
- "step": 1630
1051
- },
1052
- {
1053
- "epoch": 4.22,
1054
- "learning_rate": 0.00017777777777777776,
1055
- "loss": 0.8023,
1056
- "step": 1640
1057
- },
1058
- {
1059
- "epoch": 4.24,
1060
- "learning_rate": 0.00017698412698412697,
1061
- "loss": 0.6788,
1062
- "step": 1650
1063
- },
1064
- {
1065
- "epoch": 4.27,
1066
- "learning_rate": 0.0001761904761904762,
1067
- "loss": 0.7387,
1068
- "step": 1660
1069
- },
1070
- {
1071
- "epoch": 4.3,
1072
- "learning_rate": 0.00017539682539682538,
1073
- "loss": 0.885,
1074
- "step": 1670
1075
- },
1076
- {
1077
- "epoch": 4.32,
1078
- "learning_rate": 0.00017460317460317457,
1079
- "loss": 0.8738,
1080
- "step": 1680
1081
- },
1082
- {
1083
- "epoch": 4.35,
1084
- "learning_rate": 0.0001738095238095238,
1085
- "loss": 0.8059,
1086
- "step": 1690
1087
- },
1088
- {
1089
- "epoch": 4.37,
1090
- "learning_rate": 0.000173015873015873,
1091
- "loss": 0.6667,
1092
- "step": 1700
1093
- },
1094
- {
1095
- "epoch": 4.4,
1096
- "learning_rate": 0.0001722222222222222,
1097
- "loss": 0.7406,
1098
- "step": 1710
1099
- },
1100
- {
1101
- "epoch": 4.42,
1102
- "learning_rate": 0.0001714285714285714,
1103
- "loss": 0.8764,
1104
- "step": 1720
1105
- },
1106
- {
1107
- "epoch": 4.45,
1108
- "learning_rate": 0.00017063492063492064,
1109
- "loss": 0.8839,
1110
- "step": 1730
1111
- },
1112
- {
1113
- "epoch": 4.48,
1114
- "learning_rate": 0.00016984126984126983,
1115
- "loss": 0.8009,
1116
- "step": 1740
1117
- },
1118
- {
1119
- "epoch": 4.5,
1120
- "learning_rate": 0.00016904761904761904,
1121
- "loss": 0.6658,
1122
- "step": 1750
1123
- },
1124
- {
1125
- "epoch": 4.53,
1126
- "learning_rate": 0.00016825396825396823,
1127
- "loss": 0.7423,
1128
- "step": 1760
1129
- },
1130
- {
1131
- "epoch": 4.55,
1132
- "learning_rate": 0.00016746031746031747,
1133
- "loss": 0.8748,
1134
- "step": 1770
1135
- },
1136
- {
1137
- "epoch": 4.58,
1138
- "learning_rate": 0.00016666666666666666,
1139
- "loss": 0.887,
1140
- "step": 1780
1141
- },
1142
- {
1143
- "epoch": 4.6,
1144
- "learning_rate": 0.00016587301587301585,
1145
- "loss": 0.8038,
1146
- "step": 1790
1147
- },
1148
- {
1149
- "epoch": 4.63,
1150
- "learning_rate": 0.00016507936507936506,
1151
- "loss": 0.6631,
1152
- "step": 1800
1153
- },
1154
- {
1155
- "epoch": 4.63,
1156
- "eval_loss": 0.9004252552986145,
1157
- "eval_runtime": 189.1263,
1158
- "eval_samples_per_second": 10.575,
1159
- "eval_steps_per_second": 1.322,
1160
- "step": 1800
1161
- },
1162
- {
1163
- "epoch": 4.66,
1164
- "learning_rate": 0.00016428571428571428,
1165
- "loss": 0.7327,
1166
- "step": 1810
1167
- },
1168
- {
1169
- "epoch": 4.68,
1170
- "learning_rate": 0.0001634920634920635,
1171
- "loss": 0.8703,
1172
- "step": 1820
1173
- },
1174
- {
1175
- "epoch": 4.71,
1176
- "learning_rate": 0.00016269841269841268,
1177
- "loss": 0.8734,
1178
- "step": 1830
1179
- },
1180
- {
1181
- "epoch": 4.73,
1182
- "learning_rate": 0.00016190476190476187,
1183
- "loss": 0.8066,
1184
- "step": 1840
1185
- },
1186
- {
1187
- "epoch": 4.76,
1188
- "learning_rate": 0.0001611111111111111,
1189
- "loss": 0.6655,
1190
- "step": 1850
1191
- },
1192
- {
1193
- "epoch": 4.78,
1194
- "learning_rate": 0.0001603174603174603,
1195
- "loss": 0.736,
1196
- "step": 1860
1197
- },
1198
- {
1199
- "epoch": 4.81,
1200
- "learning_rate": 0.00015952380952380951,
1201
- "loss": 0.8707,
1202
- "step": 1870
1203
- },
1204
- {
1205
- "epoch": 4.84,
1206
- "learning_rate": 0.0001587301587301587,
1207
- "loss": 0.8789,
1208
- "step": 1880
1209
- },
1210
- {
1211
- "epoch": 4.86,
1212
- "learning_rate": 0.00015793650793650795,
1213
- "loss": 0.804,
1214
- "step": 1890
1215
- },
1216
- {
1217
- "epoch": 4.89,
1218
- "learning_rate": 0.00015714285714285713,
1219
- "loss": 0.6628,
1220
- "step": 1900
1221
- },
1222
- {
1223
- "epoch": 4.91,
1224
- "learning_rate": 0.00015634920634920635,
1225
- "loss": 0.7394,
1226
- "step": 1910
1227
- },
1228
- {
1229
- "epoch": 4.94,
1230
- "learning_rate": 0.00015555555555555554,
1231
- "loss": 0.8856,
1232
- "step": 1920
1233
- },
1234
- {
1235
- "epoch": 4.96,
1236
- "learning_rate": 0.00015476190476190478,
1237
- "loss": 0.8558,
1238
- "step": 1930
1239
- },
1240
- {
1241
- "epoch": 4.99,
1242
- "learning_rate": 0.00015396825396825397,
1243
- "loss": 0.6766,
1244
- "step": 1940
1245
- },
1246
- {
1247
- "epoch": 5.02,
1248
- "learning_rate": 0.00015317460317460315,
1249
- "loss": 0.7662,
1250
- "step": 1950
1251
- },
1252
- {
1253
- "epoch": 5.04,
1254
- "learning_rate": 0.00015238095238095237,
1255
- "loss": 0.8602,
1256
- "step": 1960
1257
- },
1258
- {
1259
- "epoch": 5.07,
1260
- "learning_rate": 0.00015158730158730158,
1261
- "loss": 0.8567,
1262
- "step": 1970
1263
- },
1264
- {
1265
- "epoch": 5.09,
1266
- "learning_rate": 0.0001507936507936508,
1267
- "loss": 0.7482,
1268
- "step": 1980
1269
- },
1270
- {
1271
- "epoch": 5.12,
1272
- "learning_rate": 0.00015,
1273
- "loss": 0.6249,
1274
- "step": 1990
1275
- },
1276
- {
1277
- "epoch": 5.14,
1278
- "learning_rate": 0.00014920634920634917,
1279
- "loss": 0.7506,
1280
- "step": 2000
1281
- },
1282
- {
1283
- "epoch": 5.14,
1284
- "eval_loss": 0.9112463593482971,
1285
- "eval_runtime": 189.2232,
1286
- "eval_samples_per_second": 10.57,
1287
- "eval_steps_per_second": 1.321,
1288
- "step": 2000
1289
- },
1290
- {
1291
- "epoch": 5.17,
1292
- "learning_rate": 0.0001484126984126984,
1293
- "loss": 0.8679,
1294
- "step": 2010
1295
- },
1296
- {
1297
- "epoch": 5.2,
1298
- "learning_rate": 0.0001476190476190476,
1299
- "loss": 0.8575,
1300
- "step": 2020
1301
- },
1302
- {
1303
- "epoch": 5.22,
1304
- "learning_rate": 0.00014682539682539682,
1305
- "loss": 0.7545,
1306
- "step": 2030
1307
- },
1308
- {
1309
- "epoch": 5.25,
1310
- "learning_rate": 0.000146031746031746,
1311
- "loss": 0.6237,
1312
- "step": 2040
1313
- },
1314
- {
1315
- "epoch": 5.27,
1316
- "learning_rate": 0.00014523809523809522,
1317
- "loss": 0.7561,
1318
- "step": 2050
1319
- },
1320
- {
1321
- "epoch": 5.3,
1322
- "learning_rate": 0.0001444444444444444,
1323
- "loss": 0.8688,
1324
- "step": 2060
1325
- },
1326
- {
1327
- "epoch": 5.32,
1328
- "learning_rate": 0.00014365079365079363,
1329
- "loss": 0.8578,
1330
- "step": 2070
1331
- },
1332
- {
1333
- "epoch": 5.35,
1334
- "learning_rate": 0.00014285714285714284,
1335
- "loss": 0.7743,
1336
- "step": 2080
1337
- },
1338
- {
1339
- "epoch": 5.38,
1340
- "learning_rate": 0.00014206349206349206,
1341
- "loss": 0.6337,
1342
- "step": 2090
1343
- },
1344
- {
1345
- "epoch": 5.4,
1346
- "learning_rate": 0.00014126984126984124,
1347
- "loss": 0.7518,
1348
- "step": 2100
1349
- },
1350
- {
1351
- "epoch": 5.43,
1352
- "learning_rate": 0.00014047619047619046,
1353
- "loss": 0.8736,
1354
- "step": 2110
1355
- },
1356
- {
1357
- "epoch": 5.45,
1358
- "learning_rate": 0.00013968253968253967,
1359
- "loss": 0.8623,
1360
- "step": 2120
1361
- },
1362
- {
1363
- "epoch": 5.48,
1364
- "learning_rate": 0.0001388888888888889,
1365
- "loss": 0.7719,
1366
- "step": 2130
1367
- },
1368
- {
1369
- "epoch": 5.5,
1370
- "learning_rate": 0.00013809523809523808,
1371
- "loss": 0.6363,
1372
- "step": 2140
1373
- },
1374
- {
1375
- "epoch": 5.53,
1376
- "learning_rate": 0.0001373015873015873,
1377
- "loss": 0.7458,
1378
- "step": 2150
1379
- },
1380
- {
1381
- "epoch": 5.56,
1382
- "learning_rate": 0.00013650793650793648,
1383
- "loss": 0.865,
1384
- "step": 2160
1385
- },
1386
- {
1387
- "epoch": 5.58,
1388
- "learning_rate": 0.00013587301587301588,
1389
- "loss": 2.6458,
1390
- "step": 2170
1391
- },
1392
- {
1393
- "epoch": 5.61,
1394
- "learning_rate": 0.00013587301587301588,
1395
- "loss": 801.9857,
1396
- "step": 2180
1397
- },
1398
- {
1399
- "epoch": 5.63,
1400
- "learning_rate": 0.0001357142857142857,
1401
- "loss": 4068.018,
1402
- "step": 2190
1403
- },
1404
- {
1405
- "epoch": 5.66,
1406
- "learning_rate": 0.00013547619047619047,
1407
- "loss": 41837456.0,
1408
- "step": 2200
1409
- },
1410
- {
1411
- "epoch": 5.66,
1412
- "eval_loss": NaN,
1413
- "eval_runtime": 189.1905,
1414
- "eval_samples_per_second": 10.571,
1415
- "eval_steps_per_second": 1.321,
1416
- "step": 2200
1417
- },
1418
- {
1419
- "epoch": 5.68,
1420
- "learning_rate": 0.00013547619047619047,
1421
- "loss": 1.4509301308571507e+26,
1422
- "step": 2210
1423
- },
1424
- {
1425
- "epoch": 5.71,
1426
- "learning_rate": 0.00013547619047619047,
1427
- "loss": 3.3847730092507856e+24,
1428
- "step": 2220
1429
- },
1430
- {
1431
- "epoch": 5.74,
1432
- "learning_rate": 0.00013547619047619047,
1433
- "loss": 1.5211943209070177e+23,
1434
- "step": 2230
1435
- },
1436
- {
1437
- "epoch": 5.76,
1438
- "learning_rate": 0.00013547619047619047,
1439
- "loss": 6.678915709304036e+21,
1440
- "step": 2240
1441
- },
1442
- {
1443
- "epoch": 5.79,
1444
- "learning_rate": 0.00013531746031746032,
1445
- "loss": 9.238063623264189e+19,
1446
- "step": 2250
1447
- },
1448
- {
1449
- "epoch": 5.81,
1450
- "learning_rate": 0.00013531746031746032,
1451
- "loss": 1.0532014646514701e+20,
1452
- "step": 2260
1453
- },
1454
- {
1455
- "epoch": 5.84,
1456
- "learning_rate": 0.00013531746031746032,
1457
- "loss": 3.161435096687031e+25,
1458
- "step": 2270
1459
- },
1460
- {
1461
- "epoch": 5.86,
1462
- "learning_rate": 0.00013531746031746032,
1463
- "loss": 2.0162940987179532e+19,
1464
- "step": 2280
1465
- },
1466
- {
1467
- "epoch": 5.89,
1468
- "learning_rate": 0.00013531746031746032,
1469
- "loss": 2.55694599151234e+20,
1470
- "step": 2290
1471
- },
1472
- {
1473
- "epoch": 5.92,
1474
- "learning_rate": 0.00013523809523809522,
1475
- "loss": 5.808441058207432e+20,
1476
- "step": 2300
1477
- },
1478
- {
1479
- "epoch": 5.94,
1480
- "learning_rate": 0.00013523809523809522,
1481
- "loss": 1.9880088514154103e+22,
1482
- "step": 2310
1483
- },
1484
- {
1485
- "epoch": 5.97,
1486
- "learning_rate": 0.00013523809523809522,
1487
- "loss": 2.0954874078435546e+24,
1488
- "step": 2320
1489
- },
1490
- {
1491
- "epoch": 5.99,
1492
- "learning_rate": 0.00013523809523809522,
1493
- "loss": 6.309141694629275e+20,
1494
- "step": 2330
1495
- },
1496
- {
1497
- "epoch": 6.02,
1498
- "learning_rate": 0.00013515873015873016,
1499
- "loss": 7.353349497283535e+23,
1500
- "step": 2340
1501
- },
1502
- {
1503
- "epoch": 6.05,
1504
- "learning_rate": 0.00013468253968253966,
1505
- "loss": 0.0,
1506
- "step": 2350
1507
- },
1508
- {
1509
- "epoch": 6.07,
1510
- "learning_rate": 0.00013388888888888888,
1511
- "loss": 0.0,
1512
- "step": 2360
1513
- },
1514
- {
1515
- "epoch": 6.1,
1516
- "learning_rate": 0.00013309523809523806,
1517
- "loss": 0.0,
1518
- "step": 2370
1519
- },
1520
- {
1521
- "epoch": 6.12,
1522
- "learning_rate": 0.00013230158730158728,
1523
- "loss": 0.0,
1524
- "step": 2380
1525
- },
1526
- {
1527
- "epoch": 6.15,
1528
- "learning_rate": 0.0001315079365079365,
1529
- "loss": 0.0,
1530
- "step": 2390
1531
- },
1532
- {
1533
- "epoch": 6.17,
1534
- "learning_rate": 0.0001307142857142857,
1535
- "loss": 0.0,
1536
- "step": 2400
1537
- },
1538
- {
1539
- "epoch": 6.17,
1540
- "eval_loss": NaN,
1541
- "eval_runtime": 164.902,
1542
- "eval_samples_per_second": 12.128,
1543
- "eval_steps_per_second": 1.516,
1544
- "step": 2400
1545
- },
1546
- {
1547
- "epoch": 6.2,
1548
- "learning_rate": 0.0001299206349206349,
1549
- "loss": 0.0,
1550
- "step": 2410
1551
- },
1552
- {
1553
- "epoch": 6.23,
1554
- "learning_rate": 0.0001291269841269841,
1555
- "loss": 0.0,
1556
- "step": 2420
1557
- },
1558
- {
1559
- "epoch": 6.25,
1560
- "learning_rate": 0.00012833333333333333,
1561
- "loss": 0.0,
1562
- "step": 2430
1563
- },
1564
- {
1565
- "epoch": 6.28,
1566
- "learning_rate": 0.00012753968253968254,
1567
- "loss": 0.0,
1568
- "step": 2440
1569
- },
1570
- {
1571
- "epoch": 6.3,
1572
- "learning_rate": 0.00012674603174603173,
1573
- "loss": 0.0,
1574
- "step": 2450
1575
- },
1576
- {
1577
- "epoch": 6.33,
1578
- "learning_rate": 0.00012595238095238094,
1579
- "loss": 0.0,
1580
- "step": 2460
1581
- },
1582
- {
1583
- "epoch": 6.35,
1584
- "learning_rate": 0.00012515873015873013,
1585
- "loss": 0.0,
1586
- "step": 2470
1587
- },
1588
- {
1589
- "epoch": 6.38,
1590
- "learning_rate": 0.00012436507936507935,
1591
- "loss": 0.0,
1592
- "step": 2480
1593
- },
1594
- {
1595
- "epoch": 6.41,
1596
- "learning_rate": 0.00012357142857142856,
1597
- "loss": 0.0,
1598
- "step": 2490
1599
- },
1600
- {
1601
- "epoch": 6.43,
1602
- "learning_rate": 0.00012277777777777778,
1603
- "loss": 0.0,
1604
- "step": 2500
1605
- },
1606
- {
1607
- "epoch": 6.46,
1608
- "learning_rate": 0.00012198412698412697,
1609
- "loss": 0.0,
1610
- "step": 2510
1611
- },
1612
- {
1613
- "epoch": 6.48,
1614
- "learning_rate": 0.00012119047619047618,
1615
- "loss": 0.0,
1616
- "step": 2520
1617
- },
1618
- {
1619
- "epoch": 6.51,
1620
- "learning_rate": 0.00012039682539682538,
1621
- "loss": 0.0,
1622
- "step": 2530
1623
- },
1624
- {
1625
- "epoch": 6.53,
1626
- "learning_rate": 0.0001196031746031746,
1627
- "loss": 0.0,
1628
- "step": 2540
1629
- },
1630
- {
1631
- "epoch": 6.56,
1632
- "learning_rate": 0.0001188095238095238,
1633
- "loss": 0.0,
1634
- "step": 2550
1635
- },
1636
- {
1637
- "epoch": 6.59,
1638
- "learning_rate": 0.00011801587301587301,
1639
- "loss": 0.0,
1640
- "step": 2560
1641
- },
1642
- {
1643
- "epoch": 6.61,
1644
- "learning_rate": 0.0001172222222222222,
1645
- "loss": 0.0,
1646
- "step": 2570
1647
- },
1648
- {
1649
- "epoch": 6.64,
1650
- "learning_rate": 0.00011642857142857142,
1651
- "loss": 0.0,
1652
- "step": 2580
1653
- },
1654
- {
1655
- "epoch": 6.66,
1656
- "learning_rate": 0.00011563492063492062,
1657
- "loss": 0.0,
1658
- "step": 2590
1659
- },
1660
- {
1661
- "epoch": 6.69,
1662
- "learning_rate": 0.00011484126984126983,
1663
- "loss": 0.0,
1664
- "step": 2600
1665
- },
1666
- {
1667
- "epoch": 6.69,
1668
- "eval_loss": NaN,
1669
- "eval_runtime": 164.9294,
1670
- "eval_samples_per_second": 12.126,
1671
- "eval_steps_per_second": 1.516,
1672
- "step": 2600
1673
- },
1674
- {
1675
- "epoch": 6.71,
1676
- "learning_rate": 0.00011404761904761903,
1677
- "loss": 0.0,
1678
- "step": 2610
1679
- },
1680
- {
1681
- "epoch": 6.74,
1682
- "learning_rate": 0.00011325396825396825,
1683
- "loss": 0.0,
1684
- "step": 2620
1685
- },
1686
- {
1687
- "epoch": 6.77,
1688
- "learning_rate": 0.00011246031746031745,
1689
- "loss": 0.0,
1690
- "step": 2630
1691
- },
1692
- {
1693
- "epoch": 6.79,
1694
- "learning_rate": 0.00011166666666666667,
1695
- "loss": 0.0,
1696
- "step": 2640
1697
- },
1698
- {
1699
- "epoch": 6.82,
1700
- "learning_rate": 0.00011087301587301585,
1701
- "loss": 0.0,
1702
- "step": 2650
1703
- },
1704
- {
1705
- "epoch": 6.84,
1706
- "learning_rate": 0.00011007936507936507,
1707
- "loss": 0.0,
1708
- "step": 2660
1709
- },
1710
- {
1711
- "epoch": 6.87,
1712
- "learning_rate": 0.00010928571428571427,
1713
- "loss": 0.0,
1714
- "step": 2670
1715
- },
1716
- {
1717
- "epoch": 6.89,
1718
- "learning_rate": 0.00010849206349206349,
1719
- "loss": 0.0,
1720
- "step": 2680
1721
- },
1722
- {
1723
- "epoch": 6.92,
1724
- "learning_rate": 0.00010769841269841269,
1725
- "loss": 0.0,
1726
- "step": 2690
1727
- },
1728
- {
1729
- "epoch": 6.95,
1730
- "learning_rate": 0.0001069047619047619,
1731
- "loss": 0.0,
1732
- "step": 2700
1733
- },
1734
- {
1735
- "epoch": 6.97,
1736
- "learning_rate": 0.0001061111111111111,
1737
- "loss": 0.0,
1738
- "step": 2710
1739
- },
1740
- {
1741
- "epoch": 7.0,
1742
- "learning_rate": 0.0001053174603174603,
1743
- "loss": 0.0,
1744
- "step": 2720
1745
- },
1746
- {
1747
- "epoch": 7.02,
1748
- "learning_rate": 0.0001045238095238095,
1749
- "loss": 0.0,
1750
- "step": 2730
1751
- },
1752
- {
1753
- "epoch": 7.05,
1754
- "learning_rate": 0.00010373015873015872,
1755
- "loss": 0.0,
1756
- "step": 2740
1757
- },
1758
- {
1759
- "epoch": 7.07,
1760
- "learning_rate": 0.00010293650793650792,
1761
- "loss": 0.0,
1762
- "step": 2750
1763
- },
1764
- {
1765
- "epoch": 7.1,
1766
- "learning_rate": 0.00010214285714285714,
1767
- "loss": 0.0,
1768
- "step": 2760
1769
- },
1770
- {
1771
- "epoch": 7.13,
1772
- "learning_rate": 0.00010134920634920634,
1773
- "loss": 0.0,
1774
- "step": 2770
1775
- },
1776
- {
1777
- "epoch": 7.15,
1778
- "learning_rate": 0.00010055555555555555,
1779
- "loss": 0.0,
1780
- "step": 2780
1781
- },
1782
- {
1783
- "epoch": 7.18,
1784
- "learning_rate": 9.976190476190474e-05,
1785
- "loss": 0.0,
1786
- "step": 2790
1787
- },
1788
- {
1789
- "epoch": 7.2,
1790
- "learning_rate": 9.896825396825396e-05,
1791
- "loss": 0.0,
1792
- "step": 2800
1793
- },
1794
- {
1795
- "epoch": 7.2,
1796
- "eval_loss": NaN,
1797
- "eval_runtime": 164.8859,
1798
- "eval_samples_per_second": 12.13,
1799
- "eval_steps_per_second": 1.516,
1800
- "step": 2800
1801
- },
1802
- {
1803
- "epoch": 7.23,
1804
- "learning_rate": 9.817460317460316e-05,
1805
- "loss": 0.0,
1806
- "step": 2810
1807
- },
1808
- {
1809
- "epoch": 7.25,
1810
- "learning_rate": 9.738095238095237e-05,
1811
- "loss": 0.0,
1812
- "step": 2820
1813
- },
1814
- {
1815
- "epoch": 7.28,
1816
- "learning_rate": 9.658730158730158e-05,
1817
- "loss": 0.0,
1818
- "step": 2830
1819
- },
1820
- {
1821
- "epoch": 7.31,
1822
- "learning_rate": 9.579365079365079e-05,
1823
- "loss": 0.0,
1824
- "step": 2840
1825
- },
1826
- {
1827
- "epoch": 7.33,
1828
- "learning_rate": 9.499999999999999e-05,
1829
- "loss": 0.0,
1830
- "step": 2850
1831
- },
1832
- {
1833
- "epoch": 7.36,
1834
- "learning_rate": 9.42063492063492e-05,
1835
- "loss": 0.0,
1836
- "step": 2860
1837
- },
1838
- {
1839
- "epoch": 7.38,
1840
- "learning_rate": 9.34126984126984e-05,
1841
- "loss": 0.0,
1842
- "step": 2870
1843
- },
1844
- {
1845
- "epoch": 7.41,
1846
- "learning_rate": 9.261904761904761e-05,
1847
- "loss": 0.0,
1848
- "step": 2880
1849
- },
1850
- {
1851
- "epoch": 7.43,
1852
- "learning_rate": 9.182539682539681e-05,
1853
- "loss": 0.0,
1854
- "step": 2890
1855
- },
1856
- {
1857
- "epoch": 7.46,
1858
- "learning_rate": 9.103174603174603e-05,
1859
- "loss": 0.0,
1860
- "step": 2900
1861
- },
1862
- {
1863
- "epoch": 7.49,
1864
- "learning_rate": 9.023809523809523e-05,
1865
- "loss": 0.0,
1866
- "step": 2910
1867
- },
1868
- {
1869
- "epoch": 7.51,
1870
- "learning_rate": 8.944444444444444e-05,
1871
- "loss": 0.0,
1872
- "step": 2920
1873
- },
1874
- {
1875
- "epoch": 7.54,
1876
- "learning_rate": 8.865079365079364e-05,
1877
- "loss": 0.0,
1878
- "step": 2930
1879
- },
1880
- {
1881
- "epoch": 7.56,
1882
- "learning_rate": 8.785714285714286e-05,
1883
- "loss": 0.0,
1884
- "step": 2940
1885
- },
1886
- {
1887
- "epoch": 7.59,
1888
- "learning_rate": 8.706349206349205e-05,
1889
- "loss": 0.0,
1890
- "step": 2950
1891
- },
1892
- {
1893
- "epoch": 7.61,
1894
- "learning_rate": 8.626984126984126e-05,
1895
- "loss": 0.0,
1896
- "step": 2960
1897
- },
1898
- {
1899
- "epoch": 7.64,
1900
- "learning_rate": 8.547619047619046e-05,
1901
- "loss": 0.0,
1902
- "step": 2970
1903
- },
1904
- {
1905
- "epoch": 7.67,
1906
- "learning_rate": 8.468253968253968e-05,
1907
- "loss": 0.0,
1908
- "step": 2980
1909
- },
1910
- {
1911
- "epoch": 7.69,
1912
- "learning_rate": 8.388888888888888e-05,
1913
- "loss": 0.0,
1914
- "step": 2990
1915
- },
1916
- {
1917
- "epoch": 7.72,
1918
- "learning_rate": 8.30952380952381e-05,
1919
- "loss": 0.0,
1920
- "step": 3000
1921
- },
1922
- {
1923
- "epoch": 7.72,
1924
- "eval_loss": NaN,
1925
- "eval_runtime": 164.8915,
1926
- "eval_samples_per_second": 12.129,
1927
- "eval_steps_per_second": 1.516,
1928
- "step": 3000
1929
- },
1930
- {
1931
- "epoch": 7.74,
1932
- "learning_rate": 8.23015873015873e-05,
1933
- "loss": 0.0,
1934
- "step": 3010
1935
- },
1936
- {
1937
- "epoch": 7.77,
1938
- "learning_rate": 8.150793650793651e-05,
1939
- "loss": 0.0,
1940
- "step": 3020
1941
- },
1942
- {
1943
- "epoch": 7.79,
1944
- "learning_rate": 8.07142857142857e-05,
1945
- "loss": 0.0,
1946
- "step": 3030
1947
- },
1948
- {
1949
- "epoch": 7.82,
1950
- "learning_rate": 7.992063492063491e-05,
1951
- "loss": 0.0,
1952
- "step": 3040
1953
- },
1954
- {
1955
- "epoch": 7.85,
1956
- "learning_rate": 7.912698412698412e-05,
1957
- "loss": 0.0,
1958
- "step": 3050
1959
- },
1960
- {
1961
- "epoch": 7.87,
1962
- "learning_rate": 7.833333333333333e-05,
1963
- "loss": 0.0,
1964
- "step": 3060
1965
- },
1966
- {
1967
- "epoch": 7.9,
1968
- "learning_rate": 7.753968253968253e-05,
1969
- "loss": 0.0,
1970
- "step": 3070
1971
- },
1972
- {
1973
- "epoch": 7.92,
1974
- "learning_rate": 7.674603174603175e-05,
1975
- "loss": 0.0,
1976
- "step": 3080
1977
- },
1978
- {
1979
- "epoch": 7.95,
1980
- "learning_rate": 7.595238095238095e-05,
1981
- "loss": 0.0,
1982
- "step": 3090
1983
- },
1984
- {
1985
- "epoch": 7.97,
1986
- "learning_rate": 7.515873015873015e-05,
1987
- "loss": 0.0,
1988
- "step": 3100
1989
- },
1990
- {
1991
- "epoch": 8.0,
1992
- "learning_rate": 7.436507936507935e-05,
1993
- "loss": 0.0,
1994
- "step": 3110
1995
- },
1996
- {
1997
- "epoch": 8.03,
1998
- "learning_rate": 7.357142857142857e-05,
1999
- "loss": 0.0,
2000
- "step": 3120
2001
- },
2002
- {
2003
- "epoch": 8.05,
2004
- "learning_rate": 7.277777777777777e-05,
2005
- "loss": 0.0,
2006
- "step": 3130
2007
- },
2008
- {
2009
- "epoch": 8.08,
2010
- "learning_rate": 7.198412698412697e-05,
2011
- "loss": 0.0,
2012
- "step": 3140
2013
- },
2014
- {
2015
- "epoch": 8.1,
2016
- "learning_rate": 7.119047619047618e-05,
2017
- "loss": 0.0,
2018
- "step": 3150
2019
- },
2020
- {
2021
- "epoch": 8.13,
2022
- "learning_rate": 7.039682539682539e-05,
2023
- "loss": 0.0,
2024
- "step": 3160
2025
- },
2026
- {
2027
- "epoch": 8.15,
2028
- "learning_rate": 6.960317460317459e-05,
2029
- "loss": 0.0,
2030
- "step": 3170
2031
- },
2032
- {
2033
- "epoch": 8.18,
2034
- "learning_rate": 6.88095238095238e-05,
2035
- "loss": 0.0,
2036
- "step": 3180
2037
- },
2038
- {
2039
- "epoch": 8.21,
2040
- "learning_rate": 6.8015873015873e-05,
2041
- "loss": 0.0,
2042
- "step": 3190
2043
- },
2044
- {
2045
- "epoch": 8.23,
2046
- "learning_rate": 6.722222222222222e-05,
2047
- "loss": 0.0,
2048
- "step": 3200
2049
- },
2050
- {
2051
- "epoch": 8.23,
2052
- "eval_loss": NaN,
2053
- "eval_runtime": 164.9214,
2054
- "eval_samples_per_second": 12.127,
2055
- "eval_steps_per_second": 1.516,
2056
- "step": 3200
2057
- },
2058
- {
2059
- "epoch": 8.26,
2060
- "learning_rate": 6.642857142857142e-05,
2061
- "loss": 0.0,
2062
- "step": 3210
2063
- },
2064
- {
2065
- "epoch": 8.28,
2066
- "learning_rate": 6.563492063492062e-05,
2067
- "loss": 0.0,
2068
- "step": 3220
2069
- },
2070
- {
2071
- "epoch": 8.31,
2072
- "learning_rate": 6.484126984126984e-05,
2073
- "loss": 0.0,
2074
- "step": 3230
2075
- },
2076
- {
2077
- "epoch": 8.33,
2078
- "learning_rate": 6.404761904761904e-05,
2079
- "loss": 0.0,
2080
- "step": 3240
2081
- },
2082
- {
2083
- "epoch": 8.36,
2084
- "learning_rate": 6.325396825396824e-05,
2085
- "loss": 0.0,
2086
- "step": 3250
2087
- },
2088
- {
2089
- "epoch": 8.39,
2090
- "learning_rate": 6.246031746031746e-05,
2091
- "loss": 0.0,
2092
- "step": 3260
2093
- },
2094
- {
2095
- "epoch": 8.41,
2096
- "learning_rate": 6.166666666666666e-05,
2097
- "loss": 0.0,
2098
- "step": 3270
2099
- },
2100
- {
2101
- "epoch": 8.44,
2102
- "learning_rate": 6.0873015873015865e-05,
2103
- "loss": 0.0,
2104
- "step": 3280
2105
- },
2106
- {
2107
- "epoch": 8.46,
2108
- "learning_rate": 6.007936507936507e-05,
2109
- "loss": 0.0,
2110
- "step": 3290
2111
- },
2112
- {
2113
- "epoch": 8.49,
2114
- "learning_rate": 5.9285714285714275e-05,
2115
- "loss": 0.0,
2116
- "step": 3300
2117
- },
2118
- {
2119
- "epoch": 8.51,
2120
- "learning_rate": 5.849206349206348e-05,
2121
- "loss": 0.0,
2122
- "step": 3310
2123
- },
2124
- {
2125
- "epoch": 8.54,
2126
- "learning_rate": 5.769841269841269e-05,
2127
- "loss": 0.0,
2128
- "step": 3320
2129
- },
2130
- {
2131
- "epoch": 8.57,
2132
- "learning_rate": 5.69047619047619e-05,
2133
- "loss": 0.0,
2134
- "step": 3330
2135
- },
2136
- {
2137
- "epoch": 8.59,
2138
- "learning_rate": 5.61111111111111e-05,
2139
- "loss": 0.0,
2140
- "step": 3340
2141
- },
2142
- {
2143
- "epoch": 8.62,
2144
- "learning_rate": 5.531746031746031e-05,
2145
- "loss": 0.0,
2146
- "step": 3350
2147
- },
2148
- {
2149
- "epoch": 8.64,
2150
- "learning_rate": 5.452380952380952e-05,
2151
- "loss": 0.0,
2152
- "step": 3360
2153
- },
2154
- {
2155
- "epoch": 8.67,
2156
- "learning_rate": 5.3730158730158726e-05,
2157
- "loss": 0.0,
2158
- "step": 3370
2159
- },
2160
- {
2161
- "epoch": 8.69,
2162
- "learning_rate": 5.293650793650793e-05,
2163
- "loss": 0.0,
2164
- "step": 3380
2165
- },
2166
- {
2167
- "epoch": 8.72,
2168
- "learning_rate": 5.2142857142857135e-05,
2169
- "loss": 0.0,
2170
- "step": 3390
2171
- },
2172
- {
2173
- "epoch": 8.75,
2174
- "learning_rate": 5.1349206349206344e-05,
2175
- "loss": 0.0,
2176
- "step": 3400
2177
- },
2178
- {
2179
- "epoch": 8.75,
2180
- "eval_loss": NaN,
2181
- "eval_runtime": 164.9331,
2182
- "eval_samples_per_second": 12.126,
2183
- "eval_steps_per_second": 1.516,
2184
- "step": 3400
2185
- },
2186
- {
2187
- "epoch": 8.77,
2188
- "learning_rate": 5.055555555555555e-05,
2189
- "loss": 0.0,
2190
- "step": 3410
2191
- },
2192
- {
2193
- "epoch": 8.8,
2194
- "learning_rate": 4.976190476190475e-05,
2195
- "loss": 0.0,
2196
- "step": 3420
2197
- },
2198
- {
2199
- "epoch": 8.82,
2200
- "learning_rate": 4.896825396825396e-05,
2201
- "loss": 0.0,
2202
- "step": 3430
2203
- },
2204
- {
2205
- "epoch": 8.85,
2206
- "learning_rate": 4.817460317460317e-05,
2207
- "loss": 0.0,
2208
- "step": 3440
2209
- },
2210
- {
2211
- "epoch": 8.87,
2212
- "learning_rate": 4.738095238095238e-05,
2213
- "loss": 0.0,
2214
- "step": 3450
2215
- },
2216
- {
2217
- "epoch": 8.9,
2218
- "learning_rate": 4.658730158730158e-05,
2219
- "loss": 0.0,
2220
- "step": 3460
2221
- },
2222
- {
2223
- "epoch": 8.93,
2224
- "learning_rate": 4.579365079365079e-05,
2225
- "loss": 0.0,
2226
- "step": 3470
2227
- },
2228
- {
2229
- "epoch": 8.95,
2230
- "learning_rate": 4.4999999999999996e-05,
2231
- "loss": 0.0,
2232
- "step": 3480
2233
- },
2234
- {
2235
- "epoch": 8.98,
2236
- "learning_rate": 4.42063492063492e-05,
2237
- "loss": 0.0,
2238
- "step": 3490
2239
- },
2240
- {
2241
- "epoch": 9.0,
2242
- "learning_rate": 4.3412698412698406e-05,
2243
- "loss": 0.0,
2244
- "step": 3500
2245
- },
2246
- {
2247
- "epoch": 9.03,
2248
- "learning_rate": 4.2619047619047614e-05,
2249
- "loss": 0.0,
2250
- "step": 3510
2251
- },
2252
- {
2253
- "epoch": 9.05,
2254
- "learning_rate": 4.182539682539682e-05,
2255
- "loss": 0.0,
2256
- "step": 3520
2257
- },
2258
- {
2259
- "epoch": 9.08,
2260
- "learning_rate": 4.1031746031746024e-05,
2261
- "loss": 0.0,
2262
- "step": 3530
2263
- },
2264
- {
2265
- "epoch": 9.11,
2266
- "learning_rate": 4.023809523809523e-05,
2267
- "loss": 0.0,
2268
- "step": 3540
2269
- },
2270
- {
2271
- "epoch": 9.13,
2272
- "learning_rate": 3.944444444444444e-05,
2273
- "loss": 0.0,
2274
- "step": 3550
2275
- },
2276
- {
2277
- "epoch": 9.16,
2278
- "learning_rate": 3.865079365079365e-05,
2279
- "loss": 0.0,
2280
- "step": 3560
2281
- },
2282
- {
2283
- "epoch": 9.18,
2284
- "learning_rate": 3.785714285714285e-05,
2285
- "loss": 0.0,
2286
- "step": 3570
2287
- },
2288
- {
2289
- "epoch": 9.21,
2290
- "learning_rate": 3.706349206349206e-05,
2291
- "loss": 0.0,
2292
- "step": 3580
2293
- },
2294
- {
2295
- "epoch": 9.23,
2296
- "learning_rate": 3.6269841269841266e-05,
2297
- "loss": 0.0,
2298
- "step": 3590
2299
- },
2300
- {
2301
- "epoch": 9.26,
2302
- "learning_rate": 3.5476190476190475e-05,
2303
- "loss": 0.0,
2304
- "step": 3600
2305
- },
2306
- {
2307
- "epoch": 9.26,
2308
- "eval_loss": NaN,
2309
- "eval_runtime": 164.9466,
2310
- "eval_samples_per_second": 12.125,
2311
- "eval_steps_per_second": 1.516,
2312
- "step": 3600
2313
- }
2314
- ],
2315
- "max_steps": 3880,
2316
- "num_train_epochs": 10,
2317
- "total_flos": 4.1583436794093896e+18,
2318
- "trial_name": null,
2319
- "trial_params": null
2320
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3600/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e9adb78996a536c4aa514741768e2b05cafc3e20ac4a0a0fe98e38b91109396
3
- size 3899
 
 
 
 
checkpoint-3800/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:18c2f0237ccfb8afd6cd7875393f49b2693b0535a82600fa7a3ed83c4fb9d324
3
- size 134433093
 
 
 
 
checkpoint-3800/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:58653c0888eea93a18cfef68476391e6cf3aaabd0a866d5bd1a63232af5da325
3
- size 67201357
 
 
 
 
checkpoint-3800/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:94a7728da6a7fa81be20f01daede1f4c98ccfa8b9f494eba5cd524ec56cdd102
3
- size 14575
 
 
 
 
checkpoint-3800/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7fa181fa360d46feed4180ea17c8b6a4a879a9b4231c2e91aff2be20be9076cc
3
- size 557
 
 
 
 
checkpoint-3800/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:34b8c336e51219d5b13963d925f0201df2c0b333a86c61eb7fe22364210b844d
3
- size 627
 
 
 
 
checkpoint-3800/trainer_state.json DELETED
@@ -1,2448 +0,0 @@
1
- {
2
- "best_metric": 0.8923280239105225,
3
- "best_model_checkpoint": "./lora-alpaca-hc8/checkpoint-1400",
4
- "epoch": 9.77491961414791,
5
- "global_step": 3800,
6
- "is_hyper_param_search": false,
7
- "is_local_process_zero": true,
8
- "is_world_process_zero": true,
9
- "log_history": [
10
- {
11
- "epoch": 0.03,
12
- "learning_rate": 2.9999999999999997e-05,
13
- "loss": 1.3719,
14
- "step": 10
15
- },
16
- {
17
- "epoch": 0.05,
18
- "learning_rate": 5.9999999999999995e-05,
19
- "loss": 1.5358,
20
- "step": 20
21
- },
22
- {
23
- "epoch": 0.08,
24
- "learning_rate": 8.999999999999999e-05,
25
- "loss": 1.5633,
26
- "step": 30
27
- },
28
- {
29
- "epoch": 0.1,
30
- "learning_rate": 0.00011999999999999999,
31
- "loss": 1.2433,
32
- "step": 40
33
- },
34
- {
35
- "epoch": 0.13,
36
- "learning_rate": 0.00015,
37
- "loss": 0.9843,
38
- "step": 50
39
- },
40
- {
41
- "epoch": 0.15,
42
- "learning_rate": 0.00017999999999999998,
43
- "loss": 1.012,
44
- "step": 60
45
- },
46
- {
47
- "epoch": 0.18,
48
- "learning_rate": 0.00020999999999999998,
49
- "loss": 1.0392,
50
- "step": 70
51
- },
52
- {
53
- "epoch": 0.21,
54
- "learning_rate": 0.00023999999999999998,
55
- "loss": 1.0541,
56
- "step": 80
57
- },
58
- {
59
- "epoch": 0.23,
60
- "learning_rate": 0.00027,
61
- "loss": 0.9608,
62
- "step": 90
63
- },
64
- {
65
- "epoch": 0.26,
66
- "learning_rate": 0.0003,
67
- "loss": 0.8697,
68
- "step": 100
69
- },
70
- {
71
- "epoch": 0.28,
72
- "learning_rate": 0.00029920634920634916,
73
- "loss": 0.9836,
74
- "step": 110
75
- },
76
- {
77
- "epoch": 0.31,
78
- "learning_rate": 0.00029841269841269835,
79
- "loss": 1.003,
80
- "step": 120
81
- },
82
- {
83
- "epoch": 0.33,
84
- "learning_rate": 0.0002976190476190476,
85
- "loss": 1.0052,
86
- "step": 130
87
- },
88
- {
89
- "epoch": 0.36,
90
- "learning_rate": 0.0002968253968253968,
91
- "loss": 0.9267,
92
- "step": 140
93
- },
94
- {
95
- "epoch": 0.39,
96
- "learning_rate": 0.000296031746031746,
97
- "loss": 0.8318,
98
- "step": 150
99
- },
100
- {
101
- "epoch": 0.41,
102
- "learning_rate": 0.0002952380952380952,
103
- "loss": 0.9594,
104
- "step": 160
105
- },
106
- {
107
- "epoch": 0.44,
108
- "learning_rate": 0.00029444444444444445,
109
- "loss": 0.987,
110
- "step": 170
111
- },
112
- {
113
- "epoch": 0.46,
114
- "learning_rate": 0.00029365079365079364,
115
- "loss": 0.9646,
116
- "step": 180
117
- },
118
- {
119
- "epoch": 0.49,
120
- "learning_rate": 0.00029285714285714283,
121
- "loss": 0.8501,
122
- "step": 190
123
- },
124
- {
125
- "epoch": 0.51,
126
- "learning_rate": 0.000292063492063492,
127
- "loss": 0.7568,
128
- "step": 200
129
- },
130
- {
131
- "epoch": 0.51,
132
- "eval_loss": 0.9958714246749878,
133
- "eval_runtime": 189.2223,
134
- "eval_samples_per_second": 10.57,
135
- "eval_steps_per_second": 1.321,
136
- "step": 200
137
- },
138
- {
139
- "epoch": 0.54,
140
- "learning_rate": 0.00029126984126984126,
141
- "loss": 0.949,
142
- "step": 210
143
- },
144
- {
145
- "epoch": 0.57,
146
- "learning_rate": 0.00029047619047619045,
147
- "loss": 0.9581,
148
- "step": 220
149
- },
150
- {
151
- "epoch": 0.59,
152
- "learning_rate": 0.00028968253968253963,
153
- "loss": 0.9526,
154
- "step": 230
155
- },
156
- {
157
- "epoch": 0.62,
158
- "learning_rate": 0.0002888888888888888,
159
- "loss": 0.847,
160
- "step": 240
161
- },
162
- {
163
- "epoch": 0.64,
164
- "learning_rate": 0.00028809523809523806,
165
- "loss": 0.7414,
166
- "step": 250
167
- },
168
- {
169
- "epoch": 0.67,
170
- "learning_rate": 0.00028730158730158725,
171
- "loss": 0.9449,
172
- "step": 260
173
- },
174
- {
175
- "epoch": 0.69,
176
- "learning_rate": 0.0002865079365079365,
177
- "loss": 0.9607,
178
- "step": 270
179
- },
180
- {
181
- "epoch": 0.72,
182
- "learning_rate": 0.0002857142857142857,
183
- "loss": 0.9456,
184
- "step": 280
185
- },
186
- {
187
- "epoch": 0.75,
188
- "learning_rate": 0.0002849206349206349,
189
- "loss": 0.8346,
190
- "step": 290
191
- },
192
- {
193
- "epoch": 0.77,
194
- "learning_rate": 0.0002841269841269841,
195
- "loss": 0.7275,
196
- "step": 300
197
- },
198
- {
199
- "epoch": 0.8,
200
- "learning_rate": 0.0002833333333333333,
201
- "loss": 0.9337,
202
- "step": 310
203
- },
204
- {
205
- "epoch": 0.82,
206
- "learning_rate": 0.0002825396825396825,
207
- "loss": 0.9466,
208
- "step": 320
209
- },
210
- {
211
- "epoch": 0.85,
212
- "learning_rate": 0.00028174603174603173,
213
- "loss": 0.9386,
214
- "step": 330
215
- },
216
- {
217
- "epoch": 0.87,
218
- "learning_rate": 0.0002809523809523809,
219
- "loss": 0.8254,
220
- "step": 340
221
- },
222
- {
223
- "epoch": 0.9,
224
- "learning_rate": 0.0002801587301587301,
225
- "loss": 0.723,
226
- "step": 350
227
- },
228
- {
229
- "epoch": 0.93,
230
- "learning_rate": 0.00027936507936507935,
231
- "loss": 0.9274,
232
- "step": 360
233
- },
234
- {
235
- "epoch": 0.95,
236
- "learning_rate": 0.00027857142857142854,
237
- "loss": 0.9492,
238
- "step": 370
239
- },
240
- {
241
- "epoch": 0.98,
242
- "learning_rate": 0.0002777777777777778,
243
- "loss": 0.871,
244
- "step": 380
245
- },
246
- {
247
- "epoch": 1.0,
248
- "learning_rate": 0.00027698412698412697,
249
- "loss": 0.7533,
250
- "step": 390
251
- },
252
- {
253
- "epoch": 1.03,
254
- "learning_rate": 0.00027619047619047615,
255
- "loss": 0.9284,
256
- "step": 400
257
- },
258
- {
259
- "epoch": 1.03,
260
- "eval_loss": 0.9276881217956543,
261
- "eval_runtime": 189.2181,
262
- "eval_samples_per_second": 10.57,
263
- "eval_steps_per_second": 1.321,
264
- "step": 400
265
- },
266
- {
267
- "epoch": 1.05,
268
- "learning_rate": 0.0002753968253968254,
269
- "loss": 0.9402,
270
- "step": 410
271
- },
272
- {
273
- "epoch": 1.08,
274
- "learning_rate": 0.0002746031746031746,
275
- "loss": 0.9188,
276
- "step": 420
277
- },
278
- {
279
- "epoch": 1.11,
280
- "learning_rate": 0.00027380952380952377,
281
- "loss": 0.7976,
282
- "step": 430
283
- },
284
- {
285
- "epoch": 1.13,
286
- "learning_rate": 0.00027301587301587296,
287
- "loss": 0.7172,
288
- "step": 440
289
- },
290
- {
291
- "epoch": 1.16,
292
- "learning_rate": 0.0002722222222222222,
293
- "loss": 0.9195,
294
- "step": 450
295
- },
296
- {
297
- "epoch": 1.18,
298
- "learning_rate": 0.0002714285714285714,
299
- "loss": 0.9426,
300
- "step": 460
301
- },
302
- {
303
- "epoch": 1.21,
304
- "learning_rate": 0.00027063492063492063,
305
- "loss": 0.9034,
306
- "step": 470
307
- },
308
- {
309
- "epoch": 1.23,
310
- "learning_rate": 0.0002698412698412698,
311
- "loss": 0.788,
312
- "step": 480
313
- },
314
- {
315
- "epoch": 1.26,
316
- "learning_rate": 0.000269047619047619,
317
- "loss": 0.7213,
318
- "step": 490
319
- },
320
- {
321
- "epoch": 1.29,
322
- "learning_rate": 0.00026825396825396825,
323
- "loss": 0.9149,
324
- "step": 500
325
- },
326
- {
327
- "epoch": 1.31,
328
- "learning_rate": 0.00026746031746031744,
329
- "loss": 0.9386,
330
- "step": 510
331
- },
332
- {
333
- "epoch": 1.34,
334
- "learning_rate": 0.0002666666666666666,
335
- "loss": 0.9099,
336
- "step": 520
337
- },
338
- {
339
- "epoch": 1.36,
340
- "learning_rate": 0.00026587301587301587,
341
- "loss": 0.7802,
342
- "step": 530
343
- },
344
- {
345
- "epoch": 1.39,
346
- "learning_rate": 0.00026507936507936506,
347
- "loss": 0.7246,
348
- "step": 540
349
- },
350
- {
351
- "epoch": 1.41,
352
- "learning_rate": 0.00026428571428571424,
353
- "loss": 0.9213,
354
- "step": 550
355
- },
356
- {
357
- "epoch": 1.44,
358
- "learning_rate": 0.00026349206349206343,
359
- "loss": 0.9348,
360
- "step": 560
361
- },
362
- {
363
- "epoch": 1.47,
364
- "learning_rate": 0.0002626984126984127,
365
- "loss": 0.8995,
366
- "step": 570
367
- },
368
- {
369
- "epoch": 1.49,
370
- "learning_rate": 0.00026190476190476186,
371
- "loss": 0.7866,
372
- "step": 580
373
- },
374
- {
375
- "epoch": 1.52,
376
- "learning_rate": 0.0002611111111111111,
377
- "loss": 0.7257,
378
- "step": 590
379
- },
380
- {
381
- "epoch": 1.54,
382
- "learning_rate": 0.0002603174603174603,
383
- "loss": 0.918,
384
- "step": 600
385
- },
386
- {
387
- "epoch": 1.54,
388
- "eval_loss": 0.9236659407615662,
389
- "eval_runtime": 189.1236,
390
- "eval_samples_per_second": 10.575,
391
- "eval_steps_per_second": 1.322,
392
- "step": 600
393
- },
394
- {
395
- "epoch": 1.57,
396
- "learning_rate": 0.00025952380952380953,
397
- "loss": 0.933,
398
- "step": 610
399
- },
400
- {
401
- "epoch": 1.59,
402
- "learning_rate": 0.0002587301587301587,
403
- "loss": 0.9085,
404
- "step": 620
405
- },
406
- {
407
- "epoch": 1.62,
408
- "learning_rate": 0.0002579365079365079,
409
- "loss": 0.7928,
410
- "step": 630
411
- },
412
- {
413
- "epoch": 1.65,
414
- "learning_rate": 0.0002571428571428571,
415
- "loss": 0.7162,
416
- "step": 640
417
- },
418
- {
419
- "epoch": 1.67,
420
- "learning_rate": 0.00025634920634920634,
421
- "loss": 0.9076,
422
- "step": 650
423
- },
424
- {
425
- "epoch": 1.7,
426
- "learning_rate": 0.00025555555555555553,
427
- "loss": 0.9345,
428
- "step": 660
429
- },
430
- {
431
- "epoch": 1.72,
432
- "learning_rate": 0.0002547619047619047,
433
- "loss": 0.9107,
434
- "step": 670
435
- },
436
- {
437
- "epoch": 1.75,
438
- "learning_rate": 0.00025396825396825396,
439
- "loss": 0.7721,
440
- "step": 680
441
- },
442
- {
443
- "epoch": 1.77,
444
- "learning_rate": 0.00025317460317460315,
445
- "loss": 0.7112,
446
- "step": 690
447
- },
448
- {
449
- "epoch": 1.8,
450
- "learning_rate": 0.0002523809523809524,
451
- "loss": 0.9118,
452
- "step": 700
453
- },
454
- {
455
- "epoch": 1.83,
456
- "learning_rate": 0.0002515873015873016,
457
- "loss": 0.9205,
458
- "step": 710
459
- },
460
- {
461
- "epoch": 1.85,
462
- "learning_rate": 0.00025079365079365076,
463
- "loss": 0.9004,
464
- "step": 720
465
- },
466
- {
467
- "epoch": 1.88,
468
- "learning_rate": 0.00025,
469
- "loss": 0.7741,
470
- "step": 730
471
- },
472
- {
473
- "epoch": 1.9,
474
- "learning_rate": 0.0002492063492063492,
475
- "loss": 0.7186,
476
- "step": 740
477
- },
478
- {
479
- "epoch": 1.93,
480
- "learning_rate": 0.0002484126984126984,
481
- "loss": 0.9002,
482
- "step": 750
483
- },
484
- {
485
- "epoch": 1.95,
486
- "learning_rate": 0.00024761904761904757,
487
- "loss": 0.9066,
488
- "step": 760
489
- },
490
- {
491
- "epoch": 1.98,
492
- "learning_rate": 0.0002468253968253968,
493
- "loss": 0.8127,
494
- "step": 770
495
- },
496
- {
497
- "epoch": 2.01,
498
- "learning_rate": 0.000246031746031746,
499
- "loss": 0.7305,
500
- "step": 780
501
- },
502
- {
503
- "epoch": 2.03,
504
- "learning_rate": 0.0002452380952380952,
505
- "loss": 0.8921,
506
- "step": 790
507
- },
508
- {
509
- "epoch": 2.06,
510
- "learning_rate": 0.00024444444444444443,
511
- "loss": 0.9178,
512
- "step": 800
513
- },
514
- {
515
- "epoch": 2.06,
516
- "eval_loss": 0.901778519153595,
517
- "eval_runtime": 189.2038,
518
- "eval_samples_per_second": 10.571,
519
- "eval_steps_per_second": 1.321,
520
- "step": 800
521
- },
522
- {
523
- "epoch": 2.08,
524
- "learning_rate": 0.00024365079365079364,
525
- "loss": 0.8823,
526
- "step": 810
527
- },
528
- {
529
- "epoch": 2.11,
530
- "learning_rate": 0.00024285714285714283,
531
- "loss": 0.7521,
532
- "step": 820
533
- },
534
- {
535
- "epoch": 2.14,
536
- "learning_rate": 0.00024206349206349205,
537
- "loss": 0.717,
538
- "step": 830
539
- },
540
- {
541
- "epoch": 2.16,
542
- "learning_rate": 0.00024126984126984123,
543
- "loss": 0.9045,
544
- "step": 840
545
- },
546
- {
547
- "epoch": 2.19,
548
- "learning_rate": 0.00024047619047619048,
549
- "loss": 0.9146,
550
- "step": 850
551
- },
552
- {
553
- "epoch": 2.21,
554
- "learning_rate": 0.00023968253968253966,
555
- "loss": 0.8678,
556
- "step": 860
557
- },
558
- {
559
- "epoch": 2.24,
560
- "learning_rate": 0.00023888888888888885,
561
- "loss": 0.745,
562
- "step": 870
563
- },
564
- {
565
- "epoch": 2.26,
566
- "learning_rate": 0.00023809523809523807,
567
- "loss": 0.7226,
568
- "step": 880
569
- },
570
- {
571
- "epoch": 2.29,
572
- "learning_rate": 0.00023730158730158728,
573
- "loss": 0.9002,
574
- "step": 890
575
- },
576
- {
577
- "epoch": 2.32,
578
- "learning_rate": 0.0002365079365079365,
579
- "loss": 0.9035,
580
- "step": 900
581
- },
582
- {
583
- "epoch": 2.34,
584
- "learning_rate": 0.00023571428571428569,
585
- "loss": 0.878,
586
- "step": 910
587
- },
588
- {
589
- "epoch": 2.37,
590
- "learning_rate": 0.00023492063492063487,
591
- "loss": 0.7397,
592
- "step": 920
593
- },
594
- {
595
- "epoch": 2.39,
596
- "learning_rate": 0.00023412698412698412,
597
- "loss": 0.7251,
598
- "step": 930
599
- },
600
- {
601
- "epoch": 2.42,
602
- "learning_rate": 0.0002333333333333333,
603
- "loss": 0.9014,
604
- "step": 940
605
- },
606
- {
607
- "epoch": 2.44,
608
- "learning_rate": 0.00023253968253968252,
609
- "loss": 0.9158,
610
- "step": 950
611
- },
612
- {
613
- "epoch": 2.47,
614
- "learning_rate": 0.0002317460317460317,
615
- "loss": 0.8596,
616
- "step": 960
617
- },
618
- {
619
- "epoch": 2.5,
620
- "learning_rate": 0.00023095238095238095,
621
- "loss": 0.7312,
622
- "step": 970
623
- },
624
- {
625
- "epoch": 2.52,
626
- "learning_rate": 0.00023015873015873014,
627
- "loss": 0.7271,
628
- "step": 980
629
- },
630
- {
631
- "epoch": 2.55,
632
- "learning_rate": 0.00022936507936507935,
633
- "loss": 0.9007,
634
- "step": 990
635
- },
636
- {
637
- "epoch": 2.57,
638
- "learning_rate": 0.00022857142857142854,
639
- "loss": 0.9186,
640
- "step": 1000
641
- },
642
- {
643
- "epoch": 2.57,
644
- "eval_loss": 0.8995742201805115,
645
- "eval_runtime": 189.2401,
646
- "eval_samples_per_second": 10.569,
647
- "eval_steps_per_second": 1.321,
648
- "step": 1000
649
- },
650
- {
651
- "epoch": 2.6,
652
- "learning_rate": 0.00022777777777777778,
653
- "loss": 0.8685,
654
- "step": 1010
655
- },
656
- {
657
- "epoch": 2.62,
658
- "learning_rate": 0.00022698412698412697,
659
- "loss": 0.7359,
660
- "step": 1020
661
- },
662
- {
663
- "epoch": 2.65,
664
- "learning_rate": 0.00022619047619047616,
665
- "loss": 0.7166,
666
- "step": 1030
667
- },
668
- {
669
- "epoch": 2.68,
670
- "learning_rate": 0.00022539682539682537,
671
- "loss": 0.9012,
672
- "step": 1040
673
- },
674
- {
675
- "epoch": 2.7,
676
- "learning_rate": 0.0002246031746031746,
677
- "loss": 0.9195,
678
- "step": 1050
679
- },
680
- {
681
- "epoch": 2.73,
682
- "learning_rate": 0.0002238095238095238,
683
- "loss": 0.8733,
684
- "step": 1060
685
- },
686
- {
687
- "epoch": 2.75,
688
- "learning_rate": 0.000223015873015873,
689
- "loss": 0.7488,
690
- "step": 1070
691
- },
692
- {
693
- "epoch": 2.78,
694
- "learning_rate": 0.00022222222222222218,
695
- "loss": 0.7223,
696
- "step": 1080
697
- },
698
- {
699
- "epoch": 2.8,
700
- "learning_rate": 0.00022142857142857142,
701
- "loss": 0.9034,
702
- "step": 1090
703
- },
704
- {
705
- "epoch": 2.83,
706
- "learning_rate": 0.0002206349206349206,
707
- "loss": 0.9174,
708
- "step": 1100
709
- },
710
- {
711
- "epoch": 2.86,
712
- "learning_rate": 0.00021984126984126982,
713
- "loss": 0.868,
714
- "step": 1110
715
- },
716
- {
717
- "epoch": 2.88,
718
- "learning_rate": 0.000219047619047619,
719
- "loss": 0.7394,
720
- "step": 1120
721
- },
722
- {
723
- "epoch": 2.91,
724
- "learning_rate": 0.00021825396825396825,
725
- "loss": 0.7198,
726
- "step": 1130
727
- },
728
- {
729
- "epoch": 2.93,
730
- "learning_rate": 0.00021746031746031744,
731
- "loss": 0.9085,
732
- "step": 1140
733
- },
734
- {
735
- "epoch": 2.96,
736
- "learning_rate": 0.00021666666666666666,
737
- "loss": 0.9148,
738
- "step": 1150
739
- },
740
- {
741
- "epoch": 2.98,
742
- "learning_rate": 0.00021587301587301584,
743
- "loss": 0.7775,
744
- "step": 1160
745
- },
746
- {
747
- "epoch": 3.01,
748
- "learning_rate": 0.0002150793650793651,
749
- "loss": 0.7366,
750
- "step": 1170
751
- },
752
- {
753
- "epoch": 3.04,
754
- "learning_rate": 0.00021428571428571427,
755
- "loss": 0.8865,
756
- "step": 1180
757
- },
758
- {
759
- "epoch": 3.06,
760
- "learning_rate": 0.00021349206349206346,
761
- "loss": 0.8989,
762
- "step": 1190
763
- },
764
- {
765
- "epoch": 3.09,
766
- "learning_rate": 0.00021269841269841268,
767
- "loss": 0.8376,
768
- "step": 1200
769
- },
770
- {
771
- "epoch": 3.09,
772
- "eval_loss": 0.8965018391609192,
773
- "eval_runtime": 189.2693,
774
- "eval_samples_per_second": 10.567,
775
- "eval_steps_per_second": 1.321,
776
- "step": 1200
777
- },
778
- {
779
- "epoch": 3.11,
780
- "learning_rate": 0.0002119047619047619,
781
- "loss": 0.7012,
782
- "step": 1210
783
- },
784
- {
785
- "epoch": 3.14,
786
- "learning_rate": 0.0002111111111111111,
787
- "loss": 0.7288,
788
- "step": 1220
789
- },
790
- {
791
- "epoch": 3.16,
792
- "learning_rate": 0.0002103174603174603,
793
- "loss": 0.8904,
794
- "step": 1230
795
- },
796
- {
797
- "epoch": 3.19,
798
- "learning_rate": 0.00020952380952380948,
799
- "loss": 0.9081,
800
- "step": 1240
801
- },
802
- {
803
- "epoch": 3.22,
804
- "learning_rate": 0.00020873015873015873,
805
- "loss": 0.8461,
806
- "step": 1250
807
- },
808
- {
809
- "epoch": 3.24,
810
- "learning_rate": 0.00020793650793650791,
811
- "loss": 0.6997,
812
- "step": 1260
813
- },
814
- {
815
- "epoch": 3.27,
816
- "learning_rate": 0.00020714285714285713,
817
- "loss": 0.7189,
818
- "step": 1270
819
- },
820
- {
821
- "epoch": 3.29,
822
- "learning_rate": 0.00020634920634920632,
823
- "loss": 0.8863,
824
- "step": 1280
825
- },
826
- {
827
- "epoch": 3.32,
828
- "learning_rate": 0.00020555555555555556,
829
- "loss": 0.906,
830
- "step": 1290
831
- },
832
- {
833
- "epoch": 3.34,
834
- "learning_rate": 0.00020476190476190475,
835
- "loss": 0.8287,
836
- "step": 1300
837
- },
838
- {
839
- "epoch": 3.37,
840
- "learning_rate": 0.00020396825396825393,
841
- "loss": 0.7015,
842
- "step": 1310
843
- },
844
- {
845
- "epoch": 3.4,
846
- "learning_rate": 0.00020317460317460315,
847
- "loss": 0.7325,
848
- "step": 1320
849
- },
850
- {
851
- "epoch": 3.42,
852
- "learning_rate": 0.00020238095238095236,
853
- "loss": 0.8878,
854
- "step": 1330
855
- },
856
- {
857
- "epoch": 3.45,
858
- "learning_rate": 0.00020158730158730158,
859
- "loss": 0.9057,
860
- "step": 1340
861
- },
862
- {
863
- "epoch": 3.47,
864
- "learning_rate": 0.00020079365079365077,
865
- "loss": 0.8399,
866
- "step": 1350
867
- },
868
- {
869
- "epoch": 3.5,
870
- "learning_rate": 0.00019999999999999998,
871
- "loss": 0.7073,
872
- "step": 1360
873
- },
874
- {
875
- "epoch": 3.52,
876
- "learning_rate": 0.0001992063492063492,
877
- "loss": 0.7281,
878
- "step": 1370
879
- },
880
- {
881
- "epoch": 3.55,
882
- "learning_rate": 0.0001984126984126984,
883
- "loss": 0.8829,
884
- "step": 1380
885
- },
886
- {
887
- "epoch": 3.58,
888
- "learning_rate": 0.0001976190476190476,
889
- "loss": 0.8923,
890
- "step": 1390
891
- },
892
- {
893
- "epoch": 3.6,
894
- "learning_rate": 0.0001968253968253968,
895
- "loss": 0.8389,
896
- "step": 1400
897
- },
898
- {
899
- "epoch": 3.6,
900
- "eval_loss": 0.8923280239105225,
901
- "eval_runtime": 189.1693,
902
- "eval_samples_per_second": 10.573,
903
- "eval_steps_per_second": 1.322,
904
- "step": 1400
905
- },
906
- {
907
- "epoch": 3.63,
908
- "learning_rate": 0.00019603174603174603,
909
- "loss": 0.7148,
910
- "step": 1410
911
- },
912
- {
913
- "epoch": 3.65,
914
- "learning_rate": 0.00019523809523809522,
915
- "loss": 0.7331,
916
- "step": 1420
917
- },
918
- {
919
- "epoch": 3.68,
920
- "learning_rate": 0.00019444444444444443,
921
- "loss": 0.8944,
922
- "step": 1430
923
- },
924
- {
925
- "epoch": 3.7,
926
- "learning_rate": 0.00019365079365079362,
927
- "loss": 0.9001,
928
- "step": 1440
929
- },
930
- {
931
- "epoch": 3.73,
932
- "learning_rate": 0.00019285714285714286,
933
- "loss": 0.8397,
934
- "step": 1450
935
- },
936
- {
937
- "epoch": 3.76,
938
- "learning_rate": 0.00019206349206349205,
939
- "loss": 0.6953,
940
- "step": 1460
941
- },
942
- {
943
- "epoch": 3.78,
944
- "learning_rate": 0.00019126984126984124,
945
- "loss": 0.7229,
946
- "step": 1470
947
- },
948
- {
949
- "epoch": 3.81,
950
- "learning_rate": 0.00019047619047619045,
951
- "loss": 0.8833,
952
- "step": 1480
953
- },
954
- {
955
- "epoch": 3.83,
956
- "learning_rate": 0.00018968253968253967,
957
- "loss": 0.9028,
958
- "step": 1490
959
- },
960
- {
961
- "epoch": 3.86,
962
- "learning_rate": 0.00018888888888888888,
963
- "loss": 0.8412,
964
- "step": 1500
965
- },
966
- {
967
- "epoch": 3.88,
968
- "learning_rate": 0.00018809523809523807,
969
- "loss": 0.7024,
970
- "step": 1510
971
- },
972
- {
973
- "epoch": 3.91,
974
- "learning_rate": 0.0001873015873015873,
975
- "loss": 0.7262,
976
- "step": 1520
977
- },
978
- {
979
- "epoch": 3.94,
980
- "learning_rate": 0.0001865079365079365,
981
- "loss": 0.8926,
982
- "step": 1530
983
- },
984
- {
985
- "epoch": 3.96,
986
- "learning_rate": 0.00018571428571428572,
987
- "loss": 0.8703,
988
- "step": 1540
989
- },
990
- {
991
- "epoch": 3.99,
992
- "learning_rate": 0.0001849206349206349,
993
- "loss": 0.7311,
994
- "step": 1550
995
- },
996
- {
997
- "epoch": 4.01,
998
- "learning_rate": 0.0001841269841269841,
999
- "loss": 0.7535,
1000
- "step": 1560
1001
- },
1002
- {
1003
- "epoch": 4.04,
1004
- "learning_rate": 0.00018333333333333334,
1005
- "loss": 0.8779,
1006
- "step": 1570
1007
- },
1008
- {
1009
- "epoch": 4.06,
1010
- "learning_rate": 0.00018253968253968252,
1011
- "loss": 0.887,
1012
- "step": 1580
1013
- },
1014
- {
1015
- "epoch": 4.09,
1016
- "learning_rate": 0.00018174603174603174,
1017
- "loss": 0.797,
1018
- "step": 1590
1019
- },
1020
- {
1021
- "epoch": 4.12,
1022
- "learning_rate": 0.00018095238095238093,
1023
- "loss": 0.6651,
1024
- "step": 1600
1025
- },
1026
- {
1027
- "epoch": 4.12,
1028
- "eval_loss": 0.904344916343689,
1029
- "eval_runtime": 189.1226,
1030
- "eval_samples_per_second": 10.575,
1031
- "eval_steps_per_second": 1.322,
1032
- "step": 1600
1033
- },
1034
- {
1035
- "epoch": 4.14,
1036
- "learning_rate": 0.00018015873015873017,
1037
- "loss": 0.7348,
1038
- "step": 1610
1039
- },
1040
- {
1041
- "epoch": 4.17,
1042
- "learning_rate": 0.00017936507936507936,
1043
- "loss": 0.8756,
1044
- "step": 1620
1045
- },
1046
- {
1047
- "epoch": 4.19,
1048
- "learning_rate": 0.00017857142857142854,
1049
- "loss": 0.8934,
1050
- "step": 1630
1051
- },
1052
- {
1053
- "epoch": 4.22,
1054
- "learning_rate": 0.00017777777777777776,
1055
- "loss": 0.8023,
1056
- "step": 1640
1057
- },
1058
- {
1059
- "epoch": 4.24,
1060
- "learning_rate": 0.00017698412698412697,
1061
- "loss": 0.6788,
1062
- "step": 1650
1063
- },
1064
- {
1065
- "epoch": 4.27,
1066
- "learning_rate": 0.0001761904761904762,
1067
- "loss": 0.7387,
1068
- "step": 1660
1069
- },
1070
- {
1071
- "epoch": 4.3,
1072
- "learning_rate": 0.00017539682539682538,
1073
- "loss": 0.885,
1074
- "step": 1670
1075
- },
1076
- {
1077
- "epoch": 4.32,
1078
- "learning_rate": 0.00017460317460317457,
1079
- "loss": 0.8738,
1080
- "step": 1680
1081
- },
1082
- {
1083
- "epoch": 4.35,
1084
- "learning_rate": 0.0001738095238095238,
1085
- "loss": 0.8059,
1086
- "step": 1690
1087
- },
1088
- {
1089
- "epoch": 4.37,
1090
- "learning_rate": 0.000173015873015873,
1091
- "loss": 0.6667,
1092
- "step": 1700
1093
- },
1094
- {
1095
- "epoch": 4.4,
1096
- "learning_rate": 0.0001722222222222222,
1097
- "loss": 0.7406,
1098
- "step": 1710
1099
- },
1100
- {
1101
- "epoch": 4.42,
1102
- "learning_rate": 0.0001714285714285714,
1103
- "loss": 0.8764,
1104
- "step": 1720
1105
- },
1106
- {
1107
- "epoch": 4.45,
1108
- "learning_rate": 0.00017063492063492064,
1109
- "loss": 0.8839,
1110
- "step": 1730
1111
- },
1112
- {
1113
- "epoch": 4.48,
1114
- "learning_rate": 0.00016984126984126983,
1115
- "loss": 0.8009,
1116
- "step": 1740
1117
- },
1118
- {
1119
- "epoch": 4.5,
1120
- "learning_rate": 0.00016904761904761904,
1121
- "loss": 0.6658,
1122
- "step": 1750
1123
- },
1124
- {
1125
- "epoch": 4.53,
1126
- "learning_rate": 0.00016825396825396823,
1127
- "loss": 0.7423,
1128
- "step": 1760
1129
- },
1130
- {
1131
- "epoch": 4.55,
1132
- "learning_rate": 0.00016746031746031747,
1133
- "loss": 0.8748,
1134
- "step": 1770
1135
- },
1136
- {
1137
- "epoch": 4.58,
1138
- "learning_rate": 0.00016666666666666666,
1139
- "loss": 0.887,
1140
- "step": 1780
1141
- },
1142
- {
1143
- "epoch": 4.6,
1144
- "learning_rate": 0.00016587301587301585,
1145
- "loss": 0.8038,
1146
- "step": 1790
1147
- },
1148
- {
1149
- "epoch": 4.63,
1150
- "learning_rate": 0.00016507936507936506,
1151
- "loss": 0.6631,
1152
- "step": 1800
1153
- },
1154
- {
1155
- "epoch": 4.63,
1156
- "eval_loss": 0.9004252552986145,
1157
- "eval_runtime": 189.1263,
1158
- "eval_samples_per_second": 10.575,
1159
- "eval_steps_per_second": 1.322,
1160
- "step": 1800
1161
- },
1162
- {
1163
- "epoch": 4.66,
1164
- "learning_rate": 0.00016428571428571428,
1165
- "loss": 0.7327,
1166
- "step": 1810
1167
- },
1168
- {
1169
- "epoch": 4.68,
1170
- "learning_rate": 0.0001634920634920635,
1171
- "loss": 0.8703,
1172
- "step": 1820
1173
- },
1174
- {
1175
- "epoch": 4.71,
1176
- "learning_rate": 0.00016269841269841268,
1177
- "loss": 0.8734,
1178
- "step": 1830
1179
- },
1180
- {
1181
- "epoch": 4.73,
1182
- "learning_rate": 0.00016190476190476187,
1183
- "loss": 0.8066,
1184
- "step": 1840
1185
- },
1186
- {
1187
- "epoch": 4.76,
1188
- "learning_rate": 0.0001611111111111111,
1189
- "loss": 0.6655,
1190
- "step": 1850
1191
- },
1192
- {
1193
- "epoch": 4.78,
1194
- "learning_rate": 0.0001603174603174603,
1195
- "loss": 0.736,
1196
- "step": 1860
1197
- },
1198
- {
1199
- "epoch": 4.81,
1200
- "learning_rate": 0.00015952380952380951,
1201
- "loss": 0.8707,
1202
- "step": 1870
1203
- },
1204
- {
1205
- "epoch": 4.84,
1206
- "learning_rate": 0.0001587301587301587,
1207
- "loss": 0.8789,
1208
- "step": 1880
1209
- },
1210
- {
1211
- "epoch": 4.86,
1212
- "learning_rate": 0.00015793650793650795,
1213
- "loss": 0.804,
1214
- "step": 1890
1215
- },
1216
- {
1217
- "epoch": 4.89,
1218
- "learning_rate": 0.00015714285714285713,
1219
- "loss": 0.6628,
1220
- "step": 1900
1221
- },
1222
- {
1223
- "epoch": 4.91,
1224
- "learning_rate": 0.00015634920634920635,
1225
- "loss": 0.7394,
1226
- "step": 1910
1227
- },
1228
- {
1229
- "epoch": 4.94,
1230
- "learning_rate": 0.00015555555555555554,
1231
- "loss": 0.8856,
1232
- "step": 1920
1233
- },
1234
- {
1235
- "epoch": 4.96,
1236
- "learning_rate": 0.00015476190476190478,
1237
- "loss": 0.8558,
1238
- "step": 1930
1239
- },
1240
- {
1241
- "epoch": 4.99,
1242
- "learning_rate": 0.00015396825396825397,
1243
- "loss": 0.6766,
1244
- "step": 1940
1245
- },
1246
- {
1247
- "epoch": 5.02,
1248
- "learning_rate": 0.00015317460317460315,
1249
- "loss": 0.7662,
1250
- "step": 1950
1251
- },
1252
- {
1253
- "epoch": 5.04,
1254
- "learning_rate": 0.00015238095238095237,
1255
- "loss": 0.8602,
1256
- "step": 1960
1257
- },
1258
- {
1259
- "epoch": 5.07,
1260
- "learning_rate": 0.00015158730158730158,
1261
- "loss": 0.8567,
1262
- "step": 1970
1263
- },
1264
- {
1265
- "epoch": 5.09,
1266
- "learning_rate": 0.0001507936507936508,
1267
- "loss": 0.7482,
1268
- "step": 1980
1269
- },
1270
- {
1271
- "epoch": 5.12,
1272
- "learning_rate": 0.00015,
1273
- "loss": 0.6249,
1274
- "step": 1990
1275
- },
1276
- {
1277
- "epoch": 5.14,
1278
- "learning_rate": 0.00014920634920634917,
1279
- "loss": 0.7506,
1280
- "step": 2000
1281
- },
1282
- {
1283
- "epoch": 5.14,
1284
- "eval_loss": 0.9112463593482971,
1285
- "eval_runtime": 189.2232,
1286
- "eval_samples_per_second": 10.57,
1287
- "eval_steps_per_second": 1.321,
1288
- "step": 2000
1289
- },
1290
- {
1291
- "epoch": 5.17,
1292
- "learning_rate": 0.0001484126984126984,
1293
- "loss": 0.8679,
1294
- "step": 2010
1295
- },
1296
- {
1297
- "epoch": 5.2,
1298
- "learning_rate": 0.0001476190476190476,
1299
- "loss": 0.8575,
1300
- "step": 2020
1301
- },
1302
- {
1303
- "epoch": 5.22,
1304
- "learning_rate": 0.00014682539682539682,
1305
- "loss": 0.7545,
1306
- "step": 2030
1307
- },
1308
- {
1309
- "epoch": 5.25,
1310
- "learning_rate": 0.000146031746031746,
1311
- "loss": 0.6237,
1312
- "step": 2040
1313
- },
1314
- {
1315
- "epoch": 5.27,
1316
- "learning_rate": 0.00014523809523809522,
1317
- "loss": 0.7561,
1318
- "step": 2050
1319
- },
1320
- {
1321
- "epoch": 5.3,
1322
- "learning_rate": 0.0001444444444444444,
1323
- "loss": 0.8688,
1324
- "step": 2060
1325
- },
1326
- {
1327
- "epoch": 5.32,
1328
- "learning_rate": 0.00014365079365079363,
1329
- "loss": 0.8578,
1330
- "step": 2070
1331
- },
1332
- {
1333
- "epoch": 5.35,
1334
- "learning_rate": 0.00014285714285714284,
1335
- "loss": 0.7743,
1336
- "step": 2080
1337
- },
1338
- {
1339
- "epoch": 5.38,
1340
- "learning_rate": 0.00014206349206349206,
1341
- "loss": 0.6337,
1342
- "step": 2090
1343
- },
1344
- {
1345
- "epoch": 5.4,
1346
- "learning_rate": 0.00014126984126984124,
1347
- "loss": 0.7518,
1348
- "step": 2100
1349
- },
1350
- {
1351
- "epoch": 5.43,
1352
- "learning_rate": 0.00014047619047619046,
1353
- "loss": 0.8736,
1354
- "step": 2110
1355
- },
1356
- {
1357
- "epoch": 5.45,
1358
- "learning_rate": 0.00013968253968253967,
1359
- "loss": 0.8623,
1360
- "step": 2120
1361
- },
1362
- {
1363
- "epoch": 5.48,
1364
- "learning_rate": 0.0001388888888888889,
1365
- "loss": 0.7719,
1366
- "step": 2130
1367
- },
1368
- {
1369
- "epoch": 5.5,
1370
- "learning_rate": 0.00013809523809523808,
1371
- "loss": 0.6363,
1372
- "step": 2140
1373
- },
1374
- {
1375
- "epoch": 5.53,
1376
- "learning_rate": 0.0001373015873015873,
1377
- "loss": 0.7458,
1378
- "step": 2150
1379
- },
1380
- {
1381
- "epoch": 5.56,
1382
- "learning_rate": 0.00013650793650793648,
1383
- "loss": 0.865,
1384
- "step": 2160
1385
- },
1386
- {
1387
- "epoch": 5.58,
1388
- "learning_rate": 0.00013587301587301588,
1389
- "loss": 2.6458,
1390
- "step": 2170
1391
- },
1392
- {
1393
- "epoch": 5.61,
1394
- "learning_rate": 0.00013587301587301588,
1395
- "loss": 801.9857,
1396
- "step": 2180
1397
- },
1398
- {
1399
- "epoch": 5.63,
1400
- "learning_rate": 0.0001357142857142857,
1401
- "loss": 4068.018,
1402
- "step": 2190
1403
- },
1404
- {
1405
- "epoch": 5.66,
1406
- "learning_rate": 0.00013547619047619047,
1407
- "loss": 41837456.0,
1408
- "step": 2200
1409
- },
1410
- {
1411
- "epoch": 5.66,
1412
- "eval_loss": NaN,
1413
- "eval_runtime": 189.1905,
1414
- "eval_samples_per_second": 10.571,
1415
- "eval_steps_per_second": 1.321,
1416
- "step": 2200
1417
- },
1418
- {
1419
- "epoch": 5.68,
1420
- "learning_rate": 0.00013547619047619047,
1421
- "loss": 1.4509301308571507e+26,
1422
- "step": 2210
1423
- },
1424
- {
1425
- "epoch": 5.71,
1426
- "learning_rate": 0.00013547619047619047,
1427
- "loss": 3.3847730092507856e+24,
1428
- "step": 2220
1429
- },
1430
- {
1431
- "epoch": 5.74,
1432
- "learning_rate": 0.00013547619047619047,
1433
- "loss": 1.5211943209070177e+23,
1434
- "step": 2230
1435
- },
1436
- {
1437
- "epoch": 5.76,
1438
- "learning_rate": 0.00013547619047619047,
1439
- "loss": 6.678915709304036e+21,
1440
- "step": 2240
1441
- },
1442
- {
1443
- "epoch": 5.79,
1444
- "learning_rate": 0.00013531746031746032,
1445
- "loss": 9.238063623264189e+19,
1446
- "step": 2250
1447
- },
1448
- {
1449
- "epoch": 5.81,
1450
- "learning_rate": 0.00013531746031746032,
1451
- "loss": 1.0532014646514701e+20,
1452
- "step": 2260
1453
- },
1454
- {
1455
- "epoch": 5.84,
1456
- "learning_rate": 0.00013531746031746032,
1457
- "loss": 3.161435096687031e+25,
1458
- "step": 2270
1459
- },
1460
- {
1461
- "epoch": 5.86,
1462
- "learning_rate": 0.00013531746031746032,
1463
- "loss": 2.0162940987179532e+19,
1464
- "step": 2280
1465
- },
1466
- {
1467
- "epoch": 5.89,
1468
- "learning_rate": 0.00013531746031746032,
1469
- "loss": 2.55694599151234e+20,
1470
- "step": 2290
1471
- },
1472
- {
1473
- "epoch": 5.92,
1474
- "learning_rate": 0.00013523809523809522,
1475
- "loss": 5.808441058207432e+20,
1476
- "step": 2300
1477
- },
1478
- {
1479
- "epoch": 5.94,
1480
- "learning_rate": 0.00013523809523809522,
1481
- "loss": 1.9880088514154103e+22,
1482
- "step": 2310
1483
- },
1484
- {
1485
- "epoch": 5.97,
1486
- "learning_rate": 0.00013523809523809522,
1487
- "loss": 2.0954874078435546e+24,
1488
- "step": 2320
1489
- },
1490
- {
1491
- "epoch": 5.99,
1492
- "learning_rate": 0.00013523809523809522,
1493
- "loss": 6.309141694629275e+20,
1494
- "step": 2330
1495
- },
1496
- {
1497
- "epoch": 6.02,
1498
- "learning_rate": 0.00013515873015873016,
1499
- "loss": 7.353349497283535e+23,
1500
- "step": 2340
1501
- },
1502
- {
1503
- "epoch": 6.05,
1504
- "learning_rate": 0.00013468253968253966,
1505
- "loss": 0.0,
1506
- "step": 2350
1507
- },
1508
- {
1509
- "epoch": 6.07,
1510
- "learning_rate": 0.00013388888888888888,
1511
- "loss": 0.0,
1512
- "step": 2360
1513
- },
1514
- {
1515
- "epoch": 6.1,
1516
- "learning_rate": 0.00013309523809523806,
1517
- "loss": 0.0,
1518
- "step": 2370
1519
- },
1520
- {
1521
- "epoch": 6.12,
1522
- "learning_rate": 0.00013230158730158728,
1523
- "loss": 0.0,
1524
- "step": 2380
1525
- },
1526
- {
1527
- "epoch": 6.15,
1528
- "learning_rate": 0.0001315079365079365,
1529
- "loss": 0.0,
1530
- "step": 2390
1531
- },
1532
- {
1533
- "epoch": 6.17,
1534
- "learning_rate": 0.0001307142857142857,
1535
- "loss": 0.0,
1536
- "step": 2400
1537
- },
1538
- {
1539
- "epoch": 6.17,
1540
- "eval_loss": NaN,
1541
- "eval_runtime": 164.902,
1542
- "eval_samples_per_second": 12.128,
1543
- "eval_steps_per_second": 1.516,
1544
- "step": 2400
1545
- },
1546
- {
1547
- "epoch": 6.2,
1548
- "learning_rate": 0.0001299206349206349,
1549
- "loss": 0.0,
1550
- "step": 2410
1551
- },
1552
- {
1553
- "epoch": 6.23,
1554
- "learning_rate": 0.0001291269841269841,
1555
- "loss": 0.0,
1556
- "step": 2420
1557
- },
1558
- {
1559
- "epoch": 6.25,
1560
- "learning_rate": 0.00012833333333333333,
1561
- "loss": 0.0,
1562
- "step": 2430
1563
- },
1564
- {
1565
- "epoch": 6.28,
1566
- "learning_rate": 0.00012753968253968254,
1567
- "loss": 0.0,
1568
- "step": 2440
1569
- },
1570
- {
1571
- "epoch": 6.3,
1572
- "learning_rate": 0.00012674603174603173,
1573
- "loss": 0.0,
1574
- "step": 2450
1575
- },
1576
- {
1577
- "epoch": 6.33,
1578
- "learning_rate": 0.00012595238095238094,
1579
- "loss": 0.0,
1580
- "step": 2460
1581
- },
1582
- {
1583
- "epoch": 6.35,
1584
- "learning_rate": 0.00012515873015873013,
1585
- "loss": 0.0,
1586
- "step": 2470
1587
- },
1588
- {
1589
- "epoch": 6.38,
1590
- "learning_rate": 0.00012436507936507935,
1591
- "loss": 0.0,
1592
- "step": 2480
1593
- },
1594
- {
1595
- "epoch": 6.41,
1596
- "learning_rate": 0.00012357142857142856,
1597
- "loss": 0.0,
1598
- "step": 2490
1599
- },
1600
- {
1601
- "epoch": 6.43,
1602
- "learning_rate": 0.00012277777777777778,
1603
- "loss": 0.0,
1604
- "step": 2500
1605
- },
1606
- {
1607
- "epoch": 6.46,
1608
- "learning_rate": 0.00012198412698412697,
1609
- "loss": 0.0,
1610
- "step": 2510
1611
- },
1612
- {
1613
- "epoch": 6.48,
1614
- "learning_rate": 0.00012119047619047618,
1615
- "loss": 0.0,
1616
- "step": 2520
1617
- },
1618
- {
1619
- "epoch": 6.51,
1620
- "learning_rate": 0.00012039682539682538,
1621
- "loss": 0.0,
1622
- "step": 2530
1623
- },
1624
- {
1625
- "epoch": 6.53,
1626
- "learning_rate": 0.0001196031746031746,
1627
- "loss": 0.0,
1628
- "step": 2540
1629
- },
1630
- {
1631
- "epoch": 6.56,
1632
- "learning_rate": 0.0001188095238095238,
1633
- "loss": 0.0,
1634
- "step": 2550
1635
- },
1636
- {
1637
- "epoch": 6.59,
1638
- "learning_rate": 0.00011801587301587301,
1639
- "loss": 0.0,
1640
- "step": 2560
1641
- },
1642
- {
1643
- "epoch": 6.61,
1644
- "learning_rate": 0.0001172222222222222,
1645
- "loss": 0.0,
1646
- "step": 2570
1647
- },
1648
- {
1649
- "epoch": 6.64,
1650
- "learning_rate": 0.00011642857142857142,
1651
- "loss": 0.0,
1652
- "step": 2580
1653
- },
1654
- {
1655
- "epoch": 6.66,
1656
- "learning_rate": 0.00011563492063492062,
1657
- "loss": 0.0,
1658
- "step": 2590
1659
- },
1660
- {
1661
- "epoch": 6.69,
1662
- "learning_rate": 0.00011484126984126983,
1663
- "loss": 0.0,
1664
- "step": 2600
1665
- },
1666
- {
1667
- "epoch": 6.69,
1668
- "eval_loss": NaN,
1669
- "eval_runtime": 164.9294,
1670
- "eval_samples_per_second": 12.126,
1671
- "eval_steps_per_second": 1.516,
1672
- "step": 2600
1673
- },
1674
- {
1675
- "epoch": 6.71,
1676
- "learning_rate": 0.00011404761904761903,
1677
- "loss": 0.0,
1678
- "step": 2610
1679
- },
1680
- {
1681
- "epoch": 6.74,
1682
- "learning_rate": 0.00011325396825396825,
1683
- "loss": 0.0,
1684
- "step": 2620
1685
- },
1686
- {
1687
- "epoch": 6.77,
1688
- "learning_rate": 0.00011246031746031745,
1689
- "loss": 0.0,
1690
- "step": 2630
1691
- },
1692
- {
1693
- "epoch": 6.79,
1694
- "learning_rate": 0.00011166666666666667,
1695
- "loss": 0.0,
1696
- "step": 2640
1697
- },
1698
- {
1699
- "epoch": 6.82,
1700
- "learning_rate": 0.00011087301587301585,
1701
- "loss": 0.0,
1702
- "step": 2650
1703
- },
1704
- {
1705
- "epoch": 6.84,
1706
- "learning_rate": 0.00011007936507936507,
1707
- "loss": 0.0,
1708
- "step": 2660
1709
- },
1710
- {
1711
- "epoch": 6.87,
1712
- "learning_rate": 0.00010928571428571427,
1713
- "loss": 0.0,
1714
- "step": 2670
1715
- },
1716
- {
1717
- "epoch": 6.89,
1718
- "learning_rate": 0.00010849206349206349,
1719
- "loss": 0.0,
1720
- "step": 2680
1721
- },
1722
- {
1723
- "epoch": 6.92,
1724
- "learning_rate": 0.00010769841269841269,
1725
- "loss": 0.0,
1726
- "step": 2690
1727
- },
1728
- {
1729
- "epoch": 6.95,
1730
- "learning_rate": 0.0001069047619047619,
1731
- "loss": 0.0,
1732
- "step": 2700
1733
- },
1734
- {
1735
- "epoch": 6.97,
1736
- "learning_rate": 0.0001061111111111111,
1737
- "loss": 0.0,
1738
- "step": 2710
1739
- },
1740
- {
1741
- "epoch": 7.0,
1742
- "learning_rate": 0.0001053174603174603,
1743
- "loss": 0.0,
1744
- "step": 2720
1745
- },
1746
- {
1747
- "epoch": 7.02,
1748
- "learning_rate": 0.0001045238095238095,
1749
- "loss": 0.0,
1750
- "step": 2730
1751
- },
1752
- {
1753
- "epoch": 7.05,
1754
- "learning_rate": 0.00010373015873015872,
1755
- "loss": 0.0,
1756
- "step": 2740
1757
- },
1758
- {
1759
- "epoch": 7.07,
1760
- "learning_rate": 0.00010293650793650792,
1761
- "loss": 0.0,
1762
- "step": 2750
1763
- },
1764
- {
1765
- "epoch": 7.1,
1766
- "learning_rate": 0.00010214285714285714,
1767
- "loss": 0.0,
1768
- "step": 2760
1769
- },
1770
- {
1771
- "epoch": 7.13,
1772
- "learning_rate": 0.00010134920634920634,
1773
- "loss": 0.0,
1774
- "step": 2770
1775
- },
1776
- {
1777
- "epoch": 7.15,
1778
- "learning_rate": 0.00010055555555555555,
1779
- "loss": 0.0,
1780
- "step": 2780
1781
- },
1782
- {
1783
- "epoch": 7.18,
1784
- "learning_rate": 9.976190476190474e-05,
1785
- "loss": 0.0,
1786
- "step": 2790
1787
- },
1788
- {
1789
- "epoch": 7.2,
1790
- "learning_rate": 9.896825396825396e-05,
1791
- "loss": 0.0,
1792
- "step": 2800
1793
- },
1794
- {
1795
- "epoch": 7.2,
1796
- "eval_loss": NaN,
1797
- "eval_runtime": 164.8859,
1798
- "eval_samples_per_second": 12.13,
1799
- "eval_steps_per_second": 1.516,
1800
- "step": 2800
1801
- },
1802
- {
1803
- "epoch": 7.23,
1804
- "learning_rate": 9.817460317460316e-05,
1805
- "loss": 0.0,
1806
- "step": 2810
1807
- },
1808
- {
1809
- "epoch": 7.25,
1810
- "learning_rate": 9.738095238095237e-05,
1811
- "loss": 0.0,
1812
- "step": 2820
1813
- },
1814
- {
1815
- "epoch": 7.28,
1816
- "learning_rate": 9.658730158730158e-05,
1817
- "loss": 0.0,
1818
- "step": 2830
1819
- },
1820
- {
1821
- "epoch": 7.31,
1822
- "learning_rate": 9.579365079365079e-05,
1823
- "loss": 0.0,
1824
- "step": 2840
1825
- },
1826
- {
1827
- "epoch": 7.33,
1828
- "learning_rate": 9.499999999999999e-05,
1829
- "loss": 0.0,
1830
- "step": 2850
1831
- },
1832
- {
1833
- "epoch": 7.36,
1834
- "learning_rate": 9.42063492063492e-05,
1835
- "loss": 0.0,
1836
- "step": 2860
1837
- },
1838
- {
1839
- "epoch": 7.38,
1840
- "learning_rate": 9.34126984126984e-05,
1841
- "loss": 0.0,
1842
- "step": 2870
1843
- },
1844
- {
1845
- "epoch": 7.41,
1846
- "learning_rate": 9.261904761904761e-05,
1847
- "loss": 0.0,
1848
- "step": 2880
1849
- },
1850
- {
1851
- "epoch": 7.43,
1852
- "learning_rate": 9.182539682539681e-05,
1853
- "loss": 0.0,
1854
- "step": 2890
1855
- },
1856
- {
1857
- "epoch": 7.46,
1858
- "learning_rate": 9.103174603174603e-05,
1859
- "loss": 0.0,
1860
- "step": 2900
1861
- },
1862
- {
1863
- "epoch": 7.49,
1864
- "learning_rate": 9.023809523809523e-05,
1865
- "loss": 0.0,
1866
- "step": 2910
1867
- },
1868
- {
1869
- "epoch": 7.51,
1870
- "learning_rate": 8.944444444444444e-05,
1871
- "loss": 0.0,
1872
- "step": 2920
1873
- },
1874
- {
1875
- "epoch": 7.54,
1876
- "learning_rate": 8.865079365079364e-05,
1877
- "loss": 0.0,
1878
- "step": 2930
1879
- },
1880
- {
1881
- "epoch": 7.56,
1882
- "learning_rate": 8.785714285714286e-05,
1883
- "loss": 0.0,
1884
- "step": 2940
1885
- },
1886
- {
1887
- "epoch": 7.59,
1888
- "learning_rate": 8.706349206349205e-05,
1889
- "loss": 0.0,
1890
- "step": 2950
1891
- },
1892
- {
1893
- "epoch": 7.61,
1894
- "learning_rate": 8.626984126984126e-05,
1895
- "loss": 0.0,
1896
- "step": 2960
1897
- },
1898
- {
1899
- "epoch": 7.64,
1900
- "learning_rate": 8.547619047619046e-05,
1901
- "loss": 0.0,
1902
- "step": 2970
1903
- },
1904
- {
1905
- "epoch": 7.67,
1906
- "learning_rate": 8.468253968253968e-05,
1907
- "loss": 0.0,
1908
- "step": 2980
1909
- },
1910
- {
1911
- "epoch": 7.69,
1912
- "learning_rate": 8.388888888888888e-05,
1913
- "loss": 0.0,
1914
- "step": 2990
1915
- },
1916
- {
1917
- "epoch": 7.72,
1918
- "learning_rate": 8.30952380952381e-05,
1919
- "loss": 0.0,
1920
- "step": 3000
1921
- },
1922
- {
1923
- "epoch": 7.72,
1924
- "eval_loss": NaN,
1925
- "eval_runtime": 164.8915,
1926
- "eval_samples_per_second": 12.129,
1927
- "eval_steps_per_second": 1.516,
1928
- "step": 3000
1929
- },
1930
- {
1931
- "epoch": 7.74,
1932
- "learning_rate": 8.23015873015873e-05,
1933
- "loss": 0.0,
1934
- "step": 3010
1935
- },
1936
- {
1937
- "epoch": 7.77,
1938
- "learning_rate": 8.150793650793651e-05,
1939
- "loss": 0.0,
1940
- "step": 3020
1941
- },
1942
- {
1943
- "epoch": 7.79,
1944
- "learning_rate": 8.07142857142857e-05,
1945
- "loss": 0.0,
1946
- "step": 3030
1947
- },
1948
- {
1949
- "epoch": 7.82,
1950
- "learning_rate": 7.992063492063491e-05,
1951
- "loss": 0.0,
1952
- "step": 3040
1953
- },
1954
- {
1955
- "epoch": 7.85,
1956
- "learning_rate": 7.912698412698412e-05,
1957
- "loss": 0.0,
1958
- "step": 3050
1959
- },
1960
- {
1961
- "epoch": 7.87,
1962
- "learning_rate": 7.833333333333333e-05,
1963
- "loss": 0.0,
1964
- "step": 3060
1965
- },
1966
- {
1967
- "epoch": 7.9,
1968
- "learning_rate": 7.753968253968253e-05,
1969
- "loss": 0.0,
1970
- "step": 3070
1971
- },
1972
- {
1973
- "epoch": 7.92,
1974
- "learning_rate": 7.674603174603175e-05,
1975
- "loss": 0.0,
1976
- "step": 3080
1977
- },
1978
- {
1979
- "epoch": 7.95,
1980
- "learning_rate": 7.595238095238095e-05,
1981
- "loss": 0.0,
1982
- "step": 3090
1983
- },
1984
- {
1985
- "epoch": 7.97,
1986
- "learning_rate": 7.515873015873015e-05,
1987
- "loss": 0.0,
1988
- "step": 3100
1989
- },
1990
- {
1991
- "epoch": 8.0,
1992
- "learning_rate": 7.436507936507935e-05,
1993
- "loss": 0.0,
1994
- "step": 3110
1995
- },
1996
- {
1997
- "epoch": 8.03,
1998
- "learning_rate": 7.357142857142857e-05,
1999
- "loss": 0.0,
2000
- "step": 3120
2001
- },
2002
- {
2003
- "epoch": 8.05,
2004
- "learning_rate": 7.277777777777777e-05,
2005
- "loss": 0.0,
2006
- "step": 3130
2007
- },
2008
- {
2009
- "epoch": 8.08,
2010
- "learning_rate": 7.198412698412697e-05,
2011
- "loss": 0.0,
2012
- "step": 3140
2013
- },
2014
- {
2015
- "epoch": 8.1,
2016
- "learning_rate": 7.119047619047618e-05,
2017
- "loss": 0.0,
2018
- "step": 3150
2019
- },
2020
- {
2021
- "epoch": 8.13,
2022
- "learning_rate": 7.039682539682539e-05,
2023
- "loss": 0.0,
2024
- "step": 3160
2025
- },
2026
- {
2027
- "epoch": 8.15,
2028
- "learning_rate": 6.960317460317459e-05,
2029
- "loss": 0.0,
2030
- "step": 3170
2031
- },
2032
- {
2033
- "epoch": 8.18,
2034
- "learning_rate": 6.88095238095238e-05,
2035
- "loss": 0.0,
2036
- "step": 3180
2037
- },
2038
- {
2039
- "epoch": 8.21,
2040
- "learning_rate": 6.8015873015873e-05,
2041
- "loss": 0.0,
2042
- "step": 3190
2043
- },
2044
- {
2045
- "epoch": 8.23,
2046
- "learning_rate": 6.722222222222222e-05,
2047
- "loss": 0.0,
2048
- "step": 3200
2049
- },
2050
- {
2051
- "epoch": 8.23,
2052
- "eval_loss": NaN,
2053
- "eval_runtime": 164.9214,
2054
- "eval_samples_per_second": 12.127,
2055
- "eval_steps_per_second": 1.516,
2056
- "step": 3200
2057
- },
2058
- {
2059
- "epoch": 8.26,
2060
- "learning_rate": 6.642857142857142e-05,
2061
- "loss": 0.0,
2062
- "step": 3210
2063
- },
2064
- {
2065
- "epoch": 8.28,
2066
- "learning_rate": 6.563492063492062e-05,
2067
- "loss": 0.0,
2068
- "step": 3220
2069
- },
2070
- {
2071
- "epoch": 8.31,
2072
- "learning_rate": 6.484126984126984e-05,
2073
- "loss": 0.0,
2074
- "step": 3230
2075
- },
2076
- {
2077
- "epoch": 8.33,
2078
- "learning_rate": 6.404761904761904e-05,
2079
- "loss": 0.0,
2080
- "step": 3240
2081
- },
2082
- {
2083
- "epoch": 8.36,
2084
- "learning_rate": 6.325396825396824e-05,
2085
- "loss": 0.0,
2086
- "step": 3250
2087
- },
2088
- {
2089
- "epoch": 8.39,
2090
- "learning_rate": 6.246031746031746e-05,
2091
- "loss": 0.0,
2092
- "step": 3260
2093
- },
2094
- {
2095
- "epoch": 8.41,
2096
- "learning_rate": 6.166666666666666e-05,
2097
- "loss": 0.0,
2098
- "step": 3270
2099
- },
2100
- {
2101
- "epoch": 8.44,
2102
- "learning_rate": 6.0873015873015865e-05,
2103
- "loss": 0.0,
2104
- "step": 3280
2105
- },
2106
- {
2107
- "epoch": 8.46,
2108
- "learning_rate": 6.007936507936507e-05,
2109
- "loss": 0.0,
2110
- "step": 3290
2111
- },
2112
- {
2113
- "epoch": 8.49,
2114
- "learning_rate": 5.9285714285714275e-05,
2115
- "loss": 0.0,
2116
- "step": 3300
2117
- },
2118
- {
2119
- "epoch": 8.51,
2120
- "learning_rate": 5.849206349206348e-05,
2121
- "loss": 0.0,
2122
- "step": 3310
2123
- },
2124
- {
2125
- "epoch": 8.54,
2126
- "learning_rate": 5.769841269841269e-05,
2127
- "loss": 0.0,
2128
- "step": 3320
2129
- },
2130
- {
2131
- "epoch": 8.57,
2132
- "learning_rate": 5.69047619047619e-05,
2133
- "loss": 0.0,
2134
- "step": 3330
2135
- },
2136
- {
2137
- "epoch": 8.59,
2138
- "learning_rate": 5.61111111111111e-05,
2139
- "loss": 0.0,
2140
- "step": 3340
2141
- },
2142
- {
2143
- "epoch": 8.62,
2144
- "learning_rate": 5.531746031746031e-05,
2145
- "loss": 0.0,
2146
- "step": 3350
2147
- },
2148
- {
2149
- "epoch": 8.64,
2150
- "learning_rate": 5.452380952380952e-05,
2151
- "loss": 0.0,
2152
- "step": 3360
2153
- },
2154
- {
2155
- "epoch": 8.67,
2156
- "learning_rate": 5.3730158730158726e-05,
2157
- "loss": 0.0,
2158
- "step": 3370
2159
- },
2160
- {
2161
- "epoch": 8.69,
2162
- "learning_rate": 5.293650793650793e-05,
2163
- "loss": 0.0,
2164
- "step": 3380
2165
- },
2166
- {
2167
- "epoch": 8.72,
2168
- "learning_rate": 5.2142857142857135e-05,
2169
- "loss": 0.0,
2170
- "step": 3390
2171
- },
2172
- {
2173
- "epoch": 8.75,
2174
- "learning_rate": 5.1349206349206344e-05,
2175
- "loss": 0.0,
2176
- "step": 3400
2177
- },
2178
- {
2179
- "epoch": 8.75,
2180
- "eval_loss": NaN,
2181
- "eval_runtime": 164.9331,
2182
- "eval_samples_per_second": 12.126,
2183
- "eval_steps_per_second": 1.516,
2184
- "step": 3400
2185
- },
2186
- {
2187
- "epoch": 8.77,
2188
- "learning_rate": 5.055555555555555e-05,
2189
- "loss": 0.0,
2190
- "step": 3410
2191
- },
2192
- {
2193
- "epoch": 8.8,
2194
- "learning_rate": 4.976190476190475e-05,
2195
- "loss": 0.0,
2196
- "step": 3420
2197
- },
2198
- {
2199
- "epoch": 8.82,
2200
- "learning_rate": 4.896825396825396e-05,
2201
- "loss": 0.0,
2202
- "step": 3430
2203
- },
2204
- {
2205
- "epoch": 8.85,
2206
- "learning_rate": 4.817460317460317e-05,
2207
- "loss": 0.0,
2208
- "step": 3440
2209
- },
2210
- {
2211
- "epoch": 8.87,
2212
- "learning_rate": 4.738095238095238e-05,
2213
- "loss": 0.0,
2214
- "step": 3450
2215
- },
2216
- {
2217
- "epoch": 8.9,
2218
- "learning_rate": 4.658730158730158e-05,
2219
- "loss": 0.0,
2220
- "step": 3460
2221
- },
2222
- {
2223
- "epoch": 8.93,
2224
- "learning_rate": 4.579365079365079e-05,
2225
- "loss": 0.0,
2226
- "step": 3470
2227
- },
2228
- {
2229
- "epoch": 8.95,
2230
- "learning_rate": 4.4999999999999996e-05,
2231
- "loss": 0.0,
2232
- "step": 3480
2233
- },
2234
- {
2235
- "epoch": 8.98,
2236
- "learning_rate": 4.42063492063492e-05,
2237
- "loss": 0.0,
2238
- "step": 3490
2239
- },
2240
- {
2241
- "epoch": 9.0,
2242
- "learning_rate": 4.3412698412698406e-05,
2243
- "loss": 0.0,
2244
- "step": 3500
2245
- },
2246
- {
2247
- "epoch": 9.03,
2248
- "learning_rate": 4.2619047619047614e-05,
2249
- "loss": 0.0,
2250
- "step": 3510
2251
- },
2252
- {
2253
- "epoch": 9.05,
2254
- "learning_rate": 4.182539682539682e-05,
2255
- "loss": 0.0,
2256
- "step": 3520
2257
- },
2258
- {
2259
- "epoch": 9.08,
2260
- "learning_rate": 4.1031746031746024e-05,
2261
- "loss": 0.0,
2262
- "step": 3530
2263
- },
2264
- {
2265
- "epoch": 9.11,
2266
- "learning_rate": 4.023809523809523e-05,
2267
- "loss": 0.0,
2268
- "step": 3540
2269
- },
2270
- {
2271
- "epoch": 9.13,
2272
- "learning_rate": 3.944444444444444e-05,
2273
- "loss": 0.0,
2274
- "step": 3550
2275
- },
2276
- {
2277
- "epoch": 9.16,
2278
- "learning_rate": 3.865079365079365e-05,
2279
- "loss": 0.0,
2280
- "step": 3560
2281
- },
2282
- {
2283
- "epoch": 9.18,
2284
- "learning_rate": 3.785714285714285e-05,
2285
- "loss": 0.0,
2286
- "step": 3570
2287
- },
2288
- {
2289
- "epoch": 9.21,
2290
- "learning_rate": 3.706349206349206e-05,
2291
- "loss": 0.0,
2292
- "step": 3580
2293
- },
2294
- {
2295
- "epoch": 9.23,
2296
- "learning_rate": 3.6269841269841266e-05,
2297
- "loss": 0.0,
2298
- "step": 3590
2299
- },
2300
- {
2301
- "epoch": 9.26,
2302
- "learning_rate": 3.5476190476190475e-05,
2303
- "loss": 0.0,
2304
- "step": 3600
2305
- },
2306
- {
2307
- "epoch": 9.26,
2308
- "eval_loss": NaN,
2309
- "eval_runtime": 164.9466,
2310
- "eval_samples_per_second": 12.125,
2311
- "eval_steps_per_second": 1.516,
2312
- "step": 3600
2313
- },
2314
- {
2315
- "epoch": 9.29,
2316
- "learning_rate": 3.4682539682539676e-05,
2317
- "loss": 0.0,
2318
- "step": 3610
2319
- },
2320
- {
2321
- "epoch": 9.31,
2322
- "learning_rate": 3.3888888888888884e-05,
2323
- "loss": 0.0,
2324
- "step": 3620
2325
- },
2326
- {
2327
- "epoch": 9.34,
2328
- "learning_rate": 3.309523809523809e-05,
2329
- "loss": 0.0,
2330
- "step": 3630
2331
- },
2332
- {
2333
- "epoch": 9.36,
2334
- "learning_rate": 3.23015873015873e-05,
2335
- "loss": 0.0,
2336
- "step": 3640
2337
- },
2338
- {
2339
- "epoch": 9.39,
2340
- "learning_rate": 3.15079365079365e-05,
2341
- "loss": 0.0,
2342
- "step": 3650
2343
- },
2344
- {
2345
- "epoch": 9.41,
2346
- "learning_rate": 3.071428571428571e-05,
2347
- "loss": 0.0,
2348
- "step": 3660
2349
- },
2350
- {
2351
- "epoch": 9.44,
2352
- "learning_rate": 2.992063492063492e-05,
2353
- "loss": 0.0,
2354
- "step": 3670
2355
- },
2356
- {
2357
- "epoch": 9.47,
2358
- "learning_rate": 2.9126984126984124e-05,
2359
- "loss": 0.0,
2360
- "step": 3680
2361
- },
2362
- {
2363
- "epoch": 9.49,
2364
- "learning_rate": 2.833333333333333e-05,
2365
- "loss": 0.0,
2366
- "step": 3690
2367
- },
2368
- {
2369
- "epoch": 9.52,
2370
- "learning_rate": 2.7539682539682537e-05,
2371
- "loss": 0.0,
2372
- "step": 3700
2373
- },
2374
- {
2375
- "epoch": 9.54,
2376
- "learning_rate": 2.6746031746031742e-05,
2377
- "loss": 0.0,
2378
- "step": 3710
2379
- },
2380
- {
2381
- "epoch": 9.57,
2382
- "learning_rate": 2.595238095238095e-05,
2383
- "loss": 0.0,
2384
- "step": 3720
2385
- },
2386
- {
2387
- "epoch": 9.59,
2388
- "learning_rate": 2.5158730158730155e-05,
2389
- "loss": 0.0,
2390
- "step": 3730
2391
- },
2392
- {
2393
- "epoch": 9.62,
2394
- "learning_rate": 2.4365079365079363e-05,
2395
- "loss": 0.0,
2396
- "step": 3740
2397
- },
2398
- {
2399
- "epoch": 9.65,
2400
- "learning_rate": 2.3571428571428568e-05,
2401
- "loss": 0.0,
2402
- "step": 3750
2403
- },
2404
- {
2405
- "epoch": 9.67,
2406
- "learning_rate": 2.2777777777777776e-05,
2407
- "loss": 0.0,
2408
- "step": 3760
2409
- },
2410
- {
2411
- "epoch": 9.7,
2412
- "learning_rate": 2.198412698412698e-05,
2413
- "loss": 0.0,
2414
- "step": 3770
2415
- },
2416
- {
2417
- "epoch": 9.72,
2418
- "learning_rate": 2.119047619047619e-05,
2419
- "loss": 0.0,
2420
- "step": 3780
2421
- },
2422
- {
2423
- "epoch": 9.75,
2424
- "learning_rate": 2.0396825396825394e-05,
2425
- "loss": 0.0,
2426
- "step": 3790
2427
- },
2428
- {
2429
- "epoch": 9.77,
2430
- "learning_rate": 1.9603174603174602e-05,
2431
- "loss": 0.0,
2432
- "step": 3800
2433
- },
2434
- {
2435
- "epoch": 9.77,
2436
- "eval_loss": NaN,
2437
- "eval_runtime": 164.8991,
2438
- "eval_samples_per_second": 12.129,
2439
- "eval_steps_per_second": 1.516,
2440
- "step": 3800
2441
- }
2442
- ],
2443
- "max_steps": 3880,
2444
- "num_train_epochs": 10,
2445
- "total_flos": 4.3886595913710305e+18,
2446
- "trial_name": null,
2447
- "trial_params": null
2448
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoint-3800/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e9adb78996a536c4aa514741768e2b05cafc3e20ac4a0a0fe98e38b91109396
3
- size 3899
 
 
 
 
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "decapoda-research/llama-7b-hf",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 1,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 4096,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 11008,
12
+ "max_position_embeddings": 2048,
13
+ "max_sequence_length": 2048,
14
+ "model_type": "llama",
15
+ "num_attention_heads": 32,
16
+ "num_hidden_layers": 32,
17
+ "pad_token_id": -1,
18
+ "rms_norm_eps": 1e-06,
19
+ "tie_word_embeddings": false,
20
+ "torch_dtype": "float16",
21
+ "transformers_version": "4.29.2",
22
+ "use_cache": true,
23
+ "vocab_size": 32000
24
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.29.2"
7
+ }
checkpoint-1400/rng_state.pth → pytorch_model-00001-of-00039.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b813319309bb78de43fcb3df443c8fa6445901aa3ecbd7af077bb6cad5abbd16
3
- size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:975b4fcce1cc0f5d39f984b7ebcac7a505bb56623bc6eff75df7f381f0007f3e
3
+ size 396364479
adapter_model.bin → pytorch_model-00002-of-00039.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e5e1621f48d9ad8feb1d6d31050275f0aafd080c5c07153301fe2f48411f4406
3
- size 443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27283e5ff7771322644820763a40e77ee0010486bf6d2ff868b91b593368e54d
3
+ size 371215393
checkpoint-1400/optimizer.pt → pytorch_model-00003-of-00039.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08f7a1d9c9173c3436019668ea26863da5987bfaefcf8c31239fa1070132548c
3
- size 134433093
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5d680f0fd9b39af959244383207858b3267e2dcc52f1175ae2b4ee02ba89553
3
+ size 371215986
checkpoint-1400/pytorch_model.bin → pytorch_model-00004-of-00039.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a08267c1710aaea04b40844ab403cd930673f05fd06be55a0be7f6cec062b8d
3
- size 67201357
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b473df902ef42eebd86da908b28c363106f9c6d941135a6f1e2f218cc60bc683
3
+ size 371215986
pytorch_model-00005-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a2a87142731bdcb6ad27a6a9c38414e3f05789b9d00fabdca75f9c917332171
3
+ size 371215986
pytorch_model-00006-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:038201b714a5eeddbffc0f7d6401fa7d5105ff0c9480eaeeda9161c7bd3d7a2c
3
+ size 314575888
pytorch_model-00007-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e89e8728cf12d0cdf2b802153930d91d2ef1839327b7de2b1a16494463e275f9
3
+ size 314592882
pytorch_model-00008-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:934849ae9e6ef4da628697d02ba984b8ba57c0b3de2a6ebccd4801f26547b74a
3
+ size 314592882
pytorch_model-00009-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81e3b2e0d5f945e056c2e7e36f537c5a6630dd3c10525d931e913a04e86cf700
3
+ size 371215393
pytorch_model-00010-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8bcc28c6492ebac9031ea9581290ac6eb08c485b55e5aaea4f0c3ae2d89d6a
3
+ size 371215986
pytorch_model-00011-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c5c844df0791f6483bdbb1c61579026529145ca7581eb58dbe33d468cbbad20
3
+ size 371215986
pytorch_model-00012-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c503b904ea8996ac79dcb7a8c4a7b5ecee5aa75192280f4ad5b30df9162030a
3
+ size 371215986
pytorch_model-00013-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd65605bdc1fa36223bd6df6e74077380b95f4183db2d139bd2b765041e25503
3
+ size 314575888
pytorch_model-00014-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ff0bb12cf5798c6fb85b7462cd4a83d1a395c354875fe739f628a693fc3eb04
3
+ size 314592882
pytorch_model-00015-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bac5ee8a9a61324a31abfbee8c59daaa4c6292581c4e3f28db43686a43f3388d
3
+ size 314592882
pytorch_model-00016-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12e14c1ed24eab48141dc66e06c2e4cf4c1fa8b84d571f25f13f1994ea3c4932
3
+ size 371215393
pytorch_model-00017-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c122693cd9e13cdc97eefe31237f04b1aa689bb934c4411465b10400d26c0a
3
+ size 371215986
pytorch_model-00018-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be7157b8619d9ee86c047bb142e6a3a7a7e4c0401c733e7446a1b5c10a5b4d3c
3
+ size 371215986
pytorch_model-00019-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c736d13013d2fd3f627c4068c884950e6fa78fadf08b60afd6162288297a8dd9
3
+ size 371215986
pytorch_model-00020-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e04f1429986b18496845282f36b032afb172f5172aba0c3fa825a714024f1e34
3
+ size 314575888
pytorch_model-00021-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d780c33c5bed6d8fac81d50538c06a3348dd34e7ad12c3f8cc61993a1866dbd1
3
+ size 314592882
pytorch_model-00022-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:822a8b3a34f79d5ca9ad4fa070b700ebe1b687d275b280d19e4b9c1705c586a7
3
+ size 314592882
pytorch_model-00023-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f44dade49e92d3706ac1207d25b36f19b1fcf150f46deb07ee22d74184050bfb
3
+ size 371215393
pytorch_model-00024-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b6d9eff68d0909e909eed722bbe2e263df705ff486c446472c737a2c0fc31a2
3
+ size 371215986
pytorch_model-00025-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc292c68330f900426448a4b661ae593347db7cdc664cc354fa965ff1c608e66
3
+ size 371215986
pytorch_model-00026-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f9f1a1f95473652f893bd5f339cd6d6d9f75c3d81e525009b5898a6c21e784f
3
+ size 371215986
pytorch_model-00027-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63775c85e6504ea5d5d024f7029e90c3a4b9d66079573bd1c36284ce390b7037
3
+ size 314575888
pytorch_model-00028-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df35c532086f4e6b6d097bdd038afdf3bd4e271240ededf03d2747bd0544fd6f
3
+ size 314592882
pytorch_model-00029-of-00039.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d4b8da74b497dc675f74e756426fe1cd6384e8df50fe7d392a2e0eb4c4f06fd
3
+ size 314592882