kittinan commited on
Commit
e1a1c47
1 Parent(s): 8f6f665

update model

Browse files
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_loss": 1.6228220462799072,
4
- "eval_runtime": 39.4036,
5
  "eval_samples": 455,
6
- "eval_samples_per_second": 11.547,
7
- "eval_steps_per_second": 1.447,
8
- "perplexity": 5.067370511502039,
9
- "train_loss": 1.6758505107466766,
10
- "train_runtime": 8429.8505,
11
  "train_samples": 9572,
12
- "train_samples_per_second": 3.406,
13
- "train_steps_per_second": 1.703
14
  }
 
1
  {
2
+ "epoch": 12.0,
3
+ "eval_loss": 1.7010221481323242,
4
+ "eval_runtime": 34.7991,
5
  "eval_samples": 455,
6
+ "eval_samples_per_second": 13.075,
7
+ "eval_steps_per_second": 1.638,
8
+ "perplexity": 5.479545437358641,
9
+ "train_loss": 1.0414019944791026,
10
+ "train_runtime": 23906.9056,
11
  "train_samples": 9572,
12
+ "train_samples_per_second": 4.805,
13
+ "train_steps_per_second": 2.402
14
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 3.0,
3
- "eval_loss": 1.6228220462799072,
4
- "eval_runtime": 39.4036,
5
  "eval_samples": 455,
6
- "eval_samples_per_second": 11.547,
7
- "eval_steps_per_second": 1.447,
8
- "perplexity": 5.067370511502039
9
  }
 
1
  {
2
+ "epoch": 12.0,
3
+ "eval_loss": 1.7010221481323242,
4
+ "eval_runtime": 34.7991,
5
  "eval_samples": 455,
6
+ "eval_samples_per_second": 13.075,
7
+ "eval_steps_per_second": 1.638,
8
+ "perplexity": 5.479545437358641
9
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:409331efe280435524cd7722ac9e7987b776bfeda5497d04d5e947a08cd772ec
3
  size 510435963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f2d41238a98adcd748d7db02b0080e34378e845a398f3d85ed71d96942e129e
3
  size 510435963
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "train_loss": 1.6758505107466766,
4
- "train_runtime": 8429.8505,
5
  "train_samples": 9572,
6
- "train_samples_per_second": 3.406,
7
- "train_steps_per_second": 1.703
8
  }
 
1
  {
2
+ "epoch": 12.0,
3
+ "train_loss": 1.0414019944791026,
4
+ "train_runtime": 23906.9056,
5
  "train_samples": 9572,
6
+ "train_samples_per_second": 4.805,
7
+ "train_steps_per_second": 2.402
8
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "global_step": 14358,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -207,11 +207,608 @@
207
  "train_runtime": 8429.8505,
208
  "train_samples_per_second": 3.406,
209
  "train_steps_per_second": 1.703
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  }
211
  ],
212
- "max_steps": 14358,
213
- "num_train_epochs": 3,
214
- "total_flos": 1.5006523981824e+16,
215
  "trial_name": null,
216
  "trial_params": null
217
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 12.0,
5
+ "global_step": 57432,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
207
  "train_runtime": 8429.8505,
208
  "train_samples_per_second": 3.406,
209
  "train_steps_per_second": 1.703
210
+ },
211
+ {
212
+ "epoch": 3.03,
213
+ "learning_rate": 4.987811672934949e-05,
214
+ "loss": 1.5627,
215
+ "step": 14500
216
+ },
217
+ {
218
+ "epoch": 3.13,
219
+ "learning_rate": 4.944368992895947e-05,
220
+ "loss": 1.5686,
221
+ "step": 15000
222
+ },
223
+ {
224
+ "epoch": 3.24,
225
+ "learning_rate": 4.900839253377908e-05,
226
+ "loss": 1.5776,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 3.34,
231
+ "learning_rate": 4.85730951385987e-05,
232
+ "loss": 1.5699,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 3.45,
237
+ "learning_rate": 4.8137797743418304e-05,
238
+ "loss": 1.5476,
239
+ "step": 16500
240
+ },
241
+ {
242
+ "epoch": 3.55,
243
+ "learning_rate": 4.770250034823792e-05,
244
+ "loss": 1.5448,
245
+ "step": 17000
246
+ },
247
+ {
248
+ "epoch": 3.66,
249
+ "learning_rate": 4.726720295305753e-05,
250
+ "loss": 1.5656,
251
+ "step": 17500
252
+ },
253
+ {
254
+ "epoch": 3.76,
255
+ "learning_rate": 4.683277615266751e-05,
256
+ "loss": 1.5694,
257
+ "step": 18000
258
+ },
259
+ {
260
+ "epoch": 3.87,
261
+ "learning_rate": 4.6397478757487115e-05,
262
+ "loss": 1.542,
263
+ "step": 18500
264
+ },
265
+ {
266
+ "epoch": 3.97,
267
+ "learning_rate": 4.596218136230673e-05,
268
+ "loss": 1.5474,
269
+ "step": 19000
270
+ },
271
+ {
272
+ "epoch": 4.0,
273
+ "eval_loss": 1.5994045734405518,
274
+ "eval_runtime": 39.2637,
275
+ "eval_samples_per_second": 11.588,
276
+ "eval_steps_per_second": 1.452,
277
+ "step": 19144
278
+ },
279
+ {
280
+ "epoch": 4.07,
281
+ "learning_rate": 4.552688396712634e-05,
282
+ "loss": 1.5336,
283
+ "step": 19500
284
+ },
285
+ {
286
+ "epoch": 4.18,
287
+ "learning_rate": 4.5091586571945955e-05,
288
+ "loss": 1.5259,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 4.28,
293
+ "learning_rate": 4.465628917676557e-05,
294
+ "loss": 1.495,
295
+ "step": 20500
296
+ },
297
+ {
298
+ "epoch": 4.39,
299
+ "learning_rate": 4.422099178158518e-05,
300
+ "loss": 1.5003,
301
+ "step": 21000
302
+ },
303
+ {
304
+ "epoch": 4.49,
305
+ "learning_rate": 4.378656498119515e-05,
306
+ "loss": 1.4912,
307
+ "step": 21500
308
+ },
309
+ {
310
+ "epoch": 4.6,
311
+ "learning_rate": 4.3351267586014767e-05,
312
+ "loss": 1.491,
313
+ "step": 22000
314
+ },
315
+ {
316
+ "epoch": 4.7,
317
+ "learning_rate": 4.291597019083438e-05,
318
+ "loss": 1.4888,
319
+ "step": 22500
320
+ },
321
+ {
322
+ "epoch": 4.81,
323
+ "learning_rate": 4.2480672795653994e-05,
324
+ "loss": 1.5098,
325
+ "step": 23000
326
+ },
327
+ {
328
+ "epoch": 4.91,
329
+ "learning_rate": 4.2046245995263964e-05,
330
+ "loss": 1.5011,
331
+ "step": 23500
332
+ },
333
+ {
334
+ "epoch": 5.0,
335
+ "eval_loss": 1.5889110565185547,
336
+ "eval_runtime": 42.2475,
337
+ "eval_samples_per_second": 10.77,
338
+ "eval_steps_per_second": 1.349,
339
+ "step": 23930
340
+ },
341
+ {
342
+ "epoch": 5.01,
343
+ "learning_rate": 4.161094860008358e-05,
344
+ "loss": 1.4888,
345
+ "step": 24000
346
+ },
347
+ {
348
+ "epoch": 5.12,
349
+ "learning_rate": 4.117565120490319e-05,
350
+ "loss": 1.4664,
351
+ "step": 24500
352
+ },
353
+ {
354
+ "epoch": 5.22,
355
+ "learning_rate": 4.0740353809722805e-05,
356
+ "loss": 1.455,
357
+ "step": 25000
358
+ },
359
+ {
360
+ "epoch": 5.33,
361
+ "learning_rate": 4.030505641454242e-05,
362
+ "loss": 1.4491,
363
+ "step": 25500
364
+ },
365
+ {
366
+ "epoch": 5.43,
367
+ "learning_rate": 3.986975901936203e-05,
368
+ "loss": 1.4543,
369
+ "step": 26000
370
+ },
371
+ {
372
+ "epoch": 5.54,
373
+ "learning_rate": 3.9435332218972e-05,
374
+ "loss": 1.4588,
375
+ "step": 26500
376
+ },
377
+ {
378
+ "epoch": 5.64,
379
+ "learning_rate": 3.9000034823791616e-05,
380
+ "loss": 1.446,
381
+ "step": 27000
382
+ },
383
+ {
384
+ "epoch": 5.75,
385
+ "learning_rate": 3.856473742861123e-05,
386
+ "loss": 1.4386,
387
+ "step": 27500
388
+ },
389
+ {
390
+ "epoch": 5.85,
391
+ "learning_rate": 3.812944003343084e-05,
392
+ "loss": 1.4473,
393
+ "step": 28000
394
+ },
395
+ {
396
+ "epoch": 5.95,
397
+ "learning_rate": 3.7694142638250456e-05,
398
+ "loss": 1.4724,
399
+ "step": 28500
400
+ },
401
+ {
402
+ "epoch": 6.0,
403
+ "eval_loss": 1.5893619060516357,
404
+ "eval_runtime": 39.6175,
405
+ "eval_samples_per_second": 11.485,
406
+ "eval_steps_per_second": 1.439,
407
+ "step": 28716
408
+ },
409
+ {
410
+ "epoch": 6.06,
411
+ "learning_rate": 3.725884524307007e-05,
412
+ "loss": 1.4439,
413
+ "step": 29000
414
+ },
415
+ {
416
+ "epoch": 6.16,
417
+ "learning_rate": 3.6823547847889676e-05,
418
+ "loss": 1.4021,
419
+ "step": 29500
420
+ },
421
+ {
422
+ "epoch": 6.27,
423
+ "learning_rate": 3.6388250452709297e-05,
424
+ "loss": 1.4147,
425
+ "step": 30000
426
+ },
427
+ {
428
+ "epoch": 6.37,
429
+ "learning_rate": 3.595382365231927e-05,
430
+ "loss": 1.4157,
431
+ "step": 30500
432
+ },
433
+ {
434
+ "epoch": 6.48,
435
+ "learning_rate": 3.551852625713888e-05,
436
+ "loss": 1.417,
437
+ "step": 31000
438
+ },
439
+ {
440
+ "epoch": 6.58,
441
+ "learning_rate": 3.508322886195849e-05,
442
+ "loss": 1.4094,
443
+ "step": 31500
444
+ },
445
+ {
446
+ "epoch": 6.69,
447
+ "learning_rate": 3.464793146677811e-05,
448
+ "loss": 1.4209,
449
+ "step": 32000
450
+ },
451
+ {
452
+ "epoch": 6.79,
453
+ "learning_rate": 3.4212634071597714e-05,
454
+ "loss": 1.415,
455
+ "step": 32500
456
+ },
457
+ {
458
+ "epoch": 6.9,
459
+ "learning_rate": 3.377907786599805e-05,
460
+ "loss": 1.4128,
461
+ "step": 33000
462
+ },
463
+ {
464
+ "epoch": 7.0,
465
+ "learning_rate": 3.334378047081766e-05,
466
+ "loss": 1.414,
467
+ "step": 33500
468
+ },
469
+ {
470
+ "epoch": 7.0,
471
+ "eval_loss": 1.5933945178985596,
472
+ "eval_runtime": 39.5291,
473
+ "eval_samples_per_second": 11.51,
474
+ "eval_steps_per_second": 1.442,
475
+ "step": 33502
476
+ },
477
+ {
478
+ "epoch": 7.1,
479
+ "learning_rate": 3.2908483075637276e-05,
480
+ "loss": 1.358,
481
+ "step": 34000
482
+ },
483
+ {
484
+ "epoch": 7.21,
485
+ "learning_rate": 3.247318568045689e-05,
486
+ "loss": 1.3821,
487
+ "step": 34500
488
+ },
489
+ {
490
+ "epoch": 7.31,
491
+ "learning_rate": 3.20378882852765e-05,
492
+ "loss": 1.377,
493
+ "step": 35000
494
+ },
495
+ {
496
+ "epoch": 7.42,
497
+ "learning_rate": 3.1602590890096116e-05,
498
+ "loss": 1.3772,
499
+ "step": 35500
500
+ },
501
+ {
502
+ "epoch": 7.52,
503
+ "learning_rate": 3.116729349491573e-05,
504
+ "loss": 1.3777,
505
+ "step": 36000
506
+ },
507
+ {
508
+ "epoch": 7.63,
509
+ "learning_rate": 3.0731996099735337e-05,
510
+ "loss": 1.3672,
511
+ "step": 36500
512
+ },
513
+ {
514
+ "epoch": 7.73,
515
+ "learning_rate": 3.0298439894135678e-05,
516
+ "loss": 1.3949,
517
+ "step": 37000
518
+ },
519
+ {
520
+ "epoch": 7.84,
521
+ "learning_rate": 2.9864013093745645e-05,
522
+ "loss": 1.3626,
523
+ "step": 37500
524
+ },
525
+ {
526
+ "epoch": 7.94,
527
+ "learning_rate": 2.9428715698565262e-05,
528
+ "loss": 1.3939,
529
+ "step": 38000
530
+ },
531
+ {
532
+ "epoch": 8.0,
533
+ "eval_loss": 1.6080235242843628,
534
+ "eval_runtime": 39.1335,
535
+ "eval_samples_per_second": 11.627,
536
+ "eval_steps_per_second": 1.457,
537
+ "step": 38288
538
+ },
539
+ {
540
+ "epoch": 8.04,
541
+ "learning_rate": 2.8993418303384872e-05,
542
+ "loss": 1.3667,
543
+ "step": 38500
544
+ },
545
+ {
546
+ "epoch": 8.15,
547
+ "learning_rate": 2.855812090820449e-05,
548
+ "loss": 1.336,
549
+ "step": 39000
550
+ },
551
+ {
552
+ "epoch": 8.25,
553
+ "learning_rate": 2.81228235130241e-05,
554
+ "loss": 1.3212,
555
+ "step": 39500
556
+ },
557
+ {
558
+ "epoch": 8.36,
559
+ "learning_rate": 2.768752611784371e-05,
560
+ "loss": 1.3235,
561
+ "step": 40000
562
+ },
563
+ {
564
+ "epoch": 8.46,
565
+ "learning_rate": 2.7252228722663326e-05,
566
+ "loss": 1.3649,
567
+ "step": 40500
568
+ },
569
+ {
570
+ "epoch": 8.57,
571
+ "learning_rate": 2.6816931327482936e-05,
572
+ "loss": 1.3423,
573
+ "step": 41000
574
+ },
575
+ {
576
+ "epoch": 8.67,
577
+ "learning_rate": 2.6381633932302553e-05,
578
+ "loss": 1.3396,
579
+ "step": 41500
580
+ },
581
+ {
582
+ "epoch": 8.78,
583
+ "learning_rate": 2.5946336537122163e-05,
584
+ "loss": 1.3528,
585
+ "step": 42000
586
+ },
587
+ {
588
+ "epoch": 8.88,
589
+ "learning_rate": 2.5511909736732137e-05,
590
+ "loss": 1.3488,
591
+ "step": 42500
592
+ },
593
+ {
594
+ "epoch": 8.98,
595
+ "learning_rate": 2.5076612341551747e-05,
596
+ "loss": 1.3448,
597
+ "step": 43000
598
+ },
599
+ {
600
+ "epoch": 9.0,
601
+ "eval_loss": 1.626845121383667,
602
+ "eval_runtime": 39.7529,
603
+ "eval_samples_per_second": 11.446,
604
+ "eval_steps_per_second": 1.434,
605
+ "step": 43074
606
+ },
607
+ {
608
+ "epoch": 9.09,
609
+ "learning_rate": 2.4641314946371364e-05,
610
+ "loss": 1.3208,
611
+ "step": 43500
612
+ },
613
+ {
614
+ "epoch": 9.19,
615
+ "learning_rate": 2.4206017551190974e-05,
616
+ "loss": 1.3107,
617
+ "step": 44000
618
+ },
619
+ {
620
+ "epoch": 9.3,
621
+ "learning_rate": 2.377159075080095e-05,
622
+ "loss": 1.3029,
623
+ "step": 44500
624
+ },
625
+ {
626
+ "epoch": 9.4,
627
+ "learning_rate": 2.3336293355620562e-05,
628
+ "loss": 1.3034,
629
+ "step": 45000
630
+ },
631
+ {
632
+ "epoch": 9.51,
633
+ "learning_rate": 2.2900995960440175e-05,
634
+ "loss": 1.3041,
635
+ "step": 45500
636
+ },
637
+ {
638
+ "epoch": 9.61,
639
+ "learning_rate": 2.2466569160050146e-05,
640
+ "loss": 1.3025,
641
+ "step": 46000
642
+ },
643
+ {
644
+ "epoch": 9.72,
645
+ "learning_rate": 2.203127176486976e-05,
646
+ "loss": 1.3186,
647
+ "step": 46500
648
+ },
649
+ {
650
+ "epoch": 9.82,
651
+ "learning_rate": 2.1595974369689373e-05,
652
+ "loss": 1.3185,
653
+ "step": 47000
654
+ },
655
+ {
656
+ "epoch": 9.92,
657
+ "learning_rate": 2.1160676974508987e-05,
658
+ "loss": 1.3227,
659
+ "step": 47500
660
+ },
661
+ {
662
+ "epoch": 10.0,
663
+ "eval_loss": 1.6497271060943604,
664
+ "eval_runtime": 39.0443,
665
+ "eval_samples_per_second": 11.653,
666
+ "eval_steps_per_second": 1.46,
667
+ "step": 47860
668
+ },
669
+ {
670
+ "epoch": 10.03,
671
+ "learning_rate": 2.07253795793286e-05,
672
+ "loss": 1.3038,
673
+ "step": 48000
674
+ },
675
+ {
676
+ "epoch": 10.13,
677
+ "learning_rate": 2.029008218414821e-05,
678
+ "loss": 1.2825,
679
+ "step": 48500
680
+ },
681
+ {
682
+ "epoch": 10.24,
683
+ "learning_rate": 1.9854784788967824e-05,
684
+ "loss": 1.2657,
685
+ "step": 49000
686
+ },
687
+ {
688
+ "epoch": 10.34,
689
+ "learning_rate": 1.9419487393787437e-05,
690
+ "loss": 1.2851,
691
+ "step": 49500
692
+ },
693
+ {
694
+ "epoch": 10.45,
695
+ "learning_rate": 1.898418999860705e-05,
696
+ "loss": 1.294,
697
+ "step": 50000
698
+ },
699
+ {
700
+ "epoch": 10.55,
701
+ "learning_rate": 1.8549763198217025e-05,
702
+ "loss": 1.2857,
703
+ "step": 50500
704
+ },
705
+ {
706
+ "epoch": 10.66,
707
+ "learning_rate": 1.8114465803036635e-05,
708
+ "loss": 1.2665,
709
+ "step": 51000
710
+ },
711
+ {
712
+ "epoch": 10.76,
713
+ "learning_rate": 1.7679168407856248e-05,
714
+ "loss": 1.273,
715
+ "step": 51500
716
+ },
717
+ {
718
+ "epoch": 10.87,
719
+ "learning_rate": 1.724387101267586e-05,
720
+ "loss": 1.2838,
721
+ "step": 52000
722
+ },
723
+ {
724
+ "epoch": 10.97,
725
+ "learning_rate": 1.6809444212285836e-05,
726
+ "loss": 1.2873,
727
+ "step": 52500
728
+ },
729
+ {
730
+ "epoch": 11.0,
731
+ "eval_loss": 1.673952341079712,
732
+ "eval_runtime": 38.6787,
733
+ "eval_samples_per_second": 11.764,
734
+ "eval_steps_per_second": 1.474,
735
+ "step": 52646
736
+ },
737
+ {
738
+ "epoch": 11.07,
739
+ "learning_rate": 1.637414681710545e-05,
740
+ "loss": 1.2753,
741
+ "step": 53000
742
+ },
743
+ {
744
+ "epoch": 11.18,
745
+ "learning_rate": 1.593884942192506e-05,
746
+ "loss": 1.2625,
747
+ "step": 53500
748
+ },
749
+ {
750
+ "epoch": 11.28,
751
+ "learning_rate": 1.5503552026744673e-05,
752
+ "loss": 1.2471,
753
+ "step": 54000
754
+ },
755
+ {
756
+ "epoch": 11.39,
757
+ "learning_rate": 1.5068254631564285e-05,
758
+ "loss": 1.2523,
759
+ "step": 54500
760
+ },
761
+ {
762
+ "epoch": 11.49,
763
+ "learning_rate": 1.463382783117426e-05,
764
+ "loss": 1.2534,
765
+ "step": 55000
766
+ },
767
+ {
768
+ "epoch": 11.6,
769
+ "learning_rate": 1.4199401030784231e-05,
770
+ "loss": 1.2606,
771
+ "step": 55500
772
+ },
773
+ {
774
+ "epoch": 11.7,
775
+ "learning_rate": 1.3764103635603845e-05,
776
+ "loss": 1.244,
777
+ "step": 56000
778
+ },
779
+ {
780
+ "epoch": 11.81,
781
+ "learning_rate": 1.332967683521382e-05,
782
+ "loss": 1.2577,
783
+ "step": 56500
784
+ },
785
+ {
786
+ "epoch": 11.91,
787
+ "learning_rate": 1.2894379440033432e-05,
788
+ "loss": 1.2671,
789
+ "step": 57000
790
+ },
791
+ {
792
+ "epoch": 12.0,
793
+ "eval_loss": 1.7010221481323242,
794
+ "eval_runtime": 38.3243,
795
+ "eval_samples_per_second": 11.872,
796
+ "eval_steps_per_second": 1.487,
797
+ "step": 57432
798
+ },
799
+ {
800
+ "epoch": 12.0,
801
+ "step": 57432,
802
+ "total_flos": 6.0026095927296e+16,
803
+ "train_loss": 1.0414019944791026,
804
+ "train_runtime": 23906.9056,
805
+ "train_samples_per_second": 4.805,
806
+ "train_steps_per_second": 2.402
807
  }
808
  ],
809
+ "max_steps": 57432,
810
+ "num_train_epochs": 12,
811
+ "total_flos": 6.0026095927296e+16,
812
  "trial_name": null,
813
  "trial_params": null
814
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7685f4b1b08905ec0c46a302540d4cb5df073837ba31d9ea1b1a8aa902f04819
3
  size 2671
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c53e7fa950a52eaad61b7c8a110974066f4a67ab2333e62df9941b90476f3148
3
  size 2671