8BitStudio commited on
Commit
3e305fd
·
verified ·
1 Parent(s): 95fe703

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ed7f5192373055df50388d1e8a342b0008cc7f264c290f2f40d0816847f2899
3
  size 1520630616
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9310a4b888df283774971e4e671540bfed2da01aea080fa39eda067305eeba86
3
  size 1520630616
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b8be61aa4b411ba072b5dd099697cc18dd1215103eeea9cd79dbfb70d181d7a
3
  size 3041448587
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1f256b63f8887aa92c9795198c14b259ff29bd76f4e601214dd8ad4add4ccd6
3
  size 3041448587
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82f11385365889b74991a13277667854d4ee120983e8addb357d466767c0b9ff
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2669ee2d37691d1bc42e7a0090a126e105acbd5de1cf305e31cb6b68e55636b7
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3ac42a4d50be277865df4f8c22478009406dfd138fc6ebe8a41f41d644b86db8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a30b126d1da8ae8870320a9f300ee7d428169650eb20c3a488c09fc00bef14d8
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.0060874316939892,
6
  "eval_steps": 500,
7
- "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -568,6 +568,286 @@
568
  "learning_rate": 0.00029976166518534735,
569
  "loss": 2.4739,
570
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  }
572
  ],
573
  "logging_steps": 50,
@@ -587,7 +867,7 @@
587
  "attributes": {}
588
  }
589
  },
590
- "total_flos": 2.1391181977674056e+18,
591
  "train_batch_size": 16,
592
  "trial_name": null,
593
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.0279453551912567,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
568
  "learning_rate": 0.00029976166518534735,
569
  "loss": 2.4739,
570
  "step": 4000
571
+ },
572
+ {
573
+ "epoch": 1.0066338797814207,
574
+ "grad_norm": 0.5546875,
575
+ "learning_rate": 0.00029974653116842764,
576
+ "loss": 2.4487,
577
+ "step": 4050
578
+ },
579
+ {
580
+ "epoch": 1.0071803278688525,
581
+ "grad_norm": 0.60546875,
582
+ "learning_rate": 0.0002997309317358347,
583
+ "loss": 2.4674,
584
+ "step": 4100
585
+ },
586
+ {
587
+ "epoch": 1.0077267759562842,
588
+ "grad_norm": 0.67578125,
589
+ "learning_rate": 0.0002997148669360519,
590
+ "loss": 2.4814,
591
+ "step": 4150
592
+ },
593
+ {
594
+ "epoch": 1.0082732240437158,
595
+ "grad_norm": 0.53515625,
596
+ "learning_rate": 0.00029969833681900914,
597
+ "loss": 2.448,
598
+ "step": 4200
599
+ },
600
+ {
601
+ "epoch": 1.0088196721311475,
602
+ "grad_norm": 0.68359375,
603
+ "learning_rate": 0.0002996813414360822,
604
+ "loss": 2.4299,
605
+ "step": 4250
606
+ },
607
+ {
608
+ "epoch": 1.0093661202185793,
609
+ "grad_norm": 0.57421875,
610
+ "learning_rate": 0.00029966388084009334,
611
+ "loss": 2.4271,
612
+ "step": 4300
613
+ },
614
+ {
615
+ "epoch": 1.0099125683060108,
616
+ "grad_norm": 0.515625,
617
+ "learning_rate": 0.00029964595508531034,
618
+ "loss": 2.4848,
619
+ "step": 4350
620
+ },
621
+ {
622
+ "epoch": 1.0104590163934426,
623
+ "grad_norm": 0.53125,
624
+ "learning_rate": 0.00029962756422744695,
625
+ "loss": 2.414,
626
+ "step": 4400
627
+ },
628
+ {
629
+ "epoch": 1.0110054644808744,
630
+ "grad_norm": 0.51171875,
631
+ "learning_rate": 0.00029960870832366224,
632
+ "loss": 2.3993,
633
+ "step": 4450
634
+ },
635
+ {
636
+ "epoch": 1.0115519125683061,
637
+ "grad_norm": 0.54296875,
638
+ "learning_rate": 0.000299589387432561,
639
+ "loss": 2.4171,
640
+ "step": 4500
641
+ },
642
+ {
643
+ "epoch": 1.0120983606557377,
644
+ "grad_norm": 0.490234375,
645
+ "learning_rate": 0.00029956960161419283,
646
+ "loss": 2.4038,
647
+ "step": 4550
648
+ },
649
+ {
650
+ "epoch": 1.0126448087431694,
651
+ "grad_norm": 0.498046875,
652
+ "learning_rate": 0.0002995493509300526,
653
+ "loss": 2.4128,
654
+ "step": 4600
655
+ },
656
+ {
657
+ "epoch": 1.0131912568306012,
658
+ "grad_norm": 0.51171875,
659
+ "learning_rate": 0.0002995286354430799,
660
+ "loss": 2.3721,
661
+ "step": 4650
662
+ },
663
+ {
664
+ "epoch": 1.0137377049180327,
665
+ "grad_norm": 0.51171875,
666
+ "learning_rate": 0.0002995074552176589,
667
+ "loss": 2.3734,
668
+ "step": 4700
669
+ },
670
+ {
671
+ "epoch": 1.0142841530054645,
672
+ "grad_norm": 0.515625,
673
+ "learning_rate": 0.00029948581031961826,
674
+ "loss": 2.3805,
675
+ "step": 4750
676
+ },
677
+ {
678
+ "epoch": 1.0148306010928962,
679
+ "grad_norm": 0.515625,
680
+ "learning_rate": 0.0002994637008162308,
681
+ "loss": 2.3819,
682
+ "step": 4800
683
+ },
684
+ {
685
+ "epoch": 1.0153770491803278,
686
+ "grad_norm": 0.53515625,
687
+ "learning_rate": 0.00029944112677621345,
688
+ "loss": 2.3839,
689
+ "step": 4850
690
+ },
691
+ {
692
+ "epoch": 1.0159234972677595,
693
+ "grad_norm": 0.4921875,
694
+ "learning_rate": 0.00029941808826972673,
695
+ "loss": 2.336,
696
+ "step": 4900
697
+ },
698
+ {
699
+ "epoch": 1.0164699453551913,
700
+ "grad_norm": 0.515625,
701
+ "learning_rate": 0.0002993945853683749,
702
+ "loss": 2.3126,
703
+ "step": 4950
704
+ },
705
+ {
706
+ "epoch": 1.0170163934426228,
707
+ "grad_norm": 0.53515625,
708
+ "learning_rate": 0.00029937061814520546,
709
+ "loss": 2.3271,
710
+ "step": 5000
711
+ },
712
+ {
713
+ "epoch": 1.0175628415300546,
714
+ "grad_norm": 0.53125,
715
+ "learning_rate": 0.00029934618667470925,
716
+ "loss": 2.3275,
717
+ "step": 5050
718
+ },
719
+ {
720
+ "epoch": 1.0181092896174864,
721
+ "grad_norm": 0.51953125,
722
+ "learning_rate": 0.0002993212910328197,
723
+ "loss": 2.2837,
724
+ "step": 5100
725
+ },
726
+ {
727
+ "epoch": 1.0186557377049181,
728
+ "grad_norm": 0.56640625,
729
+ "learning_rate": 0.00029929593129691305,
730
+ "loss": 2.2964,
731
+ "step": 5150
732
+ },
733
+ {
734
+ "epoch": 1.0192021857923497,
735
+ "grad_norm": 0.50390625,
736
+ "learning_rate": 0.000299270107545808,
737
+ "loss": 2.3155,
738
+ "step": 5200
739
+ },
740
+ {
741
+ "epoch": 1.0197486338797814,
742
+ "grad_norm": 0.55078125,
743
+ "learning_rate": 0.00029924381985976534,
744
+ "loss": 2.2722,
745
+ "step": 5250
746
+ },
747
+ {
748
+ "epoch": 1.0202950819672132,
749
+ "grad_norm": 0.49609375,
750
+ "learning_rate": 0.00029921706832048784,
751
+ "loss": 2.3175,
752
+ "step": 5300
753
+ },
754
+ {
755
+ "epoch": 1.0208415300546447,
756
+ "grad_norm": 0.48828125,
757
+ "learning_rate": 0.00029918985301111985,
758
+ "loss": 2.2834,
759
+ "step": 5350
760
+ },
761
+ {
762
+ "epoch": 1.0213879781420765,
763
+ "grad_norm": 0.734375,
764
+ "learning_rate": 0.00029916217401624716,
765
+ "loss": 2.2522,
766
+ "step": 5400
767
+ },
768
+ {
769
+ "epoch": 1.0219344262295083,
770
+ "grad_norm": 0.46484375,
771
+ "learning_rate": 0.00029913403142189677,
772
+ "loss": 2.2872,
773
+ "step": 5450
774
+ },
775
+ {
776
+ "epoch": 1.0224808743169398,
777
+ "grad_norm": 0.51953125,
778
+ "learning_rate": 0.00029910542531553656,
779
+ "loss": 2.2793,
780
+ "step": 5500
781
+ },
782
+ {
783
+ "epoch": 1.0230273224043716,
784
+ "grad_norm": 0.474609375,
785
+ "learning_rate": 0.00029907635578607487,
786
+ "loss": 2.218,
787
+ "step": 5550
788
+ },
789
+ {
790
+ "epoch": 1.0235737704918033,
791
+ "grad_norm": 0.49609375,
792
+ "learning_rate": 0.00029904682292386053,
793
+ "loss": 2.2309,
794
+ "step": 5600
795
+ },
796
+ {
797
+ "epoch": 1.024120218579235,
798
+ "grad_norm": 0.52734375,
799
+ "learning_rate": 0.0002990168268206823,
800
+ "loss": 2.2285,
801
+ "step": 5650
802
+ },
803
+ {
804
+ "epoch": 1.0246666666666666,
805
+ "grad_norm": 0.48828125,
806
+ "learning_rate": 0.00029898636756976884,
807
+ "loss": 2.2338,
808
+ "step": 5700
809
+ },
810
+ {
811
+ "epoch": 1.0252131147540984,
812
+ "grad_norm": 0.462890625,
813
+ "learning_rate": 0.0002989554452657881,
814
+ "loss": 2.2048,
815
+ "step": 5750
816
+ },
817
+ {
818
+ "epoch": 1.0257595628415301,
819
+ "grad_norm": 0.62109375,
820
+ "learning_rate": 0.0002989240600048475,
821
+ "loss": 2.2716,
822
+ "step": 5800
823
+ },
824
+ {
825
+ "epoch": 1.0263060109289617,
826
+ "grad_norm": 0.6015625,
827
+ "learning_rate": 0.00029889221188449295,
828
+ "loss": 2.2618,
829
+ "step": 5850
830
+ },
831
+ {
832
+ "epoch": 1.0268524590163934,
833
+ "grad_norm": 0.47265625,
834
+ "learning_rate": 0.0002988599010037092,
835
+ "loss": 2.2181,
836
+ "step": 5900
837
+ },
838
+ {
839
+ "epoch": 1.0273989071038252,
840
+ "grad_norm": 0.5234375,
841
+ "learning_rate": 0.0002988271274629192,
842
+ "loss": 2.2005,
843
+ "step": 5950
844
+ },
845
+ {
846
+ "epoch": 1.0279453551912567,
847
+ "grad_norm": 0.515625,
848
+ "learning_rate": 0.00029879389136398403,
849
+ "loss": 2.1958,
850
+ "step": 6000
851
  }
852
  ],
853
  "logging_steps": 50,
 
867
  "attributes": {}
868
  }
869
  },
870
+ "total_flos": 3.2086020985643336e+18,
871
  "train_batch_size": 16,
872
  "trial_name": null,
873
  "trial_params": null