andreas122001 commited on
Commit
8d8346d
1 Parent(s): 064b4dc

Upload 11 files

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +279 -3
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a640828f9d526d1b8de3e127a01bb0b2bf2a5a80c39b6c6b8cc6e472a9682bdd
3
  size 133466376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65e67105c48220d2074098ec1e37f0d61b97375cc55304f903567d3695295370
3
  size 133466376
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:319c57b813206d1595bb4568e829ffe00cf1dae5479933d61e57b031110ab3c8
3
  size 267054330
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0714840fa9604d2fc8f7ea5d59a366c8a93781aa1d7c495a9de1474c626848b4
3
  size 267054330
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cdeee1988eb8068c8ab116f87717a64e03e46695a5c02d86b0b0d194f7a377c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38f537364220577dd4e5269dcefc13c34f9b3778f6daf6768a70fafb5a921478
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34631147f877348c90c7ffcffd8096f87c949df488c55e07965421da16ce55dd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5da051eb48121dd050f992d01ba9f253f3d6fbc88a7c642a234fae20f61e7eb3
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 35.0,
5
  "eval_steps": 10000,
6
- "global_step": 88795,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -637,6 +637,282 @@
637
  "learning_rate": 6.131178557351203e-06,
638
  "loss": 0.0135,
639
  "step": 88000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
640
  }
641
  ],
642
  "logging_steps": 1000,
@@ -644,7 +920,7 @@
644
  "num_input_tokens_seen": 0,
645
  "num_train_epochs": 50,
646
  "save_steps": 500,
647
- "total_flos": 7400666137523400.0,
648
  "train_batch_size": 8,
649
  "trial_name": null,
650
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 50.0,
5
  "eval_steps": 10000,
6
+ "global_step": 126850,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
637
  "learning_rate": 6.131178557351203e-06,
638
  "loss": 0.0135,
639
  "step": 88000
640
+ },
641
+ {
642
+ "epoch": 35.08,
643
+ "learning_rate": 5.973512022073315e-06,
644
+ "loss": 0.016,
645
+ "step": 89000
646
+ },
647
+ {
648
+ "epoch": 35.47,
649
+ "learning_rate": 5.815845486795429e-06,
650
+ "loss": 0.0144,
651
+ "step": 90000
652
+ },
653
+ {
654
+ "epoch": 35.47,
655
+ "eval_accuracy": 0.996594778660613,
656
+ "eval_f1": 0.9872340425531915,
657
+ "eval_loss": 0.009375466965138912,
658
+ "eval_precision": 0.9914529914529915,
659
+ "eval_recall": 0.9830508474576272,
660
+ "eval_runtime": 6.361,
661
+ "eval_samples_per_second": 276.999,
662
+ "eval_steps_per_second": 34.743,
663
+ "step": 90000
664
+ },
665
+ {
666
+ "epoch": 35.87,
667
+ "learning_rate": 5.658336618052819e-06,
668
+ "loss": 0.0155,
669
+ "step": 91000
670
+ },
671
+ {
672
+ "epoch": 36.26,
673
+ "learning_rate": 5.500670082774931e-06,
674
+ "loss": 0.0126,
675
+ "step": 92000
676
+ },
677
+ {
678
+ "epoch": 36.66,
679
+ "learning_rate": 5.343161214032322e-06,
680
+ "loss": 0.0094,
681
+ "step": 93000
682
+ },
683
+ {
684
+ "epoch": 37.05,
685
+ "learning_rate": 5.185494678754435e-06,
686
+ "loss": 0.0106,
687
+ "step": 94000
688
+ },
689
+ {
690
+ "epoch": 37.45,
691
+ "learning_rate": 5.027828143476547e-06,
692
+ "loss": 0.0115,
693
+ "step": 95000
694
+ },
695
+ {
696
+ "epoch": 37.84,
697
+ "learning_rate": 4.870161608198661e-06,
698
+ "loss": 0.0087,
699
+ "step": 96000
700
+ },
701
+ {
702
+ "epoch": 38.23,
703
+ "learning_rate": 4.712495072920773e-06,
704
+ "loss": 0.0126,
705
+ "step": 97000
706
+ },
707
+ {
708
+ "epoch": 38.63,
709
+ "learning_rate": 4.554986204178163e-06,
710
+ "loss": 0.009,
711
+ "step": 98000
712
+ },
713
+ {
714
+ "epoch": 39.02,
715
+ "learning_rate": 4.397319668900277e-06,
716
+ "loss": 0.0129,
717
+ "step": 99000
718
+ },
719
+ {
720
+ "epoch": 39.42,
721
+ "learning_rate": 4.239653133622389e-06,
722
+ "loss": 0.009,
723
+ "step": 100000
724
+ },
725
+ {
726
+ "epoch": 39.42,
727
+ "eval_accuracy": 0.996594778660613,
728
+ "eval_f1": 0.9872881355932204,
729
+ "eval_loss": 0.008661070838570595,
730
+ "eval_precision": 0.9872881355932204,
731
+ "eval_recall": 0.9872881355932204,
732
+ "eval_runtime": 6.0434,
733
+ "eval_samples_per_second": 291.559,
734
+ "eval_steps_per_second": 36.569,
735
+ "step": 100000
736
+ },
737
+ {
738
+ "epoch": 39.81,
739
+ "learning_rate": 4.081986598344502e-06,
740
+ "loss": 0.0066,
741
+ "step": 101000
742
+ },
743
+ {
744
+ "epoch": 40.2,
745
+ "learning_rate": 3.924477729601892e-06,
746
+ "loss": 0.0116,
747
+ "step": 102000
748
+ },
749
+ {
750
+ "epoch": 40.6,
751
+ "learning_rate": 3.7668111943240047e-06,
752
+ "loss": 0.0099,
753
+ "step": 103000
754
+ },
755
+ {
756
+ "epoch": 40.99,
757
+ "learning_rate": 3.6091446590461178e-06,
758
+ "loss": 0.0087,
759
+ "step": 104000
760
+ },
761
+ {
762
+ "epoch": 41.39,
763
+ "learning_rate": 3.4516357903035086e-06,
764
+ "loss": 0.0092,
765
+ "step": 105000
766
+ },
767
+ {
768
+ "epoch": 41.78,
769
+ "learning_rate": 3.2939692550256207e-06,
770
+ "loss": 0.0098,
771
+ "step": 106000
772
+ },
773
+ {
774
+ "epoch": 42.18,
775
+ "learning_rate": 3.1363027197477337e-06,
776
+ "loss": 0.0092,
777
+ "step": 107000
778
+ },
779
+ {
780
+ "epoch": 42.57,
781
+ "learning_rate": 2.9787938510051245e-06,
782
+ "loss": 0.0111,
783
+ "step": 108000
784
+ },
785
+ {
786
+ "epoch": 42.96,
787
+ "learning_rate": 2.821127315727237e-06,
788
+ "loss": 0.0077,
789
+ "step": 109000
790
+ },
791
+ {
792
+ "epoch": 43.36,
793
+ "learning_rate": 2.66346078044935e-06,
794
+ "loss": 0.0084,
795
+ "step": 110000
796
+ },
797
+ {
798
+ "epoch": 43.36,
799
+ "eval_accuracy": 0.9954597048808173,
800
+ "eval_f1": 0.9828326180257511,
801
+ "eval_loss": 0.018768297508358955,
802
+ "eval_precision": 0.9956521739130435,
803
+ "eval_recall": 0.9703389830508474,
804
+ "eval_runtime": 5.9996,
805
+ "eval_samples_per_second": 293.685,
806
+ "eval_steps_per_second": 36.836,
807
+ "step": 110000
808
+ },
809
+ {
810
+ "epoch": 43.75,
811
+ "learning_rate": 2.5059519117067405e-06,
812
+ "loss": 0.0086,
813
+ "step": 111000
814
+ },
815
+ {
816
+ "epoch": 44.15,
817
+ "learning_rate": 2.348285376428853e-06,
818
+ "loss": 0.0092,
819
+ "step": 112000
820
+ },
821
+ {
822
+ "epoch": 44.54,
823
+ "learning_rate": 2.190618841150966e-06,
824
+ "loss": 0.0099,
825
+ "step": 113000
826
+ },
827
+ {
828
+ "epoch": 44.93,
829
+ "learning_rate": 2.0331099724083565e-06,
830
+ "loss": 0.0101,
831
+ "step": 114000
832
+ },
833
+ {
834
+ "epoch": 45.33,
835
+ "learning_rate": 1.8754434371304691e-06,
836
+ "loss": 0.0079,
837
+ "step": 115000
838
+ },
839
+ {
840
+ "epoch": 45.72,
841
+ "learning_rate": 1.71793456838786e-06,
842
+ "loss": 0.0102,
843
+ "step": 116000
844
+ },
845
+ {
846
+ "epoch": 46.12,
847
+ "learning_rate": 1.5602680331099725e-06,
848
+ "loss": 0.0086,
849
+ "step": 117000
850
+ },
851
+ {
852
+ "epoch": 46.51,
853
+ "learning_rate": 1.4026014978320853e-06,
854
+ "loss": 0.0097,
855
+ "step": 118000
856
+ },
857
+ {
858
+ "epoch": 46.91,
859
+ "learning_rate": 1.2449349625541981e-06,
860
+ "loss": 0.0048,
861
+ "step": 119000
862
+ },
863
+ {
864
+ "epoch": 47.3,
865
+ "learning_rate": 1.0872684272763107e-06,
866
+ "loss": 0.0112,
867
+ "step": 120000
868
+ },
869
+ {
870
+ "epoch": 47.3,
871
+ "eval_accuracy": 0.996594778660613,
872
+ "eval_f1": 0.9872881355932204,
873
+ "eval_loss": 0.008344221860170364,
874
+ "eval_precision": 0.9872881355932204,
875
+ "eval_recall": 0.9872881355932204,
876
+ "eval_runtime": 6.0461,
877
+ "eval_samples_per_second": 291.429,
878
+ "eval_steps_per_second": 36.553,
879
+ "step": 120000
880
+ },
881
+ {
882
+ "epoch": 47.69,
883
+ "learning_rate": 9.297595585337013e-07,
884
+ "loss": 0.0069,
885
+ "step": 121000
886
+ },
887
+ {
888
+ "epoch": 48.09,
889
+ "learning_rate": 7.72250689791092e-07,
890
+ "loss": 0.0103,
891
+ "step": 122000
892
+ },
893
+ {
894
+ "epoch": 48.48,
895
+ "learning_rate": 6.145841545132046e-07,
896
+ "loss": 0.0079,
897
+ "step": 123000
898
+ },
899
+ {
900
+ "epoch": 48.88,
901
+ "learning_rate": 4.5691761923531733e-07,
902
+ "loss": 0.0061,
903
+ "step": 124000
904
+ },
905
+ {
906
+ "epoch": 49.27,
907
+ "learning_rate": 2.994087504927079e-07,
908
+ "loss": 0.0081,
909
+ "step": 125000
910
+ },
911
+ {
912
+ "epoch": 49.66,
913
+ "learning_rate": 1.4174221521482067e-07,
914
+ "loss": 0.0101,
915
+ "step": 126000
916
  }
917
  ],
918
  "logging_steps": 1000,
 
920
  "num_input_tokens_seen": 0,
921
  "num_train_epochs": 50,
922
  "save_steps": 500,
923
+ "total_flos": 1.0572380196462e+16,
924
  "train_batch_size": 8,
925
  "trial_name": null,
926
  "trial_params": null