kerpr commited on
Commit
f18fc82
1 Parent(s): 23ed661

End of training

Browse files
Files changed (4) hide show
  1. all_results.json +10 -10
  2. eval_results.json +6 -6
  3. train_results.json +5 -5
  4. trainer_state.json +642 -66
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 9.97,
3
- "eval_f1": 0.1941564561734213,
4
- "eval_loss": 4.426720142364502,
5
- "eval_runtime": 22.9552,
6
  "eval_samples": 1916,
7
- "eval_samples_per_second": 83.467,
8
- "eval_steps_per_second": 1.307,
9
- "train_loss": 1.9846372914412187,
10
- "train_runtime": 5392.4255,
11
  "train_samples": 5743,
12
- "train_samples_per_second": 10.65,
13
- "train_steps_per_second": 0.332
14
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_f1": 0.0,
4
+ "eval_loss": 0.34448280930519104,
5
+ "eval_runtime": 20.531,
6
  "eval_samples": 1916,
7
+ "eval_samples_per_second": 93.322,
8
+ "eval_steps_per_second": 1.461,
9
+ "train_loss": 0.3510875488058106,
10
+ "train_runtime": 3949.5247,
11
  "train_samples": 5743,
12
+ "train_samples_per_second": 14.541,
13
+ "train_steps_per_second": 0.909
14
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 9.97,
3
- "eval_f1": 0.1941564561734213,
4
- "eval_loss": 4.426720142364502,
5
- "eval_runtime": 22.9552,
6
  "eval_samples": 1916,
7
- "eval_samples_per_second": 83.467,
8
- "eval_steps_per_second": 1.307
9
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "eval_f1": 0.0,
4
+ "eval_loss": 0.34448280930519104,
5
+ "eval_runtime": 20.531,
6
  "eval_samples": 1916,
7
+ "eval_samples_per_second": 93.322,
8
+ "eval_steps_per_second": 1.461
9
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 9.97,
3
- "train_loss": 1.9846372914412187,
4
- "train_runtime": 5392.4255,
5
  "train_samples": 5743,
6
- "train_samples_per_second": 10.65,
7
- "train_steps_per_second": 0.332
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "train_loss": 0.3510875488058106,
4
+ "train_runtime": 3949.5247,
5
  "train_samples": 5743,
6
+ "train_samples_per_second": 14.541,
7
+ "train_steps_per_second": 0.909
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.972144846796658,
5
  "eval_steps": 500,
6
- "global_step": 1790,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -531,123 +531,699 @@
531
  "step": 1500
532
  },
533
  {
534
- "epoch": 8.47,
535
- "learning_rate": 2e-05,
536
- "loss": 0.0035,
537
  "step": 1520
538
  },
539
  {
540
- "epoch": 8.58,
541
- "learning_rate": 2e-05,
542
- "loss": 0.139,
543
  "step": 1540
544
  },
545
  {
546
- "epoch": 8.69,
547
- "learning_rate": 2e-05,
548
- "loss": 0.0617,
549
  "step": 1560
550
  },
551
  {
552
- "epoch": 8.8,
553
- "learning_rate": 2e-05,
554
- "loss": 0.0684,
555
  "step": 1580
556
  },
557
  {
558
- "epoch": 8.91,
559
- "learning_rate": 2e-05,
560
- "loss": 0.0231,
561
  "step": 1600
562
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  {
564
  "epoch": 9.0,
565
- "eval_f1": 0.1941564561734213,
566
- "eval_loss": 8.572157859802246,
567
- "eval_runtime": 22.7298,
568
- "eval_samples_per_second": 84.295,
569
- "eval_steps_per_second": 1.32,
570
- "step": 1615
571
  },
572
  {
573
  "epoch": 9.03,
574
- "learning_rate": 2e-05,
575
- "loss": 0.0565,
576
- "step": 1620
 
 
 
 
 
 
577
  },
578
  {
579
  "epoch": 9.14,
580
- "learning_rate": 2e-05,
581
- "loss": 0.0025,
582
- "step": 1640
 
 
 
 
 
 
583
  },
584
  {
585
  "epoch": 9.25,
586
- "learning_rate": 2e-05,
587
- "loss": 0.0,
588
- "step": 1660
 
 
 
 
 
 
589
  },
590
  {
591
  "epoch": 9.36,
592
- "learning_rate": 2e-05,
593
- "loss": 0.1041,
594
- "step": 1680
 
 
 
 
 
 
595
  },
596
  {
597
  "epoch": 9.47,
598
- "learning_rate": 2e-05,
599
- "loss": 0.0822,
600
- "step": 1700
 
 
 
 
 
 
601
  },
602
  {
603
  "epoch": 9.58,
604
- "learning_rate": 2e-05,
605
- "loss": 0.0923,
606
- "step": 1720
 
 
 
 
 
 
607
  },
608
  {
609
  "epoch": 9.69,
610
- "learning_rate": 2e-05,
611
- "loss": 0.0367,
612
- "step": 1740
 
 
 
 
 
 
613
  },
614
  {
615
  "epoch": 9.81,
616
- "learning_rate": 2e-05,
617
- "loss": 0.0083,
618
- "step": 1760
619
  },
620
  {
621
- "epoch": 9.92,
622
- "learning_rate": 2e-05,
623
- "loss": 0.0521,
624
- "step": 1780
625
  },
626
  {
627
- "epoch": 9.97,
628
- "eval_f1": 0.1941564561734213,
629
- "eval_loss": 4.426720142364502,
630
- "eval_runtime": 22.899,
631
- "eval_samples_per_second": 83.672,
632
- "eval_steps_per_second": 1.31,
633
- "step": 1790
634
  },
635
  {
636
  "epoch": 9.97,
637
- "step": 1790,
638
- "total_flos": 4485749565030400.0,
639
- "train_loss": 1.9846372914412187,
640
- "train_runtime": 5392.4255,
641
- "train_samples_per_second": 10.65,
642
- "train_steps_per_second": 0.332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  }
644
  ],
645
  "logging_steps": 20,
646
- "max_steps": 1790,
647
  "num_input_tokens_seen": 0,
648
  "num_train_epochs": 10,
649
  "save_steps": 500,
650
- "total_flos": 4485749565030400.0,
651
  "train_batch_size": 2,
652
  "trial_name": null,
653
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 3590,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
531
  "step": 1500
532
  },
533
  {
534
+ "epoch": 4.23,
535
+ "learning_rate": 0.001,
536
+ "loss": 18.7223,
537
  "step": 1520
538
  },
539
  {
540
+ "epoch": 4.29,
541
+ "learning_rate": 0.001,
542
+ "loss": 5.5701,
543
  "step": 1540
544
  },
545
  {
546
+ "epoch": 4.35,
547
+ "learning_rate": 0.001,
548
+ "loss": 1.2935,
549
  "step": 1560
550
  },
551
  {
552
+ "epoch": 4.4,
553
+ "learning_rate": 0.001,
554
+ "loss": 0.6161,
555
  "step": 1580
556
  },
557
  {
558
+ "epoch": 4.46,
559
+ "learning_rate": 0.001,
560
+ "loss": 0.757,
561
  "step": 1600
562
  },
563
+ {
564
+ "epoch": 4.51,
565
+ "learning_rate": 0.001,
566
+ "loss": 0.6241,
567
+ "step": 1620
568
+ },
569
+ {
570
+ "epoch": 4.57,
571
+ "learning_rate": 0.001,
572
+ "loss": 0.5211,
573
+ "step": 1640
574
+ },
575
+ {
576
+ "epoch": 4.62,
577
+ "learning_rate": 0.001,
578
+ "loss": 0.4467,
579
+ "step": 1660
580
+ },
581
+ {
582
+ "epoch": 4.68,
583
+ "learning_rate": 0.001,
584
+ "loss": 0.424,
585
+ "step": 1680
586
+ },
587
+ {
588
+ "epoch": 4.74,
589
+ "learning_rate": 0.001,
590
+ "loss": 0.3741,
591
+ "step": 1700
592
+ },
593
+ {
594
+ "epoch": 4.79,
595
+ "learning_rate": 0.001,
596
+ "loss": 0.3276,
597
+ "step": 1720
598
+ },
599
+ {
600
+ "epoch": 4.85,
601
+ "learning_rate": 0.001,
602
+ "loss": 0.3692,
603
+ "step": 1740
604
+ },
605
+ {
606
+ "epoch": 4.9,
607
+ "learning_rate": 0.001,
608
+ "loss": 0.3626,
609
+ "step": 1760
610
+ },
611
+ {
612
+ "epoch": 4.96,
613
+ "learning_rate": 0.001,
614
+ "loss": 0.3698,
615
+ "step": 1780
616
+ },
617
+ {
618
+ "epoch": 5.0,
619
+ "eval_f1": 0.0,
620
+ "eval_loss": 0.3513816297054291,
621
+ "eval_runtime": 20.8512,
622
+ "eval_samples_per_second": 91.889,
623
+ "eval_steps_per_second": 1.439,
624
+ "step": 1795
625
+ },
626
+ {
627
+ "epoch": 5.01,
628
+ "learning_rate": 0.001,
629
+ "loss": 0.3672,
630
+ "step": 1800
631
+ },
632
+ {
633
+ "epoch": 5.07,
634
+ "learning_rate": 0.001,
635
+ "loss": 0.3879,
636
+ "step": 1820
637
+ },
638
+ {
639
+ "epoch": 5.13,
640
+ "learning_rate": 0.001,
641
+ "loss": 0.458,
642
+ "step": 1840
643
+ },
644
+ {
645
+ "epoch": 5.18,
646
+ "learning_rate": 0.001,
647
+ "loss": 0.3949,
648
+ "step": 1860
649
+ },
650
+ {
651
+ "epoch": 5.24,
652
+ "learning_rate": 0.001,
653
+ "loss": 0.372,
654
+ "step": 1880
655
+ },
656
+ {
657
+ "epoch": 5.29,
658
+ "learning_rate": 0.001,
659
+ "loss": 0.3578,
660
+ "step": 1900
661
+ },
662
+ {
663
+ "epoch": 5.35,
664
+ "learning_rate": 0.001,
665
+ "loss": 0.3906,
666
+ "step": 1920
667
+ },
668
+ {
669
+ "epoch": 5.4,
670
+ "learning_rate": 0.001,
671
+ "loss": 0.3888,
672
+ "step": 1940
673
+ },
674
+ {
675
+ "epoch": 5.46,
676
+ "learning_rate": 0.001,
677
+ "loss": 0.4049,
678
+ "step": 1960
679
+ },
680
+ {
681
+ "epoch": 5.52,
682
+ "learning_rate": 0.001,
683
+ "loss": 0.3692,
684
+ "step": 1980
685
+ },
686
+ {
687
+ "epoch": 5.57,
688
+ "learning_rate": 0.001,
689
+ "loss": 0.3299,
690
+ "step": 2000
691
+ },
692
+ {
693
+ "epoch": 5.63,
694
+ "learning_rate": 0.001,
695
+ "loss": 0.3714,
696
+ "step": 2020
697
+ },
698
+ {
699
+ "epoch": 5.68,
700
+ "learning_rate": 0.001,
701
+ "loss": 0.3423,
702
+ "step": 2040
703
+ },
704
+ {
705
+ "epoch": 5.74,
706
+ "learning_rate": 0.001,
707
+ "loss": 0.3534,
708
+ "step": 2060
709
+ },
710
+ {
711
+ "epoch": 5.79,
712
+ "learning_rate": 0.001,
713
+ "loss": 0.3426,
714
+ "step": 2080
715
+ },
716
+ {
717
+ "epoch": 5.85,
718
+ "learning_rate": 0.001,
719
+ "loss": 0.3684,
720
+ "step": 2100
721
+ },
722
+ {
723
+ "epoch": 5.91,
724
+ "learning_rate": 0.001,
725
+ "loss": 0.3472,
726
+ "step": 2120
727
+ },
728
+ {
729
+ "epoch": 5.96,
730
+ "learning_rate": 0.001,
731
+ "loss": 0.299,
732
+ "step": 2140
733
+ },
734
+ {
735
+ "epoch": 6.0,
736
+ "eval_f1": 0.0,
737
+ "eval_loss": 0.3469391465187073,
738
+ "eval_runtime": 20.5335,
739
+ "eval_samples_per_second": 93.311,
740
+ "eval_steps_per_second": 1.461,
741
+ "step": 2154
742
+ },
743
+ {
744
+ "epoch": 6.02,
745
+ "learning_rate": 0.001,
746
+ "loss": 0.3336,
747
+ "step": 2160
748
+ },
749
+ {
750
+ "epoch": 6.07,
751
+ "learning_rate": 0.001,
752
+ "loss": 0.4366,
753
+ "step": 2180
754
+ },
755
+ {
756
+ "epoch": 6.13,
757
+ "learning_rate": 0.001,
758
+ "loss": 0.3709,
759
+ "step": 2200
760
+ },
761
+ {
762
+ "epoch": 6.18,
763
+ "learning_rate": 0.001,
764
+ "loss": 0.3357,
765
+ "step": 2220
766
+ },
767
+ {
768
+ "epoch": 6.24,
769
+ "learning_rate": 0.001,
770
+ "loss": 0.4034,
771
+ "step": 2240
772
+ },
773
+ {
774
+ "epoch": 6.3,
775
+ "learning_rate": 0.001,
776
+ "loss": 0.3868,
777
+ "step": 2260
778
+ },
779
+ {
780
+ "epoch": 6.35,
781
+ "learning_rate": 0.001,
782
+ "loss": 0.3328,
783
+ "step": 2280
784
+ },
785
+ {
786
+ "epoch": 6.41,
787
+ "learning_rate": 0.001,
788
+ "loss": 0.3974,
789
+ "step": 2300
790
+ },
791
+ {
792
+ "epoch": 6.46,
793
+ "learning_rate": 0.001,
794
+ "loss": 0.3707,
795
+ "step": 2320
796
+ },
797
+ {
798
+ "epoch": 6.52,
799
+ "learning_rate": 0.001,
800
+ "loss": 0.3753,
801
+ "step": 2340
802
+ },
803
+ {
804
+ "epoch": 6.57,
805
+ "learning_rate": 0.001,
806
+ "loss": 0.3255,
807
+ "step": 2360
808
+ },
809
+ {
810
+ "epoch": 6.63,
811
+ "learning_rate": 0.001,
812
+ "loss": 0.4284,
813
+ "step": 2380
814
+ },
815
+ {
816
+ "epoch": 6.69,
817
+ "learning_rate": 0.001,
818
+ "loss": 0.3699,
819
+ "step": 2400
820
+ },
821
+ {
822
+ "epoch": 6.74,
823
+ "learning_rate": 0.001,
824
+ "loss": 0.3705,
825
+ "step": 2420
826
+ },
827
+ {
828
+ "epoch": 6.8,
829
+ "learning_rate": 0.001,
830
+ "loss": 0.2841,
831
+ "step": 2440
832
+ },
833
+ {
834
+ "epoch": 6.85,
835
+ "learning_rate": 0.001,
836
+ "loss": 0.2687,
837
+ "step": 2460
838
+ },
839
+ {
840
+ "epoch": 6.91,
841
+ "learning_rate": 0.001,
842
+ "loss": 0.3294,
843
+ "step": 2480
844
+ },
845
+ {
846
+ "epoch": 6.96,
847
+ "learning_rate": 0.001,
848
+ "loss": 0.3531,
849
+ "step": 2500
850
+ },
851
+ {
852
+ "epoch": 7.0,
853
+ "eval_f1": 0.0,
854
+ "eval_loss": 0.3420043885707855,
855
+ "eval_runtime": 20.5195,
856
+ "eval_samples_per_second": 93.374,
857
+ "eval_steps_per_second": 1.462,
858
+ "step": 2513
859
+ },
860
+ {
861
+ "epoch": 7.02,
862
+ "learning_rate": 0.001,
863
+ "loss": 0.3396,
864
+ "step": 2520
865
+ },
866
+ {
867
+ "epoch": 7.08,
868
+ "learning_rate": 0.001,
869
+ "loss": 0.3824,
870
+ "step": 2540
871
+ },
872
+ {
873
+ "epoch": 7.13,
874
+ "learning_rate": 0.001,
875
+ "loss": 0.2518,
876
+ "step": 2560
877
+ },
878
+ {
879
+ "epoch": 7.19,
880
+ "learning_rate": 0.001,
881
+ "loss": 0.3822,
882
+ "step": 2580
883
+ },
884
+ {
885
+ "epoch": 7.24,
886
+ "learning_rate": 0.001,
887
+ "loss": 0.3969,
888
+ "step": 2600
889
+ },
890
+ {
891
+ "epoch": 7.3,
892
+ "learning_rate": 0.001,
893
+ "loss": 0.2551,
894
+ "step": 2620
895
+ },
896
+ {
897
+ "epoch": 7.35,
898
+ "learning_rate": 0.001,
899
+ "loss": 0.3387,
900
+ "step": 2640
901
+ },
902
+ {
903
+ "epoch": 7.41,
904
+ "learning_rate": 0.001,
905
+ "loss": 0.3761,
906
+ "step": 2660
907
+ },
908
+ {
909
+ "epoch": 7.47,
910
+ "learning_rate": 0.001,
911
+ "loss": 0.3899,
912
+ "step": 2680
913
+ },
914
+ {
915
+ "epoch": 7.52,
916
+ "learning_rate": 0.001,
917
+ "loss": 0.3691,
918
+ "step": 2700
919
+ },
920
+ {
921
+ "epoch": 7.58,
922
+ "learning_rate": 0.001,
923
+ "loss": 0.3172,
924
+ "step": 2720
925
+ },
926
+ {
927
+ "epoch": 7.63,
928
+ "learning_rate": 0.001,
929
+ "loss": 0.3358,
930
+ "step": 2740
931
+ },
932
+ {
933
+ "epoch": 7.69,
934
+ "learning_rate": 0.001,
935
+ "loss": 0.3459,
936
+ "step": 2760
937
+ },
938
+ {
939
+ "epoch": 7.74,
940
+ "learning_rate": 0.001,
941
+ "loss": 0.3347,
942
+ "step": 2780
943
+ },
944
+ {
945
+ "epoch": 7.8,
946
+ "learning_rate": 0.001,
947
+ "loss": 0.3459,
948
+ "step": 2800
949
+ },
950
+ {
951
+ "epoch": 7.86,
952
+ "learning_rate": 0.001,
953
+ "loss": 0.3797,
954
+ "step": 2820
955
+ },
956
+ {
957
+ "epoch": 7.91,
958
+ "learning_rate": 0.001,
959
+ "loss": 0.3721,
960
+ "step": 2840
961
+ },
962
+ {
963
+ "epoch": 7.97,
964
+ "learning_rate": 0.001,
965
+ "loss": 0.3892,
966
+ "step": 2860
967
+ },
968
+ {
969
+ "epoch": 8.0,
970
+ "eval_f1": 0.0,
971
+ "eval_loss": 0.34283891320228577,
972
+ "eval_runtime": 20.5477,
973
+ "eval_samples_per_second": 93.247,
974
+ "eval_steps_per_second": 1.46,
975
+ "step": 2872
976
+ },
977
+ {
978
+ "epoch": 8.02,
979
+ "learning_rate": 0.001,
980
+ "loss": 0.3234,
981
+ "step": 2880
982
+ },
983
+ {
984
+ "epoch": 8.08,
985
+ "learning_rate": 0.001,
986
+ "loss": 0.3979,
987
+ "step": 2900
988
+ },
989
+ {
990
+ "epoch": 8.13,
991
+ "learning_rate": 0.001,
992
+ "loss": 0.4032,
993
+ "step": 2920
994
+ },
995
+ {
996
+ "epoch": 8.19,
997
+ "learning_rate": 0.001,
998
+ "loss": 0.3787,
999
+ "step": 2940
1000
+ },
1001
+ {
1002
+ "epoch": 8.25,
1003
+ "learning_rate": 0.001,
1004
+ "loss": 0.3144,
1005
+ "step": 2960
1006
+ },
1007
+ {
1008
+ "epoch": 8.3,
1009
+ "learning_rate": 0.001,
1010
+ "loss": 0.4071,
1011
+ "step": 2980
1012
+ },
1013
+ {
1014
+ "epoch": 8.36,
1015
+ "learning_rate": 0.001,
1016
+ "loss": 0.3192,
1017
+ "step": 3000
1018
+ },
1019
+ {
1020
+ "epoch": 8.41,
1021
+ "learning_rate": 0.001,
1022
+ "loss": 0.3194,
1023
+ "step": 3020
1024
+ },
1025
+ {
1026
+ "epoch": 8.47,
1027
+ "learning_rate": 0.001,
1028
+ "loss": 0.3468,
1029
+ "step": 3040
1030
+ },
1031
+ {
1032
+ "epoch": 8.52,
1033
+ "learning_rate": 0.001,
1034
+ "loss": 0.325,
1035
+ "step": 3060
1036
+ },
1037
+ {
1038
+ "epoch": 8.58,
1039
+ "learning_rate": 0.001,
1040
+ "loss": 0.3631,
1041
+ "step": 3080
1042
+ },
1043
+ {
1044
+ "epoch": 8.64,
1045
+ "learning_rate": 0.001,
1046
+ "loss": 0.3464,
1047
+ "step": 3100
1048
+ },
1049
+ {
1050
+ "epoch": 8.69,
1051
+ "learning_rate": 0.001,
1052
+ "loss": 0.3378,
1053
+ "step": 3120
1054
+ },
1055
+ {
1056
+ "epoch": 8.75,
1057
+ "learning_rate": 0.001,
1058
+ "loss": 0.3808,
1059
+ "step": 3140
1060
+ },
1061
+ {
1062
+ "epoch": 8.8,
1063
+ "learning_rate": 0.001,
1064
+ "loss": 0.3668,
1065
+ "step": 3160
1066
+ },
1067
+ {
1068
+ "epoch": 8.86,
1069
+ "learning_rate": 0.001,
1070
+ "loss": 0.3045,
1071
+ "step": 3180
1072
+ },
1073
+ {
1074
+ "epoch": 8.91,
1075
+ "learning_rate": 0.001,
1076
+ "loss": 0.2805,
1077
+ "step": 3200
1078
+ },
1079
+ {
1080
+ "epoch": 8.97,
1081
+ "learning_rate": 0.001,
1082
+ "loss": 0.3706,
1083
+ "step": 3220
1084
+ },
1085
  {
1086
  "epoch": 9.0,
1087
+ "eval_f1": 0.0,
1088
+ "eval_loss": 0.3420598804950714,
1089
+ "eval_runtime": 20.5266,
1090
+ "eval_samples_per_second": 93.342,
1091
+ "eval_steps_per_second": 1.462,
1092
+ "step": 3231
1093
  },
1094
  {
1095
  "epoch": 9.03,
1096
+ "learning_rate": 0.001,
1097
+ "loss": 0.3502,
1098
+ "step": 3240
1099
+ },
1100
+ {
1101
+ "epoch": 9.08,
1102
+ "learning_rate": 0.001,
1103
+ "loss": 0.3414,
1104
+ "step": 3260
1105
  },
1106
  {
1107
  "epoch": 9.14,
1108
+ "learning_rate": 0.001,
1109
+ "loss": 0.4037,
1110
+ "step": 3280
1111
+ },
1112
+ {
1113
+ "epoch": 9.19,
1114
+ "learning_rate": 0.001,
1115
+ "loss": 0.3548,
1116
+ "step": 3300
1117
  },
1118
  {
1119
  "epoch": 9.25,
1120
+ "learning_rate": 0.001,
1121
+ "loss": 0.3426,
1122
+ "step": 3320
1123
+ },
1124
+ {
1125
+ "epoch": 9.3,
1126
+ "learning_rate": 0.001,
1127
+ "loss": 0.3614,
1128
+ "step": 3340
1129
  },
1130
  {
1131
  "epoch": 9.36,
1132
+ "learning_rate": 0.001,
1133
+ "loss": 0.2505,
1134
+ "step": 3360
1135
+ },
1136
+ {
1137
+ "epoch": 9.42,
1138
+ "learning_rate": 0.001,
1139
+ "loss": 0.402,
1140
+ "step": 3380
1141
  },
1142
  {
1143
  "epoch": 9.47,
1144
+ "learning_rate": 0.001,
1145
+ "loss": 0.3029,
1146
+ "step": 3400
1147
+ },
1148
+ {
1149
+ "epoch": 9.53,
1150
+ "learning_rate": 0.001,
1151
+ "loss": 0.2799,
1152
+ "step": 3420
1153
  },
1154
  {
1155
  "epoch": 9.58,
1156
+ "learning_rate": 0.001,
1157
+ "loss": 0.3046,
1158
+ "step": 3440
1159
+ },
1160
+ {
1161
+ "epoch": 9.64,
1162
+ "learning_rate": 0.001,
1163
+ "loss": 0.3707,
1164
+ "step": 3460
1165
  },
1166
  {
1167
  "epoch": 9.69,
1168
+ "learning_rate": 0.001,
1169
+ "loss": 0.3417,
1170
+ "step": 3480
1171
+ },
1172
+ {
1173
+ "epoch": 9.75,
1174
+ "learning_rate": 0.001,
1175
+ "loss": 0.3826,
1176
+ "step": 3500
1177
  },
1178
  {
1179
  "epoch": 9.81,
1180
+ "learning_rate": 0.001,
1181
+ "loss": 0.3658,
1182
+ "step": 3520
1183
  },
1184
  {
1185
+ "epoch": 9.86,
1186
+ "learning_rate": 0.001,
1187
+ "loss": 0.3185,
1188
+ "step": 3540
1189
  },
1190
  {
1191
+ "epoch": 9.92,
1192
+ "learning_rate": 0.001,
1193
+ "loss": 0.3596,
1194
+ "step": 3560
 
 
 
1195
  },
1196
  {
1197
  "epoch": 9.97,
1198
+ "learning_rate": 0.001,
1199
+ "loss": 0.3863,
1200
+ "step": 3580
1201
+ },
1202
+ {
1203
+ "epoch": 10.0,
1204
+ "eval_f1": 0.0,
1205
+ "eval_loss": 0.34448280930519104,
1206
+ "eval_runtime": 20.4875,
1207
+ "eval_samples_per_second": 93.52,
1208
+ "eval_steps_per_second": 1.464,
1209
+ "step": 3590
1210
+ },
1211
+ {
1212
+ "epoch": 10.0,
1213
+ "step": 3590,
1214
+ "total_flos": 3792562624069632.0,
1215
+ "train_loss": 0.3510875488058106,
1216
+ "train_runtime": 3949.5247,
1217
+ "train_samples_per_second": 14.541,
1218
+ "train_steps_per_second": 0.909
1219
  }
1220
  ],
1221
  "logging_steps": 20,
1222
+ "max_steps": 3590,
1223
  "num_input_tokens_seen": 0,
1224
  "num_train_epochs": 10,
1225
  "save_steps": 500,
1226
+ "total_flos": 3792562624069632.0,
1227
  "train_batch_size": 2,
1228
  "trial_name": null,
1229
  "trial_params": null