caracena commited on
Commit
e9314ad
1 Parent(s): 1d4c154

update model to 10 epochs

Browse files
README.md CHANGED
@@ -31,12 +31,12 @@ More information needed
31
 
32
  The following hyperparameters were used during training:
33
  - learning_rate: 5e-05
34
- - train_batch_size: 16
35
- - eval_batch_size: 8
36
  - seed: 42
37
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
38
  - lr_scheduler_type: linear
39
- - num_epochs: 3.0
40
 
41
  ### Training results
42
 
 
31
 
32
  The following hyperparameters were used during training:
33
  - learning_rate: 5e-05
34
+ - train_batch_size: 32
35
+ - eval_batch_size: 16
36
  - seed: 42
37
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
38
  - lr_scheduler_type: linear
39
+ - num_epochs: 10.0
40
 
41
  ### Training results
42
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "train_loss": 5.50125317320702,
4
- "train_runtime": 8519.4079,
5
  "train_samples": 253650,
6
- "train_samples_per_second": 89.32,
7
- "train_steps_per_second": 5.583
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "train_loss": 2.0376459330797827,
4
+ "train_runtime": 6961.2818,
5
  "train_samples": 253650,
6
+ "train_samples_per_second": 364.373,
7
+ "train_steps_per_second": 11.387
8
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b23e5d04438510574be93f00f1e3eb2e7176a24cf75c58d060a59996b7b4ff7
3
  size 439605291
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a683a2d389dfd89b799daa6e7f82b778995c5c2d083c9afe6ec12ad006283d1
3
  size 439605291
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 3.0,
3
- "train_loss": 5.50125317320702,
4
- "train_runtime": 8519.4079,
5
  "train_samples": 253650,
6
- "train_samples_per_second": 89.32,
7
- "train_steps_per_second": 5.583
8
  }
 
1
  {
2
+ "epoch": 10.0,
3
+ "train_loss": 2.0376459330797827,
4
+ "train_runtime": 6961.2818,
5
  "train_samples": 253650,
6
+ "train_samples_per_second": 364.373,
7
+ "train_steps_per_second": 11.387
8
  }
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
- "global_step": 47562,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -578,18 +578,396 @@
578
  "step": 47500
579
  },
580
  {
581
- "epoch": 3.0,
582
- "step": 47562,
583
- "total_flos": 2.00286824366592e+17,
584
- "train_loss": 5.50125317320702,
585
- "train_runtime": 8519.4079,
586
- "train_samples_per_second": 89.32,
587
- "train_steps_per_second": 5.583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
588
  }
589
  ],
590
- "max_steps": 47562,
591
- "num_train_epochs": 3,
592
- "total_flos": 2.00286824366592e+17,
593
  "trial_name": null,
594
  "trial_params": null
595
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 10.0,
5
+ "global_step": 79270,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
578
  "step": 47500
579
  },
580
  {
581
+ "epoch": 6.06,
582
+ "learning_rate": 1.9723729027374795e-05,
583
+ "loss": 5.3624,
584
+ "step": 48000
585
+ },
586
+ {
587
+ "epoch": 6.12,
588
+ "learning_rate": 1.9408351204743284e-05,
589
+ "loss": 5.3565,
590
+ "step": 48500
591
+ },
592
+ {
593
+ "epoch": 6.18,
594
+ "learning_rate": 1.909297338211177e-05,
595
+ "loss": 5.3609,
596
+ "step": 49000
597
+ },
598
+ {
599
+ "epoch": 6.24,
600
+ "learning_rate": 1.877759555948026e-05,
601
+ "loss": 5.3611,
602
+ "step": 49500
603
+ },
604
+ {
605
+ "epoch": 6.31,
606
+ "learning_rate": 1.8462217736848747e-05,
607
+ "loss": 5.3563,
608
+ "step": 50000
609
+ },
610
+ {
611
+ "epoch": 6.37,
612
+ "learning_rate": 1.8146839914217233e-05,
613
+ "loss": 5.3578,
614
+ "step": 50500
615
+ },
616
+ {
617
+ "epoch": 6.43,
618
+ "learning_rate": 1.783146209158572e-05,
619
+ "loss": 5.3396,
620
+ "step": 51000
621
+ },
622
+ {
623
+ "epoch": 6.5,
624
+ "learning_rate": 1.7516084268954207e-05,
625
+ "loss": 5.3496,
626
+ "step": 51500
627
+ },
628
+ {
629
+ "epoch": 6.56,
630
+ "learning_rate": 1.7200706446322696e-05,
631
+ "loss": 5.3486,
632
+ "step": 52000
633
+ },
634
+ {
635
+ "epoch": 6.62,
636
+ "learning_rate": 1.688532862369118e-05,
637
+ "loss": 5.3449,
638
+ "step": 52500
639
+ },
640
+ {
641
+ "epoch": 6.69,
642
+ "learning_rate": 1.656995080105967e-05,
643
+ "loss": 5.336,
644
+ "step": 53000
645
+ },
646
+ {
647
+ "epoch": 6.75,
648
+ "learning_rate": 1.6254572978428156e-05,
649
+ "loss": 5.3289,
650
+ "step": 53500
651
+ },
652
+ {
653
+ "epoch": 6.81,
654
+ "learning_rate": 1.5939195155796645e-05,
655
+ "loss": 5.3335,
656
+ "step": 54000
657
+ },
658
+ {
659
+ "epoch": 6.88,
660
+ "learning_rate": 1.5623817333165134e-05,
661
+ "loss": 5.3298,
662
+ "step": 54500
663
+ },
664
+ {
665
+ "epoch": 6.94,
666
+ "learning_rate": 1.530843951053362e-05,
667
+ "loss": 5.3302,
668
+ "step": 55000
669
+ },
670
+ {
671
+ "epoch": 7.0,
672
+ "learning_rate": 1.4993061687902108e-05,
673
+ "loss": 5.3275,
674
+ "step": 55500
675
+ },
676
+ {
677
+ "epoch": 7.06,
678
+ "learning_rate": 1.4677683865270595e-05,
679
+ "loss": 5.3219,
680
+ "step": 56000
681
+ },
682
+ {
683
+ "epoch": 7.13,
684
+ "learning_rate": 1.4362306042639082e-05,
685
+ "loss": 5.3179,
686
+ "step": 56500
687
+ },
688
+ {
689
+ "epoch": 7.19,
690
+ "learning_rate": 1.4046928220007568e-05,
691
+ "loss": 5.3064,
692
+ "step": 57000
693
+ },
694
+ {
695
+ "epoch": 7.25,
696
+ "learning_rate": 1.3731550397376058e-05,
697
+ "loss": 5.312,
698
+ "step": 57500
699
+ },
700
+ {
701
+ "epoch": 7.32,
702
+ "learning_rate": 1.3416172574744546e-05,
703
+ "loss": 5.3166,
704
+ "step": 58000
705
+ },
706
+ {
707
+ "epoch": 7.38,
708
+ "learning_rate": 1.3100794752113033e-05,
709
+ "loss": 5.308,
710
+ "step": 58500
711
+ },
712
+ {
713
+ "epoch": 7.44,
714
+ "learning_rate": 1.278541692948152e-05,
715
+ "loss": 5.2994,
716
+ "step": 59000
717
+ },
718
+ {
719
+ "epoch": 7.51,
720
+ "learning_rate": 1.2470039106850007e-05,
721
+ "loss": 5.2991,
722
+ "step": 59500
723
+ },
724
+ {
725
+ "epoch": 7.57,
726
+ "learning_rate": 1.2154661284218494e-05,
727
+ "loss": 5.2917,
728
+ "step": 60000
729
+ },
730
+ {
731
+ "epoch": 7.63,
732
+ "learning_rate": 1.1839283461586981e-05,
733
+ "loss": 5.2962,
734
+ "step": 60500
735
+ },
736
+ {
737
+ "epoch": 7.7,
738
+ "learning_rate": 1.1523905638955469e-05,
739
+ "loss": 5.2927,
740
+ "step": 61000
741
+ },
742
+ {
743
+ "epoch": 7.76,
744
+ "learning_rate": 1.1208527816323956e-05,
745
+ "loss": 5.2901,
746
+ "step": 61500
747
+ },
748
+ {
749
+ "epoch": 7.82,
750
+ "learning_rate": 1.0893149993692445e-05,
751
+ "loss": 5.2726,
752
+ "step": 62000
753
+ },
754
+ {
755
+ "epoch": 7.88,
756
+ "learning_rate": 1.0577772171060932e-05,
757
+ "loss": 5.2586,
758
+ "step": 62500
759
+ },
760
+ {
761
+ "epoch": 7.95,
762
+ "learning_rate": 1.0262394348429419e-05,
763
+ "loss": 5.2408,
764
+ "step": 63000
765
+ },
766
+ {
767
+ "epoch": 8.01,
768
+ "learning_rate": 9.947016525797906e-06,
769
+ "loss": 5.2086,
770
+ "step": 63500
771
+ },
772
+ {
773
+ "epoch": 8.07,
774
+ "learning_rate": 9.631638703166395e-06,
775
+ "loss": 5.1942,
776
+ "step": 64000
777
+ },
778
+ {
779
+ "epoch": 8.14,
780
+ "learning_rate": 9.31626088053488e-06,
781
+ "loss": 5.1737,
782
+ "step": 64500
783
+ },
784
+ {
785
+ "epoch": 8.2,
786
+ "learning_rate": 9.000883057903368e-06,
787
+ "loss": 5.1362,
788
+ "step": 65000
789
+ },
790
+ {
791
+ "epoch": 8.26,
792
+ "learning_rate": 8.685505235271855e-06,
793
+ "loss": 5.1235,
794
+ "step": 65500
795
+ },
796
+ {
797
+ "epoch": 8.33,
798
+ "learning_rate": 8.370127412640344e-06,
799
+ "loss": 5.0998,
800
+ "step": 66000
801
+ },
802
+ {
803
+ "epoch": 8.39,
804
+ "learning_rate": 8.054749590008831e-06,
805
+ "loss": 5.0742,
806
+ "step": 66500
807
+ },
808
+ {
809
+ "epoch": 8.45,
810
+ "learning_rate": 7.739371767377318e-06,
811
+ "loss": 5.0369,
812
+ "step": 67000
813
+ },
814
+ {
815
+ "epoch": 8.52,
816
+ "learning_rate": 7.423993944745806e-06,
817
+ "loss": 5.0226,
818
+ "step": 67500
819
+ },
820
+ {
821
+ "epoch": 8.58,
822
+ "learning_rate": 7.108616122114293e-06,
823
+ "loss": 4.9907,
824
+ "step": 68000
825
+ },
826
+ {
827
+ "epoch": 8.64,
828
+ "learning_rate": 6.7932382994827806e-06,
829
+ "loss": 4.9742,
830
+ "step": 68500
831
+ },
832
+ {
833
+ "epoch": 8.7,
834
+ "learning_rate": 6.477860476851268e-06,
835
+ "loss": 4.9379,
836
+ "step": 69000
837
+ },
838
+ {
839
+ "epoch": 8.77,
840
+ "learning_rate": 6.162482654219756e-06,
841
+ "loss": 4.9064,
842
+ "step": 69500
843
+ },
844
+ {
845
+ "epoch": 8.83,
846
+ "learning_rate": 5.847104831588243e-06,
847
+ "loss": 4.8912,
848
+ "step": 70000
849
+ },
850
+ {
851
+ "epoch": 8.89,
852
+ "learning_rate": 5.53172700895673e-06,
853
+ "loss": 4.8666,
854
+ "step": 70500
855
+ },
856
+ {
857
+ "epoch": 8.96,
858
+ "learning_rate": 5.216349186325218e-06,
859
+ "loss": 4.835,
860
+ "step": 71000
861
+ },
862
+ {
863
+ "epoch": 9.02,
864
+ "learning_rate": 4.900971363693705e-06,
865
+ "loss": 4.8144,
866
+ "step": 71500
867
+ },
868
+ {
869
+ "epoch": 9.08,
870
+ "learning_rate": 4.5855935410621925e-06,
871
+ "loss": 4.7944,
872
+ "step": 72000
873
+ },
874
+ {
875
+ "epoch": 9.15,
876
+ "learning_rate": 4.27021571843068e-06,
877
+ "loss": 4.7744,
878
+ "step": 72500
879
+ },
880
+ {
881
+ "epoch": 9.21,
882
+ "learning_rate": 3.954837895799168e-06,
883
+ "loss": 4.7491,
884
+ "step": 73000
885
+ },
886
+ {
887
+ "epoch": 9.27,
888
+ "learning_rate": 3.6394600731676553e-06,
889
+ "loss": 4.723,
890
+ "step": 73500
891
+ },
892
+ {
893
+ "epoch": 9.34,
894
+ "learning_rate": 3.3240822505361425e-06,
895
+ "loss": 4.7129,
896
+ "step": 74000
897
+ },
898
+ {
899
+ "epoch": 9.4,
900
+ "learning_rate": 3.0087044279046297e-06,
901
+ "loss": 4.6943,
902
+ "step": 74500
903
+ },
904
+ {
905
+ "epoch": 9.46,
906
+ "learning_rate": 2.6933266052731173e-06,
907
+ "loss": 4.6951,
908
+ "step": 75000
909
+ },
910
+ {
911
+ "epoch": 9.52,
912
+ "learning_rate": 2.3779487826416044e-06,
913
+ "loss": 4.6809,
914
+ "step": 75500
915
+ },
916
+ {
917
+ "epoch": 9.59,
918
+ "learning_rate": 2.062570960010092e-06,
919
+ "loss": 4.6689,
920
+ "step": 76000
921
+ },
922
+ {
923
+ "epoch": 9.65,
924
+ "learning_rate": 1.7471931373785799e-06,
925
+ "loss": 4.6579,
926
+ "step": 76500
927
+ },
928
+ {
929
+ "epoch": 9.71,
930
+ "learning_rate": 1.431815314747067e-06,
931
+ "loss": 4.6334,
932
+ "step": 77000
933
+ },
934
+ {
935
+ "epoch": 9.78,
936
+ "learning_rate": 1.1164374921155544e-06,
937
+ "loss": 4.6372,
938
+ "step": 77500
939
+ },
940
+ {
941
+ "epoch": 9.84,
942
+ "learning_rate": 8.01059669484042e-07,
943
+ "loss": 4.6378,
944
+ "step": 78000
945
+ },
946
+ {
947
+ "epoch": 9.9,
948
+ "learning_rate": 4.856818468525293e-07,
949
+ "loss": 4.6323,
950
+ "step": 78500
951
+ },
952
+ {
953
+ "epoch": 9.97,
954
+ "learning_rate": 1.7030402422101678e-07,
955
+ "loss": 4.6307,
956
+ "step": 79000
957
+ },
958
+ {
959
+ "epoch": 10.0,
960
+ "step": 79270,
961
+ "total_flos": 4.675970241591091e+17,
962
+ "train_loss": 2.0376459330797827,
963
+ "train_runtime": 6961.2818,
964
+ "train_samples_per_second": 364.373,
965
+ "train_steps_per_second": 11.387
966
  }
967
  ],
968
+ "max_steps": 79270,
969
+ "num_train_epochs": 10,
970
+ "total_flos": 4.675970241591091e+17,
971
  "trial_name": null,
972
  "trial_params": null
973
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2228c7c3560e92685b52e0da5da9796867d6757cfb03c5a32dfc8586d33d60ee
3
  size 3247
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3078cc244d536a8e922884b143b7b3668c2b157ea7e14f35d8933d887ab2d074
3
  size 3247