AlekseyKorshuk commited on
Commit
df35c33
1 Parent(s): 5aeb850

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/big-baby-tape")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/2oh33los/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Big Baby Tape's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1s6lnq7u) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1s6lnq7u/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
45
  dataset = load_dataset("huggingartists/big-baby-tape")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/1mu9ki6z/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on Big Baby Tape's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/30qklxvh) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/30qklxvh/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 1.7459986209869385, "eval_runtime": 9.6814, "eval_samples_per_second": 20.142, "eval_steps_per_second": 2.582, "epoch": 4.0}
1
+ {"eval_loss": 1.5916674137115479, "eval_runtime": 9.8629, "eval_samples_per_second": 20.075, "eval_steps_per_second": 2.535, "epoch": 13.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb5c95a97e20f7c95d543dc5984fc5037b700d9f4b39e1b5e4f6b58ce0293f31
3
  size 497764120
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b81b2217929fc3a04adc28aeef96d8c22dca42ee762849488f148e9c66ff8ebb
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:234a4acde292f927cb5081f20b7a098d4310346b486b808347df772453d38fbe
3
  size 995604017
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c93cb1dbf3bafab16cffecaf5835427ece275345f8190c3861fb64882677a2c7
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:97121acb26a6fa4c7fb4a741eb682f5cf8f1b3bb9f337fe691b8c152067e55fe
3
  size 510403817
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd707232768bf963bbfa80d00e4a1ac406411c38942ae6da42f491a699e0b1d
3
  size 510403817
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:077dc7f31533b68ea237703ffc91a5ac35fd4765522f824d8d2330befe10761d
3
  size 14567
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55264648aba19a33aed9e5dcadfb3f6be836fe87263f0b60fcb9a1183bf613ea
3
  size 14567
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bac8ceca19c5c4936fbf77b46507ea6d5819a16be52083fb75390b76c3994a44
3
  size 623
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316708ce40514505582ee090ddaa6c8e8624da3a3fe6edd6790da2e26bd960ad
3
  size 623
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 1.7459986209869385,
3
- "best_model_checkpoint": "output/big-baby-tape/checkpoint-520",
4
- "epoch": 4.0,
5
- "global_step": 520,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -662,11 +662,667 @@
662
  "eval_samples_per_second": 20.192,
663
  "eval_steps_per_second": 2.589,
664
  "step": 520
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
  }
666
  ],
667
- "max_steps": 520,
668
- "num_train_epochs": 4,
669
- "total_flos": 542964842496000.0,
670
  "trial_name": null,
671
  "trial_params": null
672
  }
1
  {
2
+ "best_metric": 1.5916674137115479,
3
+ "best_model_checkpoint": "output/big-baby-tape/checkpoint-1040",
4
+ "epoch": 8.0,
5
+ "global_step": 1040,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
662
  "eval_samples_per_second": 20.192,
663
  "eval_steps_per_second": 2.589,
664
  "step": 520
665
+ },
666
+ {
667
+ "epoch": 4.04,
668
+ "learning_rate": 5.001712368734899e-07,
669
+ "loss": 1.7425,
670
+ "step": 525
671
+ },
672
+ {
673
+ "epoch": 4.08,
674
+ "learning_rate": 1.9933913245728015e-06,
675
+ "loss": 1.6693,
676
+ "step": 530
677
+ },
678
+ {
679
+ "epoch": 4.12,
680
+ "learning_rate": 4.457885751780535e-06,
681
+ "loss": 1.5825,
682
+ "step": 535
683
+ },
684
+ {
685
+ "epoch": 4.15,
686
+ "learning_rate": 7.857716640189824e-06,
687
+ "loss": 1.6875,
688
+ "step": 540
689
+ },
690
+ {
691
+ "epoch": 4.19,
692
+ "learning_rate": 1.2143306799695106e-05,
693
+ "loss": 1.6501,
694
+ "step": 545
695
+ },
696
+ {
697
+ "epoch": 4.23,
698
+ "learning_rate": 1.725216267546245e-05,
699
+ "loss": 1.5837,
700
+ "step": 550
701
+ },
702
+ {
703
+ "epoch": 4.27,
704
+ "learning_rate": 2.3109785644681495e-05,
705
+ "loss": 1.5563,
706
+ "step": 555
707
+ },
708
+ {
709
+ "epoch": 4.31,
710
+ "learning_rate": 2.963075837424263e-05,
711
+ "loss": 1.5985,
712
+ "step": 560
713
+ },
714
+ {
715
+ "epoch": 4.35,
716
+ "learning_rate": 3.6719990397797463e-05,
717
+ "loss": 1.606,
718
+ "step": 565
719
+ },
720
+ {
721
+ "epoch": 4.38,
722
+ "learning_rate": 4.4274104748882125e-05,
723
+ "loss": 1.6791,
724
+ "step": 570
725
+ },
726
+ {
727
+ "epoch": 4.42,
728
+ "learning_rate": 5.2182945429873444e-05,
729
+ "loss": 1.7505,
730
+ "step": 575
731
+ },
732
+ {
733
+ "epoch": 4.46,
734
+ "learning_rate": 6.033118373448483e-05,
735
+ "loss": 1.5154,
736
+ "step": 580
737
+ },
738
+ {
739
+ "epoch": 4.5,
740
+ "learning_rate": 6.859999999999984e-05,
741
+ "loss": 1.671,
742
+ "step": 585
743
+ },
744
+ {
745
+ "epoch": 4.54,
746
+ "learning_rate": 7.686881626551508e-05,
747
+ "loss": 1.5294,
748
+ "step": 590
749
+ },
750
+ {
751
+ "epoch": 4.58,
752
+ "learning_rate": 8.501705457012647e-05,
753
+ "loss": 1.5787,
754
+ "step": 595
755
+ },
756
+ {
757
+ "epoch": 4.62,
758
+ "learning_rate": 9.292589525111778e-05,
759
+ "loss": 1.7159,
760
+ "step": 600
761
+ },
762
+ {
763
+ "epoch": 4.65,
764
+ "learning_rate": 0.00010048000960220244,
765
+ "loss": 1.6876,
766
+ "step": 605
767
+ },
768
+ {
769
+ "epoch": 4.69,
770
+ "learning_rate": 0.0001075692416257573,
771
+ "loss": 1.6451,
772
+ "step": 610
773
+ },
774
+ {
775
+ "epoch": 4.73,
776
+ "learning_rate": 0.00011409021435531843,
777
+ "loss": 1.7096,
778
+ "step": 615
779
+ },
780
+ {
781
+ "epoch": 4.77,
782
+ "learning_rate": 0.00011994783732453749,
783
+ "loss": 1.6926,
784
+ "step": 620
785
+ },
786
+ {
787
+ "epoch": 4.81,
788
+ "learning_rate": 0.00012505669320030485,
789
+ "loss": 1.6774,
790
+ "step": 625
791
+ },
792
+ {
793
+ "epoch": 4.85,
794
+ "learning_rate": 0.00012934228335981013,
795
+ "loss": 1.769,
796
+ "step": 630
797
+ },
798
+ {
799
+ "epoch": 4.88,
800
+ "learning_rate": 0.00013274211424821943,
801
+ "loss": 1.6677,
802
+ "step": 635
803
+ },
804
+ {
805
+ "epoch": 4.92,
806
+ "learning_rate": 0.00013520660867542716,
807
+ "loss": 1.6415,
808
+ "step": 640
809
+ },
810
+ {
811
+ "epoch": 4.96,
812
+ "learning_rate": 0.00013669982876312649,
813
+ "loss": 1.6811,
814
+ "step": 645
815
+ },
816
+ {
817
+ "epoch": 5.0,
818
+ "learning_rate": 0.0001372,
819
+ "loss": 1.6017,
820
+ "step": 650
821
+ },
822
+ {
823
+ "epoch": 5.0,
824
+ "eval_loss": 1.6280437707901,
825
+ "eval_runtime": 9.7539,
826
+ "eval_samples_per_second": 20.3,
827
+ "eval_steps_per_second": 2.563,
828
+ "step": 650
829
+ },
830
+ {
831
+ "epoch": 5.04,
832
+ "learning_rate": 0.00013669982876312649,
833
+ "loss": 1.5983,
834
+ "step": 655
835
+ },
836
+ {
837
+ "epoch": 5.08,
838
+ "learning_rate": 0.0001352066086754272,
839
+ "loss": 1.6281,
840
+ "step": 660
841
+ },
842
+ {
843
+ "epoch": 5.12,
844
+ "learning_rate": 0.00013274211424821946,
845
+ "loss": 1.6583,
846
+ "step": 665
847
+ },
848
+ {
849
+ "epoch": 5.15,
850
+ "learning_rate": 0.00012934228335981018,
851
+ "loss": 1.5553,
852
+ "step": 670
853
+ },
854
+ {
855
+ "epoch": 5.19,
856
+ "learning_rate": 0.0001250566932003049,
857
+ "loss": 1.669,
858
+ "step": 675
859
+ },
860
+ {
861
+ "epoch": 5.23,
862
+ "learning_rate": 0.00011994783732453757,
863
+ "loss": 1.5035,
864
+ "step": 680
865
+ },
866
+ {
867
+ "epoch": 5.27,
868
+ "learning_rate": 0.00011409021435531852,
869
+ "loss": 1.5271,
870
+ "step": 685
871
+ },
872
+ {
873
+ "epoch": 5.31,
874
+ "learning_rate": 0.00010756924162575738,
875
+ "loss": 1.6283,
876
+ "step": 690
877
+ },
878
+ {
879
+ "epoch": 5.35,
880
+ "learning_rate": 0.00010048000960220254,
881
+ "loss": 1.5816,
882
+ "step": 695
883
+ },
884
+ {
885
+ "epoch": 5.38,
886
+ "learning_rate": 9.292589525111789e-05,
887
+ "loss": 1.537,
888
+ "step": 700
889
+ },
890
+ {
891
+ "epoch": 5.42,
892
+ "learning_rate": 8.501705457012655e-05,
893
+ "loss": 1.5076,
894
+ "step": 705
895
+ },
896
+ {
897
+ "epoch": 5.46,
898
+ "learning_rate": 7.686881626551518e-05,
899
+ "loss": 1.5273,
900
+ "step": 710
901
+ },
902
+ {
903
+ "epoch": 5.5,
904
+ "learning_rate": 6.859999999999993e-05,
905
+ "loss": 1.6255,
906
+ "step": 715
907
+ },
908
+ {
909
+ "epoch": 5.54,
910
+ "learning_rate": 6.033118373448493e-05,
911
+ "loss": 1.5228,
912
+ "step": 720
913
+ },
914
+ {
915
+ "epoch": 5.58,
916
+ "learning_rate": 5.2182945429873545e-05,
917
+ "loss": 1.4525,
918
+ "step": 725
919
+ },
920
+ {
921
+ "epoch": 5.62,
922
+ "learning_rate": 4.427410474888221e-05,
923
+ "loss": 1.5599,
924
+ "step": 730
925
+ },
926
+ {
927
+ "epoch": 5.65,
928
+ "learning_rate": 3.671999039779755e-05,
929
+ "loss": 1.7274,
930
+ "step": 735
931
+ },
932
+ {
933
+ "epoch": 5.69,
934
+ "learning_rate": 2.9630758374242713e-05,
935
+ "loss": 1.5442,
936
+ "step": 740
937
+ },
938
+ {
939
+ "epoch": 5.73,
940
+ "learning_rate": 2.3109785644681573e-05,
941
+ "loss": 1.4565,
942
+ "step": 745
943
+ },
944
+ {
945
+ "epoch": 5.77,
946
+ "learning_rate": 1.725216267546251e-05,
947
+ "loss": 1.4942,
948
+ "step": 750
949
+ },
950
+ {
951
+ "epoch": 5.81,
952
+ "learning_rate": 1.2143306799695167e-05,
953
+ "loss": 1.6139,
954
+ "step": 755
955
+ },
956
+ {
957
+ "epoch": 5.85,
958
+ "learning_rate": 7.857716640189868e-06,
959
+ "loss": 1.6282,
960
+ "step": 760
961
+ },
962
+ {
963
+ "epoch": 5.88,
964
+ "learning_rate": 4.457885751780565e-06,
965
+ "loss": 1.6596,
966
+ "step": 765
967
+ },
968
+ {
969
+ "epoch": 5.92,
970
+ "learning_rate": 1.9933913245728244e-06,
971
+ "loss": 1.5722,
972
+ "step": 770
973
+ },
974
+ {
975
+ "epoch": 5.96,
976
+ "learning_rate": 5.001712368735127e-07,
977
+ "loss": 1.5391,
978
+ "step": 775
979
+ },
980
+ {
981
+ "epoch": 6.0,
982
+ "learning_rate": 0.0,
983
+ "loss": 1.6218,
984
+ "step": 780
985
+ },
986
+ {
987
+ "epoch": 6.0,
988
+ "eval_loss": 1.5917974710464478,
989
+ "eval_runtime": 9.7962,
990
+ "eval_samples_per_second": 20.212,
991
+ "eval_steps_per_second": 2.552,
992
+ "step": 780
993
+ },
994
+ {
995
+ "epoch": 6.04,
996
+ "learning_rate": 5.001712368734975e-07,
997
+ "loss": 1.4457,
998
+ "step": 785
999
+ },
1000
+ {
1001
+ "epoch": 6.08,
1002
+ "learning_rate": 1.9933913245728015e-06,
1003
+ "loss": 1.5599,
1004
+ "step": 790
1005
+ },
1006
+ {
1007
+ "epoch": 6.12,
1008
+ "learning_rate": 4.457885751780527e-06,
1009
+ "loss": 1.4377,
1010
+ "step": 795
1011
+ },
1012
+ {
1013
+ "epoch": 6.15,
1014
+ "learning_rate": 7.857716640189815e-06,
1015
+ "loss": 1.4766,
1016
+ "step": 800
1017
+ },
1018
+ {
1019
+ "epoch": 6.19,
1020
+ "learning_rate": 1.2143306799695099e-05,
1021
+ "loss": 1.4534,
1022
+ "step": 805
1023
+ },
1024
+ {
1025
+ "epoch": 6.23,
1026
+ "learning_rate": 1.7252162675462436e-05,
1027
+ "loss": 1.4231,
1028
+ "step": 810
1029
+ },
1030
+ {
1031
+ "epoch": 6.27,
1032
+ "learning_rate": 2.310978564468148e-05,
1033
+ "loss": 1.5066,
1034
+ "step": 815
1035
+ },
1036
+ {
1037
+ "epoch": 6.31,
1038
+ "learning_rate": 2.963075837424261e-05,
1039
+ "loss": 1.4435,
1040
+ "step": 820
1041
+ },
1042
+ {
1043
+ "epoch": 6.35,
1044
+ "learning_rate": 3.671999039779744e-05,
1045
+ "loss": 1.5639,
1046
+ "step": 825
1047
+ },
1048
+ {
1049
+ "epoch": 6.38,
1050
+ "learning_rate": 4.4274104748882104e-05,
1051
+ "loss": 1.4327,
1052
+ "step": 830
1053
+ },
1054
+ {
1055
+ "epoch": 6.42,
1056
+ "learning_rate": 5.218294542987343e-05,
1057
+ "loss": 1.5161,
1058
+ "step": 835
1059
+ },
1060
+ {
1061
+ "epoch": 6.46,
1062
+ "learning_rate": 6.0331183734484816e-05,
1063
+ "loss": 1.412,
1064
+ "step": 840
1065
+ },
1066
+ {
1067
+ "epoch": 6.5,
1068
+ "learning_rate": 6.859999999999982e-05,
1069
+ "loss": 1.5963,
1070
+ "step": 845
1071
+ },
1072
+ {
1073
+ "epoch": 6.54,
1074
+ "learning_rate": 7.686881626551506e-05,
1075
+ "loss": 1.4996,
1076
+ "step": 850
1077
+ },
1078
+ {
1079
+ "epoch": 6.58,
1080
+ "learning_rate": 8.501705457012646e-05,
1081
+ "loss": 1.4342,
1082
+ "step": 855
1083
+ },
1084
+ {
1085
+ "epoch": 6.62,
1086
+ "learning_rate": 9.292589525111778e-05,
1087
+ "loss": 1.5237,
1088
+ "step": 860
1089
+ },
1090
+ {
1091
+ "epoch": 6.65,
1092
+ "learning_rate": 0.00010048000960220244,
1093
+ "loss": 1.5488,
1094
+ "step": 865
1095
+ },
1096
+ {
1097
+ "epoch": 6.69,
1098
+ "learning_rate": 0.00010756924162575728,
1099
+ "loss": 1.5456,
1100
+ "step": 870
1101
+ },
1102
+ {
1103
+ "epoch": 6.73,
1104
+ "learning_rate": 0.00011409021435531843,
1105
+ "loss": 1.4942,
1106
+ "step": 875
1107
+ },
1108
+ {
1109
+ "epoch": 6.77,
1110
+ "learning_rate": 0.00011994783732453747,
1111
+ "loss": 1.5108,
1112
+ "step": 880
1113
+ },
1114
+ {
1115
+ "epoch": 6.81,
1116
+ "learning_rate": 0.00012505669320030482,
1117
+ "loss": 1.5004,
1118
+ "step": 885
1119
+ },
1120
+ {
1121
+ "epoch": 6.85,
1122
+ "learning_rate": 0.00012934228335981013,
1123
+ "loss": 1.5133,
1124
+ "step": 890
1125
+ },
1126
+ {
1127
+ "epoch": 6.88,
1128
+ "learning_rate": 0.00013274211424821943,
1129
+ "loss": 1.5261,
1130
+ "step": 895
1131
+ },
1132
+ {
1133
+ "epoch": 6.92,
1134
+ "learning_rate": 0.00013520660867542716,
1135
+ "loss": 1.5521,
1136
+ "step": 900
1137
+ },
1138
+ {
1139
+ "epoch": 6.96,
1140
+ "learning_rate": 0.00013669982876312649,
1141
+ "loss": 1.6079,
1142
+ "step": 905
1143
+ },
1144
+ {
1145
+ "epoch": 7.0,
1146
+ "learning_rate": 0.0001372,
1147
+ "loss": 1.546,
1148
+ "step": 910
1149
+ },
1150
+ {
1151
+ "epoch": 7.0,
1152
+ "eval_loss": 1.618756890296936,
1153
+ "eval_runtime": 9.7835,
1154
+ "eval_samples_per_second": 20.238,
1155
+ "eval_steps_per_second": 2.555,
1156
+ "step": 910
1157
+ },
1158
+ {
1159
+ "epoch": 7.04,
1160
+ "learning_rate": 0.00013669982876312649,
1161
+ "loss": 1.2732,
1162
+ "step": 915
1163
+ },
1164
+ {
1165
+ "epoch": 7.08,
1166
+ "learning_rate": 0.0001352066086754272,
1167
+ "loss": 1.4751,
1168
+ "step": 920
1169
+ },
1170
+ {
1171
+ "epoch": 7.12,
1172
+ "learning_rate": 0.00013274211424821948,
1173
+ "loss": 1.5095,
1174
+ "step": 925
1175
+ },
1176
+ {
1177
+ "epoch": 7.15,
1178
+ "learning_rate": 0.0001293422833598103,
1179
+ "loss": 1.4858,
1180
+ "step": 930
1181
+ },
1182
+ {
1183
+ "epoch": 7.19,
1184
+ "learning_rate": 0.0001250566932003049,
1185
+ "loss": 1.5896,
1186
+ "step": 935
1187
+ },
1188
+ {
1189
+ "epoch": 7.23,
1190
+ "learning_rate": 0.00011994783732453773,
1191
+ "loss": 1.3257,
1192
+ "step": 940
1193
+ },
1194
+ {
1195
+ "epoch": 7.27,
1196
+ "learning_rate": 0.00011409021435531852,
1197
+ "loss": 1.5044,
1198
+ "step": 945
1199
+ },
1200
+ {
1201
+ "epoch": 7.31,
1202
+ "learning_rate": 0.00010756924162575719,
1203
+ "loss": 1.389,
1204
+ "step": 950
1205
+ },
1206
+ {
1207
+ "epoch": 7.35,
1208
+ "learning_rate": 0.00010048000960220255,
1209
+ "loss": 1.4434,
1210
+ "step": 955
1211
+ },
1212
+ {
1213
+ "epoch": 7.38,
1214
+ "learning_rate": 9.29258952511179e-05,
1215
+ "loss": 1.4208,
1216
+ "step": 960
1217
+ },
1218
+ {
1219
+ "epoch": 7.42,
1220
+ "learning_rate": 8.501705457012658e-05,
1221
+ "loss": 1.4817,
1222
+ "step": 965
1223
+ },
1224
+ {
1225
+ "epoch": 7.46,
1226
+ "learning_rate": 7.68688162655152e-05,
1227
+ "loss": 1.5339,
1228
+ "step": 970
1229
+ },
1230
+ {
1231
+ "epoch": 7.5,
1232
+ "learning_rate": 6.859999999999995e-05,
1233
+ "loss": 1.5614,
1234
+ "step": 975
1235
+ },
1236
+ {
1237
+ "epoch": 7.54,
1238
+ "learning_rate": 6.0331183734484945e-05,
1239
+ "loss": 1.4163,
1240
+ "step": 980
1241
+ },
1242
+ {
1243
+ "epoch": 7.58,
1244
+ "learning_rate": 5.2182945429873566e-05,
1245
+ "loss": 1.3478,
1246
+ "step": 985
1247
+ },
1248
+ {
1249
+ "epoch": 7.62,
1250
+ "learning_rate": 4.4274104748882226e-05,
1251
+ "loss": 1.502,
1252
+ "step": 990
1253
+ },
1254
+ {
1255
+ "epoch": 7.65,
1256
+ "learning_rate": 3.6719990397797565e-05,
1257
+ "loss": 1.52,
1258
+ "step": 995
1259
+ },
1260
+ {
1261
+ "epoch": 7.69,
1262
+ "learning_rate": 2.9630758374242923e-05,
1263
+ "loss": 1.3148,
1264
+ "step": 1000
1265
+ },
1266
+ {
1267
+ "epoch": 7.73,
1268
+ "learning_rate": 2.310978564468158e-05,
1269
+ "loss": 1.5192,
1270
+ "step": 1005
1271
+ },
1272
+ {
1273
+ "epoch": 7.77,
1274
+ "learning_rate": 1.7252162675462368e-05,
1275
+ "loss": 1.4457,
1276
+ "step": 1010
1277
+ },
1278
+ {
1279
+ "epoch": 7.81,
1280
+ "learning_rate": 1.2143306799695174e-05,
1281
+ "loss": 1.3958,
1282
+ "step": 1015
1283
+ },
1284
+ {
1285
+ "epoch": 7.85,
1286
+ "learning_rate": 7.85771664018977e-06,
1287
+ "loss": 1.5431,
1288
+ "step": 1020
1289
+ },
1290
+ {
1291
+ "epoch": 7.88,
1292
+ "learning_rate": 4.457885751780573e-06,
1293
+ "loss": 1.4051,
1294
+ "step": 1025
1295
+ },
1296
+ {
1297
+ "epoch": 7.92,
1298
+ "learning_rate": 1.993391324572832e-06,
1299
+ "loss": 1.4924,
1300
+ "step": 1030
1301
+ },
1302
+ {
1303
+ "epoch": 7.96,
1304
+ "learning_rate": 5.001712368735127e-07,
1305
+ "loss": 1.4193,
1306
+ "step": 1035
1307
+ },
1308
+ {
1309
+ "epoch": 8.0,
1310
+ "learning_rate": 0.0,
1311
+ "loss": 1.4695,
1312
+ "step": 1040
1313
+ },
1314
+ {
1315
+ "epoch": 8.0,
1316
+ "eval_loss": 1.5916674137115479,
1317
+ "eval_runtime": 9.7988,
1318
+ "eval_samples_per_second": 20.206,
1319
+ "eval_steps_per_second": 2.551,
1320
+ "step": 1040
1321
  }
1322
  ],
1323
+ "max_steps": 1690,
1324
+ "num_train_epochs": 13,
1325
+ "total_flos": 1084361932800000.0,
1326
  "trial_name": null,
1327
  "trial_params": null
1328
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5b5708c7447842968121331f651a0667504a3cf69e6dddb04ddfb6f7d888825b
3
  size 2671
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7de331ed63430c4c904d774190a38a9a01f51742c221c11fc99341c51bd96bba
3
  size 2671