AlekseyKorshuk commited on
Commit
28a3ccf
1 Parent(s): b4c3898

huggingartists

Browse files
README.md CHANGED
@@ -45,15 +45,15 @@ from datasets import load_dataset
45
  dataset = load_dataset("huggingartists/andre-3000")
46
  ```
47
 
48
- [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/zdji17w8/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on André 3000's lyrics.
53
 
54
- Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/1x8qakj9) for full transparency and reproducibility.
55
 
56
- At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/1x8qakj9/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
45
  dataset = load_dataset("huggingartists/andre-3000")
46
  ```
47
 
48
+ [Explore the data](https://wandb.ai/huggingartists/huggingartists/runs/2aj9iybn/artifacts), which is tracked with [W&B artifacts](https://docs.wandb.com/artifacts) at every step of the pipeline.
49
 
50
  ## Training procedure
51
 
52
  The model is based on a pre-trained [GPT-2](https://huggingface.co/gpt2) which is fine-tuned on André 3000's lyrics.
53
 
54
+ Hyperparameters and metrics are recorded in the [W&B training run](https://wandb.ai/huggingartists/huggingartists/runs/3lbg2cit) for full transparency and reproducibility.
55
 
56
+ At the end of training, [the final model](https://wandb.ai/huggingartists/huggingartists/runs/3lbg2cit/artifacts) is logged and versioned.
57
 
58
  ## How to use
59
 
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "gpt2",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
1
  {
2
+ "_name_or_path": "huggingartists/andre-3000",
3
  "activation_function": "gelu_new",
4
  "architectures": [
5
  "GPT2LMHeadModel"
evaluation.txt CHANGED
@@ -1 +1 @@
1
- {"eval_loss": 3.5885772705078125, "eval_runtime": 3.0977, "eval_samples_per_second": 43.258, "eval_steps_per_second": 5.488, "epoch": 10.0}
1
+ {"eval_loss": 2.7857184410095215, "eval_runtime": 3.2483, "eval_samples_per_second": 43.715, "eval_steps_per_second": 5.541, "epoch": 105.0}
flax_model.msgpack CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4c5fc3549feddb90f7e822ada817b97afa10f79c935226937d817c046cf9f0d
3
  size 497764120
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ca2d40ae75a2d9347d2b27e73aece5aa55fffa7ae760d8a75b258044ec799c4
3
  size 497764120
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1fd9963abebba007a6bd7e6d0556ca7196371944164c15c125dc13d92de7783
3
  size 995604017
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f6ec27a8db3a3d8e12f1403c8e1ddd7d9d8f7c369361110364668a7d3905bcc
3
  size 995604017
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1f5d517dcc771d45d75c794a20a60c90fd1be2cddf41b2acd8053771a778089
3
  size 510403817
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:644cac333a83af4607d4d90810953f70e30bdc7159f4a9dc40f8b242c88c2770
3
  size 510403817
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d62244ae79eb401974a845e8e52733528244c631389781aa6c66540ff1a2ca27
3
- size 14567
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16e34fba951f86dbe52880106da1b406b7bf5d467c83625264b1d27faf3245c0
3
+ size 14503
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eab14c8a89edd442bfb86b7ee9c03bcf272aae22b0b5ce034a32fd2886c2bc24
3
  size 623
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4215fb2fd58e7d9127b8dee8649641af0662fd1b11cda980f7e7dc26e6aa301f
3
  size 623
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "huggingartists/andre-3000", "tokenizer_class": "GPT2Tokenizer"}
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_metric": 3.5885772705078125,
3
- "best_model_checkpoint": "output/andre-3000/checkpoint-500",
4
- "epoch": 5.0,
5
- "global_step": 500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -646,11 +646,639 @@
646
  "eval_samples_per_second": 43.288,
647
  "eval_steps_per_second": 5.492,
648
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
  }
650
  ],
651
- "max_steps": 1000,
652
- "num_train_epochs": 10,
653
- "total_flos": 521277603840000.0,
654
  "trial_name": null,
655
  "trial_params": null
656
  }
1
  {
2
+ "best_metric": 2.7857184410095215,
3
+ "best_model_checkpoint": "output/andre-3000/checkpoint-990",
4
+ "epoch": 10.0,
5
+ "global_step": 990,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
646
  "eval_samples_per_second": 43.288,
647
  "eval_steps_per_second": 5.492,
648
  "step": 500
649
+ },
650
+ {
651
+ "epoch": 5.1,
652
+ "learning_rate": 3.4251213229711433e-06,
653
+ "loss": 2.7141,
654
+ "step": 505
655
+ },
656
+ {
657
+ "epoch": 5.15,
658
+ "learning_rate": 7.625888222272204e-06,
659
+ "loss": 3.1031,
660
+ "step": 510
661
+ },
662
+ {
663
+ "epoch": 5.2,
664
+ "learning_rate": 1.3358460333369401e-05,
665
+ "loss": 2.7596,
666
+ "step": 515
667
+ },
668
+ {
669
+ "epoch": 5.25,
670
+ "learning_rate": 2.0478822703346315e-05,
671
+ "loss": 2.747,
672
+ "step": 520
673
+ },
674
+ {
675
+ "epoch": 5.3,
676
+ "learning_rate": 2.880809600341567e-05,
677
+ "loss": 2.8618,
678
+ "step": 525
679
+ },
680
+ {
681
+ "epoch": 5.35,
682
+ "learning_rate": 3.8137030375243876e-05,
683
+ "loss": 2.811,
684
+ "step": 530
685
+ },
686
+ {
687
+ "epoch": 5.4,
688
+ "learning_rate": 4.823126225248023e-05,
689
+ "loss": 2.8491,
690
+ "step": 535
691
+ },
692
+ {
693
+ "epoch": 5.45,
694
+ "learning_rate": 5.883720209445242e-05,
695
+ "loss": 2.5856,
696
+ "step": 540
697
+ },
698
+ {
699
+ "epoch": 5.51,
700
+ "learning_rate": 6.968840511906774e-05,
701
+ "loss": 2.9258,
702
+ "step": 545
703
+ },
704
+ {
705
+ "epoch": 5.56,
706
+ "learning_rate": 8.051226498795124e-05,
707
+ "loss": 2.9732,
708
+ "step": 550
709
+ },
710
+ {
711
+ "epoch": 5.61,
712
+ "learning_rate": 9.10368622835751e-05,
713
+ "loss": 2.9297,
714
+ "step": 555
715
+ },
716
+ {
717
+ "epoch": 5.66,
718
+ "learning_rate": 0.0001009977957294059,
719
+ "loss": 2.5333,
720
+ "step": 560
721
+ },
722
+ {
723
+ "epoch": 5.71,
724
+ "learning_rate": 0.00011014482453764393,
725
+ "loss": 2.7163,
726
+ "step": 565
727
+ },
728
+ {
729
+ "epoch": 5.76,
730
+ "learning_rate": 0.00011824815501400774,
731
+ "loss": 2.575,
732
+ "step": 570
733
+ },
734
+ {
735
+ "epoch": 5.81,
736
+ "learning_rate": 0.00012510421348608654,
737
+ "loss": 2.6619,
738
+ "step": 575
739
+ },
740
+ {
741
+ "epoch": 5.86,
742
+ "learning_rate": 0.0001305407605264622,
743
+ "loss": 2.8522,
744
+ "step": 580
745
+ },
746
+ {
747
+ "epoch": 5.91,
748
+ "learning_rate": 0.00013442121798995453,
749
+ "loss": 2.9609,
750
+ "step": 585
751
+ },
752
+ {
753
+ "epoch": 5.96,
754
+ "learning_rate": 0.00013664810016019257,
755
+ "loss": 2.7212,
756
+ "step": 590
757
+ },
758
+ {
759
+ "epoch": 6.0,
760
+ "eval_loss": 2.8283281326293945,
761
+ "eval_runtime": 3.2672,
762
+ "eval_samples_per_second": 43.462,
763
+ "eval_steps_per_second": 5.509,
764
+ "step": 594
765
+ },
766
+ {
767
+ "epoch": 6.01,
768
+ "learning_rate": 0.0001371654628074865,
769
+ "loss": 2.8262,
770
+ "step": 595
771
+ },
772
+ {
773
+ "epoch": 6.06,
774
+ "learning_rate": 0.00013596030863222166,
775
+ "loss": 2.45,
776
+ "step": 600
777
+ },
778
+ {
779
+ "epoch": 6.11,
780
+ "learning_rate": 0.00013306291378591335,
781
+ "loss": 2.7517,
782
+ "step": 605
783
+ },
784
+ {
785
+ "epoch": 6.16,
786
+ "learning_rate": 0.00012854606726698723,
787
+ "loss": 2.4939,
788
+ "step": 610
789
+ },
790
+ {
791
+ "epoch": 6.21,
792
+ "learning_rate": 0.00012252324229935523,
793
+ "loss": 2.6832,
794
+ "step": 615
795
+ },
796
+ {
797
+ "epoch": 6.26,
798
+ "learning_rate": 0.0001151457456328192,
799
+ "loss": 2.5246,
800
+ "step": 620
801
+ },
802
+ {
803
+ "epoch": 6.31,
804
+ "learning_rate": 0.00010659891638121517,
805
+ "loss": 2.8753,
806
+ "step": 625
807
+ },
808
+ {
809
+ "epoch": 6.36,
810
+ "learning_rate": 9.70974698919295e-05,
811
+ "loss": 2.8246,
812
+ "step": 630
813
+ },
814
+ {
815
+ "epoch": 6.41,
816
+ "learning_rate": 8.688010361913636e-05,
817
+ "loss": 2.5406,
818
+ "step": 635
819
+ },
820
+ {
821
+ "epoch": 6.46,
822
+ "learning_rate": 7.620350051320941e-05,
823
+ "loss": 2.3994,
824
+ "step": 640
825
+ },
826
+ {
827
+ "epoch": 6.52,
828
+ "learning_rate": 6.53358805744912e-05,
829
+ "loss": 2.4749,
830
+ "step": 645
831
+ },
832
+ {
833
+ "epoch": 6.57,
834
+ "learning_rate": 5.4550262570727935e-05,
835
+ "loss": 2.2935,
836
+ "step": 650
837
+ },
838
+ {
839
+ "epoch": 6.62,
840
+ "learning_rate": 4.4117605198797704e-05,
841
+ "loss": 2.142,
842
+ "step": 655
843
+ },
844
+ {
845
+ "epoch": 6.67,
846
+ "learning_rate": 3.429999999999998e-05,
847
+ "loss": 2.6776,
848
+ "step": 660
849
+ },
850
+ {
851
+ "epoch": 6.72,
852
+ "learning_rate": 2.5344087038001816e-05,
853
+ "loss": 2.6633,
854
+ "step": 665
855
+ },
856
+ {
857
+ "epoch": 6.77,
858
+ "learning_rate": 1.7474858752243343e-05,
859
+ "loss": 2.7227,
860
+ "step": 670
861
+ },
862
+ {
863
+ "epoch": 6.82,
864
+ "learning_rate": 1.0890007647780993e-05,
865
+ "loss": 2.7123,
866
+ "step": 675
867
+ },
868
+ {
869
+ "epoch": 6.87,
870
+ "learning_rate": 5.754959820160079e-06,
871
+ "loss": 2.7949,
872
+ "step": 680
873
+ },
874
+ {
875
+ "epoch": 6.92,
876
+ "learning_rate": 2.1987190842099613e-06,
877
+ "loss": 2.6913,
878
+ "step": 685
879
+ },
880
+ {
881
+ "epoch": 6.97,
882
+ "learning_rate": 3.106261114864048e-07,
883
+ "loss": 2.5163,
884
+ "step": 690
885
+ },
886
+ {
887
+ "epoch": 7.0,
888
+ "eval_loss": 2.7900161743164062,
889
+ "eval_runtime": 3.2724,
890
+ "eval_samples_per_second": 43.393,
891
+ "eval_steps_per_second": 5.501,
892
+ "step": 693
893
+ },
894
+ {
895
+ "epoch": 7.02,
896
+ "learning_rate": 1.3811399402871997e-07,
897
+ "loss": 2.5678,
898
+ "step": 695
899
+ },
900
+ {
901
+ "epoch": 7.07,
902
+ "learning_rate": 1.6855166196610614e-06,
903
+ "loss": 2.4848,
904
+ "step": 700
905
+ },
906
+ {
907
+ "epoch": 7.12,
908
+ "learning_rate": 4.913959795097361e-06,
909
+ "loss": 2.603,
910
+ "step": 705
911
+ },
912
+ {
913
+ "epoch": 7.17,
914
+ "learning_rate": 9.742337852080565e-06,
915
+ "loss": 2.26,
916
+ "step": 710
917
+ },
918
+ {
919
+ "epoch": 7.22,
920
+ "learning_rate": 1.604935120203803e-05,
921
+ "loss": 2.3438,
922
+ "step": 715
923
+ },
924
+ {
925
+ "epoch": 7.27,
926
+ "learning_rate": 2.367655365135328e-05,
927
+ "loss": 2.4659,
928
+ "step": 720
929
+ },
930
+ {
931
+ "epoch": 7.32,
932
+ "learning_rate": 3.243233292191947e-05,
933
+ "loss": 2.1382,
934
+ "step": 725
935
+ },
936
+ {
937
+ "epoch": 7.37,
938
+ "learning_rate": 4.2096724377451216e-05,
939
+ "loss": 2.2814,
940
+ "step": 730
941
+ },
942
+ {
943
+ "epoch": 7.42,
944
+ "learning_rate": 5.242693702405326e-05,
945
+ "loss": 2.212,
946
+ "step": 735
947
+ },
948
+ {
949
+ "epoch": 7.47,
950
+ "learning_rate": 6.316345295962419e-05,
951
+ "loss": 2.4038,
952
+ "step": 740
953
+ },
954
+ {
955
+ "epoch": 7.53,
956
+ "learning_rate": 7.403654704037569e-05,
957
+ "loss": 2.4106,
958
+ "step": 745
959
+ },
960
+ {
961
+ "epoch": 7.58,
962
+ "learning_rate": 8.477306297594661e-05,
963
+ "loss": 2.4768,
964
+ "step": 750
965
+ },
966
+ {
967
+ "epoch": 7.63,
968
+ "learning_rate": 9.510327562254866e-05,
969
+ "loss": 2.3888,
970
+ "step": 755
971
+ },
972
+ {
973
+ "epoch": 7.68,
974
+ "learning_rate": 0.00010476766707808043,
975
+ "loss": 2.4749,
976
+ "step": 760
977
+ },
978
+ {
979
+ "epoch": 7.73,
980
+ "learning_rate": 0.00011352344634864663,
981
+ "loss": 2.5513,
982
+ "step": 765
983
+ },
984
+ {
985
+ "epoch": 7.78,
986
+ "learning_rate": 0.00012115064879796188,
987
+ "loss": 2.5604,
988
+ "step": 770
989
+ },
990
+ {
991
+ "epoch": 7.83,
992
+ "learning_rate": 0.00012745766214791938,
993
+ "loss": 2.3697,
994
+ "step": 775
995
+ },
996
+ {
997
+ "epoch": 7.88,
998
+ "learning_rate": 0.0001322860402049026,
999
+ "loss": 2.3862,
1000
+ "step": 780
1001
+ },
1002
+ {
1003
+ "epoch": 7.93,
1004
+ "learning_rate": 0.0001355144833803389,
1005
+ "loss": 2.323,
1006
+ "step": 785
1007
+ },
1008
+ {
1009
+ "epoch": 7.98,
1010
+ "learning_rate": 0.00013706188600597126,
1011
+ "loss": 2.6325,
1012
+ "step": 790
1013
+ },
1014
+ {
1015
+ "epoch": 8.0,
1016
+ "eval_loss": 2.807237148284912,
1017
+ "eval_runtime": 3.2933,
1018
+ "eval_samples_per_second": 43.118,
1019
+ "eval_steps_per_second": 5.466,
1020
+ "step": 792
1021
+ },
1022
+ {
1023
+ "epoch": 8.03,
1024
+ "learning_rate": 0.00013688937388851358,
1025
+ "loss": 2.3013,
1026
+ "step": 795
1027
+ },
1028
+ {
1029
+ "epoch": 8.08,
1030
+ "learning_rate": 0.00013500128091579002,
1031
+ "loss": 2.5725,
1032
+ "step": 800
1033
+ },
1034
+ {
1035
+ "epoch": 8.13,
1036
+ "learning_rate": 0.00013144504017983998,
1037
+ "loss": 2.2264,
1038
+ "step": 805
1039
+ },
1040
+ {
1041
+ "epoch": 8.18,
1042
+ "learning_rate": 0.00012630999235221907,
1043
+ "loss": 1.9343,
1044
+ "step": 810
1045
+ },
1046
+ {
1047
+ "epoch": 8.23,
1048
+ "learning_rate": 0.0001197251412477569,
1049
+ "loss": 2.2098,
1050
+ "step": 815
1051
+ },
1052
+ {
1053
+ "epoch": 8.28,
1054
+ "learning_rate": 0.00011185591296199847,
1055
+ "loss": 1.9414,
1056
+ "step": 820
1057
+ },
1058
+ {
1059
+ "epoch": 8.33,
1060
+ "learning_rate": 0.00010289999999999993,
1061
+ "loss": 2.5007,
1062
+ "step": 825
1063
+ },
1064
+ {
1065
+ "epoch": 8.38,
1066
+ "learning_rate": 9.308239480120242e-05,
1067
+ "loss": 2.2473,
1068
+ "step": 830
1069
+ },
1070
+ {
1071
+ "epoch": 8.43,
1072
+ "learning_rate": 8.264973742927219e-05,
1073
+ "loss": 2.648,
1074
+ "step": 835
1075
+ },
1076
+ {
1077
+ "epoch": 8.48,
1078
+ "learning_rate": 7.186411942550894e-05,
1079
+ "loss": 2.4495,
1080
+ "step": 840
1081
+ },
1082
+ {
1083
+ "epoch": 8.54,
1084
+ "learning_rate": 6.0996499486790955e-05,
1085
+ "loss": 2.3626,
1086
+ "step": 845
1087
+ },
1088
+ {
1089
+ "epoch": 8.59,
1090
+ "learning_rate": 5.0319896380863535e-05,
1091
+ "loss": 2.3541,
1092
+ "step": 850
1093
+ },
1094
+ {
1095
+ "epoch": 8.64,
1096
+ "learning_rate": 4.0102530108070616e-05,
1097
+ "loss": 2.1531,
1098
+ "step": 855
1099
+ },
1100
+ {
1101
+ "epoch": 8.69,
1102
+ "learning_rate": 3.0601083618784945e-05,
1103
+ "loss": 2.3576,
1104
+ "step": 860
1105
+ },
1106
+ {
1107
+ "epoch": 8.74,
1108
+ "learning_rate": 2.20542543671809e-05,
1109
+ "loss": 1.9389,
1110
+ "step": 865
1111
+ },
1112
+ {
1113
+ "epoch": 8.79,
1114
+ "learning_rate": 1.4676757700644984e-05,
1115
+ "loss": 2.3032,
1116
+ "step": 870
1117
+ },
1118
+ {
1119
+ "epoch": 8.84,
1120
+ "learning_rate": 8.653932733012724e-06,
1121
+ "loss": 2.5286,
1122
+ "step": 875
1123
+ },
1124
+ {
1125
+ "epoch": 8.89,
1126
+ "learning_rate": 4.137086214086705e-06,
1127
+ "loss": 2.1202,
1128
+ "step": 880
1129
+ },
1130
+ {
1131
+ "epoch": 8.94,
1132
+ "learning_rate": 1.2396913677783503e-06,
1133
+ "loss": 2.24,
1134
+ "step": 885
1135
+ },
1136
+ {
1137
+ "epoch": 8.99,
1138
+ "learning_rate": 3.4537192513510953e-08,
1139
+ "loss": 2.1317,
1140
+ "step": 890
1141
+ },
1142
+ {
1143
+ "epoch": 9.0,
1144
+ "eval_loss": 2.797985315322876,
1145
+ "eval_runtime": 3.2929,
1146
+ "eval_samples_per_second": 43.124,
1147
+ "eval_steps_per_second": 5.466,
1148
+ "step": 891
1149
+ },
1150
+ {
1151
+ "epoch": 9.04,
1152
+ "learning_rate": 5.51899839807432e-07,
1153
+ "loss": 2.1029,
1154
+ "step": 895
1155
+ },
1156
+ {
1157
+ "epoch": 9.09,
1158
+ "learning_rate": 2.778782010045488e-06,
1159
+ "loss": 1.9706,
1160
+ "step": 900
1161
+ },
1162
+ {
1163
+ "epoch": 9.14,
1164
+ "learning_rate": 6.6592394735377576e-06,
1165
+ "loss": 2.1198,
1166
+ "step": 905
1167
+ },
1168
+ {
1169
+ "epoch": 9.19,
1170
+ "learning_rate": 1.2095786513913378e-05,
1171
+ "loss": 2.1092,
1172
+ "step": 910
1173
+ },
1174
+ {
1175
+ "epoch": 9.24,
1176
+ "learning_rate": 1.895184498599217e-05,
1177
+ "loss": 2.1326,
1178
+ "step": 915
1179
+ },
1180
+ {
1181
+ "epoch": 9.29,
1182
+ "learning_rate": 2.7055175462355963e-05,
1183
+ "loss": 2.0587,
1184
+ "step": 920
1185
+ },
1186
+ {
1187
+ "epoch": 9.34,
1188
+ "learning_rate": 3.6202204270593973e-05,
1189
+ "loss": 1.9377,
1190
+ "step": 925
1191
+ },
1192
+ {
1193
+ "epoch": 9.39,
1194
+ "learning_rate": 4.6163137716425034e-05,
1195
+ "loss": 2.0585,
1196
+ "step": 930
1197
+ },
1198
+ {
1199
+ "epoch": 9.44,
1200
+ "learning_rate": 5.6687735012048636e-05,
1201
+ "loss": 1.9919,
1202
+ "step": 935
1203
+ },
1204
+ {
1205
+ "epoch": 9.49,
1206
+ "learning_rate": 6.751159488093214e-05,
1207
+ "loss": 1.8915,
1208
+ "step": 940
1209
+ },
1210
+ {
1211
+ "epoch": 9.55,
1212
+ "learning_rate": 7.836279790554722e-05,
1213
+ "loss": 1.871,
1214
+ "step": 945
1215
+ },
1216
+ {
1217
+ "epoch": 9.6,
1218
+ "learning_rate": 8.896873774751943e-05,
1219
+ "loss": 2.1734,
1220
+ "step": 950
1221
+ },
1222
+ {
1223
+ "epoch": 9.65,
1224
+ "learning_rate": 9.906296962475623e-05,
1225
+ "loss": 2.1848,
1226
+ "step": 955
1227
+ },
1228
+ {
1229
+ "epoch": 9.7,
1230
+ "learning_rate": 0.00010839190399658423,
1231
+ "loss": 2.1742,
1232
+ "step": 960
1233
+ },
1234
+ {
1235
+ "epoch": 9.75,
1236
+ "learning_rate": 0.00011672117729665359,
1237
+ "loss": 2.3443,
1238
+ "step": 965
1239
+ },
1240
+ {
1241
+ "epoch": 9.8,
1242
+ "learning_rate": 0.00012384153966663053,
1243
+ "loss": 1.857,
1244
+ "step": 970
1245
+ },
1246
+ {
1247
+ "epoch": 9.85,
1248
+ "learning_rate": 0.00012957411177772762,
1249
+ "loss": 2.2031,
1250
+ "step": 975
1251
+ },
1252
+ {
1253
+ "epoch": 9.9,
1254
+ "learning_rate": 0.0001337748786770289,
1255
+ "loss": 2.1931,
1256
+ "step": 980
1257
+ },
1258
+ {
1259
+ "epoch": 9.95,
1260
+ "learning_rate": 0.00013633830776320065,
1261
+ "loss": 2.0811,
1262
+ "step": 985
1263
+ },
1264
+ {
1265
+ "epoch": 10.0,
1266
+ "learning_rate": 0.0001372,
1267
+ "loss": 2.3998,
1268
+ "step": 990
1269
+ },
1270
+ {
1271
+ "epoch": 10.0,
1272
+ "eval_loss": 2.7857184410095215,
1273
+ "eval_runtime": 3.2779,
1274
+ "eval_samples_per_second": 43.32,
1275
+ "eval_steps_per_second": 5.491,
1276
+ "step": 990
1277
  }
1278
  ],
1279
+ "max_steps": 10395,
1280
+ "num_train_epochs": 105,
1281
+ "total_flos": 1032103526400000.0,
1282
  "trial_name": null,
1283
  "trial_params": null
1284
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5308c09e205af506486abe9769533704d74f4eee1e24a6f594dd6e3b76fa381
3
  size 2863
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87b339441b1f6019e8540c7c71ded5ff035d2f0e1f074c763f744b0da4bc0c37
3
  size 2863