tyzhu commited on
Commit
fb75b3a
·
verified ·
1 Parent(s): 700558e

End of training

Browse files
Files changed (6) hide show
  1. README.md +14 -2
  2. all_results.json +12 -12
  3. eval_results.json +7 -7
  4. tokenizer.json +1 -6
  5. train_results.json +6 -6
  6. trainer_state.json +690 -12
README.md CHANGED
@@ -3,11 +3,23 @@ license: other
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  library_name: peft
12
  ---
13
 
@@ -16,7 +28,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
18
 
19
- This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 0.7825
22
  - Accuracy: 0.7891
 
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx
8
  metrics:
9
  - accuracy
10
  model-index:
11
  - name: lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
12
+ results:
13
+ - task:
14
+ name: Causal Language Modeling
15
+ type: text-generation
16
+ dataset:
17
+ name: tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx
18
+ type: tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx
19
+ metrics:
20
+ - name: Accuracy
21
+ type: accuracy
22
+ value: 0.7890842332613391
23
  library_name: peft
24
  ---
25
 
 
28
 
29
  # lmind_hotpot_train8000_eval7405_v1_docidx_Qwen_Qwen1.5-4B_lora2
30
 
31
+ This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on the tyzhu/lmind_hotpot_train8000_eval7405_v1_docidx dataset.
32
  It achieves the following results on the evaluation set:
33
  - Loss: 0.7825
34
  - Accuracy: 0.7891
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 9.997021149836163,
3
- "eval_accuracy": 0.7691922246220302,
4
- "eval_loss": 1.0623269081115723,
5
- "eval_runtime": 7.649,
6
  "eval_samples": 500,
7
- "eval_samples_per_second": 65.368,
8
- "eval_steps_per_second": 8.236,
9
- "perplexity": 2.8930951295301637,
10
- "total_flos": 6.866381543623885e+17,
11
- "train_loss": 1.1092717030903723,
12
- "train_runtime": 19337.1025,
13
  "train_samples": 26854,
14
- "train_samples_per_second": 13.887,
15
- "train_steps_per_second": 0.434
16
  }
 
1
  {
2
+ "epoch": 19.997021149836165,
3
+ "eval_accuracy": 0.7890842332613391,
4
+ "eval_loss": 0.7825167179107666,
5
+ "eval_runtime": 7.775,
6
  "eval_samples": 500,
7
+ "eval_samples_per_second": 64.309,
8
+ "eval_steps_per_second": 8.103,
9
+ "perplexity": 2.186969330199743,
10
+ "total_flos": 1.3732763132881797e+18,
11
+ "train_loss": 0.18793870911126484,
12
+ "train_runtime": 19785.1606,
13
  "train_samples": 26854,
14
+ "train_samples_per_second": 27.146,
15
+ "train_steps_per_second": 0.848
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 9.997021149836163,
3
- "eval_accuracy": 0.7691922246220302,
4
- "eval_loss": 1.0623269081115723,
5
- "eval_runtime": 7.649,
6
  "eval_samples": 500,
7
- "eval_samples_per_second": 65.368,
8
- "eval_steps_per_second": 8.236,
9
- "perplexity": 2.8930951295301637
10
  }
 
1
  {
2
+ "epoch": 19.997021149836165,
3
+ "eval_accuracy": 0.7890842332613391,
4
+ "eval_loss": 0.7825167179107666,
5
+ "eval_runtime": 7.775,
6
  "eval_samples": 500,
7
+ "eval_samples_per_second": 64.309,
8
+ "eval_steps_per_second": 8.103,
9
+ "perplexity": 2.186969330199743
10
  }
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 1024,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 9.997021149836163,
3
- "total_flos": 6.866381543623885e+17,
4
- "train_loss": 1.1092717030903723,
5
- "train_runtime": 19337.1025,
6
  "train_samples": 26854,
7
- "train_samples_per_second": 13.887,
8
- "train_steps_per_second": 0.434
9
  }
 
1
  {
2
+ "epoch": 19.997021149836165,
3
+ "total_flos": 1.3732763132881797e+18,
4
+ "train_loss": 0.18793870911126484,
5
+ "train_runtime": 19785.1606,
6
  "train_samples": 26854,
7
+ "train_samples_per_second": 27.146,
8
+ "train_steps_per_second": 0.848
9
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.997021149836163,
5
  "eval_steps": 500,
6
- "global_step": 8390,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -680,21 +680,699 @@
680
  "step": 8390
681
  },
682
  {
683
- "epoch": 9.997021149836163,
684
- "step": 8390,
685
- "total_flos": 6.866381543623885e+17,
686
- "train_loss": 1.1092717030903723,
687
- "train_runtime": 19337.1025,
688
- "train_samples_per_second": 13.887,
689
- "train_steps_per_second": 0.434
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
  }
691
  ],
692
  "logging_steps": 100,
693
- "max_steps": 8390,
694
  "num_input_tokens_seen": 0,
695
- "num_train_epochs": 10,
696
  "save_steps": 500,
697
- "total_flos": 6.866381543623885e+17,
698
  "train_batch_size": 1,
699
  "trial_name": null,
700
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 19.997021149836165,
5
  "eval_steps": 500,
6
+ "global_step": 16780,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
680
  "step": 8390
681
  },
682
  {
683
+ "epoch": 10.011915400655347,
684
+ "grad_norm": 1.1237155199050903,
685
+ "learning_rate": 0.0001,
686
+ "loss": 0.5067,
687
+ "step": 8400
688
+ },
689
+ {
690
+ "epoch": 10.131069407208818,
691
+ "grad_norm": 1.2166295051574707,
692
+ "learning_rate": 0.0001,
693
+ "loss": 0.5138,
694
+ "step": 8500
695
+ },
696
+ {
697
+ "epoch": 10.250223413762289,
698
+ "grad_norm": 1.1621054410934448,
699
+ "learning_rate": 0.0001,
700
+ "loss": 0.5314,
701
+ "step": 8600
702
+ },
703
+ {
704
+ "epoch": 10.369377420315757,
705
+ "grad_norm": 1.082357406616211,
706
+ "learning_rate": 0.0001,
707
+ "loss": 0.5358,
708
+ "step": 8700
709
+ },
710
+ {
711
+ "epoch": 10.488531426869228,
712
+ "grad_norm": 1.3300856351852417,
713
+ "learning_rate": 0.0001,
714
+ "loss": 0.5464,
715
+ "step": 8800
716
+ },
717
+ {
718
+ "epoch": 10.607685433422699,
719
+ "grad_norm": 1.1997504234313965,
720
+ "learning_rate": 0.0001,
721
+ "loss": 0.5422,
722
+ "step": 8900
723
+ },
724
+ {
725
+ "epoch": 10.72683943997617,
726
+ "grad_norm": 1.4044774770736694,
727
+ "learning_rate": 0.0001,
728
+ "loss": 0.5567,
729
+ "step": 9000
730
+ },
731
+ {
732
+ "epoch": 10.84599344652964,
733
+ "grad_norm": 1.215906023979187,
734
+ "learning_rate": 0.0001,
735
+ "loss": 0.562,
736
+ "step": 9100
737
+ },
738
+ {
739
+ "epoch": 10.96514745308311,
740
+ "grad_norm": 1.2198461294174194,
741
+ "learning_rate": 0.0001,
742
+ "loss": 0.5784,
743
+ "step": 9200
744
+ },
745
+ {
746
+ "epoch": 10.999702114983616,
747
+ "eval_accuracy": 0.7730842332613391,
748
+ "eval_loss": 1.010130763053894,
749
+ "eval_runtime": 7.8978,
750
+ "eval_samples_per_second": 63.309,
751
+ "eval_steps_per_second": 7.977,
752
+ "step": 9229
753
+ },
754
+ {
755
+ "epoch": 11.08430145963658,
756
+ "grad_norm": 1.3466765880584717,
757
+ "learning_rate": 0.0001,
758
+ "loss": 0.4876,
759
+ "step": 9300
760
+ },
761
+ {
762
+ "epoch": 11.20345546619005,
763
+ "grad_norm": 1.3371703624725342,
764
+ "learning_rate": 0.0001,
765
+ "loss": 0.4666,
766
+ "step": 9400
767
+ },
768
+ {
769
+ "epoch": 11.32260947274352,
770
+ "grad_norm": 1.3903799057006836,
771
+ "learning_rate": 0.0001,
772
+ "loss": 0.4819,
773
+ "step": 9500
774
+ },
775
+ {
776
+ "epoch": 11.441763479296991,
777
+ "grad_norm": 1.261116623878479,
778
+ "learning_rate": 0.0001,
779
+ "loss": 0.4857,
780
+ "step": 9600
781
+ },
782
+ {
783
+ "epoch": 11.560917485850462,
784
+ "grad_norm": 1.4429560899734497,
785
+ "learning_rate": 0.0001,
786
+ "loss": 0.4941,
787
+ "step": 9700
788
+ },
789
+ {
790
+ "epoch": 11.680071492403933,
791
+ "grad_norm": 1.268157958984375,
792
+ "learning_rate": 0.0001,
793
+ "loss": 0.4913,
794
+ "step": 9800
795
+ },
796
+ {
797
+ "epoch": 11.799225498957403,
798
+ "grad_norm": 1.3515466451644897,
799
+ "learning_rate": 0.0001,
800
+ "loss": 0.5074,
801
+ "step": 9900
802
+ },
803
+ {
804
+ "epoch": 11.918379505510872,
805
+ "grad_norm": 1.343897819519043,
806
+ "learning_rate": 0.0001,
807
+ "loss": 0.5071,
808
+ "step": 10000
809
+ },
810
+ {
811
+ "epoch": 11.999404229967233,
812
+ "eval_accuracy": 0.7760388768898488,
813
+ "eval_loss": 0.9537739753723145,
814
+ "eval_runtime": 7.6808,
815
+ "eval_samples_per_second": 65.098,
816
+ "eval_steps_per_second": 8.202,
817
+ "step": 10068
818
+ },
819
+ {
820
+ "epoch": 12.037533512064343,
821
+ "grad_norm": 1.2503083944320679,
822
+ "learning_rate": 0.0001,
823
+ "loss": 0.4805,
824
+ "step": 10100
825
+ },
826
+ {
827
+ "epoch": 12.156687518617813,
828
+ "grad_norm": 1.1671448945999146,
829
+ "learning_rate": 0.0001,
830
+ "loss": 0.4098,
831
+ "step": 10200
832
+ },
833
+ {
834
+ "epoch": 12.275841525171284,
835
+ "grad_norm": 1.2349199056625366,
836
+ "learning_rate": 0.0001,
837
+ "loss": 0.4222,
838
+ "step": 10300
839
+ },
840
+ {
841
+ "epoch": 12.394995531724755,
842
+ "grad_norm": 1.3856853246688843,
843
+ "learning_rate": 0.0001,
844
+ "loss": 0.4282,
845
+ "step": 10400
846
+ },
847
+ {
848
+ "epoch": 12.514149538278225,
849
+ "grad_norm": 1.2162753343582153,
850
+ "learning_rate": 0.0001,
851
+ "loss": 0.4353,
852
+ "step": 10500
853
+ },
854
+ {
855
+ "epoch": 12.633303544831694,
856
+ "grad_norm": 1.4368740320205688,
857
+ "learning_rate": 0.0001,
858
+ "loss": 0.4506,
859
+ "step": 10600
860
+ },
861
+ {
862
+ "epoch": 12.752457551385165,
863
+ "grad_norm": 1.2878433465957642,
864
+ "learning_rate": 0.0001,
865
+ "loss": 0.4607,
866
+ "step": 10700
867
+ },
868
+ {
869
+ "epoch": 12.871611557938635,
870
+ "grad_norm": 1.3356980085372925,
871
+ "learning_rate": 0.0001,
872
+ "loss": 0.4651,
873
+ "step": 10800
874
+ },
875
+ {
876
+ "epoch": 12.990765564492106,
877
+ "grad_norm": 1.4646358489990234,
878
+ "learning_rate": 0.0001,
879
+ "loss": 0.4734,
880
+ "step": 10900
881
+ },
882
+ {
883
+ "epoch": 12.99910634495085,
884
+ "eval_accuracy": 0.7790539956803456,
885
+ "eval_loss": 0.9292365908622742,
886
+ "eval_runtime": 7.7765,
887
+ "eval_samples_per_second": 64.296,
888
+ "eval_steps_per_second": 8.101,
889
+ "step": 10907
890
+ },
891
+ {
892
+ "epoch": 13.109919571045577,
893
+ "grad_norm": 1.0571186542510986,
894
+ "learning_rate": 0.0001,
895
+ "loss": 0.3875,
896
+ "step": 11000
897
+ },
898
+ {
899
+ "epoch": 13.229073577599047,
900
+ "grad_norm": 1.5074485540390015,
901
+ "learning_rate": 0.0001,
902
+ "loss": 0.3734,
903
+ "step": 11100
904
+ },
905
+ {
906
+ "epoch": 13.348227584152518,
907
+ "grad_norm": 1.0904532670974731,
908
+ "learning_rate": 0.0001,
909
+ "loss": 0.3906,
910
+ "step": 11200
911
+ },
912
+ {
913
+ "epoch": 13.467381590705987,
914
+ "grad_norm": 1.1496778726577759,
915
+ "learning_rate": 0.0001,
916
+ "loss": 0.3999,
917
+ "step": 11300
918
+ },
919
+ {
920
+ "epoch": 13.586535597259457,
921
+ "grad_norm": 1.4542232751846313,
922
+ "learning_rate": 0.0001,
923
+ "loss": 0.4048,
924
+ "step": 11400
925
+ },
926
+ {
927
+ "epoch": 13.705689603812928,
928
+ "grad_norm": 1.6107604503631592,
929
+ "learning_rate": 0.0001,
930
+ "loss": 0.4098,
931
+ "step": 11500
932
+ },
933
+ {
934
+ "epoch": 13.824843610366399,
935
+ "grad_norm": 1.6126165390014648,
936
+ "learning_rate": 0.0001,
937
+ "loss": 0.4141,
938
+ "step": 11600
939
+ },
940
+ {
941
+ "epoch": 13.94399761691987,
942
+ "grad_norm": 1.5488628149032593,
943
+ "learning_rate": 0.0001,
944
+ "loss": 0.4302,
945
+ "step": 11700
946
+ },
947
+ {
948
+ "epoch": 14.0,
949
+ "eval_accuracy": 0.7808639308855292,
950
+ "eval_loss": 0.8845791220664978,
951
+ "eval_runtime": 7.816,
952
+ "eval_samples_per_second": 63.971,
953
+ "eval_steps_per_second": 8.06,
954
+ "step": 11747
955
+ },
956
+ {
957
+ "epoch": 14.06315162347334,
958
+ "grad_norm": 1.1683179140090942,
959
+ "learning_rate": 0.0001,
960
+ "loss": 0.3842,
961
+ "step": 11800
962
+ },
963
+ {
964
+ "epoch": 14.182305630026809,
965
+ "grad_norm": 1.4228167533874512,
966
+ "learning_rate": 0.0001,
967
+ "loss": 0.3452,
968
+ "step": 11900
969
+ },
970
+ {
971
+ "epoch": 14.30145963658028,
972
+ "grad_norm": 1.3090318441390991,
973
+ "learning_rate": 0.0001,
974
+ "loss": 0.3518,
975
+ "step": 12000
976
+ },
977
+ {
978
+ "epoch": 14.42061364313375,
979
+ "grad_norm": 1.3104828596115112,
980
+ "learning_rate": 0.0001,
981
+ "loss": 0.3632,
982
+ "step": 12100
983
+ },
984
+ {
985
+ "epoch": 14.53976764968722,
986
+ "grad_norm": 1.5828286409378052,
987
+ "learning_rate": 0.0001,
988
+ "loss": 0.3737,
989
+ "step": 12200
990
+ },
991
+ {
992
+ "epoch": 14.658921656240691,
993
+ "grad_norm": 1.3281217813491821,
994
+ "learning_rate": 0.0001,
995
+ "loss": 0.3705,
996
+ "step": 12300
997
+ },
998
+ {
999
+ "epoch": 14.778075662794162,
1000
+ "grad_norm": 1.753090500831604,
1001
+ "learning_rate": 0.0001,
1002
+ "loss": 0.3839,
1003
+ "step": 12400
1004
+ },
1005
+ {
1006
+ "epoch": 14.897229669347631,
1007
+ "grad_norm": 1.3213167190551758,
1008
+ "learning_rate": 0.0001,
1009
+ "loss": 0.3917,
1010
+ "step": 12500
1011
+ },
1012
+ {
1013
+ "epoch": 14.999702114983616,
1014
+ "eval_accuracy": 0.7832613390928725,
1015
+ "eval_loss": 0.8536106944084167,
1016
+ "eval_runtime": 7.6693,
1017
+ "eval_samples_per_second": 65.195,
1018
+ "eval_steps_per_second": 8.215,
1019
+ "step": 12586
1020
+ },
1021
+ {
1022
+ "epoch": 15.016383675901102,
1023
+ "grad_norm": 1.5647796392440796,
1024
+ "learning_rate": 0.0001,
1025
+ "loss": 0.3796,
1026
+ "step": 12600
1027
+ },
1028
+ {
1029
+ "epoch": 15.135537682454572,
1030
+ "grad_norm": 1.903051495552063,
1031
+ "learning_rate": 0.0001,
1032
+ "loss": 0.3098,
1033
+ "step": 12700
1034
+ },
1035
+ {
1036
+ "epoch": 15.254691689008043,
1037
+ "grad_norm": 1.4900842905044556,
1038
+ "learning_rate": 0.0001,
1039
+ "loss": 0.3291,
1040
+ "step": 12800
1041
+ },
1042
+ {
1043
+ "epoch": 15.373845695561513,
1044
+ "grad_norm": 1.3253552913665771,
1045
+ "learning_rate": 0.0001,
1046
+ "loss": 0.3284,
1047
+ "step": 12900
1048
+ },
1049
+ {
1050
+ "epoch": 15.492999702114984,
1051
+ "grad_norm": 1.582220196723938,
1052
+ "learning_rate": 0.0001,
1053
+ "loss": 0.345,
1054
+ "step": 13000
1055
+ },
1056
+ {
1057
+ "epoch": 15.612153708668455,
1058
+ "grad_norm": 1.4743067026138306,
1059
+ "learning_rate": 0.0001,
1060
+ "loss": 0.3482,
1061
+ "step": 13100
1062
+ },
1063
+ {
1064
+ "epoch": 15.731307715221924,
1065
+ "grad_norm": 1.5687114000320435,
1066
+ "learning_rate": 0.0001,
1067
+ "loss": 0.3573,
1068
+ "step": 13200
1069
+ },
1070
+ {
1071
+ "epoch": 15.850461721775394,
1072
+ "grad_norm": 1.5427637100219727,
1073
+ "learning_rate": 0.0001,
1074
+ "loss": 0.3531,
1075
+ "step": 13300
1076
+ },
1077
+ {
1078
+ "epoch": 15.969615728328865,
1079
+ "grad_norm": 1.621741533279419,
1080
+ "learning_rate": 0.0001,
1081
+ "loss": 0.3632,
1082
+ "step": 13400
1083
+ },
1084
+ {
1085
+ "epoch": 15.999404229967233,
1086
+ "eval_accuracy": 0.7846133909287257,
1087
+ "eval_loss": 0.8468108177185059,
1088
+ "eval_runtime": 7.8749,
1089
+ "eval_samples_per_second": 63.493,
1090
+ "eval_steps_per_second": 8.0,
1091
+ "step": 13425
1092
+ },
1093
+ {
1094
+ "epoch": 16.088769734882334,
1095
+ "grad_norm": 1.3480048179626465,
1096
+ "learning_rate": 0.0001,
1097
+ "loss": 0.3116,
1098
+ "step": 13500
1099
+ },
1100
+ {
1101
+ "epoch": 16.207923741435806,
1102
+ "grad_norm": 1.3218774795532227,
1103
+ "learning_rate": 0.0001,
1104
+ "loss": 0.2957,
1105
+ "step": 13600
1106
+ },
1107
+ {
1108
+ "epoch": 16.327077747989275,
1109
+ "grad_norm": 1.5867496728897095,
1110
+ "learning_rate": 0.0001,
1111
+ "loss": 0.3087,
1112
+ "step": 13700
1113
+ },
1114
+ {
1115
+ "epoch": 16.446231754542747,
1116
+ "grad_norm": 1.3426684141159058,
1117
+ "learning_rate": 0.0001,
1118
+ "loss": 0.315,
1119
+ "step": 13800
1120
+ },
1121
+ {
1122
+ "epoch": 16.565385761096216,
1123
+ "grad_norm": 1.5667626857757568,
1124
+ "learning_rate": 0.0001,
1125
+ "loss": 0.3212,
1126
+ "step": 13900
1127
+ },
1128
+ {
1129
+ "epoch": 16.68453976764969,
1130
+ "grad_norm": 1.3792177438735962,
1131
+ "learning_rate": 0.0001,
1132
+ "loss": 0.3258,
1133
+ "step": 14000
1134
+ },
1135
+ {
1136
+ "epoch": 16.803693774203158,
1137
+ "grad_norm": 1.5244312286376953,
1138
+ "learning_rate": 0.0001,
1139
+ "loss": 0.3337,
1140
+ "step": 14100
1141
+ },
1142
+ {
1143
+ "epoch": 16.922847780756626,
1144
+ "grad_norm": 1.598897933959961,
1145
+ "learning_rate": 0.0001,
1146
+ "loss": 0.3351,
1147
+ "step": 14200
1148
+ },
1149
+ {
1150
+ "epoch": 16.99910634495085,
1151
+ "eval_accuracy": 0.7862505399568035,
1152
+ "eval_loss": 0.8243688941001892,
1153
+ "eval_runtime": 7.7339,
1154
+ "eval_samples_per_second": 64.651,
1155
+ "eval_steps_per_second": 8.146,
1156
+ "step": 14264
1157
+ },
1158
+ {
1159
+ "epoch": 17.0420017873101,
1160
+ "grad_norm": 1.5057127475738525,
1161
+ "learning_rate": 0.0001,
1162
+ "loss": 0.3195,
1163
+ "step": 14300
1164
+ },
1165
+ {
1166
+ "epoch": 17.161155793863568,
1167
+ "grad_norm": 1.175302505493164,
1168
+ "learning_rate": 0.0001,
1169
+ "loss": 0.2775,
1170
+ "step": 14400
1171
+ },
1172
+ {
1173
+ "epoch": 17.28030980041704,
1174
+ "grad_norm": 1.4136508703231812,
1175
+ "learning_rate": 0.0001,
1176
+ "loss": 0.289,
1177
+ "step": 14500
1178
+ },
1179
+ {
1180
+ "epoch": 17.39946380697051,
1181
+ "grad_norm": 1.5237888097763062,
1182
+ "learning_rate": 0.0001,
1183
+ "loss": 0.2897,
1184
+ "step": 14600
1185
+ },
1186
+ {
1187
+ "epoch": 17.51861781352398,
1188
+ "grad_norm": 1.4952021837234497,
1189
+ "learning_rate": 0.0001,
1190
+ "loss": 0.3002,
1191
+ "step": 14700
1192
+ },
1193
+ {
1194
+ "epoch": 17.63777182007745,
1195
+ "grad_norm": 1.4359500408172607,
1196
+ "learning_rate": 0.0001,
1197
+ "loss": 0.2994,
1198
+ "step": 14800
1199
+ },
1200
+ {
1201
+ "epoch": 17.75692582663092,
1202
+ "grad_norm": 1.2760846614837646,
1203
+ "learning_rate": 0.0001,
1204
+ "loss": 0.3085,
1205
+ "step": 14900
1206
+ },
1207
+ {
1208
+ "epoch": 17.87607983318439,
1209
+ "grad_norm": 1.4437624216079712,
1210
+ "learning_rate": 0.0001,
1211
+ "loss": 0.3141,
1212
+ "step": 15000
1213
+ },
1214
+ {
1215
+ "epoch": 17.99523383973786,
1216
+ "grad_norm": 1.4819003343582153,
1217
+ "learning_rate": 0.0001,
1218
+ "loss": 0.3186,
1219
+ "step": 15100
1220
+ },
1221
+ {
1222
+ "epoch": 18.0,
1223
+ "eval_accuracy": 0.7870842332613391,
1224
+ "eval_loss": 0.8095716834068298,
1225
+ "eval_runtime": 7.8907,
1226
+ "eval_samples_per_second": 63.366,
1227
+ "eval_steps_per_second": 7.984,
1228
+ "step": 15104
1229
+ },
1230
+ {
1231
+ "epoch": 18.114387846291333,
1232
+ "grad_norm": 1.4048779010772705,
1233
+ "learning_rate": 0.0001,
1234
+ "loss": 0.2551,
1235
+ "step": 15200
1236
+ },
1237
+ {
1238
+ "epoch": 18.2335418528448,
1239
+ "grad_norm": 1.2755182981491089,
1240
+ "learning_rate": 0.0001,
1241
+ "loss": 0.2625,
1242
+ "step": 15300
1243
+ },
1244
+ {
1245
+ "epoch": 18.35269585939827,
1246
+ "grad_norm": 1.5804539918899536,
1247
+ "learning_rate": 0.0001,
1248
+ "loss": 0.2733,
1249
+ "step": 15400
1250
+ },
1251
+ {
1252
+ "epoch": 18.471849865951743,
1253
+ "grad_norm": 1.2414181232452393,
1254
+ "learning_rate": 0.0001,
1255
+ "loss": 0.2827,
1256
+ "step": 15500
1257
+ },
1258
+ {
1259
+ "epoch": 18.591003872505212,
1260
+ "grad_norm": 1.2644073963165283,
1261
+ "learning_rate": 0.0001,
1262
+ "loss": 0.2873,
1263
+ "step": 15600
1264
+ },
1265
+ {
1266
+ "epoch": 18.710157879058684,
1267
+ "grad_norm": 1.8947105407714844,
1268
+ "learning_rate": 0.0001,
1269
+ "loss": 0.2913,
1270
+ "step": 15700
1271
+ },
1272
+ {
1273
+ "epoch": 18.829311885612153,
1274
+ "grad_norm": 1.429527759552002,
1275
+ "learning_rate": 0.0001,
1276
+ "loss": 0.2977,
1277
+ "step": 15800
1278
+ },
1279
+ {
1280
+ "epoch": 18.948465892165625,
1281
+ "grad_norm": 1.8058485984802246,
1282
+ "learning_rate": 0.0001,
1283
+ "loss": 0.2957,
1284
+ "step": 15900
1285
+ },
1286
+ {
1287
+ "epoch": 18.999702114983616,
1288
+ "eval_accuracy": 0.7885053995680346,
1289
+ "eval_loss": 0.7864968776702881,
1290
+ "eval_runtime": 7.7324,
1291
+ "eval_samples_per_second": 64.663,
1292
+ "eval_steps_per_second": 8.148,
1293
+ "step": 15943
1294
+ },
1295
+ {
1296
+ "epoch": 19.067619898719094,
1297
+ "grad_norm": 1.3445817232131958,
1298
+ "learning_rate": 0.0001,
1299
+ "loss": 0.2748,
1300
+ "step": 16000
1301
+ },
1302
+ {
1303
+ "epoch": 19.186773905272563,
1304
+ "grad_norm": 1.358742356300354,
1305
+ "learning_rate": 0.0001,
1306
+ "loss": 0.2494,
1307
+ "step": 16100
1308
+ },
1309
+ {
1310
+ "epoch": 19.305927911826036,
1311
+ "grad_norm": 1.2323551177978516,
1312
+ "learning_rate": 0.0001,
1313
+ "loss": 0.255,
1314
+ "step": 16200
1315
+ },
1316
+ {
1317
+ "epoch": 19.425081918379504,
1318
+ "grad_norm": 1.210010290145874,
1319
+ "learning_rate": 0.0001,
1320
+ "loss": 0.2592,
1321
+ "step": 16300
1322
+ },
1323
+ {
1324
+ "epoch": 19.544235924932977,
1325
+ "grad_norm": 1.4125585556030273,
1326
+ "learning_rate": 0.0001,
1327
+ "loss": 0.2664,
1328
+ "step": 16400
1329
+ },
1330
+ {
1331
+ "epoch": 19.663389931486446,
1332
+ "grad_norm": 1.5337769985198975,
1333
+ "learning_rate": 0.0001,
1334
+ "loss": 0.2741,
1335
+ "step": 16500
1336
+ },
1337
+ {
1338
+ "epoch": 19.782543938039918,
1339
+ "grad_norm": 1.4104398488998413,
1340
+ "learning_rate": 0.0001,
1341
+ "loss": 0.277,
1342
+ "step": 16600
1343
+ },
1344
+ {
1345
+ "epoch": 19.901697944593387,
1346
+ "grad_norm": 1.6847435235977173,
1347
+ "learning_rate": 0.0001,
1348
+ "loss": 0.2858,
1349
+ "step": 16700
1350
+ },
1351
+ {
1352
+ "epoch": 19.997021149836165,
1353
+ "eval_accuracy": 0.7890842332613391,
1354
+ "eval_loss": 0.7825167179107666,
1355
+ "eval_runtime": 7.7619,
1356
+ "eval_samples_per_second": 64.417,
1357
+ "eval_steps_per_second": 8.117,
1358
+ "step": 16780
1359
+ },
1360
+ {
1361
+ "epoch": 19.997021149836165,
1362
+ "step": 16780,
1363
+ "total_flos": 1.3732763132881797e+18,
1364
+ "train_loss": 0.18793870911126484,
1365
+ "train_runtime": 19785.1606,
1366
+ "train_samples_per_second": 27.146,
1367
+ "train_steps_per_second": 0.848
1368
  }
1369
  ],
1370
  "logging_steps": 100,
1371
+ "max_steps": 16780,
1372
  "num_input_tokens_seen": 0,
1373
+ "num_train_epochs": 20,
1374
  "save_steps": 500,
1375
+ "total_flos": 1.3732763132881797e+18,
1376
  "train_batch_size": 1,
1377
  "trial_name": null,
1378
  "trial_params": null