MeedoSam commited on
Commit
cdb1cf0
1 Parent(s): 5116de3

Uploaded checkpoint-1500

Browse files
Files changed (5) hide show
  1. model.safetensors +1 -1
  2. optimizer.pt +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +361 -3
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8911772aaa706974b723f2bf7d3b98b7c2ae73c0dbc7dddde6a2d848e652d94
3
  size 2836579040
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d10c905060a8fb9799b74c25d481c34611c9de9b480817812d33857c534c228a
3
  size 2836579040
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6570b389af75084935ca1b3962d21e1284890e5644db6f6a2f34573d9091689c
3
  size 5673376169
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:977e579e2ef1ecdd4b4a2a07787051dc204b437a958f6bb3774f7b810fdf5fba
3
  size 5673376169
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d532e7d9583dca1ddde0e710f735c5380d765e13138b2f1a520634f9ce1c4336
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba1ab4afddc1b42e59e07ab68c21af6f77eaf33ebe920a8a196bc4d34ce85d64
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e7730c869a76c8e036e0c188b1763b2a2fec511ae6277c0d9eb703a1bcc3fee9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b5ba5f4da1b25af3b36501cdb417cae3a94cdecabb4332617c829913a9d9c0a
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.05,
5
  "eval_steps": 500,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -723,6 +723,364 @@
723
  "eval_samples_per_second": 15.395,
724
  "eval_steps_per_second": 15.395,
725
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  }
727
  ],
728
  "logging_steps": 10,
@@ -730,7 +1088,7 @@
730
  "num_input_tokens_seen": 0,
731
  "num_train_epochs": 1,
732
  "save_steps": 500,
733
- "total_flos": 1.613922041856e+16,
734
  "train_batch_size": 1,
735
  "trial_name": null,
736
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.075,
5
  "eval_steps": 500,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
723
  "eval_samples_per_second": 15.395,
724
  "eval_steps_per_second": 15.395,
725
  "step": 1000
726
+ },
727
+ {
728
+ "epoch": 0.05,
729
+ "grad_norm": 65.0,
730
+ "learning_rate": 7.260000000000001e-07,
731
+ "loss": 1.2767,
732
+ "step": 1010
733
+ },
734
+ {
735
+ "epoch": 0.05,
736
+ "grad_norm": 63.25,
737
+ "learning_rate": 7.186666666666667e-07,
738
+ "loss": 1.2266,
739
+ "step": 1020
740
+ },
741
+ {
742
+ "epoch": 0.05,
743
+ "grad_norm": 63.75,
744
+ "learning_rate": 7.113333333333333e-07,
745
+ "loss": 1.2458,
746
+ "step": 1030
747
+ },
748
+ {
749
+ "epoch": 0.05,
750
+ "grad_norm": 63.0,
751
+ "learning_rate": 7.040000000000001e-07,
752
+ "loss": 1.1768,
753
+ "step": 1040
754
+ },
755
+ {
756
+ "epoch": 0.05,
757
+ "grad_norm": 62.75,
758
+ "learning_rate": 6.966666666666666e-07,
759
+ "loss": 1.1844,
760
+ "step": 1050
761
+ },
762
+ {
763
+ "epoch": 0.05,
764
+ "grad_norm": 62.25,
765
+ "learning_rate": 6.893333333333334e-07,
766
+ "loss": 1.186,
767
+ "step": 1060
768
+ },
769
+ {
770
+ "epoch": 0.05,
771
+ "grad_norm": 62.75,
772
+ "learning_rate": 6.82e-07,
773
+ "loss": 1.1551,
774
+ "step": 1070
775
+ },
776
+ {
777
+ "epoch": 0.05,
778
+ "grad_norm": 60.25,
779
+ "learning_rate": 6.746666666666667e-07,
780
+ "loss": 1.1218,
781
+ "step": 1080
782
+ },
783
+ {
784
+ "epoch": 0.05,
785
+ "grad_norm": 61.25,
786
+ "learning_rate": 6.673333333333334e-07,
787
+ "loss": 1.0962,
788
+ "step": 1090
789
+ },
790
+ {
791
+ "epoch": 0.06,
792
+ "grad_norm": 60.5,
793
+ "learning_rate": 6.6e-07,
794
+ "loss": 1.1204,
795
+ "step": 1100
796
+ },
797
+ {
798
+ "epoch": 0.06,
799
+ "grad_norm": 58.25,
800
+ "learning_rate": 6.526666666666667e-07,
801
+ "loss": 1.0833,
802
+ "step": 1110
803
+ },
804
+ {
805
+ "epoch": 0.06,
806
+ "grad_norm": 57.25,
807
+ "learning_rate": 6.453333333333334e-07,
808
+ "loss": 1.0743,
809
+ "step": 1120
810
+ },
811
+ {
812
+ "epoch": 0.06,
813
+ "grad_norm": 60.25,
814
+ "learning_rate": 6.38e-07,
815
+ "loss": 1.0764,
816
+ "step": 1130
817
+ },
818
+ {
819
+ "epoch": 0.06,
820
+ "grad_norm": 56.0,
821
+ "learning_rate": 6.306666666666668e-07,
822
+ "loss": 1.0315,
823
+ "step": 1140
824
+ },
825
+ {
826
+ "epoch": 0.06,
827
+ "grad_norm": 59.25,
828
+ "learning_rate": 6.233333333333333e-07,
829
+ "loss": 1.0791,
830
+ "step": 1150
831
+ },
832
+ {
833
+ "epoch": 0.06,
834
+ "grad_norm": 60.0,
835
+ "learning_rate": 6.160000000000001e-07,
836
+ "loss": 1.0443,
837
+ "step": 1160
838
+ },
839
+ {
840
+ "epoch": 0.06,
841
+ "grad_norm": 60.75,
842
+ "learning_rate": 6.086666666666667e-07,
843
+ "loss": 1.0472,
844
+ "step": 1170
845
+ },
846
+ {
847
+ "epoch": 0.06,
848
+ "grad_norm": 59.75,
849
+ "learning_rate": 6.013333333333334e-07,
850
+ "loss": 1.0422,
851
+ "step": 1180
852
+ },
853
+ {
854
+ "epoch": 0.06,
855
+ "grad_norm": 58.5,
856
+ "learning_rate": 5.94e-07,
857
+ "loss": 1.051,
858
+ "step": 1190
859
+ },
860
+ {
861
+ "epoch": 0.06,
862
+ "grad_norm": 57.25,
863
+ "learning_rate": 5.866666666666667e-07,
864
+ "loss": 1.0104,
865
+ "step": 1200
866
+ },
867
+ {
868
+ "epoch": 0.06,
869
+ "grad_norm": 58.5,
870
+ "learning_rate": 5.793333333333333e-07,
871
+ "loss": 1.0429,
872
+ "step": 1210
873
+ },
874
+ {
875
+ "epoch": 0.06,
876
+ "grad_norm": 60.25,
877
+ "learning_rate": 5.720000000000001e-07,
878
+ "loss": 1.0135,
879
+ "step": 1220
880
+ },
881
+ {
882
+ "epoch": 0.06,
883
+ "grad_norm": 58.0,
884
+ "learning_rate": 5.646666666666667e-07,
885
+ "loss": 1.0441,
886
+ "step": 1230
887
+ },
888
+ {
889
+ "epoch": 0.06,
890
+ "grad_norm": 57.25,
891
+ "learning_rate": 5.573333333333335e-07,
892
+ "loss": 1.0202,
893
+ "step": 1240
894
+ },
895
+ {
896
+ "epoch": 0.06,
897
+ "grad_norm": 54.5,
898
+ "learning_rate": 5.5e-07,
899
+ "loss": 0.9915,
900
+ "step": 1250
901
+ },
902
+ {
903
+ "epoch": 0.06,
904
+ "grad_norm": 56.75,
905
+ "learning_rate": 5.426666666666667e-07,
906
+ "loss": 1.0085,
907
+ "step": 1260
908
+ },
909
+ {
910
+ "epoch": 0.06,
911
+ "grad_norm": 58.75,
912
+ "learning_rate": 5.353333333333334e-07,
913
+ "loss": 1.0114,
914
+ "step": 1270
915
+ },
916
+ {
917
+ "epoch": 0.06,
918
+ "grad_norm": 56.0,
919
+ "learning_rate": 5.28e-07,
920
+ "loss": 1.0092,
921
+ "step": 1280
922
+ },
923
+ {
924
+ "epoch": 0.06,
925
+ "grad_norm": 54.25,
926
+ "learning_rate": 5.206666666666666e-07,
927
+ "loss": 0.9634,
928
+ "step": 1290
929
+ },
930
+ {
931
+ "epoch": 0.07,
932
+ "grad_norm": 54.0,
933
+ "learning_rate": 5.133333333333333e-07,
934
+ "loss": 0.9588,
935
+ "step": 1300
936
+ },
937
+ {
938
+ "epoch": 0.07,
939
+ "grad_norm": 56.5,
940
+ "learning_rate": 5.06e-07,
941
+ "loss": 0.9885,
942
+ "step": 1310
943
+ },
944
+ {
945
+ "epoch": 0.07,
946
+ "grad_norm": 55.75,
947
+ "learning_rate": 4.986666666666667e-07,
948
+ "loss": 1.024,
949
+ "step": 1320
950
+ },
951
+ {
952
+ "epoch": 0.07,
953
+ "grad_norm": 54.5,
954
+ "learning_rate": 4.913333333333334e-07,
955
+ "loss": 0.9812,
956
+ "step": 1330
957
+ },
958
+ {
959
+ "epoch": 0.07,
960
+ "grad_norm": 55.5,
961
+ "learning_rate": 4.84e-07,
962
+ "loss": 0.9859,
963
+ "step": 1340
964
+ },
965
+ {
966
+ "epoch": 0.07,
967
+ "grad_norm": 57.5,
968
+ "learning_rate": 4.766666666666667e-07,
969
+ "loss": 0.9629,
970
+ "step": 1350
971
+ },
972
+ {
973
+ "epoch": 0.07,
974
+ "grad_norm": 54.25,
975
+ "learning_rate": 4.693333333333334e-07,
976
+ "loss": 0.9386,
977
+ "step": 1360
978
+ },
979
+ {
980
+ "epoch": 0.07,
981
+ "grad_norm": 54.75,
982
+ "learning_rate": 4.62e-07,
983
+ "loss": 1.0033,
984
+ "step": 1370
985
+ },
986
+ {
987
+ "epoch": 0.07,
988
+ "grad_norm": 56.5,
989
+ "learning_rate": 4.5466666666666666e-07,
990
+ "loss": 0.9642,
991
+ "step": 1380
992
+ },
993
+ {
994
+ "epoch": 0.07,
995
+ "grad_norm": 53.5,
996
+ "learning_rate": 4.4733333333333334e-07,
997
+ "loss": 0.9745,
998
+ "step": 1390
999
+ },
1000
+ {
1001
+ "epoch": 0.07,
1002
+ "grad_norm": 54.75,
1003
+ "learning_rate": 4.4e-07,
1004
+ "loss": 0.9625,
1005
+ "step": 1400
1006
+ },
1007
+ {
1008
+ "epoch": 0.07,
1009
+ "grad_norm": 56.25,
1010
+ "learning_rate": 4.3266666666666665e-07,
1011
+ "loss": 0.9617,
1012
+ "step": 1410
1013
+ },
1014
+ {
1015
+ "epoch": 0.07,
1016
+ "grad_norm": 56.5,
1017
+ "learning_rate": 4.2533333333333333e-07,
1018
+ "loss": 0.9724,
1019
+ "step": 1420
1020
+ },
1021
+ {
1022
+ "epoch": 0.07,
1023
+ "grad_norm": 54.5,
1024
+ "learning_rate": 4.18e-07,
1025
+ "loss": 0.9962,
1026
+ "step": 1430
1027
+ },
1028
+ {
1029
+ "epoch": 0.07,
1030
+ "grad_norm": 53.0,
1031
+ "learning_rate": 4.106666666666667e-07,
1032
+ "loss": 0.9469,
1033
+ "step": 1440
1034
+ },
1035
+ {
1036
+ "epoch": 0.07,
1037
+ "grad_norm": 54.75,
1038
+ "learning_rate": 4.033333333333333e-07,
1039
+ "loss": 0.9777,
1040
+ "step": 1450
1041
+ },
1042
+ {
1043
+ "epoch": 0.07,
1044
+ "grad_norm": 57.0,
1045
+ "learning_rate": 3.96e-07,
1046
+ "loss": 0.9498,
1047
+ "step": 1460
1048
+ },
1049
+ {
1050
+ "epoch": 0.07,
1051
+ "grad_norm": 57.0,
1052
+ "learning_rate": 3.886666666666667e-07,
1053
+ "loss": 0.9229,
1054
+ "step": 1470
1055
+ },
1056
+ {
1057
+ "epoch": 0.07,
1058
+ "grad_norm": 57.5,
1059
+ "learning_rate": 3.8133333333333336e-07,
1060
+ "loss": 0.9686,
1061
+ "step": 1480
1062
+ },
1063
+ {
1064
+ "epoch": 0.07,
1065
+ "grad_norm": 59.0,
1066
+ "learning_rate": 3.7400000000000004e-07,
1067
+ "loss": 0.9718,
1068
+ "step": 1490
1069
+ },
1070
+ {
1071
+ "epoch": 0.07,
1072
+ "grad_norm": 55.0,
1073
+ "learning_rate": 3.6666666666666667e-07,
1074
+ "loss": 0.9614,
1075
+ "step": 1500
1076
+ },
1077
+ {
1078
+ "epoch": 0.07,
1079
+ "eval_loss": 0.9728732705116272,
1080
+ "eval_runtime": 65.1341,
1081
+ "eval_samples_per_second": 15.353,
1082
+ "eval_steps_per_second": 15.353,
1083
+ "step": 1500
1084
  }
1085
  ],
1086
  "logging_steps": 10,
 
1088
  "num_input_tokens_seen": 0,
1089
  "num_train_epochs": 1,
1090
  "save_steps": 500,
1091
+ "total_flos": 2.420883062784e+16,
1092
  "train_batch_size": 1,
1093
  "trial_name": null,
1094
  "trial_params": null