kiddothe2b commited on
Commit
5d8ab3b
1 Parent(s): cb30f37

Training in progress, step 19200

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c8c311bed380f6c5231042dd3172757e53d32d3926ff696064a6f7e652b2260
3
  size 745634697
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d041fc9e6d55c4a7915f0599d0972686813610cfd5a2d83bd76580f7087c5ca
3
  size 745634697
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01755e0768402bffb76f967e33df76e23d5b263c52bad4b9110c9a221b45c611
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7a084a9b3d69be038d1f70310127204c769b6f31132335d3c43f2359a442b86
3
  size 372832803
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c6f9685929a5db844ce472a185dad9d0c6482918c842f5a9b7670626b6da045
3
  size 15523
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac85050f1de5a3da93b15d68ec19d08f9c128973d47940d52332ce7a8a430098
3
  size 15523
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a63c18679f872f561021a84d9bfcd3fad0c807bcef87d1a807b9818f9895c1f
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12a6154fa53f0286557ec7a9b6bf6b9f5b2fb01f4345510fa7b96c5e44005857
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2,
5
- "global_step": 12800,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -792,11 +792,404 @@
792
  "eval_samples_per_second": 45.406,
793
  "eval_steps_per_second": 2.838,
794
  "step": 12800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795
  }
796
  ],
797
  "max_steps": 64000,
798
  "num_train_epochs": 9223372036854775807,
799
- "total_flos": 6.76983528751104e+16,
800
  "trial_name": null,
801
  "trial_params": null
802
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3,
5
+ "global_step": 19200,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
792
  "eval_samples_per_second": 45.406,
793
  "eval_steps_per_second": 2.838,
794
  "step": 12800
795
+ },
796
+ {
797
+ "epoch": 0.2,
798
+ "learning_rate": 0.001,
799
+ "loss": 8.0474,
800
+ "step": 12900
801
+ },
802
+ {
803
+ "epoch": 0.2,
804
+ "learning_rate": 0.001,
805
+ "loss": 8.054,
806
+ "step": 13000
807
+ },
808
+ {
809
+ "epoch": 0.2,
810
+ "learning_rate": 0.001,
811
+ "loss": 8.0506,
812
+ "step": 13100
813
+ },
814
+ {
815
+ "epoch": 0.21,
816
+ "learning_rate": 0.001,
817
+ "loss": 8.0485,
818
+ "step": 13200
819
+ },
820
+ {
821
+ "epoch": 0.21,
822
+ "learning_rate": 0.001,
823
+ "loss": 8.0485,
824
+ "step": 13300
825
+ },
826
+ {
827
+ "epoch": 0.21,
828
+ "learning_rate": 0.001,
829
+ "loss": 8.0515,
830
+ "step": 13400
831
+ },
832
+ {
833
+ "epoch": 0.21,
834
+ "learning_rate": 0.001,
835
+ "loss": 8.0295,
836
+ "step": 13500
837
+ },
838
+ {
839
+ "epoch": 0.21,
840
+ "learning_rate": 0.001,
841
+ "loss": 8.0456,
842
+ "step": 13600
843
+ },
844
+ {
845
+ "epoch": 0.21,
846
+ "learning_rate": 0.001,
847
+ "loss": 8.0278,
848
+ "step": 13700
849
+ },
850
+ {
851
+ "epoch": 0.22,
852
+ "learning_rate": 0.001,
853
+ "loss": 8.0358,
854
+ "step": 13800
855
+ },
856
+ {
857
+ "epoch": 0.22,
858
+ "learning_rate": 0.001,
859
+ "loss": 8.0513,
860
+ "step": 13900
861
+ },
862
+ {
863
+ "epoch": 0.22,
864
+ "learning_rate": 0.001,
865
+ "loss": 8.0443,
866
+ "step": 14000
867
+ },
868
+ {
869
+ "epoch": 0.22,
870
+ "learning_rate": 0.001,
871
+ "loss": 8.0657,
872
+ "step": 14100
873
+ },
874
+ {
875
+ "epoch": 0.22,
876
+ "learning_rate": 0.001,
877
+ "loss": 8.0381,
878
+ "step": 14200
879
+ },
880
+ {
881
+ "epoch": 0.22,
882
+ "learning_rate": 0.001,
883
+ "loss": 8.047,
884
+ "step": 14300
885
+ },
886
+ {
887
+ "epoch": 0.23,
888
+ "learning_rate": 0.001,
889
+ "loss": 8.0441,
890
+ "step": 14400
891
+ },
892
+ {
893
+ "epoch": 0.23,
894
+ "learning_rate": 0.001,
895
+ "loss": 8.0293,
896
+ "step": 14500
897
+ },
898
+ {
899
+ "epoch": 0.23,
900
+ "learning_rate": 0.001,
901
+ "loss": 8.0308,
902
+ "step": 14600
903
+ },
904
+ {
905
+ "epoch": 0.23,
906
+ "learning_rate": 0.001,
907
+ "loss": 8.0353,
908
+ "step": 14700
909
+ },
910
+ {
911
+ "epoch": 0.23,
912
+ "learning_rate": 0.001,
913
+ "loss": 8.0098,
914
+ "step": 14800
915
+ },
916
+ {
917
+ "epoch": 0.23,
918
+ "learning_rate": 0.001,
919
+ "loss": 8.0422,
920
+ "step": 14900
921
+ },
922
+ {
923
+ "epoch": 0.23,
924
+ "learning_rate": 0.001,
925
+ "loss": 8.0427,
926
+ "step": 15000
927
+ },
928
+ {
929
+ "epoch": 0.24,
930
+ "learning_rate": 0.001,
931
+ "loss": 8.0323,
932
+ "step": 15100
933
+ },
934
+ {
935
+ "epoch": 0.24,
936
+ "learning_rate": 0.001,
937
+ "loss": 8.035,
938
+ "step": 15200
939
+ },
940
+ {
941
+ "epoch": 0.24,
942
+ "learning_rate": 0.001,
943
+ "loss": 8.0574,
944
+ "step": 15300
945
+ },
946
+ {
947
+ "epoch": 0.24,
948
+ "learning_rate": 0.001,
949
+ "loss": 8.0365,
950
+ "step": 15400
951
+ },
952
+ {
953
+ "epoch": 0.24,
954
+ "learning_rate": 0.001,
955
+ "loss": 8.0341,
956
+ "step": 15500
957
+ },
958
+ {
959
+ "epoch": 0.24,
960
+ "learning_rate": 0.001,
961
+ "loss": 8.0484,
962
+ "step": 15600
963
+ },
964
+ {
965
+ "epoch": 0.25,
966
+ "learning_rate": 0.001,
967
+ "loss": 8.0409,
968
+ "step": 15700
969
+ },
970
+ {
971
+ "epoch": 0.25,
972
+ "learning_rate": 0.001,
973
+ "loss": 8.0363,
974
+ "step": 15800
975
+ },
976
+ {
977
+ "epoch": 0.25,
978
+ "learning_rate": 0.001,
979
+ "loss": 8.0415,
980
+ "step": 15900
981
+ },
982
+ {
983
+ "epoch": 0.25,
984
+ "learning_rate": 0.001,
985
+ "loss": 8.043,
986
+ "step": 16000
987
+ },
988
+ {
989
+ "epoch": 0.25,
990
+ "learning_rate": 0.001,
991
+ "loss": 8.0307,
992
+ "step": 16100
993
+ },
994
+ {
995
+ "epoch": 0.25,
996
+ "learning_rate": 0.001,
997
+ "loss": 8.0179,
998
+ "step": 16200
999
+ },
1000
+ {
1001
+ "epoch": 0.25,
1002
+ "learning_rate": 0.001,
1003
+ "loss": 8.0279,
1004
+ "step": 16300
1005
+ },
1006
+ {
1007
+ "epoch": 0.26,
1008
+ "learning_rate": 0.001,
1009
+ "loss": 8.0552,
1010
+ "step": 16400
1011
+ },
1012
+ {
1013
+ "epoch": 0.26,
1014
+ "learning_rate": 0.001,
1015
+ "loss": 8.0437,
1016
+ "step": 16500
1017
+ },
1018
+ {
1019
+ "epoch": 0.26,
1020
+ "learning_rate": 0.001,
1021
+ "loss": 8.0356,
1022
+ "step": 16600
1023
+ },
1024
+ {
1025
+ "epoch": 0.26,
1026
+ "learning_rate": 0.001,
1027
+ "loss": 8.0427,
1028
+ "step": 16700
1029
+ },
1030
+ {
1031
+ "epoch": 0.26,
1032
+ "learning_rate": 0.001,
1033
+ "loss": 8.0472,
1034
+ "step": 16800
1035
+ },
1036
+ {
1037
+ "epoch": 0.26,
1038
+ "learning_rate": 0.001,
1039
+ "loss": 8.043,
1040
+ "step": 16900
1041
+ },
1042
+ {
1043
+ "epoch": 0.27,
1044
+ "learning_rate": 0.001,
1045
+ "loss": 8.0567,
1046
+ "step": 17000
1047
+ },
1048
+ {
1049
+ "epoch": 0.27,
1050
+ "learning_rate": 0.001,
1051
+ "loss": 8.0474,
1052
+ "step": 17100
1053
+ },
1054
+ {
1055
+ "epoch": 0.27,
1056
+ "learning_rate": 0.001,
1057
+ "loss": 8.0295,
1058
+ "step": 17200
1059
+ },
1060
+ {
1061
+ "epoch": 0.27,
1062
+ "learning_rate": 0.001,
1063
+ "loss": 8.0364,
1064
+ "step": 17300
1065
+ },
1066
+ {
1067
+ "epoch": 0.27,
1068
+ "learning_rate": 0.001,
1069
+ "loss": 8.0474,
1070
+ "step": 17400
1071
+ },
1072
+ {
1073
+ "epoch": 0.27,
1074
+ "learning_rate": 0.001,
1075
+ "loss": 8.0282,
1076
+ "step": 17500
1077
+ },
1078
+ {
1079
+ "epoch": 0.28,
1080
+ "learning_rate": 0.001,
1081
+ "loss": 8.0525,
1082
+ "step": 17600
1083
+ },
1084
+ {
1085
+ "epoch": 0.28,
1086
+ "learning_rate": 0.001,
1087
+ "loss": 8.0464,
1088
+ "step": 17700
1089
+ },
1090
+ {
1091
+ "epoch": 0.28,
1092
+ "learning_rate": 0.001,
1093
+ "loss": 8.0516,
1094
+ "step": 17800
1095
+ },
1096
+ {
1097
+ "epoch": 0.28,
1098
+ "learning_rate": 0.001,
1099
+ "loss": 8.0322,
1100
+ "step": 17900
1101
+ },
1102
+ {
1103
+ "epoch": 0.28,
1104
+ "learning_rate": 0.001,
1105
+ "loss": 8.0376,
1106
+ "step": 18000
1107
+ },
1108
+ {
1109
+ "epoch": 0.28,
1110
+ "learning_rate": 0.001,
1111
+ "loss": 8.0235,
1112
+ "step": 18100
1113
+ },
1114
+ {
1115
+ "epoch": 0.28,
1116
+ "learning_rate": 0.001,
1117
+ "loss": 8.0316,
1118
+ "step": 18200
1119
+ },
1120
+ {
1121
+ "epoch": 0.29,
1122
+ "learning_rate": 0.001,
1123
+ "loss": 8.0319,
1124
+ "step": 18300
1125
+ },
1126
+ {
1127
+ "epoch": 0.29,
1128
+ "learning_rate": 0.001,
1129
+ "loss": 8.0393,
1130
+ "step": 18400
1131
+ },
1132
+ {
1133
+ "epoch": 0.29,
1134
+ "learning_rate": 0.001,
1135
+ "loss": 8.0248,
1136
+ "step": 18500
1137
+ },
1138
+ {
1139
+ "epoch": 0.29,
1140
+ "learning_rate": 0.001,
1141
+ "loss": 8.0463,
1142
+ "step": 18600
1143
+ },
1144
+ {
1145
+ "epoch": 0.29,
1146
+ "learning_rate": 0.001,
1147
+ "loss": 8.024,
1148
+ "step": 18700
1149
+ },
1150
+ {
1151
+ "epoch": 0.29,
1152
+ "learning_rate": 0.001,
1153
+ "loss": 8.0062,
1154
+ "step": 18800
1155
+ },
1156
+ {
1157
+ "epoch": 0.3,
1158
+ "learning_rate": 0.001,
1159
+ "loss": 8.0302,
1160
+ "step": 18900
1161
+ },
1162
+ {
1163
+ "epoch": 0.3,
1164
+ "learning_rate": 0.001,
1165
+ "loss": 8.054,
1166
+ "step": 19000
1167
+ },
1168
+ {
1169
+ "epoch": 0.3,
1170
+ "learning_rate": 0.001,
1171
+ "loss": 8.0448,
1172
+ "step": 19100
1173
+ },
1174
+ {
1175
+ "epoch": 0.3,
1176
+ "learning_rate": 0.001,
1177
+ "loss": 8.0465,
1178
+ "step": 19200
1179
+ },
1180
+ {
1181
+ "epoch": 0.3,
1182
+ "eval_accuracy": 0.03331597495163198,
1183
+ "eval_loss": 8.039088249206543,
1184
+ "eval_runtime": 7133.5004,
1185
+ "eval_samples_per_second": 45.96,
1186
+ "eval_steps_per_second": 2.873,
1187
+ "step": 19200
1188
  }
1189
  ],
1190
  "max_steps": 64000,
1191
  "num_train_epochs": 9223372036854775807,
1192
+ "total_flos": 1.015475293126656e+17,
1193
  "trial_name": null,
1194
  "trial_params": null
1195
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:01755e0768402bffb76f967e33df76e23d5b263c52bad4b9110c9a221b45c611
3
  size 372832803
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7a084a9b3d69be038d1f70310127204c769b6f31132335d3c43f2359a442b86
3
  size 372832803