Bingsu commited on
Commit
f51794d
1 Parent(s): c70d807

Training in progress, step 40000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:991640a131f2a0a32a17ba1af542f31b7776932281bd0a73639dd3a4960e3a40
3
  size 100170757
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88a75430873048d1c55c1dfe2ccd053acab1fd15a8dcedf646948c68737b2e5a
3
  size 100170757
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c10b0dd9b3e24c2c1ca2db9a9e924f901a4d183202a5c32479436a975f462f9d
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:484db98ddb6ed91b4ea5f15498a85d452cda43038e8130ac0e093318895988c2
3
  size 146774203
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f754532050c5b1775c36eee5da06c337e5bc03296f22630efbd5a1c263b25446
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88f2a279490e0c3b8efb66c61c4617ccf8fdba17e1e5c57bdf6bbd4c4665d937
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd2f5c3de2046b6ec35a993f60879cf7288b2cb7906fdbd23f2869d9429fbe1b
3
  size 246897640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8a8e57edadd6965f548754431358c0af914fc344d18e1de094a5af2d8025b1c
3
  size 246897640
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.1289213579716373,
5
- "global_step": 30000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -906,11 +906,311 @@
906
  "learning_rate": 0.00013167302452368236,
907
  "loss": 3.5359,
908
  "step": 30000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  }
910
  ],
911
  "max_steps": 500000,
912
  "num_train_epochs": 3,
913
- "total_flos": 4.781489946624e+16,
914
  "trial_name": null,
915
  "trial_params": null
916
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.17189514396218306,
5
+ "global_step": 40000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
906
  "learning_rate": 0.00013167302452368236,
907
  "loss": 3.5359,
908
  "step": 30000
909
+ },
910
+ {
911
+ "epoch": 0.13,
912
+ "learning_rate": 0.0001328582549574664,
913
+ "loss": 3.5252,
914
+ "step": 30200
915
+ },
916
+ {
917
+ "epoch": 0.13,
918
+ "learning_rate": 0.00013405027824677038,
919
+ "loss": 3.5104,
920
+ "step": 30400
921
+ },
922
+ {
923
+ "epoch": 0.13,
924
+ "learning_rate": 0.00013524907347614926,
925
+ "loss": 3.5253,
926
+ "step": 30600
927
+ },
928
+ {
929
+ "epoch": 0.13,
930
+ "learning_rate": 0.00013645461961133603,
931
+ "loss": 3.5249,
932
+ "step": 30800
933
+ },
934
+ {
935
+ "epoch": 0.13,
936
+ "learning_rate": 0.00013766689549961136,
937
+ "loss": 3.5374,
938
+ "step": 31000
939
+ },
940
+ {
941
+ "epoch": 0.13,
942
+ "learning_rate": 0.00013888587987017427,
943
+ "loss": 3.5093,
944
+ "step": 31200
945
+ },
946
+ {
947
+ "epoch": 0.13,
948
+ "learning_rate": 0.00014011155133451586,
949
+ "loss": 3.5465,
950
+ "step": 31400
951
+ },
952
+ {
953
+ "epoch": 0.14,
954
+ "learning_rate": 0.00014134388838679408,
955
+ "loss": 3.5362,
956
+ "step": 31600
957
+ },
958
+ {
959
+ "epoch": 0.14,
960
+ "learning_rate": 0.00014258286940421164,
961
+ "loss": 3.5233,
962
+ "step": 31800
963
+ },
964
+ {
965
+ "epoch": 0.14,
966
+ "learning_rate": 0.00014382847264739456,
967
+ "loss": 3.5425,
968
+ "step": 32000
969
+ },
970
+ {
971
+ "epoch": 0.14,
972
+ "learning_rate": 0.00014508067626077482,
973
+ "loss": 3.5178,
974
+ "step": 32200
975
+ },
976
+ {
977
+ "epoch": 0.14,
978
+ "learning_rate": 0.00014633945827297273,
979
+ "loss": 3.5288,
980
+ "step": 32400
981
+ },
982
+ {
983
+ "epoch": 0.14,
984
+ "learning_rate": 0.00014760479659718304,
985
+ "loss": 3.52,
986
+ "step": 32600
987
+ },
988
+ {
989
+ "epoch": 0.14,
990
+ "learning_rate": 0.00014887666903156218,
991
+ "loss": 3.5121,
992
+ "step": 32800
993
+ },
994
+ {
995
+ "epoch": 0.14,
996
+ "learning_rate": 0.0001501550532596183,
997
+ "loss": 3.5191,
998
+ "step": 33000
999
+ },
1000
+ {
1001
+ "epoch": 0.14,
1002
+ "learning_rate": 0.00015143992685060208,
1003
+ "loss": 3.5097,
1004
+ "step": 33200
1005
+ },
1006
+ {
1007
+ "epoch": 0.14,
1008
+ "learning_rate": 0.00015273126725990098,
1009
+ "loss": 3.5034,
1010
+ "step": 33400
1011
+ },
1012
+ {
1013
+ "epoch": 0.14,
1014
+ "learning_rate": 0.00015402905182943438,
1015
+ "loss": 3.484,
1016
+ "step": 33600
1017
+ },
1018
+ {
1019
+ "epoch": 0.15,
1020
+ "learning_rate": 0.00015533325778805166,
1021
+ "loss": 3.5111,
1022
+ "step": 33800
1023
+ },
1024
+ {
1025
+ "epoch": 0.15,
1026
+ "learning_rate": 0.00015664386225193092,
1027
+ "loss": 3.5041,
1028
+ "step": 34000
1029
+ },
1030
+ {
1031
+ "epoch": 0.15,
1032
+ "learning_rate": 0.00015796084222498145,
1033
+ "loss": 3.5126,
1034
+ "step": 34200
1035
+ },
1036
+ {
1037
+ "epoch": 0.15,
1038
+ "learning_rate": 0.0001592841745992464,
1039
+ "loss": 3.4939,
1040
+ "step": 34400
1041
+ },
1042
+ {
1043
+ "epoch": 0.15,
1044
+ "learning_rate": 0.00016061383615530893,
1045
+ "loss": 3.4728,
1046
+ "step": 34600
1047
+ },
1048
+ {
1049
+ "epoch": 0.15,
1050
+ "learning_rate": 0.0001619498035626989,
1051
+ "loss": 3.4671,
1052
+ "step": 34800
1053
+ },
1054
+ {
1055
+ "epoch": 0.15,
1056
+ "learning_rate": 0.00016329205338030289,
1057
+ "loss": 3.5058,
1058
+ "step": 35000
1059
+ },
1060
+ {
1061
+ "epoch": 0.15,
1062
+ "learning_rate": 0.00016464056205677522,
1063
+ "loss": 3.4973,
1064
+ "step": 35200
1065
+ },
1066
+ {
1067
+ "epoch": 0.15,
1068
+ "learning_rate": 0.0001659953059309513,
1069
+ "loss": 3.488,
1070
+ "step": 35400
1071
+ },
1072
+ {
1073
+ "epoch": 0.15,
1074
+ "learning_rate": 0.00016735626123226218,
1075
+ "loss": 3.4953,
1076
+ "step": 35600
1077
+ },
1078
+ {
1079
+ "epoch": 0.15,
1080
+ "learning_rate": 0.00016872340408115283,
1081
+ "loss": 3.4772,
1082
+ "step": 35800
1083
+ },
1084
+ {
1085
+ "epoch": 0.15,
1086
+ "learning_rate": 0.00017009671048950003,
1087
+ "loss": 3.4866,
1088
+ "step": 36000
1089
+ },
1090
+ {
1091
+ "epoch": 0.16,
1092
+ "learning_rate": 0.00017147615636103365,
1093
+ "loss": 3.4752,
1094
+ "step": 36200
1095
+ },
1096
+ {
1097
+ "epoch": 0.16,
1098
+ "learning_rate": 0.00017286171749175986,
1099
+ "loss": 3.4878,
1100
+ "step": 36400
1101
+ },
1102
+ {
1103
+ "epoch": 0.16,
1104
+ "learning_rate": 0.0001742533695703849,
1105
+ "loss": 3.468,
1106
+ "step": 36600
1107
+ },
1108
+ {
1109
+ "epoch": 0.16,
1110
+ "learning_rate": 0.0001756510881787427,
1111
+ "loss": 3.4708,
1112
+ "step": 36800
1113
+ },
1114
+ {
1115
+ "epoch": 0.16,
1116
+ "learning_rate": 0.00017705484879222265,
1117
+ "loss": 3.4879,
1118
+ "step": 37000
1119
+ },
1120
+ {
1121
+ "epoch": 0.16,
1122
+ "learning_rate": 0.0001784646267801997,
1123
+ "loss": 3.4787,
1124
+ "step": 37200
1125
+ },
1126
+ {
1127
+ "epoch": 0.16,
1128
+ "learning_rate": 0.00017988039740646764,
1129
+ "loss": 3.4673,
1130
+ "step": 37400
1131
+ },
1132
+ {
1133
+ "epoch": 0.16,
1134
+ "learning_rate": 0.00018130213582967188,
1135
+ "loss": 3.4794,
1136
+ "step": 37600
1137
+ },
1138
+ {
1139
+ "epoch": 0.16,
1140
+ "learning_rate": 0.00018272981710374596,
1141
+ "loss": 3.4561,
1142
+ "step": 37800
1143
+ },
1144
+ {
1145
+ "epoch": 0.16,
1146
+ "learning_rate": 0.00018416341617834915,
1147
+ "loss": 3.4599,
1148
+ "step": 38000
1149
+ },
1150
+ {
1151
+ "epoch": 0.16,
1152
+ "learning_rate": 0.00018560290789930596,
1153
+ "loss": 3.454,
1154
+ "step": 38200
1155
+ },
1156
+ {
1157
+ "epoch": 0.17,
1158
+ "learning_rate": 0.00018704826700904756,
1159
+ "loss": 3.4628,
1160
+ "step": 38400
1161
+ },
1162
+ {
1163
+ "epoch": 0.17,
1164
+ "learning_rate": 0.00018849946814705483,
1165
+ "loss": 3.4557,
1166
+ "step": 38600
1167
+ },
1168
+ {
1169
+ "epoch": 0.17,
1170
+ "learning_rate": 0.0001899564858503036,
1171
+ "loss": 3.4584,
1172
+ "step": 38800
1173
+ },
1174
+ {
1175
+ "epoch": 0.17,
1176
+ "learning_rate": 0.00019141929455371092,
1177
+ "loss": 3.4492,
1178
+ "step": 39000
1179
+ },
1180
+ {
1181
+ "epoch": 0.17,
1182
+ "learning_rate": 0.00019288786859058442,
1183
+ "loss": 3.4641,
1184
+ "step": 39200
1185
+ },
1186
+ {
1187
+ "epoch": 0.17,
1188
+ "learning_rate": 0.00019436218219307173,
1189
+ "loss": 3.4665,
1190
+ "step": 39400
1191
+ },
1192
+ {
1193
+ "epoch": 0.17,
1194
+ "learning_rate": 0.00019584220949261325,
1195
+ "loss": 3.4503,
1196
+ "step": 39600
1197
+ },
1198
+ {
1199
+ "epoch": 0.17,
1200
+ "learning_rate": 0.00019732792452039607,
1201
+ "loss": 3.4438,
1202
+ "step": 39800
1203
+ },
1204
+ {
1205
+ "epoch": 0.17,
1206
+ "learning_rate": 0.00019881930120780906,
1207
+ "loss": 3.4454,
1208
+ "step": 40000
1209
  }
1210
  ],
1211
  "max_steps": 500000,
1212
  "num_train_epochs": 3,
1213
+ "total_flos": 6.375319928832e+16,
1214
  "trial_name": null,
1215
  "trial_params": null
1216
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c10b0dd9b3e24c2c1ca2db9a9e924f901a4d183202a5c32479436a975f462f9d
3
  size 146774203
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:484db98ddb6ed91b4ea5f15498a85d452cda43038e8130ac0e093318895988c2
3
  size 146774203