wcyat commited on
Commit
74b9ee9
·
verified ·
1 Parent(s): 1a234a1

Training in progress, step 1500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6efd0944d3d8e93526a7d42677046f68dceb25c5d54a4154f27e3b3d5f2710b
3
  size 410636248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee4e2d84e38cb73fc215ea160b72d8c65fb8ef814b9ce18e21001c4fceb2971e
3
  size 410636248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3acc7a1c93cddadaff8c85e17129d69e68eeeb2c1d34c27bee1a7891be700262
3
  size 821393658
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db5189682206216f1eb6c749f032237c2b81ba6a5a04045dadba360a7ff157fd
3
  size 821393658
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a348450b4301ff77668e6e7031dfdf4901888d4300593c1bec37e44424f879c2
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70171b5cf4c5d65f8d1801780619a5ef6eaa49858f51f2d188c1b4ae2878778d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e98b9b6a2ef31cc34f2869daae0c056a45fbf12e7ae930da879a4172e2bd47dc
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0db746135b039e0910ffb0031dacdf1ae5245ec4309f44b2f467d1af6778d5b2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.20959021151065826,
3
  "best_model_checkpoint": "./results/checkpoint-640",
4
- "epoch": 3.076923076923077,
5
  "eval_steps": 20,
6
- "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -807,6 +807,406 @@
807
  "eval_samples_per_second": 34.023,
808
  "eval_steps_per_second": 8.619,
809
  "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
810
  }
811
  ],
812
  "logging_steps": 20,
@@ -826,7 +1226,7 @@
826
  "attributes": {}
827
  }
828
  },
829
- "total_flos": 916853639059440.0,
830
  "train_batch_size": 4,
831
  "trial_name": null,
832
  "trial_params": null
 
1
  {
2
  "best_metric": 0.20959021151065826,
3
  "best_model_checkpoint": "./results/checkpoint-640",
4
+ "epoch": 4.615384615384615,
5
  "eval_steps": 20,
6
+ "global_step": 1500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
807
  "eval_samples_per_second": 34.023,
808
  "eval_steps_per_second": 8.619,
809
  "step": 1000
810
+ },
811
+ {
812
+ "epoch": 3.1384615384615384,
813
+ "grad_norm": 0.00659064669162035,
814
+ "learning_rate": 7.446153846153846e-06,
815
+ "loss": 0.0003,
816
+ "step": 1020
817
+ },
818
+ {
819
+ "epoch": 3.1384615384615384,
820
+ "eval_accuracy": 0.9133333333333333,
821
+ "eval_loss": 0.49057698249816895,
822
+ "eval_runtime": 4.386,
823
+ "eval_samples_per_second": 34.2,
824
+ "eval_steps_per_second": 8.664,
825
+ "step": 1020
826
+ },
827
+ {
828
+ "epoch": 3.2,
829
+ "grad_norm": 0.043470922857522964,
830
+ "learning_rate": 7.2000000000000005e-06,
831
+ "loss": 0.0003,
832
+ "step": 1040
833
+ },
834
+ {
835
+ "epoch": 3.2,
836
+ "eval_accuracy": 0.9133333333333333,
837
+ "eval_loss": 0.48904499411582947,
838
+ "eval_runtime": 4.5644,
839
+ "eval_samples_per_second": 32.863,
840
+ "eval_steps_per_second": 8.325,
841
+ "step": 1040
842
+ },
843
+ {
844
+ "epoch": 3.2615384615384615,
845
+ "grad_norm": 0.005845973733812571,
846
+ "learning_rate": 6.953846153846154e-06,
847
+ "loss": 0.0642,
848
+ "step": 1060
849
+ },
850
+ {
851
+ "epoch": 3.2615384615384615,
852
+ "eval_accuracy": 0.9333333333333333,
853
+ "eval_loss": 0.34622737765312195,
854
+ "eval_runtime": 4.642,
855
+ "eval_samples_per_second": 32.313,
856
+ "eval_steps_per_second": 8.186,
857
+ "step": 1060
858
+ },
859
+ {
860
+ "epoch": 3.3230769230769233,
861
+ "grad_norm": 0.015024982392787933,
862
+ "learning_rate": 6.707692307692308e-06,
863
+ "loss": 0.0003,
864
+ "step": 1080
865
+ },
866
+ {
867
+ "epoch": 3.3230769230769233,
868
+ "eval_accuracy": 0.9466666666666667,
869
+ "eval_loss": 0.3094027638435364,
870
+ "eval_runtime": 4.5041,
871
+ "eval_samples_per_second": 33.303,
872
+ "eval_steps_per_second": 8.437,
873
+ "step": 1080
874
+ },
875
+ {
876
+ "epoch": 3.3846153846153846,
877
+ "grad_norm": 0.004358809906989336,
878
+ "learning_rate": 6.461538461538463e-06,
879
+ "loss": 0.0003,
880
+ "step": 1100
881
+ },
882
+ {
883
+ "epoch": 3.3846153846153846,
884
+ "eval_accuracy": 0.94,
885
+ "eval_loss": 0.32816219329833984,
886
+ "eval_runtime": 4.3854,
887
+ "eval_samples_per_second": 34.204,
888
+ "eval_steps_per_second": 8.665,
889
+ "step": 1100
890
+ },
891
+ {
892
+ "epoch": 3.4461538461538463,
893
+ "grad_norm": 0.003102461341768503,
894
+ "learning_rate": 6.215384615384615e-06,
895
+ "loss": 0.1037,
896
+ "step": 1120
897
+ },
898
+ {
899
+ "epoch": 3.4461538461538463,
900
+ "eval_accuracy": 0.9333333333333333,
901
+ "eval_loss": 0.38093501329421997,
902
+ "eval_runtime": 4.3086,
903
+ "eval_samples_per_second": 34.814,
904
+ "eval_steps_per_second": 8.82,
905
+ "step": 1120
906
+ },
907
+ {
908
+ "epoch": 3.5076923076923077,
909
+ "grad_norm": 1.403334379196167,
910
+ "learning_rate": 5.9692307692307695e-06,
911
+ "loss": 0.0006,
912
+ "step": 1140
913
+ },
914
+ {
915
+ "epoch": 3.5076923076923077,
916
+ "eval_accuracy": 0.9266666666666666,
917
+ "eval_loss": 0.44484424591064453,
918
+ "eval_runtime": 4.3328,
919
+ "eval_samples_per_second": 34.62,
920
+ "eval_steps_per_second": 8.77,
921
+ "step": 1140
922
+ },
923
+ {
924
+ "epoch": 3.569230769230769,
925
+ "grad_norm": 0.0068647353909909725,
926
+ "learning_rate": 5.723076923076923e-06,
927
+ "loss": 0.0942,
928
+ "step": 1160
929
+ },
930
+ {
931
+ "epoch": 3.569230769230769,
932
+ "eval_accuracy": 0.8866666666666667,
933
+ "eval_loss": 0.6030946969985962,
934
+ "eval_runtime": 4.4045,
935
+ "eval_samples_per_second": 34.056,
936
+ "eval_steps_per_second": 8.627,
937
+ "step": 1160
938
+ },
939
+ {
940
+ "epoch": 3.6307692307692307,
941
+ "grad_norm": 0.00668348977342248,
942
+ "learning_rate": 5.476923076923077e-06,
943
+ "loss": 0.0003,
944
+ "step": 1180
945
+ },
946
+ {
947
+ "epoch": 3.6307692307692307,
948
+ "eval_accuracy": 0.8866666666666667,
949
+ "eval_loss": 0.4964194595813751,
950
+ "eval_runtime": 4.4815,
951
+ "eval_samples_per_second": 33.471,
952
+ "eval_steps_per_second": 8.479,
953
+ "step": 1180
954
+ },
955
+ {
956
+ "epoch": 3.6923076923076925,
957
+ "grad_norm": 0.009669807739555836,
958
+ "learning_rate": 5.230769230769232e-06,
959
+ "loss": 0.0007,
960
+ "step": 1200
961
+ },
962
+ {
963
+ "epoch": 3.6923076923076925,
964
+ "eval_accuracy": 0.8866666666666667,
965
+ "eval_loss": 0.5268967151641846,
966
+ "eval_runtime": 4.5142,
967
+ "eval_samples_per_second": 33.228,
968
+ "eval_steps_per_second": 8.418,
969
+ "step": 1200
970
+ },
971
+ {
972
+ "epoch": 3.753846153846154,
973
+ "grad_norm": 0.00334552931599319,
974
+ "learning_rate": 4.984615384615385e-06,
975
+ "loss": 0.0887,
976
+ "step": 1220
977
+ },
978
+ {
979
+ "epoch": 3.753846153846154,
980
+ "eval_accuracy": 0.8866666666666667,
981
+ "eval_loss": 0.49135467410087585,
982
+ "eval_runtime": 4.4888,
983
+ "eval_samples_per_second": 33.417,
984
+ "eval_steps_per_second": 8.466,
985
+ "step": 1220
986
+ },
987
+ {
988
+ "epoch": 3.815384615384615,
989
+ "grad_norm": 0.005591992288827896,
990
+ "learning_rate": 4.738461538461539e-06,
991
+ "loss": 0.0003,
992
+ "step": 1240
993
+ },
994
+ {
995
+ "epoch": 3.815384615384615,
996
+ "eval_accuracy": 0.9266666666666666,
997
+ "eval_loss": 0.395882248878479,
998
+ "eval_runtime": 4.4355,
999
+ "eval_samples_per_second": 33.818,
1000
+ "eval_steps_per_second": 8.567,
1001
+ "step": 1240
1002
+ },
1003
+ {
1004
+ "epoch": 3.876923076923077,
1005
+ "grad_norm": 0.02592817321419716,
1006
+ "learning_rate": 4.492307692307693e-06,
1007
+ "loss": 0.0008,
1008
+ "step": 1260
1009
+ },
1010
+ {
1011
+ "epoch": 3.876923076923077,
1012
+ "eval_accuracy": 0.9266666666666666,
1013
+ "eval_loss": 0.42400792241096497,
1014
+ "eval_runtime": 4.4143,
1015
+ "eval_samples_per_second": 33.98,
1016
+ "eval_steps_per_second": 8.608,
1017
+ "step": 1260
1018
+ },
1019
+ {
1020
+ "epoch": 3.9384615384615387,
1021
+ "grad_norm": 0.0067848521284759045,
1022
+ "learning_rate": 4.246153846153846e-06,
1023
+ "loss": 0.0003,
1024
+ "step": 1280
1025
+ },
1026
+ {
1027
+ "epoch": 3.9384615384615387,
1028
+ "eval_accuracy": 0.92,
1029
+ "eval_loss": 0.43341755867004395,
1030
+ "eval_runtime": 4.3946,
1031
+ "eval_samples_per_second": 34.133,
1032
+ "eval_steps_per_second": 8.647,
1033
+ "step": 1280
1034
+ },
1035
+ {
1036
+ "epoch": 4.0,
1037
+ "grad_norm": 0.004882505163550377,
1038
+ "learning_rate": 4.000000000000001e-06,
1039
+ "loss": 0.0003,
1040
+ "step": 1300
1041
+ },
1042
+ {
1043
+ "epoch": 4.0,
1044
+ "eval_accuracy": 0.9266666666666666,
1045
+ "eval_loss": 0.42421114444732666,
1046
+ "eval_runtime": 4.4076,
1047
+ "eval_samples_per_second": 34.032,
1048
+ "eval_steps_per_second": 8.621,
1049
+ "step": 1300
1050
+ },
1051
+ {
1052
+ "epoch": 4.061538461538461,
1053
+ "grad_norm": 0.006306629162281752,
1054
+ "learning_rate": 3.753846153846154e-06,
1055
+ "loss": 0.0002,
1056
+ "step": 1320
1057
+ },
1058
+ {
1059
+ "epoch": 4.061538461538461,
1060
+ "eval_accuracy": 0.9266666666666666,
1061
+ "eval_loss": 0.42182713747024536,
1062
+ "eval_runtime": 4.4367,
1063
+ "eval_samples_per_second": 33.809,
1064
+ "eval_steps_per_second": 8.565,
1065
+ "step": 1320
1066
+ },
1067
+ {
1068
+ "epoch": 4.123076923076923,
1069
+ "grad_norm": 0.10213489085435867,
1070
+ "learning_rate": 3.507692307692308e-06,
1071
+ "loss": 0.0003,
1072
+ "step": 1340
1073
+ },
1074
+ {
1075
+ "epoch": 4.123076923076923,
1076
+ "eval_accuracy": 0.9266666666666666,
1077
+ "eval_loss": 0.41865241527557373,
1078
+ "eval_runtime": 4.4189,
1079
+ "eval_samples_per_second": 33.945,
1080
+ "eval_steps_per_second": 8.599,
1081
+ "step": 1340
1082
+ },
1083
+ {
1084
+ "epoch": 4.184615384615385,
1085
+ "grad_norm": 0.007508518174290657,
1086
+ "learning_rate": 3.2615384615384615e-06,
1087
+ "loss": 0.0002,
1088
+ "step": 1360
1089
+ },
1090
+ {
1091
+ "epoch": 4.184615384615385,
1092
+ "eval_accuracy": 0.9266666666666666,
1093
+ "eval_loss": 0.41028958559036255,
1094
+ "eval_runtime": 4.491,
1095
+ "eval_samples_per_second": 33.4,
1096
+ "eval_steps_per_second": 8.461,
1097
+ "step": 1360
1098
+ },
1099
+ {
1100
+ "epoch": 4.246153846153846,
1101
+ "grad_norm": 0.004084484186023474,
1102
+ "learning_rate": 3.0153846153846154e-06,
1103
+ "loss": 0.0002,
1104
+ "step": 1380
1105
+ },
1106
+ {
1107
+ "epoch": 4.246153846153846,
1108
+ "eval_accuracy": 0.9266666666666666,
1109
+ "eval_loss": 0.4090527892112732,
1110
+ "eval_runtime": 4.4734,
1111
+ "eval_samples_per_second": 33.532,
1112
+ "eval_steps_per_second": 8.495,
1113
+ "step": 1380
1114
+ },
1115
+ {
1116
+ "epoch": 4.3076923076923075,
1117
+ "grad_norm": 0.00797939207404852,
1118
+ "learning_rate": 2.7692307692307697e-06,
1119
+ "loss": 0.0002,
1120
+ "step": 1400
1121
+ },
1122
+ {
1123
+ "epoch": 4.3076923076923075,
1124
+ "eval_accuracy": 0.9266666666666666,
1125
+ "eval_loss": 0.4111497402191162,
1126
+ "eval_runtime": 4.4612,
1127
+ "eval_samples_per_second": 33.624,
1128
+ "eval_steps_per_second": 8.518,
1129
+ "step": 1400
1130
+ },
1131
+ {
1132
+ "epoch": 4.36923076923077,
1133
+ "grad_norm": 0.0033812555484473705,
1134
+ "learning_rate": 2.523076923076923e-06,
1135
+ "loss": 0.0003,
1136
+ "step": 1420
1137
+ },
1138
+ {
1139
+ "epoch": 4.36923076923077,
1140
+ "eval_accuracy": 0.9266666666666666,
1141
+ "eval_loss": 0.40916740894317627,
1142
+ "eval_runtime": 4.4676,
1143
+ "eval_samples_per_second": 33.575,
1144
+ "eval_steps_per_second": 8.506,
1145
+ "step": 1420
1146
+ },
1147
+ {
1148
+ "epoch": 4.430769230769231,
1149
+ "grad_norm": 0.005078181624412537,
1150
+ "learning_rate": 2.276923076923077e-06,
1151
+ "loss": 0.0003,
1152
+ "step": 1440
1153
+ },
1154
+ {
1155
+ "epoch": 4.430769230769231,
1156
+ "eval_accuracy": 0.9333333333333333,
1157
+ "eval_loss": 0.3990814685821533,
1158
+ "eval_runtime": 4.4277,
1159
+ "eval_samples_per_second": 33.877,
1160
+ "eval_steps_per_second": 8.582,
1161
+ "step": 1440
1162
+ },
1163
+ {
1164
+ "epoch": 4.492307692307692,
1165
+ "grad_norm": 0.003911417443305254,
1166
+ "learning_rate": 2.030769230769231e-06,
1167
+ "loss": 0.0002,
1168
+ "step": 1460
1169
+ },
1170
+ {
1171
+ "epoch": 4.492307692307692,
1172
+ "eval_accuracy": 0.9333333333333333,
1173
+ "eval_loss": 0.39907512068748474,
1174
+ "eval_runtime": 4.4194,
1175
+ "eval_samples_per_second": 33.941,
1176
+ "eval_steps_per_second": 8.598,
1177
+ "step": 1460
1178
+ },
1179
+ {
1180
+ "epoch": 4.553846153846154,
1181
+ "grad_norm": 0.005002380348742008,
1182
+ "learning_rate": 1.7846153846153846e-06,
1183
+ "loss": 0.0002,
1184
+ "step": 1480
1185
+ },
1186
+ {
1187
+ "epoch": 4.553846153846154,
1188
+ "eval_accuracy": 0.9333333333333333,
1189
+ "eval_loss": 0.39858585596084595,
1190
+ "eval_runtime": 4.4069,
1191
+ "eval_samples_per_second": 34.037,
1192
+ "eval_steps_per_second": 8.623,
1193
+ "step": 1480
1194
+ },
1195
+ {
1196
+ "epoch": 4.615384615384615,
1197
+ "grad_norm": 0.005623374599963427,
1198
+ "learning_rate": 1.5384615384615387e-06,
1199
+ "loss": 0.0004,
1200
+ "step": 1500
1201
+ },
1202
+ {
1203
+ "epoch": 4.615384615384615,
1204
+ "eval_accuracy": 0.9333333333333333,
1205
+ "eval_loss": 0.4055434763431549,
1206
+ "eval_runtime": 4.4211,
1207
+ "eval_samples_per_second": 33.928,
1208
+ "eval_steps_per_second": 8.595,
1209
+ "step": 1500
1210
  }
1211
  ],
1212
  "logging_steps": 20,
 
1226
  "attributes": {}
1227
  }
1228
  },
1229
+ "total_flos": 1380510818592000.0,
1230
  "train_batch_size": 4,
1231
  "trial_name": null,
1232
  "trial_params": null