Training in progress, step 51000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 301235464
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b71ed16611cd95fe8479b9b5158a65681e32cd86fc06fd6104792dca5e0ea90c
|
| 3 |
size 301235464
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 602335994
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4d0121bc94172a095cdea5c65ddbc39cc2a2d68c3e7dea1521191e5bf66d6e4
|
| 3 |
size 602335994
|
last-checkpoint/rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:871241677306799dd94bb012f99e77b35a49885274956fc7cf6b8c017fdd6180
|
| 3 |
size 14244
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38e628546b6b3793b4db9c04b0c48bd7f457b5c91e760c9c29b133754fb90815
|
| 3 |
size 1064
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 0.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -8908,11 +8908,189 @@
|
|
| 8908 |
"eval_steps_per_second": 23.298,
|
| 8909 |
"num_input_tokens_seen": 13107195456,
|
| 8910 |
"step": 50000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8911 |
}
|
| 8912 |
],
|
| 8913 |
"logging_steps": 50,
|
| 8914 |
"max_steps": 70000,
|
| 8915 |
-
"num_input_tokens_seen":
|
| 8916 |
"num_train_epochs": 1,
|
| 8917 |
"save_steps": 1000,
|
| 8918 |
"stateful_callbacks": {
|
|
@@ -8927,7 +9105,7 @@
|
|
| 8927 |
"attributes": {}
|
| 8928 |
}
|
| 8929 |
},
|
| 8930 |
-
"total_flos": 3.
|
| 8931 |
"train_batch_size": 64,
|
| 8932 |
"trial_name": null,
|
| 8933 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 0.24327128325601918,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 51000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 8908 |
"eval_steps_per_second": 23.298,
|
| 8909 |
"num_input_tokens_seen": 13107195456,
|
| 8910 |
"step": 50000
|
| 8911 |
+
},
|
| 8912 |
+
{
|
| 8913 |
+
"epoch": 0.23873975935223057,
|
| 8914 |
+
"grad_norm": 0.5069316029548645,
|
| 8915 |
+
"learning_rate": 0.001,
|
| 8916 |
+
"loss": 2.6591,
|
| 8917 |
+
"num_input_tokens_seen": 13120302656,
|
| 8918 |
+
"step": 50050
|
| 8919 |
+
},
|
| 8920 |
+
{
|
| 8921 |
+
"epoch": 0.23897826061032473,
|
| 8922 |
+
"grad_norm": 0.21306034922599792,
|
| 8923 |
+
"learning_rate": 0.001,
|
| 8924 |
+
"loss": 2.6455,
|
| 8925 |
+
"num_input_tokens_seen": 13133409856,
|
| 8926 |
+
"step": 50100
|
| 8927 |
+
},
|
| 8928 |
+
{
|
| 8929 |
+
"epoch": 0.23921676186841886,
|
| 8930 |
+
"grad_norm": 0.2045888900756836,
|
| 8931 |
+
"learning_rate": 0.001,
|
| 8932 |
+
"loss": 2.6227,
|
| 8933 |
+
"num_input_tokens_seen": 13146517056,
|
| 8934 |
+
"step": 50150
|
| 8935 |
+
},
|
| 8936 |
+
{
|
| 8937 |
+
"epoch": 0.239455263126513,
|
| 8938 |
+
"grad_norm": 0.2335623949766159,
|
| 8939 |
+
"learning_rate": 0.001,
|
| 8940 |
+
"loss": 2.6097,
|
| 8941 |
+
"num_input_tokens_seen": 13159624256,
|
| 8942 |
+
"step": 50200
|
| 8943 |
+
},
|
| 8944 |
+
{
|
| 8945 |
+
"epoch": 0.23969376438460713,
|
| 8946 |
+
"grad_norm": 0.19884036481380463,
|
| 8947 |
+
"learning_rate": 0.001,
|
| 8948 |
+
"loss": 2.6189,
|
| 8949 |
+
"num_input_tokens_seen": 13172731456,
|
| 8950 |
+
"step": 50250
|
| 8951 |
+
},
|
| 8952 |
+
{
|
| 8953 |
+
"epoch": 0.23993226564270126,
|
| 8954 |
+
"grad_norm": 0.21080589294433594,
|
| 8955 |
+
"learning_rate": 0.001,
|
| 8956 |
+
"loss": 2.6057,
|
| 8957 |
+
"num_input_tokens_seen": 13185838656,
|
| 8958 |
+
"step": 50300
|
| 8959 |
+
},
|
| 8960 |
+
{
|
| 8961 |
+
"epoch": 0.2401707669007954,
|
| 8962 |
+
"grad_norm": 0.21613669395446777,
|
| 8963 |
+
"learning_rate": 0.001,
|
| 8964 |
+
"loss": 2.6045,
|
| 8965 |
+
"num_input_tokens_seen": 13198945856,
|
| 8966 |
+
"step": 50350
|
| 8967 |
+
},
|
| 8968 |
+
{
|
| 8969 |
+
"epoch": 0.24040926815888955,
|
| 8970 |
+
"grad_norm": 0.2029023915529251,
|
| 8971 |
+
"learning_rate": 0.001,
|
| 8972 |
+
"loss": 2.6127,
|
| 8973 |
+
"num_input_tokens_seen": 13212053056,
|
| 8974 |
+
"step": 50400
|
| 8975 |
+
},
|
| 8976 |
+
{
|
| 8977 |
+
"epoch": 0.24064776941698368,
|
| 8978 |
+
"grad_norm": 0.2275777906179428,
|
| 8979 |
+
"learning_rate": 0.001,
|
| 8980 |
+
"loss": 2.6149,
|
| 8981 |
+
"num_input_tokens_seen": 13225160256,
|
| 8982 |
+
"step": 50450
|
| 8983 |
+
},
|
| 8984 |
+
{
|
| 8985 |
+
"epoch": 0.2408862706750778,
|
| 8986 |
+
"grad_norm": 0.3332397937774658,
|
| 8987 |
+
"learning_rate": 0.001,
|
| 8988 |
+
"loss": 2.6013,
|
| 8989 |
+
"num_input_tokens_seen": 13238267456,
|
| 8990 |
+
"step": 50500
|
| 8991 |
+
},
|
| 8992 |
+
{
|
| 8993 |
+
"epoch": 0.2408862706750778,
|
| 8994 |
+
"eval_loss": 2.5022270679473877,
|
| 8995 |
+
"eval_runtime": 53.5942,
|
| 8996 |
+
"eval_samples_per_second": 93.294,
|
| 8997 |
+
"eval_steps_per_second": 23.323,
|
| 8998 |
+
"num_input_tokens_seen": 13238267456,
|
| 8999 |
+
"step": 50500
|
| 9000 |
+
},
|
| 9001 |
+
{
|
| 9002 |
+
"epoch": 0.24112477193317194,
|
| 9003 |
+
"grad_norm": 0.2197851538658142,
|
| 9004 |
+
"learning_rate": 0.001,
|
| 9005 |
+
"loss": 2.6326,
|
| 9006 |
+
"num_input_tokens_seen": 13251374656,
|
| 9007 |
+
"step": 50550
|
| 9008 |
+
},
|
| 9009 |
+
{
|
| 9010 |
+
"epoch": 0.24136327319126608,
|
| 9011 |
+
"grad_norm": 0.2201780080795288,
|
| 9012 |
+
"learning_rate": 0.001,
|
| 9013 |
+
"loss": 2.6265,
|
| 9014 |
+
"num_input_tokens_seen": 13264481856,
|
| 9015 |
+
"step": 50600
|
| 9016 |
+
},
|
| 9017 |
+
{
|
| 9018 |
+
"epoch": 0.2416017744493602,
|
| 9019 |
+
"grad_norm": 0.2196362316608429,
|
| 9020 |
+
"learning_rate": 0.001,
|
| 9021 |
+
"loss": 2.6272,
|
| 9022 |
+
"num_input_tokens_seen": 13277589056,
|
| 9023 |
+
"step": 50650
|
| 9024 |
+
},
|
| 9025 |
+
{
|
| 9026 |
+
"epoch": 0.24184027570745437,
|
| 9027 |
+
"grad_norm": 0.2234160453081131,
|
| 9028 |
+
"learning_rate": 0.001,
|
| 9029 |
+
"loss": 2.6178,
|
| 9030 |
+
"num_input_tokens_seen": 13290696256,
|
| 9031 |
+
"step": 50700
|
| 9032 |
+
},
|
| 9033 |
+
{
|
| 9034 |
+
"epoch": 0.2420787769655485,
|
| 9035 |
+
"grad_norm": 0.24019016325473785,
|
| 9036 |
+
"learning_rate": 0.001,
|
| 9037 |
+
"loss": 2.6142,
|
| 9038 |
+
"num_input_tokens_seen": 13303803456,
|
| 9039 |
+
"step": 50750
|
| 9040 |
+
},
|
| 9041 |
+
{
|
| 9042 |
+
"epoch": 0.24231727822364263,
|
| 9043 |
+
"grad_norm": 0.21481236815452576,
|
| 9044 |
+
"learning_rate": 0.001,
|
| 9045 |
+
"loss": 2.6149,
|
| 9046 |
+
"num_input_tokens_seen": 13316910656,
|
| 9047 |
+
"step": 50800
|
| 9048 |
+
},
|
| 9049 |
+
{
|
| 9050 |
+
"epoch": 0.24255577948173676,
|
| 9051 |
+
"grad_norm": 0.20477178692817688,
|
| 9052 |
+
"learning_rate": 0.001,
|
| 9053 |
+
"loss": 2.5977,
|
| 9054 |
+
"num_input_tokens_seen": 13330017856,
|
| 9055 |
+
"step": 50850
|
| 9056 |
+
},
|
| 9057 |
+
{
|
| 9058 |
+
"epoch": 0.2427942807398309,
|
| 9059 |
+
"grad_norm": 0.20742499828338623,
|
| 9060 |
+
"learning_rate": 0.001,
|
| 9061 |
+
"loss": 2.6153,
|
| 9062 |
+
"num_input_tokens_seen": 13343125056,
|
| 9063 |
+
"step": 50900
|
| 9064 |
+
},
|
| 9065 |
+
{
|
| 9066 |
+
"epoch": 0.24303278199792505,
|
| 9067 |
+
"grad_norm": 0.21933062374591827,
|
| 9068 |
+
"learning_rate": 0.001,
|
| 9069 |
+
"loss": 2.5966,
|
| 9070 |
+
"num_input_tokens_seen": 13356232256,
|
| 9071 |
+
"step": 50950
|
| 9072 |
+
},
|
| 9073 |
+
{
|
| 9074 |
+
"epoch": 0.24327128325601918,
|
| 9075 |
+
"grad_norm": 0.3282420337200165,
|
| 9076 |
+
"learning_rate": 0.001,
|
| 9077 |
+
"loss": 2.6063,
|
| 9078 |
+
"num_input_tokens_seen": 13369339456,
|
| 9079 |
+
"step": 51000
|
| 9080 |
+
},
|
| 9081 |
+
{
|
| 9082 |
+
"epoch": 0.24327128325601918,
|
| 9083 |
+
"eval_loss": 2.4981296062469482,
|
| 9084 |
+
"eval_runtime": 53.5536,
|
| 9085 |
+
"eval_samples_per_second": 93.364,
|
| 9086 |
+
"eval_steps_per_second": 23.341,
|
| 9087 |
+
"num_input_tokens_seen": 13369339456,
|
| 9088 |
+
"step": 51000
|
| 9089 |
}
|
| 9090 |
],
|
| 9091 |
"logging_steps": 50,
|
| 9092 |
"max_steps": 70000,
|
| 9093 |
+
"num_input_tokens_seen": 13369339456,
|
| 9094 |
"num_train_epochs": 1,
|
| 9095 |
"save_steps": 1000,
|
| 9096 |
"stateful_callbacks": {
|
|
|
|
| 9105 |
"attributes": {}
|
| 9106 |
}
|
| 9107 |
},
|
| 9108 |
+
"total_flos": 3.5764287892330906e+18,
|
| 9109 |
"train_batch_size": 64,
|
| 9110 |
"trial_name": null,
|
| 9111 |
"trial_params": null
|