|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9895227008149012, |
|
"eval_steps": 500, |
|
"global_step": 321, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09313154831199069, |
|
"grad_norm": 6.410240671476086, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0737, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.18626309662398138, |
|
"grad_norm": 2.110670339648224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.853, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.27939464493597205, |
|
"grad_norm": 2.0454490264727103, |
|
"learning_rate": 5e-06, |
|
"loss": 0.808, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.37252619324796277, |
|
"grad_norm": 1.9173512178087246, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7705, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.46565774155995343, |
|
"grad_norm": 1.9696383768635166, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7493, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5587892898719441, |
|
"grad_norm": 1.7929306896649821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.723, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6519208381839348, |
|
"grad_norm": 1.4541049068033436, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7029, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7450523864959255, |
|
"grad_norm": 1.4546232801341403, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7045, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8381839348079162, |
|
"grad_norm": 0.9836741916976506, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6913, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9313154831199069, |
|
"grad_norm": 0.9546295476288479, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6853, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9965075669383003, |
|
"eval_loss": 0.6684155464172363, |
|
"eval_runtime": 72.6852, |
|
"eval_samples_per_second": 39.788, |
|
"eval_steps_per_second": 0.633, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.0244470314318976, |
|
"grad_norm": 1.2182872648581187, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7062, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1175785797438882, |
|
"grad_norm": 1.4541964745779468, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6098, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.210710128055879, |
|
"grad_norm": 0.9495507104405151, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6019, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3038416763678695, |
|
"grad_norm": 1.8134473930318298, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6043, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3969732246798603, |
|
"grad_norm": 1.113009214401007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6023, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.490104772991851, |
|
"grad_norm": 2.029132520416773, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6021, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5832363213038416, |
|
"grad_norm": 1.6144281176922932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6002, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6763678696158324, |
|
"grad_norm": 1.2068997504682133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5935, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.7694994179278232, |
|
"grad_norm": 1.001966561421665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.589, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.8626309662398137, |
|
"grad_norm": 0.7426199844782707, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5954, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.9557625145518043, |
|
"grad_norm": 1.0472743759335235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.591, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9930151338766007, |
|
"eval_loss": 0.6565061807632446, |
|
"eval_runtime": 76.8161, |
|
"eval_samples_per_second": 37.648, |
|
"eval_steps_per_second": 0.599, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.0488940628637953, |
|
"grad_norm": 1.1047685289937579, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5986, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.142025611175786, |
|
"grad_norm": 1.214031879234503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5179, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.2351571594877764, |
|
"grad_norm": 0.8187191061599544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5178, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.328288707799767, |
|
"grad_norm": 0.8726515587132341, |
|
"learning_rate": 5e-06, |
|
"loss": 0.513, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.421420256111758, |
|
"grad_norm": 0.8422751030451243, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5137, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.5145518044237485, |
|
"grad_norm": 0.9018678534732393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5263, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.607683352735739, |
|
"grad_norm": 0.875500810626841, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5201, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.70081490104773, |
|
"grad_norm": 0.8624139031471002, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5181, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.7939464493597206, |
|
"grad_norm": 0.8654410079756801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5197, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.887077997671711, |
|
"grad_norm": 1.255098018906923, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5274, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.980209545983702, |
|
"grad_norm": 0.8384998281941215, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5134, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.9895227008149012, |
|
"eval_loss": 0.6689090132713318, |
|
"eval_runtime": 75.5704, |
|
"eval_samples_per_second": 38.269, |
|
"eval_steps_per_second": 0.609, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.9895227008149012, |
|
"step": 321, |
|
"total_flos": 537477576130560.0, |
|
"train_loss": 0.6323956919607715, |
|
"train_runtime": 10362.5439, |
|
"train_samples_per_second": 15.907, |
|
"train_steps_per_second": 0.031 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 321, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 537477576130560.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|