|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.06798339773865751, |
|
"eval_steps": 25, |
|
"global_step": 475, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.6931123733520508, |
|
"learning_rate": 0.0001988606301829365, |
|
"loss": 3.3633, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 2.791308879852295, |
|
"eval_runtime": 1766.1893, |
|
"eval_samples_per_second": 4.028, |
|
"eval_steps_per_second": 0.504, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.8134792447090149, |
|
"learning_rate": 0.00019527921915838827, |
|
"loss": 2.8231, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.6491129398345947, |
|
"eval_runtime": 1766.0434, |
|
"eval_samples_per_second": 4.028, |
|
"eval_steps_per_second": 0.504, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7844312191009521, |
|
"learning_rate": 0.00018934232439329787, |
|
"loss": 2.7094, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.595423698425293, |
|
"eval_runtime": 1770.4727, |
|
"eval_samples_per_second": 4.018, |
|
"eval_steps_per_second": 0.503, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7818688154220581, |
|
"learning_rate": 0.00018119671723205706, |
|
"loss": 2.6759, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 2.5623528957366943, |
|
"eval_runtime": 1764.724, |
|
"eval_samples_per_second": 4.031, |
|
"eval_steps_per_second": 0.504, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9374421238899231, |
|
"learning_rate": 0.00017104377259560488, |
|
"loss": 2.6361, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.5327322483062744, |
|
"eval_runtime": 1766.346, |
|
"eval_samples_per_second": 4.028, |
|
"eval_steps_per_second": 0.504, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8157618641853333, |
|
"learning_rate": 0.00015913449060989774, |
|
"loss": 2.6192, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 2.5096042156219482, |
|
"eval_runtime": 1765.5704, |
|
"eval_samples_per_second": 4.029, |
|
"eval_steps_per_second": 0.504, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8904098272323608, |
|
"learning_rate": 0.00014576329140480925, |
|
"loss": 2.5547, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.508963108062744, |
|
"eval_runtime": 1767.6823, |
|
"eval_samples_per_second": 4.024, |
|
"eval_steps_per_second": 0.503, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8415258526802063, |
|
"learning_rate": 0.00013126073648784688, |
|
"loss": 2.5803, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.484498977661133, |
|
"eval_runtime": 1764.6855, |
|
"eval_samples_per_second": 4.031, |
|
"eval_steps_per_second": 0.504, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8273302912712097, |
|
"learning_rate": 0.00011598535663418884, |
|
"loss": 2.5275, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 2.4777684211730957, |
|
"eval_runtime": 1764.516, |
|
"eval_samples_per_second": 4.032, |
|
"eval_steps_per_second": 0.504, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8461028933525085, |
|
"learning_rate": 0.000100314788323161, |
|
"loss": 2.5504, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.4680240154266357, |
|
"eval_runtime": 1764.412, |
|
"eval_samples_per_second": 4.032, |
|
"eval_steps_per_second": 0.504, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9055793881416321, |
|
"learning_rate": 8.46364378453242e-05, |
|
"loss": 2.5447, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.4592506885528564, |
|
"eval_runtime": 1763.6878, |
|
"eval_samples_per_second": 4.034, |
|
"eval_steps_per_second": 0.505, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0102213621139526, |
|
"learning_rate": 6.933790388122256e-05, |
|
"loss": 2.5308, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 2.457402467727661, |
|
"eval_runtime": 1763.3112, |
|
"eval_samples_per_second": 4.034, |
|
"eval_steps_per_second": 0.505, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9688016176223755, |
|
"learning_rate": 5.479739532388526e-05, |
|
"loss": 2.5243, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.4439914226531982, |
|
"eval_runtime": 1763.1627, |
|
"eval_samples_per_second": 4.035, |
|
"eval_steps_per_second": 0.505, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9817106127738953, |
|
"learning_rate": 4.137438123475662e-05, |
|
"loss": 2.4967, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.4434900283813477, |
|
"eval_runtime": 1762.4256, |
|
"eval_samples_per_second": 4.036, |
|
"eval_steps_per_second": 0.505, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9285837411880493, |
|
"learning_rate": 2.9400704083950335e-05, |
|
"loss": 2.4979, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 2.4380979537963867, |
|
"eval_runtime": 1761.6148, |
|
"eval_samples_per_second": 4.038, |
|
"eval_steps_per_second": 0.505, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8453341722488403, |
|
"learning_rate": 1.917237597245065e-05, |
|
"loss": 2.5183, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.437168598175049, |
|
"eval_runtime": 1763.4013, |
|
"eval_samples_per_second": 4.034, |
|
"eval_steps_per_second": 0.505, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.091811180114746, |
|
"learning_rate": 1.0942260649272229e-05, |
|
"loss": 2.4867, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4306600093841553, |
|
"eval_runtime": 1759.0574, |
|
"eval_samples_per_second": 4.044, |
|
"eval_steps_per_second": 0.506, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9597497582435608, |
|
"learning_rate": 4.91382223804836e-06, |
|
"loss": 2.474, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 2.4304585456848145, |
|
"eval_runtime": 1763.2619, |
|
"eval_samples_per_second": 4.035, |
|
"eval_steps_per_second": 0.505, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.9459497928619385, |
|
"learning_rate": 1.2360952164325311e-06, |
|
"loss": 2.4979, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 2.4304075241088867, |
|
"eval_runtime": 1762.5181, |
|
"eval_samples_per_second": 4.036, |
|
"eval_steps_per_second": 0.505, |
|
"step": 475 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"total_flos": 8045660258064000.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|