|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.3192612137203166, |
|
"eval_steps": 25, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4790619611740112, |
|
"learning_rate": 2.3797595190380762e-05, |
|
"loss": 0.9713, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 0.7396946549415588, |
|
"eval_runtime": 75.3443, |
|
"eval_samples_per_second": 2.535, |
|
"eval_steps_per_second": 0.319, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.359966516494751, |
|
"learning_rate": 2.2545090180360722e-05, |
|
"loss": 0.6963, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 0.6846508979797363, |
|
"eval_runtime": 75.2251, |
|
"eval_samples_per_second": 2.539, |
|
"eval_steps_per_second": 0.319, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.3525006771087646, |
|
"learning_rate": 2.1292585170340683e-05, |
|
"loss": 0.6809, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.6628683805465698, |
|
"eval_runtime": 75.0453, |
|
"eval_samples_per_second": 2.545, |
|
"eval_steps_per_second": 0.32, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.3805192708969116, |
|
"learning_rate": 2.0040080160320643e-05, |
|
"loss": 0.6596, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 0.6511640548706055, |
|
"eval_runtime": 75.0686, |
|
"eval_samples_per_second": 2.544, |
|
"eval_steps_per_second": 0.32, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2107349634170532, |
|
"learning_rate": 1.87875751503006e-05, |
|
"loss": 0.6552, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 0.6428065896034241, |
|
"eval_runtime": 75.0266, |
|
"eval_samples_per_second": 2.546, |
|
"eval_steps_per_second": 0.32, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.327737808227539, |
|
"learning_rate": 1.7535070140280564e-05, |
|
"loss": 0.6363, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.6354109048843384, |
|
"eval_runtime": 75.1344, |
|
"eval_samples_per_second": 2.542, |
|
"eval_steps_per_second": 0.319, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2277601957321167, |
|
"learning_rate": 1.628256513026052e-05, |
|
"loss": 0.6441, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 0.6307269930839539, |
|
"eval_runtime": 75.0535, |
|
"eval_samples_per_second": 2.545, |
|
"eval_steps_per_second": 0.32, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.3676594495773315, |
|
"learning_rate": 1.5030060120240483e-05, |
|
"loss": 0.6317, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"eval_loss": 0.6252032518386841, |
|
"eval_runtime": 75.0153, |
|
"eval_samples_per_second": 2.546, |
|
"eval_steps_per_second": 0.32, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.2495057582855225, |
|
"learning_rate": 1.3777555110220442e-05, |
|
"loss": 0.6222, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.6214942336082458, |
|
"eval_runtime": 75.0103, |
|
"eval_samples_per_second": 2.546, |
|
"eval_steps_per_second": 0.32, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.2535713911056519, |
|
"learning_rate": 1.25250501002004e-05, |
|
"loss": 0.6127, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 0.6172995567321777, |
|
"eval_runtime": 75.023, |
|
"eval_samples_per_second": 2.546, |
|
"eval_steps_per_second": 0.32, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.3162541389465332, |
|
"learning_rate": 1.1272545090180361e-05, |
|
"loss": 0.6017, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"eval_loss": 0.614778459072113, |
|
"eval_runtime": 75.3906, |
|
"eval_samples_per_second": 2.533, |
|
"eval_steps_per_second": 0.318, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.3146350383758545, |
|
"learning_rate": 1.0020040080160322e-05, |
|
"loss": 0.6201, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 0.6113688945770264, |
|
"eval_runtime": 75.4374, |
|
"eval_samples_per_second": 2.532, |
|
"eval_steps_per_second": 0.318, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.311963438987732, |
|
"learning_rate": 8.767535070140282e-06, |
|
"loss": 0.5961, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"eval_loss": 0.6093372702598572, |
|
"eval_runtime": 75.3949, |
|
"eval_samples_per_second": 2.533, |
|
"eval_steps_per_second": 0.318, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.3103692531585693, |
|
"learning_rate": 7.515030060120242e-06, |
|
"loss": 0.6044, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 0.606224775314331, |
|
"eval_runtime": 75.3575, |
|
"eval_samples_per_second": 2.535, |
|
"eval_steps_per_second": 0.318, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.4562299251556396, |
|
"learning_rate": 6.2625250501002e-06, |
|
"loss": 0.6064, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_loss": 0.6035271286964417, |
|
"eval_runtime": 75.3932, |
|
"eval_samples_per_second": 2.533, |
|
"eval_steps_per_second": 0.318, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 1.3848822116851807, |
|
"learning_rate": 5.010020040080161e-06, |
|
"loss": 0.5763, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 0.6053218841552734, |
|
"eval_runtime": 75.3983, |
|
"eval_samples_per_second": 2.533, |
|
"eval_steps_per_second": 0.318, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.466248631477356, |
|
"learning_rate": 3.757515030060121e-06, |
|
"loss": 0.5589, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.6024670600891113, |
|
"eval_runtime": 75.3786, |
|
"eval_samples_per_second": 2.534, |
|
"eval_steps_per_second": 0.318, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.4298962354660034, |
|
"learning_rate": 2.5050100200400804e-06, |
|
"loss": 0.5595, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"eval_loss": 0.6021297574043274, |
|
"eval_runtime": 75.3606, |
|
"eval_samples_per_second": 2.534, |
|
"eval_steps_per_second": 0.318, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.395114779472351, |
|
"learning_rate": 1.2525050100200402e-06, |
|
"loss": 0.5525, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.6009101867675781, |
|
"eval_runtime": 75.3723, |
|
"eval_samples_per_second": 2.534, |
|
"eval_steps_per_second": 0.318, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.4652175903320312, |
|
"learning_rate": 0.0, |
|
"loss": 0.5541, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"eval_loss": 0.6004253625869751, |
|
"eval_runtime": 75.3761, |
|
"eval_samples_per_second": 2.534, |
|
"eval_steps_per_second": 0.318, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 250, |
|
"total_flos": 7.87141357369344e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|