|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 2206, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7927429676055908, |
|
"learning_rate": 4.773345421577516e-05, |
|
"loss": 0.6604, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7291208505630493, |
|
"learning_rate": 4.546690843155032e-05, |
|
"loss": 0.5917, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7045320868492126, |
|
"learning_rate": 4.320036264732548e-05, |
|
"loss": 0.5638, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7321786880493164, |
|
"learning_rate": 4.095648232094288e-05, |
|
"loss": 0.5409, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7257340550422668, |
|
"learning_rate": 3.868993653671804e-05, |
|
"loss": 0.5217, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7644676566123962, |
|
"learning_rate": 3.64233907524932e-05, |
|
"loss": 0.5, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7164526581764221, |
|
"learning_rate": 3.415684496826836e-05, |
|
"loss": 0.4851, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7228084802627563, |
|
"learning_rate": 3.189029918404352e-05, |
|
"loss": 0.4703, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7583528161048889, |
|
"learning_rate": 2.9623753399818678e-05, |
|
"loss": 0.456, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7674237489700317, |
|
"learning_rate": 2.7357207615593838e-05, |
|
"loss": 0.447, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7338166236877441, |
|
"learning_rate": 2.5090661831368994e-05, |
|
"loss": 0.4357, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7415379285812378, |
|
"learning_rate": 2.2824116047144154e-05, |
|
"loss": 0.4275, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7678395509719849, |
|
"learning_rate": 2.0557570262919314e-05, |
|
"loss": 0.4136, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7428087592124939, |
|
"learning_rate": 1.829102447869447e-05, |
|
"loss": 0.4104, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7453845143318176, |
|
"learning_rate": 1.602447869446963e-05, |
|
"loss": 0.3989, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6997073888778687, |
|
"learning_rate": 1.3757932910244787e-05, |
|
"loss": 0.3961, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7334680557250977, |
|
"learning_rate": 1.1491387126019947e-05, |
|
"loss": 0.3879, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.7411799430847168, |
|
"learning_rate": 9.224841341795105e-06, |
|
"loss": 0.3799, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.818085253238678, |
|
"learning_rate": 6.958295557570263e-06, |
|
"loss": 0.3782, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.7673262357711792, |
|
"learning_rate": 4.71441523118767e-06, |
|
"loss": 0.3711, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7118935585021973, |
|
"learning_rate": 2.447869446962829e-06, |
|
"loss": 0.3629, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8014984726905823, |
|
"learning_rate": 1.813236627379873e-07, |
|
"loss": 0.3695, |
|
"step": 2200 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 2206, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 2206, |
|
"total_flos": 6.205756045755679e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|