|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7171029042667623, |
|
"eval_steps": 2000, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.5916264057159424, |
|
"learning_rate": 1e-06, |
|
"loss": 1.0776, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.6381211280822754, |
|
"learning_rate": 9.898989898989898e-07, |
|
"loss": 0.8345, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.9932849407196045, |
|
"learning_rate": 9.7989898989899e-07, |
|
"loss": 0.6863, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.8963606357574463, |
|
"learning_rate": 9.697979797979798e-07, |
|
"loss": 0.6659, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.000125885009766, |
|
"learning_rate": 9.598989898989899e-07, |
|
"loss": 0.6552, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.7767927646636963, |
|
"learning_rate": 9.497979797979798e-07, |
|
"loss": 0.6583, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.204336166381836, |
|
"learning_rate": 9.396969696969696e-07, |
|
"loss": 0.6229, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.798582077026367, |
|
"learning_rate": 9.295959595959596e-07, |
|
"loss": 0.6138, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.92914080619812, |
|
"learning_rate": 9.194949494949495e-07, |
|
"loss": 0.6266, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.982314109802246, |
|
"learning_rate": 9.093939393939394e-07, |
|
"loss": 0.6311, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.385096311569214, |
|
"learning_rate": 8.992929292929292e-07, |
|
"loss": 0.6141, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.015211343765259, |
|
"learning_rate": 8.891919191919191e-07, |
|
"loss": 0.6369, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 5.285658359527588, |
|
"learning_rate": 8.790909090909091e-07, |
|
"loss": 0.6442, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 4.69854736328125, |
|
"learning_rate": 8.68989898989899e-07, |
|
"loss": 0.6419, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 2.95841121673584, |
|
"learning_rate": 8.58989898989899e-07, |
|
"loss": 0.6575, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.4997267723083496, |
|
"learning_rate": 8.488888888888888e-07, |
|
"loss": 0.6346, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.126640319824219, |
|
"learning_rate": 8.387878787878787e-07, |
|
"loss": 0.6005, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.040174961090088, |
|
"learning_rate": 8.286868686868687e-07, |
|
"loss": 0.6434, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 4.870083332061768, |
|
"learning_rate": 8.185858585858586e-07, |
|
"loss": 0.6202, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.307583808898926, |
|
"learning_rate": 8.084848484848484e-07, |
|
"loss": 0.6552, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 0.746256411075592, |
|
"eval_runtime": 199.9457, |
|
"eval_samples_per_second": 5.001, |
|
"eval_steps_per_second": 1.25, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 2000, |
|
"total_flos": 1.88564197343232e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|