|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 200, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.03862990066409111, |
|
"eval_runtime": 37.7121, |
|
"eval_samples_per_second": 424.267, |
|
"eval_steps_per_second": 6.629, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_loss": 0.0523061603307724, |
|
"eval_runtime": 37.8838, |
|
"eval_samples_per_second": 422.344, |
|
"eval_steps_per_second": 6.599, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 72862.21875, |
|
"learning_rate": 4.9692208514878444e-05, |
|
"loss": 0.1425, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"eval_loss": 1.0542395114898682, |
|
"eval_runtime": 37.8634, |
|
"eval_samples_per_second": 422.571, |
|
"eval_steps_per_second": 6.603, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_loss": 1.0459295511245728, |
|
"eval_runtime": 37.8174, |
|
"eval_samples_per_second": 423.086, |
|
"eval_steps_per_second": 6.611, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 4.641770839691162, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 0.9177, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.3677258789539337, |
|
"eval_runtime": 37.6734, |
|
"eval_samples_per_second": 424.703, |
|
"eval_steps_per_second": 6.636, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.02963975816965103, |
|
"eval_runtime": 37.6097, |
|
"eval_samples_per_second": 425.422, |
|
"eval_steps_per_second": 6.647, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"eval_loss": 0.028412258252501488, |
|
"eval_runtime": 37.7795, |
|
"eval_samples_per_second": 423.51, |
|
"eval_steps_per_second": 6.617, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.17703795433044434, |
|
"learning_rate": 4.72751631047092e-05, |
|
"loss": 0.0421, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"eval_loss": 0.02752041630446911, |
|
"eval_runtime": 37.7317, |
|
"eval_samples_per_second": 424.046, |
|
"eval_steps_per_second": 6.626, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"eval_loss": 0.027357231825590134, |
|
"eval_runtime": 37.9192, |
|
"eval_samples_per_second": 421.95, |
|
"eval_steps_per_second": 6.593, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.14987020194530487, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 0.029, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.026410279795527458, |
|
"eval_runtime": 37.883, |
|
"eval_samples_per_second": 422.353, |
|
"eval_steps_per_second": 6.599, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"eval_loss": 0.025838496163487434, |
|
"eval_runtime": 37.6759, |
|
"eval_samples_per_second": 424.674, |
|
"eval_steps_per_second": 6.636, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 0.025557253509759903, |
|
"eval_runtime": 37.6691, |
|
"eval_samples_per_second": 424.751, |
|
"eval_steps_per_second": 6.637, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.21350397169589996, |
|
"learning_rate": 4.267766952966369e-05, |
|
"loss": 0.0276, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"eval_loss": 0.025391312316060066, |
|
"eval_runtime": 37.9185, |
|
"eval_samples_per_second": 421.958, |
|
"eval_steps_per_second": 6.593, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_loss": 0.025234265252947807, |
|
"eval_runtime": 37.6569, |
|
"eval_samples_per_second": 424.889, |
|
"eval_steps_per_second": 6.639, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.09507149457931519, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 0.0265, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.025119660422205925, |
|
"eval_runtime": 37.7071, |
|
"eval_samples_per_second": 424.323, |
|
"eval_steps_per_second": 6.63, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"eval_loss": 0.02474472112953663, |
|
"eval_runtime": 37.8472, |
|
"eval_samples_per_second": 422.753, |
|
"eval_steps_per_second": 6.606, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"eval_loss": 0.02474530041217804, |
|
"eval_runtime": 37.964, |
|
"eval_samples_per_second": 421.452, |
|
"eval_steps_per_second": 6.585, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.21493718028068542, |
|
"learning_rate": 3.634976249348867e-05, |
|
"loss": 0.0256, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"eval_loss": 0.024641884490847588, |
|
"eval_runtime": 37.7028, |
|
"eval_samples_per_second": 424.372, |
|
"eval_steps_per_second": 6.631, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"eval_loss": 0.02516881749033928, |
|
"eval_runtime": 37.6697, |
|
"eval_samples_per_second": 424.745, |
|
"eval_steps_per_second": 6.637, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.11537094414234161, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 0.0262, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.024907398968935013, |
|
"eval_runtime": 37.8134, |
|
"eval_samples_per_second": 423.13, |
|
"eval_steps_per_second": 6.611, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"eval_loss": 0.024411499500274658, |
|
"eval_runtime": 37.768, |
|
"eval_samples_per_second": 423.639, |
|
"eval_steps_per_second": 6.619, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"eval_loss": 0.02421731874346733, |
|
"eval_runtime": 37.8841, |
|
"eval_samples_per_second": 422.34, |
|
"eval_steps_per_second": 6.599, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.15358753502368927, |
|
"learning_rate": 2.8910861626005776e-05, |
|
"loss": 0.0255, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"eval_loss": 0.0241916012018919, |
|
"eval_runtime": 37.7974, |
|
"eval_samples_per_second": 423.31, |
|
"eval_steps_per_second": 6.614, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"eval_loss": 0.024809282273054123, |
|
"eval_runtime": 37.9668, |
|
"eval_samples_per_second": 421.42, |
|
"eval_steps_per_second": 6.585, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.21434639394283295, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.0251, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.023995770141482353, |
|
"eval_runtime": 37.8979, |
|
"eval_samples_per_second": 422.187, |
|
"eval_steps_per_second": 6.597, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"eval_loss": 0.024207767099142075, |
|
"eval_runtime": 37.6737, |
|
"eval_samples_per_second": 424.7, |
|
"eval_steps_per_second": 6.636, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"eval_loss": 0.023853810504078865, |
|
"eval_runtime": 37.6942, |
|
"eval_samples_per_second": 424.469, |
|
"eval_steps_per_second": 6.632, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.13689687848091125, |
|
"learning_rate": 2.1089138373994223e-05, |
|
"loss": 0.0254, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_loss": 0.02390187606215477, |
|
"eval_runtime": 37.8063, |
|
"eval_samples_per_second": 423.21, |
|
"eval_steps_per_second": 6.613, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"eval_loss": 0.02373598702251911, |
|
"eval_runtime": 37.708, |
|
"eval_samples_per_second": 424.313, |
|
"eval_steps_per_second": 6.63, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.14604102075099945, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 0.0244, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.02371423877775669, |
|
"eval_runtime": 37.6971, |
|
"eval_samples_per_second": 424.436, |
|
"eval_steps_per_second": 6.632, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"eval_loss": 0.02387085184454918, |
|
"eval_runtime": 37.6715, |
|
"eval_samples_per_second": 424.724, |
|
"eval_steps_per_second": 6.636, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"eval_loss": 0.023735951632261276, |
|
"eval_runtime": 37.9359, |
|
"eval_samples_per_second": 421.764, |
|
"eval_steps_per_second": 6.59, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.12262556701898575, |
|
"learning_rate": 1.3650237506511331e-05, |
|
"loss": 0.0244, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"eval_loss": 0.023663455620408058, |
|
"eval_runtime": 37.8483, |
|
"eval_samples_per_second": 422.74, |
|
"eval_steps_per_second": 6.605, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"eval_loss": 0.023848820477724075, |
|
"eval_runtime": 37.7446, |
|
"eval_samples_per_second": 423.902, |
|
"eval_steps_per_second": 6.623, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.28356611728668213, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 0.0246, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.02359016053378582, |
|
"eval_runtime": 37.7474, |
|
"eval_samples_per_second": 423.87, |
|
"eval_steps_per_second": 6.623, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"eval_loss": 0.023543963208794594, |
|
"eval_runtime": 37.7642, |
|
"eval_samples_per_second": 423.681, |
|
"eval_steps_per_second": 6.62, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"eval_loss": 0.023517215624451637, |
|
"eval_runtime": 37.9453, |
|
"eval_samples_per_second": 421.66, |
|
"eval_steps_per_second": 6.588, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.10998739302158356, |
|
"learning_rate": 7.3223304703363135e-06, |
|
"loss": 0.0242, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"eval_loss": 0.023526180535554886, |
|
"eval_runtime": 37.925, |
|
"eval_samples_per_second": 421.885, |
|
"eval_steps_per_second": 6.592, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"eval_loss": 0.02347410097718239, |
|
"eval_runtime": 37.6793, |
|
"eval_samples_per_second": 424.636, |
|
"eval_steps_per_second": 6.635, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.0942414253950119, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 0.0244, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.023570595309138298, |
|
"eval_runtime": 37.876, |
|
"eval_samples_per_second": 422.432, |
|
"eval_steps_per_second": 6.6, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"eval_loss": 0.023448683321475983, |
|
"eval_runtime": 37.7532, |
|
"eval_samples_per_second": 423.805, |
|
"eval_steps_per_second": 6.622, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"eval_loss": 0.02345592901110649, |
|
"eval_runtime": 37.6951, |
|
"eval_samples_per_second": 424.459, |
|
"eval_steps_per_second": 6.632, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.1675085425376892, |
|
"learning_rate": 2.7248368952908053e-06, |
|
"loss": 0.0246, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"eval_loss": 0.023442383855581284, |
|
"eval_runtime": 37.8284, |
|
"eval_samples_per_second": 422.963, |
|
"eval_steps_per_second": 6.609, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"eval_loss": 0.023424193263053894, |
|
"eval_runtime": 37.8096, |
|
"eval_samples_per_second": 423.173, |
|
"eval_steps_per_second": 6.612, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.19085237383842468, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 0.0237, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.023412322625517845, |
|
"eval_runtime": 37.9142, |
|
"eval_samples_per_second": 422.006, |
|
"eval_steps_per_second": 6.594, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"eval_loss": 0.023400841280817986, |
|
"eval_runtime": 37.7841, |
|
"eval_samples_per_second": 423.458, |
|
"eval_steps_per_second": 6.617, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"eval_loss": 0.023405231535434723, |
|
"eval_runtime": 37.8666, |
|
"eval_samples_per_second": 422.536, |
|
"eval_steps_per_second": 6.602, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.16063953936100006, |
|
"learning_rate": 3.077914851215585e-07, |
|
"loss": 0.0241, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"eval_loss": 0.023399699479341507, |
|
"eval_runtime": 37.8663, |
|
"eval_samples_per_second": 422.539, |
|
"eval_steps_per_second": 6.602, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"eval_loss": 0.023402543738484383, |
|
"eval_runtime": 37.6977, |
|
"eval_samples_per_second": 424.429, |
|
"eval_steps_per_second": 6.632, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.11751583963632584, |
|
"learning_rate": 0.0, |
|
"loss": 0.024, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.023403601720929146, |
|
"eval_runtime": 37.692, |
|
"eval_samples_per_second": 424.493, |
|
"eval_steps_per_second": 6.633, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 1000, |
|
"total_flos": 4.180672512e+16, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|