|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.999360204734485, |
|
"eval_steps": 500, |
|
"global_step": 781, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03198976327575176, |
|
"grad_norm": 0.16216526925563812, |
|
"learning_rate": 6.329113924050633e-05, |
|
"loss": 0.9047, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06397952655150352, |
|
"grad_norm": 0.22862769663333893, |
|
"learning_rate": 0.00012658227848101267, |
|
"loss": 0.7593, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09596928982725528, |
|
"grad_norm": 0.2344927191734314, |
|
"learning_rate": 0.00018987341772151899, |
|
"loss": 0.6731, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.12795905310300704, |
|
"grad_norm": 0.2598811686038971, |
|
"learning_rate": 0.00019401709401709402, |
|
"loss": 0.6604, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1599488163787588, |
|
"grad_norm": 0.2400255650281906, |
|
"learning_rate": 0.0001868945868945869, |
|
"loss": 0.6304, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.19193857965451055, |
|
"grad_norm": 0.24225808680057526, |
|
"learning_rate": 0.00017977207977207978, |
|
"loss": 0.6428, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22392834293026231, |
|
"grad_norm": 0.22859220206737518, |
|
"learning_rate": 0.00017264957264957268, |
|
"loss": 0.6154, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2559181062060141, |
|
"grad_norm": 0.23886021971702576, |
|
"learning_rate": 0.00016552706552706555, |
|
"loss": 0.6001, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.28790786948176583, |
|
"grad_norm": 0.22877049446105957, |
|
"learning_rate": 0.00015840455840455842, |
|
"loss": 0.5983, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3198976327575176, |
|
"grad_norm": 0.24990104138851166, |
|
"learning_rate": 0.00015128205128205128, |
|
"loss": 0.5789, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.35188739603326935, |
|
"grad_norm": 0.2319009006023407, |
|
"learning_rate": 0.00014415954415954415, |
|
"loss": 0.5851, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3838771593090211, |
|
"grad_norm": 0.22513625025749207, |
|
"learning_rate": 0.00013703703703703705, |
|
"loss": 0.5974, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.41586692258477287, |
|
"grad_norm": 0.2516462504863739, |
|
"learning_rate": 0.00012991452991452992, |
|
"loss": 0.5811, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.44785668586052463, |
|
"grad_norm": 0.23952844738960266, |
|
"learning_rate": 0.00012279202279202279, |
|
"loss": 0.5885, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.4798464491362764, |
|
"grad_norm": 0.26743239164352417, |
|
"learning_rate": 0.00011566951566951567, |
|
"loss": 0.5812, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5118362124120281, |
|
"grad_norm": 0.24359311163425446, |
|
"learning_rate": 0.00010854700854700855, |
|
"loss": 0.5853, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5438259756877799, |
|
"grad_norm": 0.26046106219291687, |
|
"learning_rate": 0.00010142450142450144, |
|
"loss": 0.5717, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.5758157389635317, |
|
"grad_norm": 0.2767123878002167, |
|
"learning_rate": 9.430199430199431e-05, |
|
"loss": 0.5711, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6078055022392834, |
|
"grad_norm": 0.2743181884288788, |
|
"learning_rate": 8.717948717948718e-05, |
|
"loss": 0.5786, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6397952655150352, |
|
"grad_norm": 0.2655166983604431, |
|
"learning_rate": 8.005698005698006e-05, |
|
"loss": 0.563, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6717850287907869, |
|
"grad_norm": 0.2630331814289093, |
|
"learning_rate": 7.293447293447295e-05, |
|
"loss": 0.5688, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7037747920665387, |
|
"grad_norm": 0.27685314416885376, |
|
"learning_rate": 6.581196581196581e-05, |
|
"loss": 0.5687, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7357645553422905, |
|
"grad_norm": 0.2695849537849426, |
|
"learning_rate": 5.868945868945869e-05, |
|
"loss": 0.5592, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.7677543186180422, |
|
"grad_norm": 0.25096848607063293, |
|
"learning_rate": 5.156695156695157e-05, |
|
"loss": 0.5477, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.799744081893794, |
|
"grad_norm": 0.2821820378303528, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 0.5574, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.8317338451695457, |
|
"grad_norm": 0.26849839091300964, |
|
"learning_rate": 3.732193732193732e-05, |
|
"loss": 0.5662, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.8637236084452975, |
|
"grad_norm": 0.27688708901405334, |
|
"learning_rate": 3.01994301994302e-05, |
|
"loss": 0.5485, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.8957133717210493, |
|
"grad_norm": 0.2868192195892334, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 0.5493, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.927703134996801, |
|
"grad_norm": 0.28862541913986206, |
|
"learning_rate": 1.5954415954415954e-05, |
|
"loss": 0.5494, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.9596928982725528, |
|
"grad_norm": 0.2842100262641907, |
|
"learning_rate": 8.831908831908831e-06, |
|
"loss": 0.5582, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.9916826615483045, |
|
"grad_norm": 0.2615242302417755, |
|
"learning_rate": 1.7094017094017097e-06, |
|
"loss": 0.5641, |
|
"step": 775 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 781, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 2.5396281704290714e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|