|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.011189437171310284, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005594718585655141, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 1.3986013986013987e-08, |
|
"loss": 1.4848, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0011189437171310282, |
|
"grad_norm": 1.0, |
|
"learning_rate": 2.7972027972027974e-08, |
|
"loss": 1.4825, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0016784155756965425, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 4.195804195804196e-08, |
|
"loss": 1.452, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0022378874342620565, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 5.594405594405595e-08, |
|
"loss": 1.5023, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.002797359292827571, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 6.993006993006993e-08, |
|
"loss": 1.5079, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.003356831151393085, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 8.391608391608393e-08, |
|
"loss": 1.4874, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.003916303009958599, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.790209790209792e-08, |
|
"loss": 1.4273, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.004475774868524113, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 1.118881118881119e-07, |
|
"loss": 1.4738, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.005035246727089627, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 1.258741258741259e-07, |
|
"loss": 1.4768, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.005594718585655142, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.3986013986013987e-07, |
|
"loss": 1.4834, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.006154190444220655, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 1.5384615384615387e-07, |
|
"loss": 1.5255, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.00671366230278617, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.6783216783216785e-07, |
|
"loss": 1.4849, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.007273134161351684, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 1.8181818181818183e-07, |
|
"loss": 1.5231, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.007832606019917199, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 1.9580419580419583e-07, |
|
"loss": 1.4558, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.008392077878482713, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 2.097902097902098e-07, |
|
"loss": 1.5004, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008951549737048226, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 2.237762237762238e-07, |
|
"loss": 1.4348, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.00951102159561374, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 2.3776223776223777e-07, |
|
"loss": 1.4752, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.010070493454179255, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.517482517482518e-07, |
|
"loss": 1.4899, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.01062996531274477, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.6573426573426575e-07, |
|
"loss": 1.4307, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.011189437171310284, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.7972027972027973e-07, |
|
"loss": 1.4233, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 35748, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"total_flos": 7.36836603346944e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|