|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 50, |
|
"global_step": 587, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017035775127768313, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3494, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.034071550255536626, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.3494, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05110732538330494, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.3523, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06814310051107325, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3428, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08517887563884156, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.359, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08517887563884156, |
|
"eval_loss": 0.3472045063972473, |
|
"eval_runtime": 44.9607, |
|
"eval_samples_per_second": 3.27, |
|
"eval_steps_per_second": 3.27, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10221465076660988, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 3e-05, |
|
"loss": 0.3494, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11925042589437819, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.3527, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1362862010221465, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 4e-05, |
|
"loss": 0.349, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15332197614991483, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.3432, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17035775127768313, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3252, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17035775127768313, |
|
"eval_loss": 0.35169535875320435, |
|
"eval_runtime": 45.0372, |
|
"eval_samples_per_second": 3.264, |
|
"eval_steps_per_second": 3.264, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18739352640545145, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 4.897330595482547e-05, |
|
"loss": 0.3191, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.20442930153321975, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 4.7946611909650925e-05, |
|
"loss": 0.289, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22146507666098808, |
|
"grad_norm": 3.765625, |
|
"learning_rate": 4.691991786447639e-05, |
|
"loss": 0.2395, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23850085178875638, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.5893223819301853e-05, |
|
"loss": 0.3263, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2555366269165247, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 4.486652977412731e-05, |
|
"loss": 0.1648, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2555366269165247, |
|
"eval_loss": 0.611720085144043, |
|
"eval_runtime": 44.8864, |
|
"eval_samples_per_second": 3.275, |
|
"eval_steps_per_second": 3.275, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.272572402044293, |
|
"grad_norm": 21.125, |
|
"learning_rate": 4.383983572895277e-05, |
|
"loss": 0.163, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.28960817717206133, |
|
"grad_norm": 7.25, |
|
"learning_rate": 4.281314168377823e-05, |
|
"loss": 0.4368, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.30664395229982966, |
|
"grad_norm": 29.625, |
|
"learning_rate": 4.17864476386037e-05, |
|
"loss": 0.2398, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32367972742759793, |
|
"grad_norm": 15.3125, |
|
"learning_rate": 4.075975359342916e-05, |
|
"loss": 0.1639, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.34071550255536626, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 3.973305954825462e-05, |
|
"loss": 0.2869, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.34071550255536626, |
|
"eval_loss": 0.6622754335403442, |
|
"eval_runtime": 44.5126, |
|
"eval_samples_per_second": 3.302, |
|
"eval_steps_per_second": 3.302, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3577512776831346, |
|
"grad_norm": 0.021484375, |
|
"learning_rate": 3.8706365503080084e-05, |
|
"loss": 0.1419, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3747870528109029, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 3.767967145790555e-05, |
|
"loss": 0.1224, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39182282793867124, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 3.6652977412731007e-05, |
|
"loss": 0.3138, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4088586030664395, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 3.562628336755647e-05, |
|
"loss": 0.1766, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.42589437819420783, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.459958932238193e-05, |
|
"loss": 0.1243, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.42589437819420783, |
|
"eval_loss": 0.6063656210899353, |
|
"eval_runtime": 43.927, |
|
"eval_samples_per_second": 3.346, |
|
"eval_steps_per_second": 3.346, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.44293015332197616, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 3.357289527720739e-05, |
|
"loss": 0.1272, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4599659284497445, |
|
"grad_norm": 28.375, |
|
"learning_rate": 3.254620123203286e-05, |
|
"loss": 0.4251, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.47700170357751276, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 3.1519507186858315e-05, |
|
"loss": 0.4472, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4940374787052811, |
|
"grad_norm": 40.25, |
|
"learning_rate": 3.049281314168378e-05, |
|
"loss": 0.4209, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5110732538330494, |
|
"grad_norm": 0.031494140625, |
|
"learning_rate": 2.9466119096509244e-05, |
|
"loss": 0.1857, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5110732538330494, |
|
"eval_loss": 0.7174944281578064, |
|
"eval_runtime": 43.835, |
|
"eval_samples_per_second": 3.353, |
|
"eval_steps_per_second": 3.353, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5281090289608177, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 2.8439425051334705e-05, |
|
"loss": 0.2324, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.545144804088586, |
|
"grad_norm": 0.0184326171875, |
|
"learning_rate": 2.7412731006160163e-05, |
|
"loss": 0.1289, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5621805792163543, |
|
"grad_norm": 0.1201171875, |
|
"learning_rate": 2.6386036960985628e-05, |
|
"loss": 0.1295, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5792163543441227, |
|
"grad_norm": 0.005859375, |
|
"learning_rate": 2.5359342915811092e-05, |
|
"loss": 0.393, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.596252129471891, |
|
"grad_norm": 23.375, |
|
"learning_rate": 2.433264887063655e-05, |
|
"loss": 0.3171, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.596252129471891, |
|
"eval_loss": 0.7911351919174194, |
|
"eval_runtime": 43.7143, |
|
"eval_samples_per_second": 3.363, |
|
"eval_steps_per_second": 3.363, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6132879045996593, |
|
"grad_norm": 0.0242919921875, |
|
"learning_rate": 2.3305954825462014e-05, |
|
"loss": 0.535, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6303236797274276, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 2.2279260780287475e-05, |
|
"loss": 0.2197, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6473594548551959, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 2.125256673511294e-05, |
|
"loss": 0.3187, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6643952299829642, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 2.02258726899384e-05, |
|
"loss": 0.1459, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6814310051107325, |
|
"grad_norm": 39.25, |
|
"learning_rate": 1.919917864476386e-05, |
|
"loss": 0.5212, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6814310051107325, |
|
"eval_loss": 0.7614322304725647, |
|
"eval_runtime": 43.7037, |
|
"eval_samples_per_second": 3.364, |
|
"eval_steps_per_second": 3.364, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6984667802385008, |
|
"grad_norm": 3.640625, |
|
"learning_rate": 1.8172484599589323e-05, |
|
"loss": 0.1773, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7155025553662692, |
|
"grad_norm": 0.005645751953125, |
|
"learning_rate": 1.7145790554414784e-05, |
|
"loss": 0.2599, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7325383304940375, |
|
"grad_norm": 30.5, |
|
"learning_rate": 1.611909650924025e-05, |
|
"loss": 0.3468, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7495741056218058, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.5092402464065708e-05, |
|
"loss": 0.2754, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7666098807495741, |
|
"grad_norm": 32.0, |
|
"learning_rate": 1.406570841889117e-05, |
|
"loss": 0.3287, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7666098807495741, |
|
"eval_loss": 0.8178677558898926, |
|
"eval_runtime": 43.7442, |
|
"eval_samples_per_second": 3.36, |
|
"eval_steps_per_second": 3.36, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7836456558773425, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.3039014373716632e-05, |
|
"loss": 0.1921, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8006814310051107, |
|
"grad_norm": 28.125, |
|
"learning_rate": 1.2012320328542096e-05, |
|
"loss": 0.4395, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.817717206132879, |
|
"grad_norm": 4.25, |
|
"learning_rate": 1.0985626283367557e-05, |
|
"loss": 0.2133, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8347529812606473, |
|
"grad_norm": 7.25, |
|
"learning_rate": 9.95893223819302e-06, |
|
"loss": 0.4362, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8517887563884157, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 8.932238193018481e-06, |
|
"loss": 0.418, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8517887563884157, |
|
"eval_loss": 0.8019587993621826, |
|
"eval_runtime": 43.8445, |
|
"eval_samples_per_second": 3.353, |
|
"eval_steps_per_second": 3.353, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.868824531516184, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 7.905544147843944e-06, |
|
"loss": 0.236, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8858603066439523, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 6.878850102669406e-06, |
|
"loss": 0.5388, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9028960817717206, |
|
"grad_norm": 0.00543212890625, |
|
"learning_rate": 5.852156057494867e-06, |
|
"loss": 0.0928, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.919931856899489, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 4.825462012320329e-06, |
|
"loss": 0.1343, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.9369676320272572, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 3.7987679671457908e-06, |
|
"loss": 0.099, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9369676320272572, |
|
"eval_loss": 0.8314433097839355, |
|
"eval_runtime": 43.8265, |
|
"eval_samples_per_second": 3.354, |
|
"eval_steps_per_second": 3.354, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.9540034071550255, |
|
"grad_norm": 32.75, |
|
"learning_rate": 2.7720739219712527e-06, |
|
"loss": 0.1544, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9710391822827938, |
|
"grad_norm": 22.75, |
|
"learning_rate": 1.7453798767967144e-06, |
|
"loss": 0.2434, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.9880749574105622, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 7.186858316221766e-07, |
|
"loss": 0.1046, |
|
"step": 580 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 587, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8836792572744000.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|