{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 587, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017035775127768313, "grad_norm": 1.6640625, "learning_rate": 5e-06, "loss": 0.3494, "step": 10 }, { "epoch": 0.034071550255536626, "grad_norm": 1.203125, "learning_rate": 1e-05, "loss": 0.3494, "step": 20 }, { "epoch": 0.05110732538330494, "grad_norm": 1.2421875, "learning_rate": 1.5e-05, "loss": 0.3523, "step": 30 }, { "epoch": 0.06814310051107325, "grad_norm": 1.53125, "learning_rate": 2e-05, "loss": 0.3428, "step": 40 }, { "epoch": 0.08517887563884156, "grad_norm": 1.3046875, "learning_rate": 2.5e-05, "loss": 0.359, "step": 50 }, { "epoch": 0.08517887563884156, "eval_loss": 0.3472045063972473, "eval_runtime": 44.9607, "eval_samples_per_second": 3.27, "eval_steps_per_second": 3.27, "step": 50 }, { "epoch": 0.10221465076660988, "grad_norm": 1.21875, "learning_rate": 3e-05, "loss": 0.3494, "step": 60 }, { "epoch": 0.11925042589437819, "grad_norm": 1.2734375, "learning_rate": 3.5e-05, "loss": 0.3527, "step": 70 }, { "epoch": 0.1362862010221465, "grad_norm": 0.64453125, "learning_rate": 4e-05, "loss": 0.349, "step": 80 }, { "epoch": 0.15332197614991483, "grad_norm": 1.0703125, "learning_rate": 4.5e-05, "loss": 0.3432, "step": 90 }, { "epoch": 0.17035775127768313, "grad_norm": 1.40625, "learning_rate": 5e-05, "loss": 0.3252, "step": 100 }, { "epoch": 0.17035775127768313, "eval_loss": 0.35169535875320435, "eval_runtime": 45.0372, "eval_samples_per_second": 3.264, "eval_steps_per_second": 3.264, "step": 100 }, { "epoch": 0.18739352640545145, "grad_norm": 1.2890625, "learning_rate": 4.897330595482547e-05, "loss": 0.3191, "step": 110 }, { "epoch": 0.20442930153321975, "grad_norm": 2.28125, "learning_rate": 4.7946611909650925e-05, "loss": 0.289, "step": 120 }, { "epoch": 0.22146507666098808, "grad_norm": 3.765625, "learning_rate": 4.691991786447639e-05, "loss": 0.2395, "step": 130 }, { "epoch": 0.23850085178875638, "grad_norm": 9.5625, "learning_rate": 4.5893223819301853e-05, "loss": 0.3263, "step": 140 }, { "epoch": 0.2555366269165247, "grad_norm": 4.84375, "learning_rate": 4.486652977412731e-05, "loss": 0.1648, "step": 150 }, { "epoch": 0.2555366269165247, "eval_loss": 0.611720085144043, "eval_runtime": 44.8864, "eval_samples_per_second": 3.275, "eval_steps_per_second": 3.275, "step": 150 }, { "epoch": 0.272572402044293, "grad_norm": 21.125, "learning_rate": 4.383983572895277e-05, "loss": 0.163, "step": 160 }, { "epoch": 0.28960817717206133, "grad_norm": 7.25, "learning_rate": 4.281314168377823e-05, "loss": 0.4368, "step": 170 }, { "epoch": 0.30664395229982966, "grad_norm": 29.625, "learning_rate": 4.17864476386037e-05, "loss": 0.2398, "step": 180 }, { "epoch": 0.32367972742759793, "grad_norm": 15.3125, "learning_rate": 4.075975359342916e-05, "loss": 0.1639, "step": 190 }, { "epoch": 0.34071550255536626, "grad_norm": 0.119140625, "learning_rate": 3.973305954825462e-05, "loss": 0.2869, "step": 200 }, { "epoch": 0.34071550255536626, "eval_loss": 0.6622754335403442, "eval_runtime": 44.5126, "eval_samples_per_second": 3.302, "eval_steps_per_second": 3.302, "step": 200 }, { "epoch": 0.3577512776831346, "grad_norm": 0.021484375, "learning_rate": 3.8706365503080084e-05, "loss": 0.1419, "step": 210 }, { "epoch": 0.3747870528109029, "grad_norm": 9.6875, "learning_rate": 3.767967145790555e-05, "loss": 0.1224, "step": 220 }, { "epoch": 0.39182282793867124, "grad_norm": 0.0693359375, "learning_rate": 3.6652977412731007e-05, "loss": 0.3138, "step": 230 }, { "epoch": 0.4088586030664395, "grad_norm": 6.8125, "learning_rate": 3.562628336755647e-05, "loss": 0.1766, "step": 240 }, { "epoch": 0.42589437819420783, "grad_norm": 0.07421875, "learning_rate": 3.459958932238193e-05, "loss": 0.1243, "step": 250 }, { "epoch": 0.42589437819420783, "eval_loss": 0.6063656210899353, "eval_runtime": 43.927, "eval_samples_per_second": 3.346, "eval_steps_per_second": 3.346, "step": 250 }, { "epoch": 0.44293015332197616, "grad_norm": 7.59375, "learning_rate": 3.357289527720739e-05, "loss": 0.1272, "step": 260 }, { "epoch": 0.4599659284497445, "grad_norm": 28.375, "learning_rate": 3.254620123203286e-05, "loss": 0.4251, "step": 270 }, { "epoch": 0.47700170357751276, "grad_norm": 0.462890625, "learning_rate": 3.1519507186858315e-05, "loss": 0.4472, "step": 280 }, { "epoch": 0.4940374787052811, "grad_norm": 40.25, "learning_rate": 3.049281314168378e-05, "loss": 0.4209, "step": 290 }, { "epoch": 0.5110732538330494, "grad_norm": 0.031494140625, "learning_rate": 2.9466119096509244e-05, "loss": 0.1857, "step": 300 }, { "epoch": 0.5110732538330494, "eval_loss": 0.7174944281578064, "eval_runtime": 43.835, "eval_samples_per_second": 3.353, "eval_steps_per_second": 3.353, "step": 300 }, { "epoch": 0.5281090289608177, "grad_norm": 3.015625, "learning_rate": 2.8439425051334705e-05, "loss": 0.2324, "step": 310 }, { "epoch": 0.545144804088586, "grad_norm": 0.0184326171875, "learning_rate": 2.7412731006160163e-05, "loss": 0.1289, "step": 320 }, { "epoch": 0.5621805792163543, "grad_norm": 0.1201171875, "learning_rate": 2.6386036960985628e-05, "loss": 0.1295, "step": 330 }, { "epoch": 0.5792163543441227, "grad_norm": 0.005859375, "learning_rate": 2.5359342915811092e-05, "loss": 0.393, "step": 340 }, { "epoch": 0.596252129471891, "grad_norm": 23.375, "learning_rate": 2.433264887063655e-05, "loss": 0.3171, "step": 350 }, { "epoch": 0.596252129471891, "eval_loss": 0.7911351919174194, "eval_runtime": 43.7143, "eval_samples_per_second": 3.363, "eval_steps_per_second": 3.363, "step": 350 }, { "epoch": 0.6132879045996593, "grad_norm": 0.0242919921875, "learning_rate": 2.3305954825462014e-05, "loss": 0.535, "step": 360 }, { "epoch": 0.6303236797274276, "grad_norm": 0.50390625, "learning_rate": 2.2279260780287475e-05, "loss": 0.2197, "step": 370 }, { "epoch": 0.6473594548551959, "grad_norm": 2.53125, "learning_rate": 2.125256673511294e-05, "loss": 0.3187, "step": 380 }, { "epoch": 0.6643952299829642, "grad_norm": 7.8125, "learning_rate": 2.02258726899384e-05, "loss": 0.1459, "step": 390 }, { "epoch": 0.6814310051107325, "grad_norm": 39.25, "learning_rate": 1.919917864476386e-05, "loss": 0.5212, "step": 400 }, { "epoch": 0.6814310051107325, "eval_loss": 0.7614322304725647, "eval_runtime": 43.7037, "eval_samples_per_second": 3.364, "eval_steps_per_second": 3.364, "step": 400 }, { "epoch": 0.6984667802385008, "grad_norm": 3.640625, "learning_rate": 1.8172484599589323e-05, "loss": 0.1773, "step": 410 }, { "epoch": 0.7155025553662692, "grad_norm": 0.005645751953125, "learning_rate": 1.7145790554414784e-05, "loss": 0.2599, "step": 420 }, { "epoch": 0.7325383304940375, "grad_norm": 30.5, "learning_rate": 1.611909650924025e-05, "loss": 0.3468, "step": 430 }, { "epoch": 0.7495741056218058, "grad_norm": 5.5, "learning_rate": 1.5092402464065708e-05, "loss": 0.2754, "step": 440 }, { "epoch": 0.7666098807495741, "grad_norm": 32.0, "learning_rate": 1.406570841889117e-05, "loss": 0.3287, "step": 450 }, { "epoch": 0.7666098807495741, "eval_loss": 0.8178677558898926, "eval_runtime": 43.7442, "eval_samples_per_second": 3.36, "eval_steps_per_second": 3.36, "step": 450 }, { "epoch": 0.7836456558773425, "grad_norm": 3.0, "learning_rate": 1.3039014373716632e-05, "loss": 0.1921, "step": 460 }, { "epoch": 0.8006814310051107, "grad_norm": 28.125, "learning_rate": 1.2012320328542096e-05, "loss": 0.4395, "step": 470 }, { "epoch": 0.817717206132879, "grad_norm": 4.25, "learning_rate": 1.0985626283367557e-05, "loss": 0.2133, "step": 480 }, { "epoch": 0.8347529812606473, "grad_norm": 7.25, "learning_rate": 9.95893223819302e-06, "loss": 0.4362, "step": 490 }, { "epoch": 0.8517887563884157, "grad_norm": 5.78125, "learning_rate": 8.932238193018481e-06, "loss": 0.418, "step": 500 }, { "epoch": 0.8517887563884157, "eval_loss": 0.8019587993621826, "eval_runtime": 43.8445, "eval_samples_per_second": 3.353, "eval_steps_per_second": 3.353, "step": 500 }, { "epoch": 0.868824531516184, "grad_norm": 5.84375, "learning_rate": 7.905544147843944e-06, "loss": 0.236, "step": 510 }, { "epoch": 0.8858603066439523, "grad_norm": 9.0625, "learning_rate": 6.878850102669406e-06, "loss": 0.5388, "step": 520 }, { "epoch": 0.9028960817717206, "grad_norm": 0.00543212890625, "learning_rate": 5.852156057494867e-06, "loss": 0.0928, "step": 530 }, { "epoch": 0.919931856899489, "grad_norm": 10.9375, "learning_rate": 4.825462012320329e-06, "loss": 0.1343, "step": 540 }, { "epoch": 0.9369676320272572, "grad_norm": 6.59375, "learning_rate": 3.7987679671457908e-06, "loss": 0.099, "step": 550 }, { "epoch": 0.9369676320272572, "eval_loss": 0.8314433097839355, "eval_runtime": 43.8265, "eval_samples_per_second": 3.354, "eval_steps_per_second": 3.354, "step": 550 }, { "epoch": 0.9540034071550255, "grad_norm": 32.75, "learning_rate": 2.7720739219712527e-06, "loss": 0.1544, "step": 560 }, { "epoch": 0.9710391822827938, "grad_norm": 22.75, "learning_rate": 1.7453798767967144e-06, "loss": 0.2434, "step": 570 }, { "epoch": 0.9880749574105622, "grad_norm": 0.33203125, "learning_rate": 7.186858316221766e-07, "loss": 0.1046, "step": 580 } ], "logging_steps": 10, "max_steps": 587, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8836792572744000.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }