{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.08265497808390733, "eval_steps": 9, "global_step": 99, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008348987685243164, "eval_loss": 1.8743113279342651, "eval_runtime": 174.6656, "eval_samples_per_second": 11.554, "eval_steps_per_second": 1.448, "step": 1 }, { "epoch": 0.002504696305572949, "grad_norm": 0.5196024179458618, "learning_rate": 3e-05, "loss": 1.8288, "step": 3 }, { "epoch": 0.005009392611145898, "grad_norm": 0.5937802195549011, "learning_rate": 6e-05, "loss": 1.8789, "step": 6 }, { "epoch": 0.007514088916718848, "grad_norm": 0.5395702123641968, "learning_rate": 9e-05, "loss": 1.8458, "step": 9 }, { "epoch": 0.007514088916718848, "eval_loss": 1.7832880020141602, "eval_runtime": 176.3872, "eval_samples_per_second": 11.441, "eval_steps_per_second": 1.434, "step": 9 }, { "epoch": 0.010018785222291797, "grad_norm": 0.46189063787460327, "learning_rate": 9.987820251299122e-05, "loss": 1.7233, "step": 12 }, { "epoch": 0.012523481527864746, "grad_norm": 0.6529078483581543, "learning_rate": 9.924038765061042e-05, "loss": 1.6317, "step": 15 }, { "epoch": 0.015028177833437696, "grad_norm": 1.0299454927444458, "learning_rate": 9.806308479691595e-05, "loss": 1.6251, "step": 18 }, { "epoch": 0.015028177833437696, "eval_loss": 1.5364857912063599, "eval_runtime": 176.4427, "eval_samples_per_second": 11.437, "eval_steps_per_second": 1.434, "step": 18 }, { "epoch": 0.017532874139010644, "grad_norm": 0.4936962127685547, "learning_rate": 9.635919272833938e-05, "loss": 1.4772, "step": 21 }, { "epoch": 0.020037570444583593, "grad_norm": 0.5406275987625122, "learning_rate": 9.414737964294636e-05, "loss": 1.3968, "step": 24 }, { "epoch": 0.022542266750156543, "grad_norm": 0.5169019103050232, "learning_rate": 9.145187862775209e-05, "loss": 1.4935, "step": 27 }, { "epoch": 0.022542266750156543, "eval_loss": 1.4429470300674438, "eval_runtime": 176.5514, "eval_samples_per_second": 11.43, "eval_steps_per_second": 1.433, "step": 27 }, { "epoch": 0.025046963055729492, "grad_norm": 0.4290275275707245, "learning_rate": 8.83022221559489e-05, "loss": 1.4523, "step": 30 }, { "epoch": 0.027551659361302442, "grad_norm": 0.36365261673927307, "learning_rate": 8.473291852294987e-05, "loss": 1.4082, "step": 33 }, { "epoch": 0.03005635566687539, "grad_norm": 0.3774980902671814, "learning_rate": 8.07830737662829e-05, "loss": 1.4659, "step": 36 }, { "epoch": 0.03005635566687539, "eval_loss": 1.391577124595642, "eval_runtime": 176.3986, "eval_samples_per_second": 11.44, "eval_steps_per_second": 1.434, "step": 36 }, { "epoch": 0.03256105197244834, "grad_norm": 0.47076305747032166, "learning_rate": 7.649596321166024e-05, "loss": 1.3466, "step": 39 }, { "epoch": 0.03506574827802129, "grad_norm": 0.4052177369594574, "learning_rate": 7.191855733945387e-05, "loss": 1.4102, "step": 42 }, { "epoch": 0.03757044458359424, "grad_norm": 0.44520846009254456, "learning_rate": 6.710100716628344e-05, "loss": 1.3796, "step": 45 }, { "epoch": 0.03757044458359424, "eval_loss": 1.3682787418365479, "eval_runtime": 176.4236, "eval_samples_per_second": 11.438, "eval_steps_per_second": 1.434, "step": 45 }, { "epoch": 0.040075140889167186, "grad_norm": 0.44555261731147766, "learning_rate": 6.209609477998338e-05, "loss": 1.4148, "step": 48 }, { "epoch": 0.042579837194740136, "grad_norm": 0.5481745004653931, "learning_rate": 5.695865504800327e-05, "loss": 1.3406, "step": 51 }, { "epoch": 0.045084533500313086, "grad_norm": 0.47268688678741455, "learning_rate": 5.174497483512506e-05, "loss": 1.3965, "step": 54 }, { "epoch": 0.045084533500313086, "eval_loss": 1.3560230731964111, "eval_runtime": 176.478, "eval_samples_per_second": 11.435, "eval_steps_per_second": 1.434, "step": 54 }, { "epoch": 0.047589229805886035, "grad_norm": 0.3834378719329834, "learning_rate": 4.6512176312793736e-05, "loss": 1.3326, "step": 57 }, { "epoch": 0.050093926111458985, "grad_norm": 0.467364102602005, "learning_rate": 4.131759111665349e-05, "loss": 1.3105, "step": 60 }, { "epoch": 0.052598622417031934, "grad_norm": 0.4773382246494293, "learning_rate": 3.6218132209150045e-05, "loss": 1.3209, "step": 63 }, { "epoch": 0.052598622417031934, "eval_loss": 1.3461052179336548, "eval_runtime": 176.5447, "eval_samples_per_second": 11.431, "eval_steps_per_second": 1.433, "step": 63 }, { "epoch": 0.055103318722604884, "grad_norm": 0.4290798604488373, "learning_rate": 3.12696703292044e-05, "loss": 1.3209, "step": 66 }, { "epoch": 0.057608015028177834, "grad_norm": 0.38160884380340576, "learning_rate": 2.6526421860705473e-05, "loss": 1.3099, "step": 69 }, { "epoch": 0.06011271133375078, "grad_norm": 0.4193227291107178, "learning_rate": 2.2040354826462668e-05, "loss": 1.3537, "step": 72 }, { "epoch": 0.06011271133375078, "eval_loss": 1.341897964477539, "eval_runtime": 176.4889, "eval_samples_per_second": 11.434, "eval_steps_per_second": 1.434, "step": 72 }, { "epoch": 0.06261740763932373, "grad_norm": 0.38362041115760803, "learning_rate": 1.7860619515673033e-05, "loss": 1.2482, "step": 75 }, { "epoch": 0.06512210394489668, "grad_norm": 0.34432926774024963, "learning_rate": 1.4033009983067452e-05, "loss": 1.3452, "step": 78 }, { "epoch": 0.06762680025046963, "grad_norm": 0.331984281539917, "learning_rate": 1.0599462319663905e-05, "loss": 1.275, "step": 81 }, { "epoch": 0.06762680025046963, "eval_loss": 1.3398759365081787, "eval_runtime": 176.5136, "eval_samples_per_second": 11.433, "eval_steps_per_second": 1.433, "step": 81 }, { "epoch": 0.07013149655604257, "grad_norm": 0.38239288330078125, "learning_rate": 7.597595192178702e-06, "loss": 1.3075, "step": 84 }, { "epoch": 0.07263619286161553, "grad_norm": 0.35065001249313354, "learning_rate": 5.060297685041659e-06, "loss": 1.3749, "step": 87 }, { "epoch": 0.07514088916718847, "grad_norm": 0.33161821961402893, "learning_rate": 3.0153689607045845e-06, "loss": 1.3186, "step": 90 }, { "epoch": 0.07514088916718847, "eval_loss": 1.3385422229766846, "eval_runtime": 176.3071, "eval_samples_per_second": 11.446, "eval_steps_per_second": 1.435, "step": 90 }, { "epoch": 0.07764558547276143, "grad_norm": 0.3372178375720978, "learning_rate": 1.4852136862001764e-06, "loss": 1.3194, "step": 93 }, { "epoch": 0.08015028177833437, "grad_norm": 0.35166051983833313, "learning_rate": 4.865965629214819e-07, "loss": 1.3366, "step": 96 }, { "epoch": 0.08265497808390733, "grad_norm": 0.33704036474227905, "learning_rate": 3.04586490452119e-08, "loss": 1.3919, "step": 99 }, { "epoch": 0.08265497808390733, "eval_loss": 1.3384730815887451, "eval_runtime": 176.3253, "eval_samples_per_second": 11.445, "eval_steps_per_second": 1.435, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0918272387199795e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }