{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11904761904761904, "grad_norm": 8.735864265285194, "learning_rate": 5.263157894736842e-07, "loss": 0.8721, "step": 10 }, { "epoch": 0.23809523809523808, "grad_norm": 3.0586867449133153, "learning_rate": 1.0526315789473683e-06, "loss": 0.7667, "step": 20 }, { "epoch": 0.35714285714285715, "grad_norm": 1.3919491548026783, "learning_rate": 1.5789473684210526e-06, "loss": 0.6797, "step": 30 }, { "epoch": 0.47619047619047616, "grad_norm": 1.3015106207743352, "learning_rate": 1.9996767546702485e-06, "loss": 0.6273, "step": 40 }, { "epoch": 0.5952380952380952, "grad_norm": 3.14961074638027, "learning_rate": 1.988392397752233e-06, "loss": 0.5986, "step": 50 }, { "epoch": 0.7142857142857143, "grad_norm": 2.02851221384901, "learning_rate": 1.961223330122206e-06, "loss": 0.581, "step": 60 }, { "epoch": 0.8333333333333334, "grad_norm": 2.523772811799601, "learning_rate": 1.9187540279759314e-06, "loss": 0.5693, "step": 70 }, { "epoch": 0.9523809523809523, "grad_norm": 1.7965113435726119, "learning_rate": 1.861898114721218e-06, "loss": 0.5579, "step": 80 }, { "epoch": 1.0, "eval_loss": 0.06908344477415085, "eval_runtime": 89.6395, "eval_samples_per_second": 201.953, "eval_steps_per_second": 0.402, "step": 84 }, { "epoch": 1.0714285714285714, "grad_norm": 2.0945916853208053, "learning_rate": 1.7918787065996015e-06, "loss": 0.5453, "step": 90 }, { "epoch": 1.1904761904761905, "grad_norm": 1.9610907060341172, "learning_rate": 1.7102021003248955e-06, "loss": 0.5353, "step": 100 }, { "epoch": 1.3095238095238095, "grad_norm": 1.7318461716843294, "learning_rate": 1.6186253687848507e-06, "loss": 0.5312, "step": 110 }, { "epoch": 1.4285714285714286, "grad_norm": 1.5023311888601967, "learning_rate": 1.5191185619053519e-06, "loss": 0.5261, "step": 120 }, { "epoch": 1.5476190476190477, "grad_norm": 2.103909947624852, "learning_rate": 1.4138223258333096e-06, "loss": 0.5196, "step": 130 }, { "epoch": 1.6666666666666665, "grad_norm": 1.4806727889130753, "learning_rate": 1.3050018521581279e-06, "loss": 0.5163, "step": 140 }, { "epoch": 1.7857142857142856, "grad_norm": 1.7118523951460423, "learning_rate": 1.1949981478418721e-06, "loss": 0.5127, "step": 150 }, { "epoch": 1.9047619047619047, "grad_norm": 2.1882237413509573, "learning_rate": 1.0861776741666901e-06, "loss": 0.508, "step": 160 }, { "epoch": 2.0, "eval_loss": 0.06450411677360535, "eval_runtime": 89.5353, "eval_samples_per_second": 202.188, "eval_steps_per_second": 0.402, "step": 168 }, { "epoch": 2.0238095238095237, "grad_norm": 1.7432600091420465, "learning_rate": 9.80881438094648e-07, "loss": 0.5039, "step": 170 }, { "epoch": 2.142857142857143, "grad_norm": 1.3437595427602773, "learning_rate": 8.813746312151494e-07, "loss": 0.4936, "step": 180 }, { "epoch": 2.261904761904762, "grad_norm": 1.3903164766916085, "learning_rate": 7.897978996751046e-07, "loss": 0.4913, "step": 190 }, { "epoch": 2.380952380952381, "grad_norm": 1.4779105389820446, "learning_rate": 7.081212934003984e-07, "loss": 0.487, "step": 200 }, { "epoch": 2.5, "grad_norm": 0.8301326396528811, "learning_rate": 6.381018852787821e-07, "loss": 0.4852, "step": 210 }, { "epoch": 2.619047619047619, "grad_norm": 0.710602110495643, "learning_rate": 5.812459720240681e-07, "loss": 0.4866, "step": 220 }, { "epoch": 2.738095238095238, "grad_norm": 0.6168443563524292, "learning_rate": 5.387766698777935e-07, "loss": 0.4847, "step": 230 }, { "epoch": 2.857142857142857, "grad_norm": 0.5597413961712824, "learning_rate": 5.116076022477671e-07, "loss": 0.483, "step": 240 }, { "epoch": 2.9761904761904763, "grad_norm": 0.5023687044086946, "learning_rate": 5.003232453297512e-07, "loss": 0.48, "step": 250 }, { "epoch": 3.0, "eval_loss": 0.06279128044843674, "eval_runtime": 88.7475, "eval_samples_per_second": 203.983, "eval_steps_per_second": 0.406, "step": 252 }, { "epoch": 3.0, "step": 252, "total_flos": 3375200049561600.0, "train_loss": 0.5529132849640317, "train_runtime": 15188.2312, "train_samples_per_second": 67.938, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 252, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3375200049561600.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }