{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 15375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0975609756097561, "grad_norm": 19.442411422729492, "learning_rate": 4.8373983739837406e-05, "loss": 6.7559, "step": 500 }, { "epoch": 0.1951219512195122, "grad_norm": 22.672739028930664, "learning_rate": 4.6747967479674795e-05, "loss": 6.6932, "step": 1000 }, { "epoch": 0.2926829268292683, "grad_norm": 21.795516967773438, "learning_rate": 4.51219512195122e-05, "loss": 6.6652, "step": 1500 }, { "epoch": 0.3902439024390244, "grad_norm": 19.84381866455078, "learning_rate": 4.3495934959349595e-05, "loss": 6.6335, "step": 2000 }, { "epoch": 0.4878048780487805, "grad_norm": 14.22912883758545, "learning_rate": 4.186991869918699e-05, "loss": 6.6248, "step": 2500 }, { "epoch": 0.5853658536585366, "grad_norm": 14.391462326049805, "learning_rate": 4.0243902439024395e-05, "loss": 6.5929, "step": 3000 }, { "epoch": 0.6829268292682927, "grad_norm": 19.81720733642578, "learning_rate": 3.861788617886179e-05, "loss": 6.5589, "step": 3500 }, { "epoch": 0.7804878048780488, "grad_norm": 15.33761978149414, "learning_rate": 3.699186991869919e-05, "loss": 6.5327, "step": 4000 }, { "epoch": 0.8780487804878049, "grad_norm": 14.190281867980957, "learning_rate": 3.5365853658536584e-05, "loss": 6.5175, "step": 4500 }, { "epoch": 0.975609756097561, "grad_norm": 16.57828712463379, "learning_rate": 3.373983739837399e-05, "loss": 6.5137, "step": 5000 }, { "epoch": 1.0731707317073171, "grad_norm": 16.75761604309082, "learning_rate": 3.2113821138211384e-05, "loss": 6.495, "step": 5500 }, { "epoch": 1.170731707317073, "grad_norm": 18.840726852416992, "learning_rate": 3.048780487804878e-05, "loss": 6.4757, "step": 6000 }, { "epoch": 1.2682926829268293, "grad_norm": 17.630483627319336, "learning_rate": 2.886178861788618e-05, "loss": 6.4633, "step": 6500 }, { "epoch": 1.3658536585365852, "grad_norm": 16.721818923950195, "learning_rate": 2.7235772357723577e-05, "loss": 6.4462, "step": 7000 }, { "epoch": 1.4634146341463414, "grad_norm": 14.650636672973633, "learning_rate": 2.5609756097560977e-05, "loss": 6.4404, "step": 7500 }, { "epoch": 1.5609756097560976, "grad_norm": 13.825970649719238, "learning_rate": 2.3983739837398377e-05, "loss": 6.4326, "step": 8000 }, { "epoch": 1.6585365853658538, "grad_norm": 11.85326862335205, "learning_rate": 2.2357723577235773e-05, "loss": 6.4239, "step": 8500 }, { "epoch": 1.7560975609756098, "grad_norm": 13.92196273803711, "learning_rate": 2.073170731707317e-05, "loss": 6.4098, "step": 9000 }, { "epoch": 1.8536585365853657, "grad_norm": 12.077308654785156, "learning_rate": 1.9105691056910573e-05, "loss": 6.3987, "step": 9500 }, { "epoch": 1.951219512195122, "grad_norm": 12.406614303588867, "learning_rate": 1.747967479674797e-05, "loss": 6.3957, "step": 10000 }, { "epoch": 2.048780487804878, "grad_norm": 14.001736640930176, "learning_rate": 1.5853658536585366e-05, "loss": 6.3752, "step": 10500 }, { "epoch": 2.1463414634146343, "grad_norm": 12.691810607910156, "learning_rate": 1.4227642276422764e-05, "loss": 6.3566, "step": 11000 }, { "epoch": 2.2439024390243905, "grad_norm": 10.062420845031738, "learning_rate": 1.2601626016260162e-05, "loss": 6.3492, "step": 11500 }, { "epoch": 2.341463414634146, "grad_norm": 11.78906536102295, "learning_rate": 1.0975609756097562e-05, "loss": 6.3447, "step": 12000 }, { "epoch": 2.4390243902439024, "grad_norm": 13.368131637573242, "learning_rate": 9.34959349593496e-06, "loss": 6.339, "step": 12500 }, { "epoch": 2.5365853658536586, "grad_norm": 12.125652313232422, "learning_rate": 7.723577235772358e-06, "loss": 6.3305, "step": 13000 }, { "epoch": 2.6341463414634148, "grad_norm": 13.748695373535156, "learning_rate": 6.0975609756097564e-06, "loss": 6.3205, "step": 13500 }, { "epoch": 2.7317073170731705, "grad_norm": 13.787367820739746, "learning_rate": 4.471544715447155e-06, "loss": 6.3196, "step": 14000 }, { "epoch": 2.8292682926829267, "grad_norm": 15.013029098510742, "learning_rate": 2.8455284552845528e-06, "loss": 6.3116, "step": 14500 }, { "epoch": 2.926829268292683, "grad_norm": 15.244904518127441, "learning_rate": 1.2195121951219514e-06, "loss": 6.3107, "step": 15000 }, { "epoch": 3.0, "step": 15375, "total_flos": 5764753863475200.0, "train_loss": 6.457221655868903, "train_runtime": 1165.3935, "train_samples_per_second": 105.536, "train_steps_per_second": 13.193 } ], "logging_steps": 500, "max_steps": 15375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5764753863475200.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }