{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.2650056625141564, "eval_steps": 20, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09060022650056625, "grad_norm": 4.824601173400879, "learning_rate": 1.9393939393939395e-05, "loss": 6.3414, "step": 20 }, { "epoch": 0.09060022650056625, "eval_loss": 5.462944984436035, "eval_runtime": 169.7392, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.583, "step": 20 }, { "epoch": 0.1812004530011325, "grad_norm": 3.5273354053497314, "learning_rate": 1.8585858585858588e-05, "loss": 4.9204, "step": 40 }, { "epoch": 0.1812004530011325, "eval_loss": 4.40663480758667, "eval_runtime": 169.7659, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.583, "step": 40 }, { "epoch": 0.2718006795016987, "grad_norm": 2.5483546257019043, "learning_rate": 1.7777777777777777e-05, "loss": 4.0241, "step": 60 }, { "epoch": 0.2718006795016987, "eval_loss": 3.6545450687408447, "eval_runtime": 169.7082, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.583, "step": 60 }, { "epoch": 0.362400906002265, "grad_norm": 1.4762500524520874, "learning_rate": 1.6969696969696972e-05, "loss": 3.4114, "step": 80 }, { "epoch": 0.362400906002265, "eval_loss": 3.1997323036193848, "eval_runtime": 169.459, "eval_samples_per_second": 2.331, "eval_steps_per_second": 0.584, "step": 80 }, { "epoch": 0.45300113250283125, "grad_norm": 1.2465909719467163, "learning_rate": 1.616161616161616e-05, "loss": 3.0527, "step": 100 }, { "epoch": 0.45300113250283125, "eval_loss": 2.9332973957061768, "eval_runtime": 169.4702, "eval_samples_per_second": 2.331, "eval_steps_per_second": 0.584, "step": 100 }, { "epoch": 0.5436013590033975, "grad_norm": 1.2837079763412476, "learning_rate": 1.5353535353535354e-05, "loss": 2.8401, "step": 120 }, { "epoch": 0.5436013590033975, "eval_loss": 2.765261173248291, "eval_runtime": 169.5654, "eval_samples_per_second": 2.329, "eval_steps_per_second": 0.584, "step": 120 }, { "epoch": 0.6342015855039638, "grad_norm": 1.069353699684143, "learning_rate": 1.4545454545454546e-05, "loss": 2.7202, "step": 140 }, { "epoch": 0.6342015855039638, "eval_loss": 2.654095411300659, "eval_runtime": 169.5632, "eval_samples_per_second": 2.33, "eval_steps_per_second": 0.584, "step": 140 }, { "epoch": 0.72480181200453, "grad_norm": 1.0665814876556396, "learning_rate": 1.3737373737373739e-05, "loss": 2.605, "step": 160 }, { "epoch": 0.72480181200453, "eval_loss": 2.576014995574951, "eval_runtime": 169.9075, "eval_samples_per_second": 2.325, "eval_steps_per_second": 0.583, "step": 160 }, { "epoch": 0.8154020385050963, "grad_norm": 1.076709508895874, "learning_rate": 1.2929292929292931e-05, "loss": 2.5533, "step": 180 }, { "epoch": 0.8154020385050963, "eval_loss": 2.519667148590088, "eval_runtime": 169.6071, "eval_samples_per_second": 2.329, "eval_steps_per_second": 0.584, "step": 180 }, { "epoch": 0.9060022650056625, "grad_norm": 1.0686030387878418, "learning_rate": 1.2121212121212122e-05, "loss": 2.5004, "step": 200 }, { "epoch": 0.9060022650056625, "eval_loss": 2.4773340225219727, "eval_runtime": 169.6567, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.584, "step": 200 }, { "epoch": 0.9966024915062288, "grad_norm": 1.1253015995025635, "learning_rate": 1.1313131313131314e-05, "loss": 2.4613, "step": 220 }, { "epoch": 0.9966024915062288, "eval_loss": 2.444694995880127, "eval_runtime": 169.7657, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.583, "step": 220 }, { "epoch": 1.087202718006795, "grad_norm": 1.1171083450317383, "learning_rate": 1.0505050505050507e-05, "loss": 2.4456, "step": 240 }, { "epoch": 1.087202718006795, "eval_loss": 2.4184916019439697, "eval_runtime": 169.7027, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.583, "step": 240 }, { "epoch": 1.1778029445073612, "grad_norm": 1.1789259910583496, "learning_rate": 9.696969696969698e-06, "loss": 2.4151, "step": 260 }, { "epoch": 1.1778029445073612, "eval_loss": 2.397007465362549, "eval_runtime": 169.6356, "eval_samples_per_second": 2.329, "eval_steps_per_second": 0.584, "step": 260 }, { "epoch": 1.2684031710079275, "grad_norm": 1.1507657766342163, "learning_rate": 8.888888888888888e-06, "loss": 2.3943, "step": 280 }, { "epoch": 1.2684031710079275, "eval_loss": 2.3794679641723633, "eval_runtime": 169.6152, "eval_samples_per_second": 2.329, "eval_steps_per_second": 0.584, "step": 280 }, { "epoch": 1.3590033975084936, "grad_norm": 1.1052231788635254, "learning_rate": 8.08080808080808e-06, "loss": 2.3621, "step": 300 }, { "epoch": 1.3590033975084936, "eval_loss": 2.3650312423706055, "eval_runtime": 169.6247, "eval_samples_per_second": 2.329, "eval_steps_per_second": 0.584, "step": 300 }, { "epoch": 1.44960362400906, "grad_norm": 1.156396508216858, "learning_rate": 7.272727272727273e-06, "loss": 2.3475, "step": 320 }, { "epoch": 1.44960362400906, "eval_loss": 2.3532702922821045, "eval_runtime": 169.6216, "eval_samples_per_second": 2.329, "eval_steps_per_second": 0.584, "step": 320 }, { "epoch": 1.5402038505096263, "grad_norm": 1.1324844360351562, "learning_rate": 6.464646464646466e-06, "loss": 2.339, "step": 340 }, { "epoch": 1.5402038505096263, "eval_loss": 2.342968702316284, "eval_runtime": 169.6943, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.583, "step": 340 }, { "epoch": 1.6308040770101924, "grad_norm": 1.2476097345352173, "learning_rate": 5.656565656565657e-06, "loss": 2.3247, "step": 360 }, { "epoch": 1.6308040770101924, "eval_loss": 2.334242343902588, "eval_runtime": 169.8944, "eval_samples_per_second": 2.325, "eval_steps_per_second": 0.583, "step": 360 }, { "epoch": 1.721404303510759, "grad_norm": 1.1416445970535278, "learning_rate": 4.848484848484849e-06, "loss": 2.3335, "step": 380 }, { "epoch": 1.721404303510759, "eval_loss": 2.326986789703369, "eval_runtime": 169.8042, "eval_samples_per_second": 2.326, "eval_steps_per_second": 0.583, "step": 380 }, { "epoch": 1.812004530011325, "grad_norm": 1.1464892625808716, "learning_rate": 4.04040404040404e-06, "loss": 2.3007, "step": 400 }, { "epoch": 1.812004530011325, "eval_loss": 2.321030378341675, "eval_runtime": 169.8884, "eval_samples_per_second": 2.325, "eval_steps_per_second": 0.583, "step": 400 }, { "epoch": 1.9026047565118913, "grad_norm": 1.1699483394622803, "learning_rate": 3.232323232323233e-06, "loss": 2.3095, "step": 420 }, { "epoch": 1.9026047565118913, "eval_loss": 2.3161513805389404, "eval_runtime": 169.7467, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.583, "step": 420 }, { "epoch": 1.9932049830124576, "grad_norm": 1.1978620290756226, "learning_rate": 2.4242424242424244e-06, "loss": 2.3093, "step": 440 }, { "epoch": 1.9932049830124576, "eval_loss": 2.312627077102661, "eval_runtime": 170.0514, "eval_samples_per_second": 2.323, "eval_steps_per_second": 0.582, "step": 440 }, { "epoch": 2.0838052095130237, "grad_norm": 1.23793363571167, "learning_rate": 1.6161616161616164e-06, "loss": 2.327, "step": 460 }, { "epoch": 2.0838052095130237, "eval_loss": 2.309922933578491, "eval_runtime": 169.6935, "eval_samples_per_second": 2.328, "eval_steps_per_second": 0.583, "step": 460 }, { "epoch": 2.17440543601359, "grad_norm": 1.2800756692886353, "learning_rate": 8.080808080808082e-07, "loss": 2.3005, "step": 480 }, { "epoch": 2.17440543601359, "eval_loss": 2.3085172176361084, "eval_runtime": 169.7305, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.583, "step": 480 }, { "epoch": 2.2650056625141564, "grad_norm": 1.1457535028457642, "learning_rate": 0.0, "loss": 2.2871, "step": 500 }, { "epoch": 2.2650056625141564, "eval_loss": 2.3081250190734863, "eval_runtime": 169.7694, "eval_samples_per_second": 2.327, "eval_steps_per_second": 0.583, "step": 500 } ], "logging_steps": 20, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.75343643596161e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }