{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999360204734485, "eval_steps": 500, "global_step": 781, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03198976327575176, "grad_norm": 0.16216526925563812, "learning_rate": 6.329113924050633e-05, "loss": 0.9047, "step": 25 }, { "epoch": 0.06397952655150352, "grad_norm": 0.22862769663333893, "learning_rate": 0.00012658227848101267, "loss": 0.7593, "step": 50 }, { "epoch": 0.09596928982725528, "grad_norm": 0.2344927191734314, "learning_rate": 0.00018987341772151899, "loss": 0.6731, "step": 75 }, { "epoch": 0.12795905310300704, "grad_norm": 0.2598811686038971, "learning_rate": 0.00019401709401709402, "loss": 0.6604, "step": 100 }, { "epoch": 0.1599488163787588, "grad_norm": 0.2400255650281906, "learning_rate": 0.0001868945868945869, "loss": 0.6304, "step": 125 }, { "epoch": 0.19193857965451055, "grad_norm": 0.24225808680057526, "learning_rate": 0.00017977207977207978, "loss": 0.6428, "step": 150 }, { "epoch": 0.22392834293026231, "grad_norm": 0.22859220206737518, "learning_rate": 0.00017264957264957268, "loss": 0.6154, "step": 175 }, { "epoch": 0.2559181062060141, "grad_norm": 0.23886021971702576, "learning_rate": 0.00016552706552706555, "loss": 0.6001, "step": 200 }, { "epoch": 0.28790786948176583, "grad_norm": 0.22877049446105957, "learning_rate": 0.00015840455840455842, "loss": 0.5983, "step": 225 }, { "epoch": 0.3198976327575176, "grad_norm": 0.24990104138851166, "learning_rate": 0.00015128205128205128, "loss": 0.5789, "step": 250 }, { "epoch": 0.35188739603326935, "grad_norm": 0.2319009006023407, "learning_rate": 0.00014415954415954415, "loss": 0.5851, "step": 275 }, { "epoch": 0.3838771593090211, "grad_norm": 0.22513625025749207, "learning_rate": 0.00013703703703703705, "loss": 0.5974, "step": 300 }, { "epoch": 0.41586692258477287, "grad_norm": 0.2516462504863739, "learning_rate": 0.00012991452991452992, "loss": 0.5811, "step": 325 }, { "epoch": 0.44785668586052463, "grad_norm": 0.23952844738960266, "learning_rate": 0.00012279202279202279, "loss": 0.5885, "step": 350 }, { "epoch": 0.4798464491362764, "grad_norm": 0.26743239164352417, "learning_rate": 0.00011566951566951567, "loss": 0.5812, "step": 375 }, { "epoch": 0.5118362124120281, "grad_norm": 0.24359311163425446, "learning_rate": 0.00010854700854700855, "loss": 0.5853, "step": 400 }, { "epoch": 0.5438259756877799, "grad_norm": 0.26046106219291687, "learning_rate": 0.00010142450142450144, "loss": 0.5717, "step": 425 }, { "epoch": 0.5758157389635317, "grad_norm": 0.2767123878002167, "learning_rate": 9.430199430199431e-05, "loss": 0.5711, "step": 450 }, { "epoch": 0.6078055022392834, "grad_norm": 0.2743181884288788, "learning_rate": 8.717948717948718e-05, "loss": 0.5786, "step": 475 }, { "epoch": 0.6397952655150352, "grad_norm": 0.2655166983604431, "learning_rate": 8.005698005698006e-05, "loss": 0.563, "step": 500 }, { "epoch": 0.6717850287907869, "grad_norm": 0.2630331814289093, "learning_rate": 7.293447293447295e-05, "loss": 0.5688, "step": 525 }, { "epoch": 0.7037747920665387, "grad_norm": 0.27685314416885376, "learning_rate": 6.581196581196581e-05, "loss": 0.5687, "step": 550 }, { "epoch": 0.7357645553422905, "grad_norm": 0.2695849537849426, "learning_rate": 5.868945868945869e-05, "loss": 0.5592, "step": 575 }, { "epoch": 0.7677543186180422, "grad_norm": 0.25096848607063293, "learning_rate": 5.156695156695157e-05, "loss": 0.5477, "step": 600 }, { "epoch": 0.799744081893794, "grad_norm": 0.2821820378303528, "learning_rate": 4.4444444444444447e-05, "loss": 0.5574, "step": 625 }, { "epoch": 0.8317338451695457, "grad_norm": 0.26849839091300964, "learning_rate": 3.732193732193732e-05, "loss": 0.5662, "step": 650 }, { "epoch": 0.8637236084452975, "grad_norm": 0.27688708901405334, "learning_rate": 3.01994301994302e-05, "loss": 0.5485, "step": 675 }, { "epoch": 0.8957133717210493, "grad_norm": 0.2868192195892334, "learning_rate": 2.307692307692308e-05, "loss": 0.5493, "step": 700 }, { "epoch": 0.927703134996801, "grad_norm": 0.28862541913986206, "learning_rate": 1.5954415954415954e-05, "loss": 0.5494, "step": 725 }, { "epoch": 0.9596928982725528, "grad_norm": 0.2842100262641907, "learning_rate": 8.831908831908831e-06, "loss": 0.5582, "step": 750 }, { "epoch": 0.9916826615483045, "grad_norm": 0.2615242302417755, "learning_rate": 1.7094017094017097e-06, "loss": 0.5641, "step": 775 } ], "logging_steps": 25, "max_steps": 781, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 2.5396281704290714e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }