{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.06798339773865751, "eval_steps": 25, "global_step": 475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.6931123733520508, "learning_rate": 0.0001988606301829365, "loss": 3.3633, "step": 25 }, { "epoch": 0.0, "eval_loss": 2.791308879852295, "eval_runtime": 1766.1893, "eval_samples_per_second": 4.028, "eval_steps_per_second": 0.504, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.8134792447090149, "learning_rate": 0.00019527921915838827, "loss": 2.8231, "step": 50 }, { "epoch": 0.01, "eval_loss": 2.6491129398345947, "eval_runtime": 1766.0434, "eval_samples_per_second": 4.028, "eval_steps_per_second": 0.504, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.7844312191009521, "learning_rate": 0.00018934232439329787, "loss": 2.7094, "step": 75 }, { "epoch": 0.01, "eval_loss": 2.595423698425293, "eval_runtime": 1770.4727, "eval_samples_per_second": 4.018, "eval_steps_per_second": 0.503, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.7818688154220581, "learning_rate": 0.00018119671723205706, "loss": 2.6759, "step": 100 }, { "epoch": 0.01, "eval_loss": 2.5623528957366943, "eval_runtime": 1764.724, "eval_samples_per_second": 4.031, "eval_steps_per_second": 0.504, "step": 100 }, { "epoch": 0.02, "grad_norm": 0.9374421238899231, "learning_rate": 0.00017104377259560488, "loss": 2.6361, "step": 125 }, { "epoch": 0.02, "eval_loss": 2.5327322483062744, "eval_runtime": 1766.346, "eval_samples_per_second": 4.028, "eval_steps_per_second": 0.504, "step": 125 }, { "epoch": 0.02, "grad_norm": 0.8157618641853333, "learning_rate": 0.00015913449060989774, "loss": 2.6192, "step": 150 }, { "epoch": 0.02, "eval_loss": 2.5096042156219482, "eval_runtime": 1765.5704, "eval_samples_per_second": 4.029, "eval_steps_per_second": 0.504, "step": 150 }, { "epoch": 0.03, "grad_norm": 0.8904098272323608, "learning_rate": 0.00014576329140480925, "loss": 2.5547, "step": 175 }, { "epoch": 0.03, "eval_loss": 2.508963108062744, "eval_runtime": 1767.6823, "eval_samples_per_second": 4.024, "eval_steps_per_second": 0.503, "step": 175 }, { "epoch": 0.03, "grad_norm": 0.8415258526802063, "learning_rate": 0.00013126073648784688, "loss": 2.5803, "step": 200 }, { "epoch": 0.03, "eval_loss": 2.484498977661133, "eval_runtime": 1764.6855, "eval_samples_per_second": 4.031, "eval_steps_per_second": 0.504, "step": 200 }, { "epoch": 0.03, "grad_norm": 0.8273302912712097, "learning_rate": 0.00011598535663418884, "loss": 2.5275, "step": 225 }, { "epoch": 0.03, "eval_loss": 2.4777684211730957, "eval_runtime": 1764.516, "eval_samples_per_second": 4.032, "eval_steps_per_second": 0.504, "step": 225 }, { "epoch": 0.04, "grad_norm": 0.8461028933525085, "learning_rate": 0.000100314788323161, "loss": 2.5504, "step": 250 }, { "epoch": 0.04, "eval_loss": 2.4680240154266357, "eval_runtime": 1764.412, "eval_samples_per_second": 4.032, "eval_steps_per_second": 0.504, "step": 250 }, { "epoch": 0.04, "grad_norm": 0.9055793881416321, "learning_rate": 8.46364378453242e-05, "loss": 2.5447, "step": 275 }, { "epoch": 0.04, "eval_loss": 2.4592506885528564, "eval_runtime": 1763.6878, "eval_samples_per_second": 4.034, "eval_steps_per_second": 0.505, "step": 275 }, { "epoch": 0.04, "grad_norm": 1.0102213621139526, "learning_rate": 6.933790388122256e-05, "loss": 2.5308, "step": 300 }, { "epoch": 0.04, "eval_loss": 2.457402467727661, "eval_runtime": 1763.3112, "eval_samples_per_second": 4.034, "eval_steps_per_second": 0.505, "step": 300 }, { "epoch": 0.05, "grad_norm": 0.9688016176223755, "learning_rate": 5.479739532388526e-05, "loss": 2.5243, "step": 325 }, { "epoch": 0.05, "eval_loss": 2.4439914226531982, "eval_runtime": 1763.1627, "eval_samples_per_second": 4.035, "eval_steps_per_second": 0.505, "step": 325 }, { "epoch": 0.05, "grad_norm": 0.9817106127738953, "learning_rate": 4.137438123475662e-05, "loss": 2.4967, "step": 350 }, { "epoch": 0.05, "eval_loss": 2.4434900283813477, "eval_runtime": 1762.4256, "eval_samples_per_second": 4.036, "eval_steps_per_second": 0.505, "step": 350 }, { "epoch": 0.05, "grad_norm": 0.9285837411880493, "learning_rate": 2.9400704083950335e-05, "loss": 2.4979, "step": 375 }, { "epoch": 0.05, "eval_loss": 2.4380979537963867, "eval_runtime": 1761.6148, "eval_samples_per_second": 4.038, "eval_steps_per_second": 0.505, "step": 375 }, { "epoch": 0.06, "grad_norm": 0.8453341722488403, "learning_rate": 1.917237597245065e-05, "loss": 2.5183, "step": 400 }, { "epoch": 0.06, "eval_loss": 2.437168598175049, "eval_runtime": 1763.4013, "eval_samples_per_second": 4.034, "eval_steps_per_second": 0.505, "step": 400 }, { "epoch": 0.06, "grad_norm": 1.091811180114746, "learning_rate": 1.0942260649272229e-05, "loss": 2.4867, "step": 425 }, { "epoch": 0.06, "eval_loss": 2.4306600093841553, "eval_runtime": 1759.0574, "eval_samples_per_second": 4.044, "eval_steps_per_second": 0.506, "step": 425 }, { "epoch": 0.06, "grad_norm": 0.9597497582435608, "learning_rate": 4.91382223804836e-06, "loss": 2.474, "step": 450 }, { "epoch": 0.06, "eval_loss": 2.4304585456848145, "eval_runtime": 1763.2619, "eval_samples_per_second": 4.035, "eval_steps_per_second": 0.505, "step": 450 }, { "epoch": 0.07, "grad_norm": 0.9459497928619385, "learning_rate": 1.2360952164325311e-06, "loss": 2.4979, "step": 475 }, { "epoch": 0.07, "eval_loss": 2.4304075241088867, "eval_runtime": 1762.5181, "eval_samples_per_second": 4.036, "eval_steps_per_second": 0.505, "step": 475 } ], "logging_steps": 25, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "total_flos": 8045660258064000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }