{ "best_metric": null, "best_model_checkpoint": null, "epoch": 22.727272727272727, "eval_steps": 500, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 159.0, "learning_rate": 1.5384615384615387e-05, "loss": 24.9691, "step": 1 }, { "epoch": 0.91, "grad_norm": 70.5, "learning_rate": 7.692307692307693e-05, "loss": 23.7344, "step": 5 }, { "epoch": 0.91, "eval_loss": 7.958380222320557, "eval_runtime": 0.5623, "eval_samples_per_second": 3.557, "eval_steps_per_second": 1.778, "step": 5 }, { "epoch": 1.82, "grad_norm": 10.75, "learning_rate": 0.00015384615384615385, "loss": 14.6026, "step": 10 }, { "epoch": 2.0, "eval_loss": 6.82885217666626, "eval_runtime": 0.5598, "eval_samples_per_second": 3.573, "eval_steps_per_second": 1.786, "step": 11 }, { "epoch": 2.73, "grad_norm": 2.90625, "learning_rate": 0.00019984268150178167, "loss": 10.8118, "step": 15 }, { "epoch": 2.91, "eval_loss": 6.418484687805176, "eval_runtime": 0.5774, "eval_samples_per_second": 3.464, "eval_steps_per_second": 1.732, "step": 16 }, { "epoch": 3.64, "grad_norm": 4.8125, "learning_rate": 0.00019807852804032305, "loss": 10.8598, "step": 20 }, { "epoch": 4.0, "eval_loss": 5.106105327606201, "eval_runtime": 0.5656, "eval_samples_per_second": 3.536, "eval_steps_per_second": 1.768, "step": 22 }, { "epoch": 4.55, "grad_norm": 16.625, "learning_rate": 0.00019438833303083678, "loss": 7.9354, "step": 25 }, { "epoch": 4.91, "eval_loss": 1.7010552883148193, "eval_runtime": 0.5782, "eval_samples_per_second": 3.459, "eval_steps_per_second": 1.73, "step": 27 }, { "epoch": 5.45, "grad_norm": 2.21875, "learning_rate": 0.00018884456359788724, "loss": 2.0354, "step": 30 }, { "epoch": 6.0, "eval_loss": 1.4460557699203491, "eval_runtime": 0.5682, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.76, "step": 33 }, { "epoch": 6.36, "grad_norm": 0.9140625, "learning_rate": 0.00018155608689592604, "loss": 1.4855, "step": 35 }, { "epoch": 6.91, "eval_loss": 1.3564677238464355, "eval_runtime": 0.5812, "eval_samples_per_second": 3.441, "eval_steps_per_second": 1.721, "step": 38 }, { "epoch": 7.27, "grad_norm": 1.203125, "learning_rate": 0.0001726660322034027, "loss": 1.326, "step": 40 }, { "epoch": 8.0, "eval_loss": 1.2935034036636353, "eval_runtime": 0.5674, "eval_samples_per_second": 3.525, "eval_steps_per_second": 1.762, "step": 44 }, { "epoch": 8.18, "grad_norm": 0.9296875, "learning_rate": 0.00016234898018587337, "loss": 1.1375, "step": 45 }, { "epoch": 8.91, "eval_loss": 1.269553542137146, "eval_runtime": 0.5872, "eval_samples_per_second": 3.406, "eval_steps_per_second": 1.703, "step": 49 }, { "epoch": 9.09, "grad_norm": 1.0625, "learning_rate": 0.00015080753452465296, "loss": 1.0376, "step": 50 }, { "epoch": 10.0, "grad_norm": 1.5390625, "learning_rate": 0.000138268343236509, "loss": 0.9091, "step": 55 }, { "epoch": 10.0, "eval_loss": 1.2716257572174072, "eval_runtime": 0.5653, "eval_samples_per_second": 3.538, "eval_steps_per_second": 1.769, "step": 55 }, { "epoch": 10.91, "grad_norm": 0.59375, "learning_rate": 0.0001249776478167227, "loss": 0.8111, "step": 60 }, { "epoch": 10.91, "eval_loss": 1.2860848903656006, "eval_runtime": 0.5837, "eval_samples_per_second": 3.427, "eval_steps_per_second": 1.713, "step": 60 }, { "epoch": 11.82, "grad_norm": 0.69140625, "learning_rate": 0.00011119644761033078, "loss": 0.689, "step": 65 }, { "epoch": 12.0, "eval_loss": 1.3148236274719238, "eval_runtime": 0.5682, "eval_samples_per_second": 3.52, "eval_steps_per_second": 1.76, "step": 66 }, { "epoch": 12.73, "grad_norm": 0.59375, "learning_rate": 9.719537437241312e-05, "loss": 0.6341, "step": 70 }, { "epoch": 12.91, "eval_loss": 1.3391039371490479, "eval_runtime": 0.7408, "eval_samples_per_second": 2.7, "eval_steps_per_second": 1.35, "step": 71 }, { "epoch": 13.64, "grad_norm": 0.8984375, "learning_rate": 8.324937766952638e-05, "loss": 0.5359, "step": 75 }, { "epoch": 14.0, "eval_loss": 1.4231812953948975, "eval_runtime": 0.5681, "eval_samples_per_second": 3.521, "eval_steps_per_second": 1.76, "step": 77 }, { "epoch": 14.55, "grad_norm": 1.0078125, "learning_rate": 6.963232548903853e-05, "loss": 0.4664, "step": 80 }, { "epoch": 14.91, "eval_loss": 1.510708212852478, "eval_runtime": 0.6308, "eval_samples_per_second": 3.171, "eval_steps_per_second": 1.585, "step": 82 }, { "epoch": 15.45, "grad_norm": 0.68359375, "learning_rate": 5.6611626088244194e-05, "loss": 0.3951, "step": 85 }, { "epoch": 16.0, "eval_loss": 1.6597082614898682, "eval_runtime": 0.5686, "eval_samples_per_second": 3.517, "eval_steps_per_second": 1.759, "step": 88 }, { "epoch": 16.36, "grad_norm": 0.67578125, "learning_rate": 4.444297669803981e-05, "loss": 0.3593, "step": 90 }, { "epoch": 16.91, "eval_loss": 1.9376537799835205, "eval_runtime": 0.5859, "eval_samples_per_second": 3.413, "eval_steps_per_second": 1.707, "step": 93 }, { "epoch": 17.27, "grad_norm": 0.53125, "learning_rate": 3.336534220479961e-05, "loss": 0.2802, "step": 95 }, { "epoch": 18.0, "eval_loss": 1.9024397134780884, "eval_runtime": 0.5686, "eval_samples_per_second": 3.518, "eval_steps_per_second": 1.759, "step": 99 }, { "epoch": 18.18, "grad_norm": 0.466796875, "learning_rate": 2.3596262417839255e-05, "loss": 0.2613, "step": 100 }, { "epoch": 18.91, "eval_loss": 2.098067283630371, "eval_runtime": 0.5848, "eval_samples_per_second": 3.42, "eval_steps_per_second": 1.71, "step": 104 }, { "epoch": 19.09, "grad_norm": 0.63671875, "learning_rate": 1.5327580077171587e-05, "loss": 0.2442, "step": 105 }, { "epoch": 20.0, "grad_norm": 0.51171875, "learning_rate": 8.72167349386811e-06, "loss": 0.2262, "step": 110 }, { "epoch": 20.0, "eval_loss": 2.1472132205963135, "eval_runtime": 0.5636, "eval_samples_per_second": 3.548, "eval_steps_per_second": 1.774, "step": 110 }, { "epoch": 20.91, "grad_norm": 0.34375, "learning_rate": 3.908267805490051e-06, "loss": 0.2169, "step": 115 }, { "epoch": 20.91, "eval_loss": 2.1632509231567383, "eval_runtime": 0.5774, "eval_samples_per_second": 3.464, "eval_steps_per_second": 1.732, "step": 115 }, { "epoch": 21.82, "grad_norm": 0.333984375, "learning_rate": 9.818874663554357e-07, "loss": 0.2232, "step": 120 }, { "epoch": 22.0, "eval_loss": 2.159508466720581, "eval_runtime": 0.5656, "eval_samples_per_second": 3.536, "eval_steps_per_second": 1.768, "step": 121 }, { "epoch": 22.73, "grad_norm": 0.46875, "learning_rate": 0.0, "loss": 0.2096, "step": 125 }, { "epoch": 22.73, "eval_loss": 2.161546468734741, "eval_runtime": 0.563, "eval_samples_per_second": 3.553, "eval_steps_per_second": 1.776, "step": 125 }, { "epoch": 22.73, "step": 125, "total_flos": 1.929524923995259e+17, "train_loss": 3.26698664855957, "train_runtime": 470.6237, "train_samples_per_second": 4.675, "train_steps_per_second": 0.266 } ], "logging_steps": 5, "max_steps": 125, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 100, "total_flos": 1.929524923995259e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }