{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 7, "global_step": 132, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030303030303030304, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.2128, "step": 1 }, { "epoch": 0.21212121212121213, "grad_norm": 2.591930533784769, "learning_rate": 3.0000000000000004e-07, "loss": 2.2112, "step": 7 }, { "epoch": 0.21212121212121213, "eval_loss": 2.200307846069336, "eval_runtime": 142.2614, "eval_samples_per_second": 3.269, "eval_steps_per_second": 0.028, "step": 7 }, { "epoch": 0.42424242424242425, "grad_norm": 3.209520918668776, "learning_rate": 2.1e-06, "loss": 2.1943, "step": 14 }, { "epoch": 0.42424242424242425, "eval_loss": 2.147017478942871, "eval_runtime": 143.3053, "eval_samples_per_second": 3.245, "eval_steps_per_second": 0.028, "step": 14 }, { "epoch": 0.6363636363636364, "grad_norm": 2.1132781771027593, "learning_rate": 2.5095609265912853e-06, "loss": 2.1573, "step": 21 }, { "epoch": 0.6363636363636364, "eval_loss": 2.0779454708099365, "eval_runtime": 143.4582, "eval_samples_per_second": 3.241, "eval_steps_per_second": 0.028, "step": 21 }, { "epoch": 0.8484848484848485, "grad_norm": 1.9056335733865675, "learning_rate": 1.3197749551783641e-06, "loss": 2.0595, "step": 28 }, { "epoch": 0.8484848484848485, "eval_loss": 2.0294580459594727, "eval_runtime": 137.3982, "eval_samples_per_second": 3.384, "eval_steps_per_second": 0.029, "step": 28 }, { "epoch": 1.0606060606060606, "grad_norm": 2.1736242058284025, "learning_rate": 6.783887430182062e-07, "loss": 1.9961, "step": 35 }, { "epoch": 1.0606060606060606, "eval_loss": 2.0057485103607178, "eval_runtime": 138.1845, "eval_samples_per_second": 3.365, "eval_steps_per_second": 0.029, "step": 35 }, { "epoch": 1.2727272727272727, "grad_norm": 3.1170870533174737, "learning_rate": 4.1931673730025623e-07, "loss": 1.9332, "step": 42 }, { "epoch": 1.2727272727272727, "eval_loss": 1.9999465942382812, "eval_runtime": 139.0169, "eval_samples_per_second": 3.345, "eval_steps_per_second": 0.029, "step": 42 }, { "epoch": 1.4848484848484849, "grad_norm": 2.023380630221034, "learning_rate": 2.1759855432049637e-07, "loss": 1.9101, "step": 49 }, { "epoch": 1.4848484848484849, "eval_loss": 1.9939130544662476, "eval_runtime": 141.1776, "eval_samples_per_second": 3.294, "eval_steps_per_second": 0.028, "step": 49 }, { "epoch": 1.696969696969697, "grad_norm": 2.6277704707226777, "learning_rate": 1.2154440189415328e-07, "loss": 1.906, "step": 56 }, { "epoch": 1.696969696969697, "eval_loss": 1.990719199180603, "eval_runtime": 138.5039, "eval_samples_per_second": 3.357, "eval_steps_per_second": 0.029, "step": 56 }, { "epoch": 1.9090909090909092, "grad_norm": 2.0677909551788045, "learning_rate": 7.843503292553053e-08, "loss": 1.9054, "step": 63 }, { "epoch": 1.9090909090909092, "eval_loss": 1.9889289140701294, "eval_runtime": 142.8032, "eval_samples_per_second": 3.256, "eval_steps_per_second": 0.028, "step": 63 }, { "epoch": 2.121212121212121, "grad_norm": 1.4792478673423035, "learning_rate": 6.038521136361391e-08, "loss": 1.9037, "step": 70 }, { "epoch": 2.121212121212121, "eval_loss": 1.9878106117248535, "eval_runtime": 139.6825, "eval_samples_per_second": 3.329, "eval_steps_per_second": 0.029, "step": 70 }, { "epoch": 2.3333333333333335, "grad_norm": 5.077301685002183, "learning_rate": 5.342647186003563e-08, "loss": 1.8786, "step": 77 }, { "epoch": 2.3333333333333335, "eval_loss": 1.9871684312820435, "eval_runtime": 151.1266, "eval_samples_per_second": 3.077, "eval_steps_per_second": 0.026, "step": 77 }, { "epoch": 2.5454545454545454, "grad_norm": 1.6345215057101947, "learning_rate": 5.099824238664556e-08, "loss": 1.8962, "step": 84 }, { "epoch": 2.5454545454545454, "eval_loss": 1.9865626096725464, "eval_runtime": 140.1891, "eval_samples_per_second": 3.317, "eval_steps_per_second": 0.029, "step": 84 }, { "epoch": 2.757575757575758, "grad_norm": 1.7960257700653, "learning_rate": 5.024882880767712e-08, "loss": 1.8668, "step": 91 }, { "epoch": 2.757575757575758, "eval_loss": 1.9858595132827759, "eval_runtime": 141.572, "eval_samples_per_second": 3.285, "eval_steps_per_second": 0.028, "step": 91 }, { "epoch": 2.9696969696969697, "grad_norm": 2.888661327751688, "learning_rate": 5.0050722602692304e-08, "loss": 1.8988, "step": 98 }, { "epoch": 2.9696969696969697, "eval_loss": 1.984999179840088, "eval_runtime": 139.6459, "eval_samples_per_second": 3.33, "eval_steps_per_second": 0.029, "step": 98 }, { "epoch": 3.1818181818181817, "grad_norm": 1.5620758554693681, "learning_rate": 5.000789738737886e-08, "loss": 1.8966, "step": 105 }, { "epoch": 3.1818181818181817, "eval_loss": 1.9841891527175903, "eval_runtime": 144.3125, "eval_samples_per_second": 3.222, "eval_steps_per_second": 0.028, "step": 105 }, { "epoch": 3.393939393939394, "grad_norm": 2.0865509846490657, "learning_rate": 5.0000841090079794e-08, "loss": 1.8847, "step": 112 }, { "epoch": 3.393939393939394, "eval_loss": 1.983519196510315, "eval_runtime": 143.6862, "eval_samples_per_second": 3.236, "eval_steps_per_second": 0.028, "step": 112 }, { "epoch": 3.606060606060606, "grad_norm": 1.8436999654324047, "learning_rate": 5.000005037180778e-08, "loss": 1.8748, "step": 119 }, { "epoch": 3.606060606060606, "eval_loss": 1.9828811883926392, "eval_runtime": 141.3381, "eval_samples_per_second": 3.29, "eval_steps_per_second": 0.028, "step": 119 }, { "epoch": 3.8181818181818183, "grad_norm": 2.307936602188417, "learning_rate": 5.00000011344935e-08, "loss": 1.851, "step": 126 }, { "epoch": 3.8181818181818183, "eval_loss": 1.9823122024536133, "eval_runtime": 142.0143, "eval_samples_per_second": 3.274, "eval_steps_per_second": 0.028, "step": 126 }, { "epoch": 4.0, "step": 132, "total_flos": 130691559849984.0, "train_loss": 1.9532976656249075, "train_runtime": 25190.3368, "train_samples_per_second": 0.664, "train_steps_per_second": 0.005 } ], "logging_steps": 7, "max_steps": 132, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 27, "total_flos": 130691559849984.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }