{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9837587006960556, "eval_steps": 500, "global_step": 53, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 28.83467674255371, "learning_rate": 2e-05, "loss": 2.7878, "step": 1 }, { "epoch": 0.04, "grad_norm": 28.955718994140625, "learning_rate": 4e-05, "loss": 2.7826, "step": 2 }, { "epoch": 0.06, "grad_norm": 17.856950759887695, "learning_rate": 6e-05, "loss": 2.2388, "step": 3 }, { "epoch": 0.07, "grad_norm": 16.615158081054688, "learning_rate": 8e-05, "loss": 1.7809, "step": 4 }, { "epoch": 0.09, "grad_norm": 34.79571533203125, "learning_rate": 0.0001, "loss": 1.9714, "step": 5 }, { "epoch": 0.11, "grad_norm": 6.3113694190979, "learning_rate": 9.935064935064936e-05, "loss": 1.4032, "step": 6 }, { "epoch": 0.13, "grad_norm": 8.230330467224121, "learning_rate": 9.870129870129871e-05, "loss": 1.241, "step": 7 }, { "epoch": 0.15, "grad_norm": 17.19231605529785, "learning_rate": 9.805194805194806e-05, "loss": 1.2388, "step": 8 }, { "epoch": 0.17, "grad_norm": 6.900284767150879, "learning_rate": 9.74025974025974e-05, "loss": 1.0881, "step": 9 }, { "epoch": 0.19, "grad_norm": 2.7498185634613037, "learning_rate": 9.675324675324677e-05, "loss": 1.0071, "step": 10 }, { "epoch": 0.2, "grad_norm": 4.016067028045654, "learning_rate": 9.610389610389611e-05, "loss": 0.9962, "step": 11 }, { "epoch": 0.22, "grad_norm": 2.365187883377075, "learning_rate": 9.545454545454546e-05, "loss": 0.9928, "step": 12 }, { "epoch": 0.24, "grad_norm": 1.4348371028900146, "learning_rate": 9.480519480519481e-05, "loss": 0.9107, "step": 13 }, { "epoch": 0.26, "grad_norm": 1.8148932456970215, "learning_rate": 9.415584415584417e-05, "loss": 0.853, "step": 14 }, { "epoch": 0.28, "grad_norm": 1.9855575561523438, "learning_rate": 9.35064935064935e-05, "loss": 0.8598, "step": 15 }, { "epoch": 0.3, "grad_norm": 1.9960970878601074, "learning_rate": 9.285714285714286e-05, "loss": 0.8384, "step": 16 }, { "epoch": 0.32, "grad_norm": 1.4565762281417847, "learning_rate": 9.220779220779221e-05, "loss": 0.7895, "step": 17 }, { "epoch": 0.33, "grad_norm": 1.419858694076538, "learning_rate": 9.155844155844156e-05, "loss": 0.8162, "step": 18 }, { "epoch": 0.35, "grad_norm": 1.3651608228683472, "learning_rate": 9.090909090909092e-05, "loss": 0.7924, "step": 19 }, { "epoch": 0.37, "grad_norm": 1.1524626016616821, "learning_rate": 9.025974025974027e-05, "loss": 0.7484, "step": 20 }, { "epoch": 0.39, "grad_norm": 0.9174069166183472, "learning_rate": 8.961038961038961e-05, "loss": 0.7276, "step": 21 }, { "epoch": 0.41, "grad_norm": 1.1832919120788574, "learning_rate": 8.896103896103896e-05, "loss": 0.6658, "step": 22 }, { "epoch": 0.43, "grad_norm": 1.3024309873580933, "learning_rate": 8.831168831168831e-05, "loss": 0.7379, "step": 23 }, { "epoch": 0.45, "grad_norm": 1.1416062116622925, "learning_rate": 8.766233766233767e-05, "loss": 0.7202, "step": 24 }, { "epoch": 0.46, "grad_norm": 1.2058277130126953, "learning_rate": 8.701298701298701e-05, "loss": 0.6777, "step": 25 }, { "epoch": 0.48, "grad_norm": 1.0915583372116089, "learning_rate": 8.636363636363637e-05, "loss": 0.7063, "step": 26 }, { "epoch": 0.5, "grad_norm": 1.0384303331375122, "learning_rate": 8.571428571428571e-05, "loss": 0.6447, "step": 27 }, { "epoch": 0.52, "grad_norm": 1.131259560585022, "learning_rate": 8.506493506493507e-05, "loss": 0.6911, "step": 28 }, { "epoch": 0.54, "grad_norm": 1.1505099534988403, "learning_rate": 8.441558441558442e-05, "loss": 0.6142, "step": 29 }, { "epoch": 0.56, "grad_norm": 2.6675026416778564, "learning_rate": 8.376623376623377e-05, "loss": 0.6181, "step": 30 }, { "epoch": 0.58, "grad_norm": 1.2021816968917847, "learning_rate": 8.311688311688312e-05, "loss": 0.6542, "step": 31 }, { "epoch": 0.59, "grad_norm": 3.363269090652466, "learning_rate": 8.246753246753248e-05, "loss": 0.5968, "step": 32 }, { "epoch": 0.61, "grad_norm": 3.8007972240448, "learning_rate": 8.181818181818183e-05, "loss": 0.5983, "step": 33 }, { "epoch": 0.63, "grad_norm": 3.570556163787842, "learning_rate": 8.116883116883117e-05, "loss": 0.6375, "step": 34 }, { "epoch": 0.65, "grad_norm": 4.428549766540527, "learning_rate": 8.051948051948052e-05, "loss": 0.6519, "step": 35 }, { "epoch": 0.67, "grad_norm": 3.7240254878997803, "learning_rate": 7.987012987012987e-05, "loss": 0.5935, "step": 36 }, { "epoch": 0.69, "grad_norm": 2.453350782394409, "learning_rate": 7.922077922077923e-05, "loss": 0.604, "step": 37 }, { "epoch": 0.71, "grad_norm": 3.341071844100952, "learning_rate": 7.857142857142858e-05, "loss": 0.5994, "step": 38 }, { "epoch": 0.72, "grad_norm": 3.4222161769866943, "learning_rate": 7.792207792207793e-05, "loss": 0.5731, "step": 39 }, { "epoch": 0.74, "grad_norm": 2.9063804149627686, "learning_rate": 7.727272727272727e-05, "loss": 0.5566, "step": 40 }, { "epoch": 0.76, "grad_norm": 3.170330762863159, "learning_rate": 7.662337662337662e-05, "loss": 0.5684, "step": 41 }, { "epoch": 0.78, "grad_norm": 2.8363170623779297, "learning_rate": 7.597402597402598e-05, "loss": 0.5936, "step": 42 }, { "epoch": 0.8, "grad_norm": 2.3944592475891113, "learning_rate": 7.532467532467533e-05, "loss": 0.5571, "step": 43 }, { "epoch": 0.82, "grad_norm": 2.816237688064575, "learning_rate": 7.467532467532467e-05, "loss": 0.5598, "step": 44 }, { "epoch": 0.84, "grad_norm": 2.57438325881958, "learning_rate": 7.402597402597404e-05, "loss": 0.5615, "step": 45 }, { "epoch": 0.85, "grad_norm": 2.3164243698120117, "learning_rate": 7.337662337662338e-05, "loss": 0.5381, "step": 46 }, { "epoch": 0.87, "grad_norm": 2.4226622581481934, "learning_rate": 7.272727272727273e-05, "loss": 0.5592, "step": 47 }, { "epoch": 0.89, "grad_norm": 1.5568475723266602, "learning_rate": 7.207792207792208e-05, "loss": 0.5729, "step": 48 }, { "epoch": 0.91, "grad_norm": 1.1320440769195557, "learning_rate": 7.142857142857143e-05, "loss": 0.5546, "step": 49 }, { "epoch": 0.93, "grad_norm": 2.220273733139038, "learning_rate": 7.077922077922077e-05, "loss": 0.5587, "step": 50 }, { "epoch": 0.95, "grad_norm": 1.8099664449691772, "learning_rate": 7.012987012987014e-05, "loss": 0.5403, "step": 51 }, { "epoch": 0.97, "grad_norm": 1.7992054224014282, "learning_rate": 6.948051948051948e-05, "loss": 0.5374, "step": 52 }, { "epoch": 0.98, "grad_norm": 1.5891600847244263, "learning_rate": 6.883116883116883e-05, "loss": 0.5188, "step": 53 } ], "logging_steps": 1, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5, "total_flos": 8.106404006854656e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }