{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1111111111111111, "grad_norm": NaN, "learning_rate": 0.0, "loss": 0.8046, "step": 1 }, { "epoch": 0.1111111111111111, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9468807578086853, "eval_runtime": 1.4964, "eval_samples_per_second": 48.117, "eval_steps_per_second": 3.341, "step": 1 }, { "epoch": 0.2222222222222222, "grad_norm": NaN, "learning_rate": 0.0, "loss": 0.834, "step": 2 }, { "epoch": 0.2222222222222222, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9468807578086853, "eval_runtime": 1.4798, "eval_samples_per_second": 48.657, "eval_steps_per_second": 3.379, "step": 2 }, { "epoch": 0.3333333333333333, "grad_norm": NaN, "learning_rate": 0.0, "loss": 0.864, "step": 3 }, { "epoch": 0.3333333333333333, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9468807578086853, "eval_runtime": 1.4711, "eval_samples_per_second": 48.945, "eval_steps_per_second": 3.399, "step": 3 }, { "epoch": 0.4444444444444444, "grad_norm": NaN, "learning_rate": 0.0, "loss": 0.6596, "step": 4 }, { "epoch": 0.4444444444444444, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9468807578086853, "eval_runtime": 1.4781, "eval_samples_per_second": 48.711, "eval_steps_per_second": 3.383, "step": 4 }, { "epoch": 0.5555555555555556, "grad_norm": 15.292896270751953, "learning_rate": 2.5e-05, "loss": 0.9107, "step": 5 }, { "epoch": 0.5555555555555556, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9468807578086853, "eval_runtime": 1.4681, "eval_samples_per_second": 49.042, "eval_steps_per_second": 3.406, "step": 5 }, { "epoch": 0.6666666666666666, "grad_norm": NaN, "learning_rate": 2.5e-05, "loss": 0.8285, "step": 6 }, { "epoch": 0.6666666666666666, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9468807578086853, "eval_runtime": 1.4656, "eval_samples_per_second": 49.125, "eval_steps_per_second": 3.411, "step": 6 }, { "epoch": 0.7777777777777778, "grad_norm": 38.5906867980957, "learning_rate": 5e-05, "loss": 0.8436, "step": 7 }, { "epoch": 0.7777777777777778, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.8725653886795044, "eval_runtime": 1.466, "eval_samples_per_second": 49.113, "eval_steps_per_second": 3.411, "step": 7 }, { "epoch": 0.8888888888888888, "grad_norm": 11.768951416015625, "learning_rate": 4.943181818181818e-05, "loss": 0.8397, "step": 8 }, { "epoch": 0.8888888888888888, "eval_accuracy": 0.5138888888888888, "eval_loss": 0.89170241355896, "eval_runtime": 1.4681, "eval_samples_per_second": 49.044, "eval_steps_per_second": 3.406, "step": 8 }, { "epoch": 1.0, "grad_norm": 38.63240432739258, "learning_rate": 4.886363636363637e-05, "loss": 0.7961, "step": 9 }, { "epoch": 1.0, "eval_accuracy": 0.5833333333333334, "eval_loss": 0.9097018837928772, "eval_runtime": 1.4715, "eval_samples_per_second": 48.93, "eval_steps_per_second": 3.398, "step": 9 }, { "epoch": 1.1111111111111112, "grad_norm": 25.327926635742188, "learning_rate": 4.829545454545455e-05, "loss": 0.6728, "step": 10 }, { "epoch": 1.1111111111111112, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.8724534511566162, "eval_runtime": 1.4733, "eval_samples_per_second": 48.871, "eval_steps_per_second": 3.394, "step": 10 }, { "epoch": 1.2222222222222223, "grad_norm": 41.665626525878906, "learning_rate": 4.772727272727273e-05, "loss": 0.8504, "step": 11 }, { "epoch": 1.2222222222222223, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.8109580278396606, "eval_runtime": 1.4661, "eval_samples_per_second": 49.109, "eval_steps_per_second": 3.41, "step": 11 }, { "epoch": 1.3333333333333333, "grad_norm": 28.891475677490234, "learning_rate": 4.715909090909091e-05, "loss": 0.6348, "step": 12 }, { "epoch": 1.3333333333333333, "eval_accuracy": 0.5277777777777778, "eval_loss": 0.7847561240196228, "eval_runtime": 1.4756, "eval_samples_per_second": 48.794, "eval_steps_per_second": 3.388, "step": 12 }, { "epoch": 1.4444444444444444, "grad_norm": 15.667322158813477, "learning_rate": 4.659090909090909e-05, "loss": 0.7075, "step": 13 }, { "epoch": 1.4444444444444444, "eval_accuracy": 0.4861111111111111, "eval_loss": 0.8036965131759644, "eval_runtime": 1.4698, "eval_samples_per_second": 48.987, "eval_steps_per_second": 3.402, "step": 13 }, { "epoch": 1.5555555555555556, "grad_norm": 14.308302879333496, "learning_rate": 4.602272727272727e-05, "loss": 0.625, "step": 14 }, { "epoch": 1.5555555555555556, "eval_accuracy": 0.4444444444444444, "eval_loss": 0.844111979007721, "eval_runtime": 1.4649, "eval_samples_per_second": 49.152, "eval_steps_per_second": 3.413, "step": 14 }, { "epoch": 1.6666666666666665, "grad_norm": 33.23239517211914, "learning_rate": 4.545454545454546e-05, "loss": 0.7687, "step": 15 }, { "epoch": 1.6666666666666665, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.8427666425704956, "eval_runtime": 1.4735, "eval_samples_per_second": 48.862, "eval_steps_per_second": 3.393, "step": 15 }, { "epoch": 1.7777777777777777, "grad_norm": 23.251686096191406, "learning_rate": 4.488636363636364e-05, "loss": 0.5379, "step": 16 }, { "epoch": 1.7777777777777777, "eval_accuracy": 0.5555555555555556, "eval_loss": 0.7749015092849731, "eval_runtime": 1.4807, "eval_samples_per_second": 48.625, "eval_steps_per_second": 3.377, "step": 16 }, { "epoch": 1.8888888888888888, "grad_norm": 11.318711280822754, "learning_rate": 4.431818181818182e-05, "loss": 0.5477, "step": 17 }, { "epoch": 1.8888888888888888, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.68642258644104, "eval_runtime": 1.4816, "eval_samples_per_second": 48.595, "eval_steps_per_second": 3.375, "step": 17 }, { "epoch": 2.0, "grad_norm": 4.618575096130371, "learning_rate": 4.375e-05, "loss": 0.54, "step": 18 }, { "epoch": 2.0, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.6404860019683838, "eval_runtime": 1.4695, "eval_samples_per_second": 48.996, "eval_steps_per_second": 3.402, "step": 18 }, { "epoch": 2.111111111111111, "grad_norm": 4.5261077880859375, "learning_rate": 4.318181818181819e-05, "loss": 0.4569, "step": 19 }, { "epoch": 2.111111111111111, "eval_accuracy": 0.6527777777777778, "eval_loss": 0.641287088394165, "eval_runtime": 1.4917, "eval_samples_per_second": 48.268, "eval_steps_per_second": 3.352, "step": 19 }, { "epoch": 2.2222222222222223, "grad_norm": 16.080116271972656, "learning_rate": 4.261363636363637e-05, "loss": 0.4211, "step": 20 }, { "epoch": 2.2222222222222223, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.5904948115348816, "eval_runtime": 1.6079, "eval_samples_per_second": 44.778, "eval_steps_per_second": 3.11, "step": 20 }, { "epoch": 2.3333333333333335, "grad_norm": 10.328635215759277, "learning_rate": 4.204545454545455e-05, "loss": 0.4088, "step": 21 }, { "epoch": 2.3333333333333335, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.5809131264686584, "eval_runtime": 1.4918, "eval_samples_per_second": 48.264, "eval_steps_per_second": 3.352, "step": 21 }, { "epoch": 2.4444444444444446, "grad_norm": 3.862455368041992, "learning_rate": 4.1477272727272734e-05, "loss": 0.3588, "step": 22 }, { "epoch": 2.4444444444444446, "eval_accuracy": 0.6388888888888888, "eval_loss": 0.5841391086578369, "eval_runtime": 1.4927, "eval_samples_per_second": 48.235, "eval_steps_per_second": 3.35, "step": 22 }, { "epoch": 2.5555555555555554, "grad_norm": 21.388599395751953, "learning_rate": 4.0909090909090915e-05, "loss": 0.3224, "step": 23 }, { "epoch": 2.5555555555555554, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.5694243311882019, "eval_runtime": 1.4741, "eval_samples_per_second": 48.845, "eval_steps_per_second": 3.392, "step": 23 }, { "epoch": 2.6666666666666665, "grad_norm": 18.525726318359375, "learning_rate": 4.034090909090909e-05, "loss": 0.3886, "step": 24 }, { "epoch": 2.6666666666666665, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.5775553584098816, "eval_runtime": 1.4808, "eval_samples_per_second": 48.621, "eval_steps_per_second": 3.376, "step": 24 }, { "epoch": 2.7777777777777777, "grad_norm": 8.507179260253906, "learning_rate": 3.9772727272727275e-05, "loss": 0.4483, "step": 25 }, { "epoch": 2.7777777777777777, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.5739086866378784, "eval_runtime": 1.5173, "eval_samples_per_second": 47.452, "eval_steps_per_second": 3.295, "step": 25 }, { "epoch": 2.888888888888889, "grad_norm": 10.042552947998047, "learning_rate": 3.9204545454545456e-05, "loss": 0.2571, "step": 26 }, { "epoch": 2.888888888888889, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.5965893864631653, "eval_runtime": 1.513, "eval_samples_per_second": 47.589, "eval_steps_per_second": 3.305, "step": 26 }, { "epoch": 3.0, "grad_norm": 4.674591541290283, "learning_rate": 3.8636363636363636e-05, "loss": 0.2023, "step": 27 }, { "epoch": 3.0, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.6090915203094482, "eval_runtime": 1.4987, "eval_samples_per_second": 48.041, "eval_steps_per_second": 3.336, "step": 27 }, { "epoch": 3.111111111111111, "grad_norm": 4.555273532867432, "learning_rate": 3.8068181818181816e-05, "loss": 0.1835, "step": 28 }, { "epoch": 3.111111111111111, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.6348134279251099, "eval_runtime": 1.5215, "eval_samples_per_second": 47.323, "eval_steps_per_second": 3.286, "step": 28 }, { "epoch": 3.2222222222222223, "grad_norm": 21.892303466796875, "learning_rate": 3.7500000000000003e-05, "loss": 0.2978, "step": 29 }, { "epoch": 3.2222222222222223, "eval_accuracy": 0.75, "eval_loss": 0.6449768543243408, "eval_runtime": 1.4942, "eval_samples_per_second": 48.187, "eval_steps_per_second": 3.346, "step": 29 }, { "epoch": 3.3333333333333335, "grad_norm": 3.3101179599761963, "learning_rate": 3.6931818181818184e-05, "loss": 0.1375, "step": 30 }, { "epoch": 3.3333333333333335, "eval_accuracy": 0.7361111111111112, "eval_loss": 0.6778775453567505, "eval_runtime": 1.581, "eval_samples_per_second": 45.542, "eval_steps_per_second": 3.163, "step": 30 }, { "epoch": 3.4444444444444446, "grad_norm": 5.253066062927246, "learning_rate": 3.6363636363636364e-05, "loss": 0.0773, "step": 31 }, { "epoch": 3.4444444444444446, "eval_accuracy": 0.75, "eval_loss": 0.6996697783470154, "eval_runtime": 1.4905, "eval_samples_per_second": 48.307, "eval_steps_per_second": 3.355, "step": 31 }, { "epoch": 3.5555555555555554, "grad_norm": 6.1245317459106445, "learning_rate": 3.579545454545455e-05, "loss": 0.1398, "step": 32 }, { "epoch": 3.5555555555555554, "eval_accuracy": 0.7361111111111112, "eval_loss": 0.7371202111244202, "eval_runtime": 1.5564, "eval_samples_per_second": 46.261, "eval_steps_per_second": 3.213, "step": 32 }, { "epoch": 3.6666666666666665, "grad_norm": 8.124380111694336, "learning_rate": 3.522727272727273e-05, "loss": 0.1665, "step": 33 }, { "epoch": 3.6666666666666665, "eval_accuracy": 0.7361111111111112, "eval_loss": 0.7838079929351807, "eval_runtime": 1.4902, "eval_samples_per_second": 48.315, "eval_steps_per_second": 3.355, "step": 33 }, { "epoch": 3.7777777777777777, "grad_norm": 5.12769079208374, "learning_rate": 3.465909090909091e-05, "loss": 0.0566, "step": 34 }, { "epoch": 3.7777777777777777, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.823827862739563, "eval_runtime": 1.4808, "eval_samples_per_second": 48.623, "eval_steps_per_second": 3.377, "step": 34 }, { "epoch": 3.888888888888889, "grad_norm": 4.054475784301758, "learning_rate": 3.409090909090909e-05, "loss": 0.0416, "step": 35 }, { "epoch": 3.888888888888889, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.8069868087768555, "eval_runtime": 1.4641, "eval_samples_per_second": 49.176, "eval_steps_per_second": 3.415, "step": 35 }, { "epoch": 4.0, "grad_norm": 2.9879519939422607, "learning_rate": 3.352272727272727e-05, "loss": 0.082, "step": 36 }, { "epoch": 4.0, "eval_accuracy": 0.7638888888888888, "eval_loss": 0.9339464902877808, "eval_runtime": 1.475, "eval_samples_per_second": 48.815, "eval_steps_per_second": 3.39, "step": 36 }, { "epoch": 4.111111111111111, "grad_norm": 5.574306488037109, "learning_rate": 3.295454545454545e-05, "loss": 0.0587, "step": 37 }, { "epoch": 4.111111111111111, "eval_accuracy": 0.75, "eval_loss": 0.898453414440155, "eval_runtime": 1.4751, "eval_samples_per_second": 48.809, "eval_steps_per_second": 3.39, "step": 37 }, { "epoch": 4.222222222222222, "grad_norm": 4.23170280456543, "learning_rate": 3.238636363636364e-05, "loss": 0.1462, "step": 38 }, { "epoch": 4.222222222222222, "eval_accuracy": 0.7361111111111112, "eval_loss": 0.8970211744308472, "eval_runtime": 1.496, "eval_samples_per_second": 48.127, "eval_steps_per_second": 3.342, "step": 38 }, { "epoch": 4.333333333333333, "grad_norm": 4.057853698730469, "learning_rate": 3.181818181818182e-05, "loss": 0.0474, "step": 39 }, { "epoch": 4.333333333333333, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.9456102848052979, "eval_runtime": 1.4756, "eval_samples_per_second": 48.794, "eval_steps_per_second": 3.388, "step": 39 }, { "epoch": 4.444444444444445, "grad_norm": 1.9089412689208984, "learning_rate": 3.125e-05, "loss": 0.0247, "step": 40 }, { "epoch": 4.444444444444445, "eval_accuracy": 0.7361111111111112, "eval_loss": 1.0376629829406738, "eval_runtime": 1.4767, "eval_samples_per_second": 48.757, "eval_steps_per_second": 3.386, "step": 40 }, { "epoch": 4.555555555555555, "grad_norm": 8.005538940429688, "learning_rate": 3.068181818181818e-05, "loss": 0.0492, "step": 41 }, { "epoch": 4.555555555555555, "eval_accuracy": 0.7361111111111112, "eval_loss": 1.09256112575531, "eval_runtime": 1.4742, "eval_samples_per_second": 48.839, "eval_steps_per_second": 3.392, "step": 41 }, { "epoch": 4.666666666666667, "grad_norm": 7.321335792541504, "learning_rate": 3.0113636363636365e-05, "loss": 0.0541, "step": 42 }, { "epoch": 4.666666666666667, "eval_accuracy": 0.75, "eval_loss": 1.060943365097046, "eval_runtime": 1.4853, "eval_samples_per_second": 48.474, "eval_steps_per_second": 3.366, "step": 42 }, { "epoch": 4.777777777777778, "grad_norm": 3.932873487472534, "learning_rate": 2.954545454545455e-05, "loss": 0.0222, "step": 43 }, { "epoch": 4.777777777777778, "eval_accuracy": 0.7361111111111112, "eval_loss": 1.047785997390747, "eval_runtime": 1.49, "eval_samples_per_second": 48.321, "eval_steps_per_second": 3.356, "step": 43 }, { "epoch": 4.888888888888889, "grad_norm": 0.5964356660842896, "learning_rate": 2.8977272727272732e-05, "loss": 0.007, "step": 44 }, { "epoch": 4.888888888888889, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.9296979904174805, "eval_runtime": 1.476, "eval_samples_per_second": 48.781, "eval_steps_per_second": 3.388, "step": 44 }, { "epoch": 5.0, "grad_norm": 5.5752363204956055, "learning_rate": 2.8409090909090912e-05, "loss": 0.0461, "step": 45 }, { "epoch": 5.0, "eval_accuracy": 0.75, "eval_loss": 0.8986176252365112, "eval_runtime": 1.5427, "eval_samples_per_second": 46.672, "eval_steps_per_second": 3.241, "step": 45 }, { "epoch": 5.111111111111111, "grad_norm": 0.39924749732017517, "learning_rate": 2.784090909090909e-05, "loss": 0.004, "step": 46 }, { "epoch": 5.111111111111111, "eval_accuracy": 0.7638888888888888, "eval_loss": 0.8829209804534912, "eval_runtime": 1.5029, "eval_samples_per_second": 47.908, "eval_steps_per_second": 3.327, "step": 46 }, { "epoch": 5.222222222222222, "grad_norm": 1.3180583715438843, "learning_rate": 2.7272727272727273e-05, "loss": 0.0085, "step": 47 }, { "epoch": 5.222222222222222, "eval_accuracy": 0.7638888888888888, "eval_loss": 0.9565764665603638, "eval_runtime": 1.4989, "eval_samples_per_second": 48.036, "eval_steps_per_second": 3.336, "step": 47 }, { "epoch": 5.333333333333333, "grad_norm": 0.30337390303611755, "learning_rate": 2.6704545454545453e-05, "loss": 0.0025, "step": 48 }, { "epoch": 5.333333333333333, "eval_accuracy": 0.7777777777777778, "eval_loss": 0.9712399244308472, "eval_runtime": 1.4897, "eval_samples_per_second": 48.33, "eval_steps_per_second": 3.356, "step": 48 }, { "epoch": 5.444444444444445, "grad_norm": 1.2145888805389404, "learning_rate": 2.6136363636363637e-05, "loss": 0.0115, "step": 49 }, { "epoch": 5.444444444444445, "eval_accuracy": 0.75, "eval_loss": 1.0557817220687866, "eval_runtime": 1.5821, "eval_samples_per_second": 45.51, "eval_steps_per_second": 3.16, "step": 49 }, { "epoch": 5.555555555555555, "grad_norm": 0.5874105095863342, "learning_rate": 2.5568181818181817e-05, "loss": 0.0063, "step": 50 }, { "epoch": 5.555555555555555, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.0096588134765625, "eval_runtime": 1.5483, "eval_samples_per_second": 46.502, "eval_steps_per_second": 3.229, "step": 50 }, { "epoch": 5.666666666666667, "grad_norm": 1.8710988759994507, "learning_rate": 2.5e-05, "loss": 0.0082, "step": 51 }, { "epoch": 5.666666666666667, "eval_accuracy": 0.7638888888888888, "eval_loss": 1.1002558469772339, "eval_runtime": 1.5485, "eval_samples_per_second": 46.496, "eval_steps_per_second": 3.229, "step": 51 }, { "epoch": 5.777777777777778, "grad_norm": 0.08881735056638718, "learning_rate": 2.4431818181818185e-05, "loss": 0.0005, "step": 52 }, { "epoch": 5.777777777777778, "eval_accuracy": 0.7638888888888888, "eval_loss": 1.111068844795227, "eval_runtime": 1.4791, "eval_samples_per_second": 48.677, "eval_steps_per_second": 3.38, "step": 52 }, { "epoch": 5.888888888888889, "grad_norm": 0.055561624467372894, "learning_rate": 2.3863636363636365e-05, "loss": 0.0004, "step": 53 }, { "epoch": 5.888888888888889, "eval_accuracy": 0.75, "eval_loss": 1.1046401262283325, "eval_runtime": 1.5013, "eval_samples_per_second": 47.957, "eval_steps_per_second": 3.33, "step": 53 }, { "epoch": 6.0, "grad_norm": 0.051147185266017914, "learning_rate": 2.3295454545454546e-05, "loss": 0.0004, "step": 54 }, { "epoch": 6.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1201319694519043, "eval_runtime": 1.4825, "eval_samples_per_second": 48.566, "eval_steps_per_second": 3.373, "step": 54 }, { "epoch": 6.111111111111111, "grad_norm": 0.03255375474691391, "learning_rate": 2.272727272727273e-05, "loss": 0.0003, "step": 55 }, { "epoch": 6.111111111111111, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1359165906906128, "eval_runtime": 1.4977, "eval_samples_per_second": 48.075, "eval_steps_per_second": 3.339, "step": 55 }, { "epoch": 6.222222222222222, "grad_norm": 0.0207963977009058, "learning_rate": 2.215909090909091e-05, "loss": 0.0001, "step": 56 }, { "epoch": 6.222222222222222, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.2502617835998535, "eval_runtime": 1.4856, "eval_samples_per_second": 48.464, "eval_steps_per_second": 3.366, "step": 56 }, { "epoch": 6.333333333333333, "grad_norm": 0.14410509169101715, "learning_rate": 2.1590909090909093e-05, "loss": 0.0006, "step": 57 }, { "epoch": 6.333333333333333, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.1601958274841309, "eval_runtime": 1.4859, "eval_samples_per_second": 48.455, "eval_steps_per_second": 3.365, "step": 57 }, { "epoch": 6.444444444444445, "grad_norm": 0.054281365126371384, "learning_rate": 2.1022727272727274e-05, "loss": 0.0003, "step": 58 }, { "epoch": 6.444444444444445, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.2148370742797852, "eval_runtime": 1.4936, "eval_samples_per_second": 48.206, "eval_steps_per_second": 3.348, "step": 58 }, { "epoch": 6.555555555555555, "grad_norm": 0.02057286538183689, "learning_rate": 2.0454545454545457e-05, "loss": 0.0002, "step": 59 }, { "epoch": 6.555555555555555, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.3522080183029175, "eval_runtime": 1.4763, "eval_samples_per_second": 48.769, "eval_steps_per_second": 3.387, "step": 59 }, { "epoch": 6.666666666666667, "grad_norm": 0.0014641749439761043, "learning_rate": 1.9886363636363638e-05, "loss": 0.0, "step": 60 }, { "epoch": 6.666666666666667, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.2812823057174683, "eval_runtime": 1.4521, "eval_samples_per_second": 49.584, "eval_steps_per_second": 3.443, "step": 60 }, { "epoch": 6.777777777777778, "grad_norm": 0.018109608441591263, "learning_rate": 1.9318181818181818e-05, "loss": 0.0001, "step": 61 }, { "epoch": 6.777777777777778, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.253478765487671, "eval_runtime": 1.4921, "eval_samples_per_second": 48.254, "eval_steps_per_second": 3.351, "step": 61 }, { "epoch": 6.888888888888889, "grad_norm": 0.05381924659013748, "learning_rate": 1.8750000000000002e-05, "loss": 0.0002, "step": 62 }, { "epoch": 6.888888888888889, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3584959506988525, "eval_runtime": 1.4846, "eval_samples_per_second": 48.499, "eval_steps_per_second": 3.368, "step": 62 }, { "epoch": 7.0, "grad_norm": 0.002476187190040946, "learning_rate": 1.8181818181818182e-05, "loss": 0.0, "step": 63 }, { "epoch": 7.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.364111304283142, "eval_runtime": 1.4702, "eval_samples_per_second": 48.973, "eval_steps_per_second": 3.401, "step": 63 }, { "epoch": 7.111111111111111, "grad_norm": 0.014680763706564903, "learning_rate": 1.7613636363636366e-05, "loss": 0.0001, "step": 64 }, { "epoch": 7.111111111111111, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3435033559799194, "eval_runtime": 1.4658, "eval_samples_per_second": 49.118, "eval_steps_per_second": 3.411, "step": 64 }, { "epoch": 7.222222222222222, "grad_norm": 0.010304883122444153, "learning_rate": 1.7045454545454546e-05, "loss": 0.0, "step": 65 }, { "epoch": 7.222222222222222, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3150172233581543, "eval_runtime": 1.4673, "eval_samples_per_second": 49.071, "eval_steps_per_second": 3.408, "step": 65 }, { "epoch": 7.333333333333333, "grad_norm": 0.014999941922724247, "learning_rate": 1.6477272727272726e-05, "loss": 0.0001, "step": 66 }, { "epoch": 7.333333333333333, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.396562099456787, "eval_runtime": 1.4754, "eval_samples_per_second": 48.799, "eval_steps_per_second": 3.389, "step": 66 }, { "epoch": 7.444444444444445, "grad_norm": 0.021909315139055252, "learning_rate": 1.590909090909091e-05, "loss": 0.0001, "step": 67 }, { "epoch": 7.444444444444445, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.3331305980682373, "eval_runtime": 1.4793, "eval_samples_per_second": 48.672, "eval_steps_per_second": 3.38, "step": 67 }, { "epoch": 7.555555555555555, "grad_norm": 0.0006957874284125865, "learning_rate": 1.534090909090909e-05, "loss": 0.0, "step": 68 }, { "epoch": 7.555555555555555, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3853394985198975, "eval_runtime": 1.4743, "eval_samples_per_second": 48.838, "eval_steps_per_second": 3.392, "step": 68 }, { "epoch": 7.666666666666667, "grad_norm": 0.016373885795474052, "learning_rate": 1.4772727272727274e-05, "loss": 0.0001, "step": 69 }, { "epoch": 7.666666666666667, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.342584490776062, "eval_runtime": 1.5274, "eval_samples_per_second": 47.14, "eval_steps_per_second": 3.274, "step": 69 }, { "epoch": 7.777777777777778, "grad_norm": 0.09788516908884048, "learning_rate": 1.4204545454545456e-05, "loss": 0.0002, "step": 70 }, { "epoch": 7.777777777777778, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.3891209363937378, "eval_runtime": 1.4849, "eval_samples_per_second": 48.488, "eval_steps_per_second": 3.367, "step": 70 }, { "epoch": 7.888888888888889, "grad_norm": 0.004993033595383167, "learning_rate": 1.3636363636363637e-05, "loss": 0.0, "step": 71 }, { "epoch": 7.888888888888889, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.461037039756775, "eval_runtime": 1.489, "eval_samples_per_second": 48.354, "eval_steps_per_second": 3.358, "step": 71 }, { "epoch": 8.0, "grad_norm": 0.00147097313310951, "learning_rate": 1.3068181818181819e-05, "loss": 0.0, "step": 72 }, { "epoch": 8.0, "eval_accuracy": 0.8194444444444444, "eval_loss": 1.3761588335037231, "eval_runtime": 1.4965, "eval_samples_per_second": 48.111, "eval_steps_per_second": 3.341, "step": 72 }, { "epoch": 8.11111111111111, "grad_norm": 0.011906428262591362, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 73 }, { "epoch": 8.11111111111111, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.4502863883972168, "eval_runtime": 1.5145, "eval_samples_per_second": 47.539, "eval_steps_per_second": 3.301, "step": 73 }, { "epoch": 8.222222222222221, "grad_norm": 0.0011542687425389886, "learning_rate": 1.1931818181818183e-05, "loss": 0.0, "step": 74 }, { "epoch": 8.222222222222221, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4473508596420288, "eval_runtime": 1.4978, "eval_samples_per_second": 48.069, "eval_steps_per_second": 3.338, "step": 74 }, { "epoch": 8.333333333333334, "grad_norm": 0.0013727850746363401, "learning_rate": 1.1363636363636365e-05, "loss": 0.0, "step": 75 }, { "epoch": 8.333333333333334, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.4718106985092163, "eval_runtime": 1.4975, "eval_samples_per_second": 48.08, "eval_steps_per_second": 3.339, "step": 75 }, { "epoch": 8.444444444444445, "grad_norm": 0.005262918770313263, "learning_rate": 1.0795454545454547e-05, "loss": 0.0, "step": 76 }, { "epoch": 8.444444444444445, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3964359760284424, "eval_runtime": 1.4988, "eval_samples_per_second": 48.037, "eval_steps_per_second": 3.336, "step": 76 }, { "epoch": 8.555555555555555, "grad_norm": 0.011616203002631664, "learning_rate": 1.0227272727272729e-05, "loss": 0.0, "step": 77 }, { "epoch": 8.555555555555555, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.365304708480835, "eval_runtime": 1.5036, "eval_samples_per_second": 47.884, "eval_steps_per_second": 3.325, "step": 77 }, { "epoch": 8.666666666666666, "grad_norm": 0.0016829015221446753, "learning_rate": 9.659090909090909e-06, "loss": 0.0, "step": 78 }, { "epoch": 8.666666666666666, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3907890319824219, "eval_runtime": 1.5386, "eval_samples_per_second": 46.797, "eval_steps_per_second": 3.25, "step": 78 }, { "epoch": 8.777777777777779, "grad_norm": 0.007697275839745998, "learning_rate": 9.090909090909091e-06, "loss": 0.0, "step": 79 }, { "epoch": 8.777777777777779, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.2962956428527832, "eval_runtime": 1.5653, "eval_samples_per_second": 45.997, "eval_steps_per_second": 3.194, "step": 79 }, { "epoch": 8.88888888888889, "grad_norm": 0.0010299325222149491, "learning_rate": 8.522727272727273e-06, "loss": 0.0, "step": 80 }, { "epoch": 8.88888888888889, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.383674144744873, "eval_runtime": 1.5762, "eval_samples_per_second": 45.681, "eval_steps_per_second": 3.172, "step": 80 }, { "epoch": 9.0, "grad_norm": 0.005102647002786398, "learning_rate": 7.954545454545455e-06, "loss": 0.0, "step": 81 }, { "epoch": 9.0, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3699817657470703, "eval_runtime": 1.4837, "eval_samples_per_second": 48.527, "eval_steps_per_second": 3.37, "step": 81 }, { "epoch": 9.11111111111111, "grad_norm": 0.0035217173863202333, "learning_rate": 7.386363636363637e-06, "loss": 0.0, "step": 82 }, { "epoch": 9.11111111111111, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.4067063331604004, "eval_runtime": 1.5387, "eval_samples_per_second": 46.792, "eval_steps_per_second": 3.249, "step": 82 }, { "epoch": 9.222222222222221, "grad_norm": 0.0014651613309979439, "learning_rate": 6.818181818181818e-06, "loss": 0.0, "step": 83 }, { "epoch": 9.222222222222221, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.3422017097473145, "eval_runtime": 1.4993, "eval_samples_per_second": 48.023, "eval_steps_per_second": 3.335, "step": 83 }, { "epoch": 9.333333333333334, "grad_norm": 0.002838965505361557, "learning_rate": 6.25e-06, "loss": 0.0, "step": 84 }, { "epoch": 9.333333333333334, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4219313859939575, "eval_runtime": 1.5163, "eval_samples_per_second": 47.485, "eval_steps_per_second": 3.298, "step": 84 }, { "epoch": 9.444444444444445, "grad_norm": 0.010845585726201534, "learning_rate": 5.681818181818182e-06, "loss": 0.0, "step": 85 }, { "epoch": 9.444444444444445, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.5043329000473022, "eval_runtime": 1.5102, "eval_samples_per_second": 47.675, "eval_steps_per_second": 3.311, "step": 85 }, { "epoch": 9.555555555555555, "grad_norm": 0.0013982527889311314, "learning_rate": 5.113636363636364e-06, "loss": 0.0, "step": 86 }, { "epoch": 9.555555555555555, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4191205501556396, "eval_runtime": 1.537, "eval_samples_per_second": 46.846, "eval_steps_per_second": 3.253, "step": 86 }, { "epoch": 9.666666666666666, "grad_norm": 0.0016395855927839875, "learning_rate": 4.5454545454545455e-06, "loss": 0.0, "step": 87 }, { "epoch": 9.666666666666666, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.4259694814682007, "eval_runtime": 1.4834, "eval_samples_per_second": 48.536, "eval_steps_per_second": 3.371, "step": 87 }, { "epoch": 9.777777777777779, "grad_norm": 0.0018825188744813204, "learning_rate": 3.9772727272727275e-06, "loss": 0.0, "step": 88 }, { "epoch": 9.777777777777779, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.4664974212646484, "eval_runtime": 1.4871, "eval_samples_per_second": 48.416, "eval_steps_per_second": 3.362, "step": 88 }, { "epoch": 9.88888888888889, "grad_norm": 0.00835789367556572, "learning_rate": 3.409090909090909e-06, "loss": 0.0, "step": 89 }, { "epoch": 9.88888888888889, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.4490399360656738, "eval_runtime": 1.5036, "eval_samples_per_second": 47.886, "eval_steps_per_second": 3.325, "step": 89 }, { "epoch": 10.0, "grad_norm": 0.001027365098707378, "learning_rate": 2.840909090909091e-06, "loss": 0.0, "step": 90 }, { "epoch": 10.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.3632524013519287, "eval_runtime": 1.484, "eval_samples_per_second": 48.518, "eval_steps_per_second": 3.369, "step": 90 }, { "epoch": 10.0, "step": 90, "total_flos": 5016736558481408.0, "train_loss": 0.20236469821797476, "train_runtime": 343.5318, "train_samples_per_second": 8.267, "train_steps_per_second": 0.262 } ], "logging_steps": 1, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 5016736558481408.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }