{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 90, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1111111111111111, "grad_norm": 71.10752868652344, "learning_rate": 2.5e-05, "loss": 1.4086, "step": 1 }, { "epoch": 0.1111111111111111, "eval_accuracy": 0.5555555555555556, "eval_loss": 1.0401158332824707, "eval_runtime": 1.5415, "eval_samples_per_second": 46.709, "eval_steps_per_second": 3.244, "step": 1 }, { "epoch": 0.2222222222222222, "grad_norm": 70.05866241455078, "learning_rate": 5e-05, "loss": 1.2969, "step": 2 }, { "epoch": 0.2222222222222222, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.957859456539154, "eval_runtime": 1.6886, "eval_samples_per_second": 42.639, "eval_steps_per_second": 2.961, "step": 2 }, { "epoch": 0.3333333333333333, "grad_norm": 51.52368927001953, "learning_rate": 4.943181818181818e-05, "loss": 1.0192, "step": 3 }, { "epoch": 0.3333333333333333, "eval_accuracy": 0.4166666666666667, "eval_loss": 1.2530348300933838, "eval_runtime": 1.7908, "eval_samples_per_second": 40.206, "eval_steps_per_second": 2.792, "step": 3 }, { "epoch": 0.4444444444444444, "grad_norm": 28.69840431213379, "learning_rate": 4.886363636363637e-05, "loss": 0.8252, "step": 4 }, { "epoch": 0.4444444444444444, "eval_accuracy": 0.4027777777777778, "eval_loss": 1.3205498456954956, "eval_runtime": 1.7073, "eval_samples_per_second": 42.172, "eval_steps_per_second": 2.929, "step": 4 }, { "epoch": 0.5555555555555556, "grad_norm": 26.98361587524414, "learning_rate": 4.829545454545455e-05, "loss": 0.8993, "step": 5 }, { "epoch": 0.5555555555555556, "eval_accuracy": 0.4166666666666667, "eval_loss": 1.1885122060775757, "eval_runtime": 1.7972, "eval_samples_per_second": 40.062, "eval_steps_per_second": 2.782, "step": 5 }, { "epoch": 0.6666666666666666, "grad_norm": 32.78493881225586, "learning_rate": 4.772727272727273e-05, "loss": 0.8969, "step": 6 }, { "epoch": 0.6666666666666666, "eval_accuracy": 0.4722222222222222, "eval_loss": 1.0518155097961426, "eval_runtime": 1.7895, "eval_samples_per_second": 40.235, "eval_steps_per_second": 2.794, "step": 6 }, { "epoch": 0.7777777777777778, "grad_norm": 35.246402740478516, "learning_rate": 4.715909090909091e-05, "loss": 0.7789, "step": 7 }, { "epoch": 0.7777777777777778, "eval_accuracy": 0.4027777777777778, "eval_loss": 0.95684814453125, "eval_runtime": 1.782, "eval_samples_per_second": 40.405, "eval_steps_per_second": 2.806, "step": 7 }, { "epoch": 0.8888888888888888, "grad_norm": 10.608202934265137, "learning_rate": 4.659090909090909e-05, "loss": 0.8172, "step": 8 }, { "epoch": 0.8888888888888888, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9124009609222412, "eval_runtime": 1.7, "eval_samples_per_second": 42.354, "eval_steps_per_second": 2.941, "step": 8 }, { "epoch": 1.0, "grad_norm": 20.640520095825195, "learning_rate": 4.602272727272727e-05, "loss": 0.9466, "step": 9 }, { "epoch": 1.0, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.8895958662033081, "eval_runtime": 1.6914, "eval_samples_per_second": 42.569, "eval_steps_per_second": 2.956, "step": 9 }, { "epoch": 1.1111111111111112, "grad_norm": 16.80632972717285, "learning_rate": 4.545454545454546e-05, "loss": 0.7398, "step": 10 }, { "epoch": 1.1111111111111112, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.8696831464767456, "eval_runtime": 1.7281, "eval_samples_per_second": 41.664, "eval_steps_per_second": 2.893, "step": 10 }, { "epoch": 1.2222222222222223, "grad_norm": 33.25798797607422, "learning_rate": 4.488636363636364e-05, "loss": 0.8393, "step": 11 }, { "epoch": 1.2222222222222223, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.8458387851715088, "eval_runtime": 1.6403, "eval_samples_per_second": 43.894, "eval_steps_per_second": 3.048, "step": 11 }, { "epoch": 1.3333333333333333, "grad_norm": 18.833730697631836, "learning_rate": 4.431818181818182e-05, "loss": 0.627, "step": 12 }, { "epoch": 1.3333333333333333, "eval_accuracy": 0.4305555555555556, "eval_loss": 0.8363681435585022, "eval_runtime": 1.7646, "eval_samples_per_second": 40.803, "eval_steps_per_second": 2.834, "step": 12 }, { "epoch": 1.4444444444444444, "grad_norm": 11.44498062133789, "learning_rate": 4.375e-05, "loss": 0.5691, "step": 13 }, { "epoch": 1.4444444444444444, "eval_accuracy": 0.4166666666666667, "eval_loss": 0.8569539189338684, "eval_runtime": 1.6972, "eval_samples_per_second": 42.423, "eval_steps_per_second": 2.946, "step": 13 }, { "epoch": 1.5555555555555556, "grad_norm": 6.994156360626221, "learning_rate": 4.318181818181819e-05, "loss": 0.6796, "step": 14 }, { "epoch": 1.5555555555555556, "eval_accuracy": 0.4444444444444444, "eval_loss": 0.9024861454963684, "eval_runtime": 1.6366, "eval_samples_per_second": 43.995, "eval_steps_per_second": 3.055, "step": 14 }, { "epoch": 1.6666666666666665, "grad_norm": 23.839618682861328, "learning_rate": 4.261363636363637e-05, "loss": 0.6225, "step": 15 }, { "epoch": 1.6666666666666665, "eval_accuracy": 0.4722222222222222, "eval_loss": 0.9257575273513794, "eval_runtime": 1.6451, "eval_samples_per_second": 43.767, "eval_steps_per_second": 3.039, "step": 15 }, { "epoch": 1.7777777777777777, "grad_norm": 23.61461639404297, "learning_rate": 4.204545454545455e-05, "loss": 0.6299, "step": 16 }, { "epoch": 1.7777777777777777, "eval_accuracy": 0.4583333333333333, "eval_loss": 0.9147135615348816, "eval_runtime": 1.7411, "eval_samples_per_second": 41.353, "eval_steps_per_second": 2.872, "step": 16 }, { "epoch": 1.8888888888888888, "grad_norm": 15.60508918762207, "learning_rate": 4.1477272727272734e-05, "loss": 0.5009, "step": 17 }, { "epoch": 1.8888888888888888, "eval_accuracy": 0.4583333333333333, "eval_loss": 0.8712971806526184, "eval_runtime": 1.688, "eval_samples_per_second": 42.653, "eval_steps_per_second": 2.962, "step": 17 }, { "epoch": 2.0, "grad_norm": 14.364876747131348, "learning_rate": 4.0909090909090915e-05, "loss": 0.4875, "step": 18 }, { "epoch": 2.0, "eval_accuracy": 0.4305555555555556, "eval_loss": 0.8122490644454956, "eval_runtime": 1.6995, "eval_samples_per_second": 42.366, "eval_steps_per_second": 2.942, "step": 18 }, { "epoch": 2.111111111111111, "grad_norm": 9.64327621459961, "learning_rate": 4.034090909090909e-05, "loss": 0.4547, "step": 19 }, { "epoch": 2.111111111111111, "eval_accuracy": 0.4444444444444444, "eval_loss": 0.7631157636642456, "eval_runtime": 1.7098, "eval_samples_per_second": 42.11, "eval_steps_per_second": 2.924, "step": 19 }, { "epoch": 2.2222222222222223, "grad_norm": 8.928176879882812, "learning_rate": 3.9772727272727275e-05, "loss": 0.3933, "step": 20 }, { "epoch": 2.2222222222222223, "eval_accuracy": 0.5138888888888888, "eval_loss": 0.7445746660232544, "eval_runtime": 1.7154, "eval_samples_per_second": 41.973, "eval_steps_per_second": 2.915, "step": 20 }, { "epoch": 2.3333333333333335, "grad_norm": 12.450410842895508, "learning_rate": 3.9204545454545456e-05, "loss": 0.3994, "step": 21 }, { "epoch": 2.3333333333333335, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.7361382246017456, "eval_runtime": 1.6893, "eval_samples_per_second": 42.621, "eval_steps_per_second": 2.96, "step": 21 }, { "epoch": 2.4444444444444446, "grad_norm": 9.766682624816895, "learning_rate": 3.8636363636363636e-05, "loss": 0.4413, "step": 22 }, { "epoch": 2.4444444444444446, "eval_accuracy": 0.5972222222222222, "eval_loss": 0.7318115234375, "eval_runtime": 1.6931, "eval_samples_per_second": 42.526, "eval_steps_per_second": 2.953, "step": 22 }, { "epoch": 2.5555555555555554, "grad_norm": 9.650227546691895, "learning_rate": 3.8068181818181816e-05, "loss": 0.4032, "step": 23 }, { "epoch": 2.5555555555555554, "eval_accuracy": 0.5972222222222222, "eval_loss": 0.7219780683517456, "eval_runtime": 1.6928, "eval_samples_per_second": 42.532, "eval_steps_per_second": 2.954, "step": 23 }, { "epoch": 2.6666666666666665, "grad_norm": 8.477367401123047, "learning_rate": 3.7500000000000003e-05, "loss": 0.4308, "step": 24 }, { "epoch": 2.6666666666666665, "eval_accuracy": 0.6527777777777778, "eval_loss": 0.7055935263633728, "eval_runtime": 1.7353, "eval_samples_per_second": 41.492, "eval_steps_per_second": 2.881, "step": 24 }, { "epoch": 2.7777777777777777, "grad_norm": 8.381991386413574, "learning_rate": 3.6931818181818184e-05, "loss": 0.3827, "step": 25 }, { "epoch": 2.7777777777777777, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.6947495937347412, "eval_runtime": 1.69, "eval_samples_per_second": 42.604, "eval_steps_per_second": 2.959, "step": 25 }, { "epoch": 2.888888888888889, "grad_norm": 18.475757598876953, "learning_rate": 3.6363636363636364e-05, "loss": 0.3719, "step": 26 }, { "epoch": 2.888888888888889, "eval_accuracy": 0.6388888888888888, "eval_loss": 0.695068359375, "eval_runtime": 1.7654, "eval_samples_per_second": 40.785, "eval_steps_per_second": 2.832, "step": 26 }, { "epoch": 3.0, "grad_norm": 4.149219036102295, "learning_rate": 3.579545454545455e-05, "loss": 0.2756, "step": 27 }, { "epoch": 3.0, "eval_accuracy": 0.625, "eval_loss": 0.7118733525276184, "eval_runtime": 1.7076, "eval_samples_per_second": 42.165, "eval_steps_per_second": 2.928, "step": 27 }, { "epoch": 3.111111111111111, "grad_norm": 6.201833248138428, "learning_rate": 3.522727272727273e-05, "loss": 0.3048, "step": 28 }, { "epoch": 3.111111111111111, "eval_accuracy": 0.6388888888888888, "eval_loss": 0.7207743525505066, "eval_runtime": 1.6906, "eval_samples_per_second": 42.59, "eval_steps_per_second": 2.958, "step": 28 }, { "epoch": 3.2222222222222223, "grad_norm": 10.069761276245117, "learning_rate": 3.465909090909091e-05, "loss": 0.2833, "step": 29 }, { "epoch": 3.2222222222222223, "eval_accuracy": 0.6111111111111112, "eval_loss": 0.758392333984375, "eval_runtime": 1.689, "eval_samples_per_second": 42.629, "eval_steps_per_second": 2.96, "step": 29 }, { "epoch": 3.3333333333333335, "grad_norm": 14.6091947555542, "learning_rate": 3.409090909090909e-05, "loss": 0.2192, "step": 30 }, { "epoch": 3.3333333333333335, "eval_accuracy": 0.625, "eval_loss": 0.7625613808631897, "eval_runtime": 1.6873, "eval_samples_per_second": 42.671, "eval_steps_per_second": 2.963, "step": 30 }, { "epoch": 3.4444444444444446, "grad_norm": 12.838824272155762, "learning_rate": 3.352272727272727e-05, "loss": 0.159, "step": 31 }, { "epoch": 3.4444444444444446, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.7382490634918213, "eval_runtime": 1.689, "eval_samples_per_second": 42.629, "eval_steps_per_second": 2.96, "step": 31 }, { "epoch": 3.5555555555555554, "grad_norm": 5.576656341552734, "learning_rate": 3.295454545454545e-05, "loss": 0.1545, "step": 32 }, { "epoch": 3.5555555555555554, "eval_accuracy": 0.6805555555555556, "eval_loss": 0.7160136103630066, "eval_runtime": 1.688, "eval_samples_per_second": 42.655, "eval_steps_per_second": 2.962, "step": 32 }, { "epoch": 3.6666666666666665, "grad_norm": 3.911334753036499, "learning_rate": 3.238636363636364e-05, "loss": 0.2048, "step": 33 }, { "epoch": 3.6666666666666665, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.7091013789176941, "eval_runtime": 1.6904, "eval_samples_per_second": 42.594, "eval_steps_per_second": 2.958, "step": 33 }, { "epoch": 3.7777777777777777, "grad_norm": 4.566244125366211, "learning_rate": 3.181818181818182e-05, "loss": 0.1285, "step": 34 }, { "epoch": 3.7777777777777777, "eval_accuracy": 0.7361111111111112, "eval_loss": 0.7089945673942566, "eval_runtime": 1.6878, "eval_samples_per_second": 42.659, "eval_steps_per_second": 2.962, "step": 34 }, { "epoch": 3.888888888888889, "grad_norm": 4.949621677398682, "learning_rate": 3.125e-05, "loss": 0.0972, "step": 35 }, { "epoch": 3.888888888888889, "eval_accuracy": 0.7361111111111112, "eval_loss": 0.7177971601486206, "eval_runtime": 1.7101, "eval_samples_per_second": 42.104, "eval_steps_per_second": 2.924, "step": 35 }, { "epoch": 4.0, "grad_norm": 5.815789699554443, "learning_rate": 3.068181818181818e-05, "loss": 0.1396, "step": 36 }, { "epoch": 4.0, "eval_accuracy": 0.7361111111111112, "eval_loss": 0.7336485385894775, "eval_runtime": 1.704, "eval_samples_per_second": 42.252, "eval_steps_per_second": 2.934, "step": 36 }, { "epoch": 4.111111111111111, "grad_norm": 5.0971174240112305, "learning_rate": 3.0113636363636365e-05, "loss": 0.1058, "step": 37 }, { "epoch": 4.111111111111111, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.7677508592605591, "eval_runtime": 1.6634, "eval_samples_per_second": 43.285, "eval_steps_per_second": 3.006, "step": 37 }, { "epoch": 4.222222222222222, "grad_norm": 3.026982545852661, "learning_rate": 2.954545454545455e-05, "loss": 0.1036, "step": 38 }, { "epoch": 4.222222222222222, "eval_accuracy": 0.6666666666666666, "eval_loss": 0.81514573097229, "eval_runtime": 1.7527, "eval_samples_per_second": 41.08, "eval_steps_per_second": 2.853, "step": 38 }, { "epoch": 4.333333333333333, "grad_norm": 3.815314531326294, "learning_rate": 2.8977272727272732e-05, "loss": 0.0651, "step": 39 }, { "epoch": 4.333333333333333, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.8406186699867249, "eval_runtime": 1.7692, "eval_samples_per_second": 40.696, "eval_steps_per_second": 2.826, "step": 39 }, { "epoch": 4.444444444444445, "grad_norm": 2.897125244140625, "learning_rate": 2.8409090909090912e-05, "loss": 0.0643, "step": 40 }, { "epoch": 4.444444444444445, "eval_accuracy": 0.6944444444444444, "eval_loss": 0.8639495372772217, "eval_runtime": 1.7505, "eval_samples_per_second": 41.132, "eval_steps_per_second": 2.856, "step": 40 }, { "epoch": 4.555555555555555, "grad_norm": 8.826624870300293, "learning_rate": 2.784090909090909e-05, "loss": 0.0708, "step": 41 }, { "epoch": 4.555555555555555, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.8449772000312805, "eval_runtime": 1.7389, "eval_samples_per_second": 41.404, "eval_steps_per_second": 2.875, "step": 41 }, { "epoch": 4.666666666666667, "grad_norm": 4.310912132263184, "learning_rate": 2.7272727272727273e-05, "loss": 0.0571, "step": 42 }, { "epoch": 4.666666666666667, "eval_accuracy": 0.7222222222222222, "eval_loss": 0.8496870398521423, "eval_runtime": 1.6953, "eval_samples_per_second": 42.471, "eval_steps_per_second": 2.949, "step": 42 }, { "epoch": 4.777777777777778, "grad_norm": 0.9157137870788574, "learning_rate": 2.6704545454545453e-05, "loss": 0.0101, "step": 43 }, { "epoch": 4.777777777777778, "eval_accuracy": 0.75, "eval_loss": 0.8314664363861084, "eval_runtime": 1.696, "eval_samples_per_second": 42.452, "eval_steps_per_second": 2.948, "step": 43 }, { "epoch": 4.888888888888889, "grad_norm": 4.54341983795166, "learning_rate": 2.6136363636363637e-05, "loss": 0.0345, "step": 44 }, { "epoch": 4.888888888888889, "eval_accuracy": 0.75, "eval_loss": 0.8536232709884644, "eval_runtime": 1.7637, "eval_samples_per_second": 40.822, "eval_steps_per_second": 2.835, "step": 44 }, { "epoch": 5.0, "grad_norm": 2.6652591228485107, "learning_rate": 2.5568181818181817e-05, "loss": 0.0194, "step": 45 }, { "epoch": 5.0, "eval_accuracy": 0.75, "eval_loss": 0.8400527834892273, "eval_runtime": 1.7745, "eval_samples_per_second": 40.575, "eval_steps_per_second": 2.818, "step": 45 }, { "epoch": 5.111111111111111, "grad_norm": 0.518764078617096, "learning_rate": 2.5e-05, "loss": 0.005, "step": 46 }, { "epoch": 5.111111111111111, "eval_accuracy": 0.75, "eval_loss": 0.8562726378440857, "eval_runtime": 1.7488, "eval_samples_per_second": 41.171, "eval_steps_per_second": 2.859, "step": 46 }, { "epoch": 5.222222222222222, "grad_norm": 0.5080448389053345, "learning_rate": 2.4431818181818185e-05, "loss": 0.0092, "step": 47 }, { "epoch": 5.222222222222222, "eval_accuracy": 0.75, "eval_loss": 0.8858435153961182, "eval_runtime": 1.7362, "eval_samples_per_second": 41.469, "eval_steps_per_second": 2.88, "step": 47 }, { "epoch": 5.333333333333333, "grad_norm": 1.4881385564804077, "learning_rate": 2.3863636363636365e-05, "loss": 0.0117, "step": 48 }, { "epoch": 5.333333333333333, "eval_accuracy": 0.7638888888888888, "eval_loss": 0.9373347163200378, "eval_runtime": 1.7561, "eval_samples_per_second": 40.999, "eval_steps_per_second": 2.847, "step": 48 }, { "epoch": 5.444444444444445, "grad_norm": 0.4491148293018341, "learning_rate": 2.3295454545454546e-05, "loss": 0.006, "step": 49 }, { "epoch": 5.444444444444445, "eval_accuracy": 0.7638888888888888, "eval_loss": 0.9894835352897644, "eval_runtime": 1.7035, "eval_samples_per_second": 42.267, "eval_steps_per_second": 2.935, "step": 49 }, { "epoch": 5.555555555555555, "grad_norm": 0.5586861371994019, "learning_rate": 2.272727272727273e-05, "loss": 0.0063, "step": 50 }, { "epoch": 5.555555555555555, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.0309796333312988, "eval_runtime": 1.7142, "eval_samples_per_second": 42.001, "eval_steps_per_second": 2.917, "step": 50 }, { "epoch": 5.666666666666667, "grad_norm": 1.2758698463439941, "learning_rate": 2.215909090909091e-05, "loss": 0.0057, "step": 51 }, { "epoch": 5.666666666666667, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.0968831777572632, "eval_runtime": 1.6931, "eval_samples_per_second": 42.526, "eval_steps_per_second": 2.953, "step": 51 }, { "epoch": 5.777777777777778, "grad_norm": 0.4022962152957916, "learning_rate": 2.1590909090909093e-05, "loss": 0.0024, "step": 52 }, { "epoch": 5.777777777777778, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1657804250717163, "eval_runtime": 1.7525, "eval_samples_per_second": 41.083, "eval_steps_per_second": 2.853, "step": 52 }, { "epoch": 5.888888888888889, "grad_norm": 8.325399398803711, "learning_rate": 2.1022727272727274e-05, "loss": 0.0283, "step": 53 }, { "epoch": 5.888888888888889, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.1517949104309082, "eval_runtime": 1.7207, "eval_samples_per_second": 41.843, "eval_steps_per_second": 2.906, "step": 53 }, { "epoch": 6.0, "grad_norm": 0.6889083981513977, "learning_rate": 2.0454545454545457e-05, "loss": 0.0031, "step": 54 }, { "epoch": 6.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.131449580192566, "eval_runtime": 1.7267, "eval_samples_per_second": 41.698, "eval_steps_per_second": 2.896, "step": 54 }, { "epoch": 6.111111111111111, "grad_norm": 0.12238375097513199, "learning_rate": 1.9886363636363638e-05, "loss": 0.0006, "step": 55 }, { "epoch": 6.111111111111111, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.120394229888916, "eval_runtime": 1.7518, "eval_samples_per_second": 41.1, "eval_steps_per_second": 2.854, "step": 55 }, { "epoch": 6.222222222222222, "grad_norm": 0.04647090286016464, "learning_rate": 1.9318181818181818e-05, "loss": 0.0004, "step": 56 }, { "epoch": 6.222222222222222, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.1473143100738525, "eval_runtime": 1.6957, "eval_samples_per_second": 42.461, "eval_steps_per_second": 2.949, "step": 56 }, { "epoch": 6.333333333333333, "grad_norm": 0.07002800703048706, "learning_rate": 1.8750000000000002e-05, "loss": 0.0004, "step": 57 }, { "epoch": 6.333333333333333, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.1827261447906494, "eval_runtime": 1.6914, "eval_samples_per_second": 42.567, "eval_steps_per_second": 2.956, "step": 57 }, { "epoch": 6.444444444444445, "grad_norm": 0.057534411549568176, "learning_rate": 1.8181818181818182e-05, "loss": 0.0002, "step": 58 }, { "epoch": 6.444444444444445, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.224190592765808, "eval_runtime": 1.6907, "eval_samples_per_second": 42.587, "eval_steps_per_second": 2.957, "step": 58 }, { "epoch": 6.555555555555555, "grad_norm": 0.1322898268699646, "learning_rate": 1.7613636363636366e-05, "loss": 0.0006, "step": 59 }, { "epoch": 6.555555555555555, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.2592931985855103, "eval_runtime": 1.6878, "eval_samples_per_second": 42.659, "eval_steps_per_second": 2.962, "step": 59 }, { "epoch": 6.666666666666667, "grad_norm": 0.014222492463886738, "learning_rate": 1.7045454545454546e-05, "loss": 0.0001, "step": 60 }, { "epoch": 6.666666666666667, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.281473159790039, "eval_runtime": 1.6928, "eval_samples_per_second": 42.532, "eval_steps_per_second": 2.954, "step": 60 }, { "epoch": 6.777777777777778, "grad_norm": 3.633732795715332, "learning_rate": 1.6477272727272726e-05, "loss": 0.0144, "step": 61 }, { "epoch": 6.777777777777778, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.2948687076568604, "eval_runtime": 1.6886, "eval_samples_per_second": 42.64, "eval_steps_per_second": 2.961, "step": 61 }, { "epoch": 6.888888888888889, "grad_norm": 0.6753994226455688, "learning_rate": 1.590909090909091e-05, "loss": 0.0021, "step": 62 }, { "epoch": 6.888888888888889, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.303892731666565, "eval_runtime": 1.691, "eval_samples_per_second": 42.579, "eval_steps_per_second": 2.957, "step": 62 }, { "epoch": 7.0, "grad_norm": 0.047468412667512894, "learning_rate": 1.534090909090909e-05, "loss": 0.0002, "step": 63 }, { "epoch": 7.0, "eval_accuracy": 0.8333333333333334, "eval_loss": 1.3037166595458984, "eval_runtime": 1.6895, "eval_samples_per_second": 42.615, "eval_steps_per_second": 2.959, "step": 63 }, { "epoch": 7.111111111111111, "grad_norm": 0.04554079473018646, "learning_rate": 1.4772727272727274e-05, "loss": 0.0001, "step": 64 }, { "epoch": 7.111111111111111, "eval_accuracy": 0.8194444444444444, "eval_loss": 1.3078640699386597, "eval_runtime": 1.6898, "eval_samples_per_second": 42.608, "eval_steps_per_second": 2.959, "step": 64 }, { "epoch": 7.222222222222222, "grad_norm": 0.010344511829316616, "learning_rate": 1.4204545454545456e-05, "loss": 0.0001, "step": 65 }, { "epoch": 7.222222222222222, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.3234918117523193, "eval_runtime": 1.6967, "eval_samples_per_second": 42.434, "eval_steps_per_second": 2.947, "step": 65 }, { "epoch": 7.333333333333333, "grad_norm": 0.006933713797479868, "learning_rate": 1.3636363636363637e-05, "loss": 0.0001, "step": 66 }, { "epoch": 7.333333333333333, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.3226008415222168, "eval_runtime": 1.6947, "eval_samples_per_second": 42.485, "eval_steps_per_second": 2.95, "step": 66 }, { "epoch": 7.444444444444445, "grad_norm": 0.004614418838173151, "learning_rate": 1.3068181818181819e-05, "loss": 0.0, "step": 67 }, { "epoch": 7.444444444444445, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.3420900106430054, "eval_runtime": 1.6925, "eval_samples_per_second": 42.54, "eval_steps_per_second": 2.954, "step": 67 }, { "epoch": 7.555555555555555, "grad_norm": 0.011573988012969494, "learning_rate": 1.25e-05, "loss": 0.0001, "step": 68 }, { "epoch": 7.555555555555555, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.3453483581542969, "eval_runtime": 1.6925, "eval_samples_per_second": 42.539, "eval_steps_per_second": 2.954, "step": 68 }, { "epoch": 7.666666666666667, "grad_norm": 0.05624835565686226, "learning_rate": 1.1931818181818183e-05, "loss": 0.0002, "step": 69 }, { "epoch": 7.666666666666667, "eval_accuracy": 0.8055555555555556, "eval_loss": 1.3481894731521606, "eval_runtime": 1.6884, "eval_samples_per_second": 42.643, "eval_steps_per_second": 2.961, "step": 69 }, { "epoch": 7.777777777777778, "grad_norm": 0.026880212128162384, "learning_rate": 1.1363636363636365e-05, "loss": 0.0001, "step": 70 }, { "epoch": 7.777777777777778, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3691877126693726, "eval_runtime": 1.689, "eval_samples_per_second": 42.628, "eval_steps_per_second": 2.96, "step": 70 }, { "epoch": 7.888888888888889, "grad_norm": 0.014328360557556152, "learning_rate": 1.0795454545454547e-05, "loss": 0.0001, "step": 71 }, { "epoch": 7.888888888888889, "eval_accuracy": 0.7916666666666666, "eval_loss": 1.3764379024505615, "eval_runtime": 1.692, "eval_samples_per_second": 42.552, "eval_steps_per_second": 2.955, "step": 71 }, { "epoch": 8.0, "grad_norm": 0.004069478716701269, "learning_rate": 1.0227272727272729e-05, "loss": 0.0, "step": 72 }, { "epoch": 8.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.3799124956130981, "eval_runtime": 1.6421, "eval_samples_per_second": 43.845, "eval_steps_per_second": 3.045, "step": 72 }, { "epoch": 8.11111111111111, "grad_norm": 0.029666263610124588, "learning_rate": 9.659090909090909e-06, "loss": 0.0001, "step": 73 }, { "epoch": 8.11111111111111, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.3902004957199097, "eval_runtime": 1.6942, "eval_samples_per_second": 42.498, "eval_steps_per_second": 2.951, "step": 73 }, { "epoch": 8.222222222222221, "grad_norm": 0.005981099791824818, "learning_rate": 9.090909090909091e-06, "loss": 0.0, "step": 74 }, { "epoch": 8.222222222222221, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.3968830108642578, "eval_runtime": 1.6929, "eval_samples_per_second": 42.531, "eval_steps_per_second": 2.954, "step": 74 }, { "epoch": 8.333333333333334, "grad_norm": 0.008150852285325527, "learning_rate": 8.522727272727273e-06, "loss": 0.0001, "step": 75 }, { "epoch": 8.333333333333334, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4002034664154053, "eval_runtime": 1.692, "eval_samples_per_second": 42.554, "eval_steps_per_second": 2.955, "step": 75 }, { "epoch": 8.444444444444445, "grad_norm": 0.10769578814506531, "learning_rate": 7.954545454545455e-06, "loss": 0.0004, "step": 76 }, { "epoch": 8.444444444444445, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4117597341537476, "eval_runtime": 1.6904, "eval_samples_per_second": 42.595, "eval_steps_per_second": 2.958, "step": 76 }, { "epoch": 8.555555555555555, "grad_norm": 0.026451965793967247, "learning_rate": 7.386363636363637e-06, "loss": 0.0001, "step": 77 }, { "epoch": 8.555555555555555, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4056729078292847, "eval_runtime": 1.6873, "eval_samples_per_second": 42.672, "eval_steps_per_second": 2.963, "step": 77 }, { "epoch": 8.666666666666666, "grad_norm": 0.07714568078517914, "learning_rate": 6.818181818181818e-06, "loss": 0.0003, "step": 78 }, { "epoch": 8.666666666666666, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4086921215057373, "eval_runtime": 1.6884, "eval_samples_per_second": 42.643, "eval_steps_per_second": 2.961, "step": 78 }, { "epoch": 8.777777777777779, "grad_norm": 0.03726133331656456, "learning_rate": 6.25e-06, "loss": 0.0001, "step": 79 }, { "epoch": 8.777777777777779, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4101053476333618, "eval_runtime": 1.6929, "eval_samples_per_second": 42.53, "eval_steps_per_second": 2.954, "step": 79 }, { "epoch": 8.88888888888889, "grad_norm": 0.01760346069931984, "learning_rate": 5.681818181818182e-06, "loss": 0.0001, "step": 80 }, { "epoch": 8.88888888888889, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4063137769699097, "eval_runtime": 1.6938, "eval_samples_per_second": 42.508, "eval_steps_per_second": 2.952, "step": 80 }, { "epoch": 9.0, "grad_norm": 0.05216860771179199, "learning_rate": 5.113636363636364e-06, "loss": 0.0003, "step": 81 }, { "epoch": 9.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4048177003860474, "eval_runtime": 1.6922, "eval_samples_per_second": 42.549, "eval_steps_per_second": 2.955, "step": 81 }, { "epoch": 9.11111111111111, "grad_norm": 0.019384268671274185, "learning_rate": 4.5454545454545455e-06, "loss": 0.0001, "step": 82 }, { "epoch": 9.11111111111111, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4030442237854004, "eval_runtime": 1.6903, "eval_samples_per_second": 42.595, "eval_steps_per_second": 2.958, "step": 82 }, { "epoch": 9.222222222222221, "grad_norm": 0.04530010744929314, "learning_rate": 3.9772727272727275e-06, "loss": 0.0002, "step": 83 }, { "epoch": 9.222222222222221, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.406543254852295, "eval_runtime": 1.6884, "eval_samples_per_second": 42.643, "eval_steps_per_second": 2.961, "step": 83 }, { "epoch": 9.333333333333334, "grad_norm": 0.006314022000879049, "learning_rate": 3.409090909090909e-06, "loss": 0.0, "step": 84 }, { "epoch": 9.333333333333334, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4123640060424805, "eval_runtime": 1.6879, "eval_samples_per_second": 42.656, "eval_steps_per_second": 2.962, "step": 84 }, { "epoch": 9.444444444444445, "grad_norm": 0.010984798893332481, "learning_rate": 2.840909090909091e-06, "loss": 0.0001, "step": 85 }, { "epoch": 9.444444444444445, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4055116176605225, "eval_runtime": 1.6923, "eval_samples_per_second": 42.547, "eval_steps_per_second": 2.955, "step": 85 }, { "epoch": 9.555555555555555, "grad_norm": 0.04161923751235008, "learning_rate": 2.2727272727272728e-06, "loss": 0.0002, "step": 86 }, { "epoch": 9.555555555555555, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4036388397216797, "eval_runtime": 1.6941, "eval_samples_per_second": 42.5, "eval_steps_per_second": 2.951, "step": 86 }, { "epoch": 9.666666666666666, "grad_norm": 0.02397008240222931, "learning_rate": 1.7045454545454546e-06, "loss": 0.0001, "step": 87 }, { "epoch": 9.666666666666666, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4052443504333496, "eval_runtime": 1.6935, "eval_samples_per_second": 42.517, "eval_steps_per_second": 2.953, "step": 87 }, { "epoch": 9.777777777777779, "grad_norm": 0.020388655364513397, "learning_rate": 1.1363636363636364e-06, "loss": 0.0001, "step": 88 }, { "epoch": 9.777777777777779, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.403831958770752, "eval_runtime": 1.693, "eval_samples_per_second": 42.529, "eval_steps_per_second": 2.953, "step": 88 }, { "epoch": 9.88888888888889, "grad_norm": 0.03793564811348915, "learning_rate": 5.681818181818182e-07, "loss": 0.0002, "step": 89 }, { "epoch": 9.88888888888889, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4065537452697754, "eval_runtime": 1.6926, "eval_samples_per_second": 42.537, "eval_steps_per_second": 2.954, "step": 89 }, { "epoch": 10.0, "grad_norm": 0.003561729798093438, "learning_rate": 0.0, "loss": 0.0, "step": 90 }, { "epoch": 10.0, "eval_accuracy": 0.7777777777777778, "eval_loss": 1.4016788005828857, "eval_runtime": 1.693, "eval_samples_per_second": 42.528, "eval_steps_per_second": 2.953, "step": 90 }, { "epoch": 10.0, "step": 90, "total_flos": 5016736558481408.0, "train_loss": 0.22732360793484582, "train_runtime": 316.2669, "train_samples_per_second": 8.98, "train_steps_per_second": 0.285 } ], "logging_steps": 1, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 5016736558481408.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }