{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3854739652870494, "eval_steps": 71, "global_step": 3276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 11.078738059471418, "learning_rate": 1e-08, "loss": 0.9787, "step": 1 }, { "epoch": 0.0, "eval_loss": 0.9907150864601135, "eval_runtime": 6914.8687, "eval_samples_per_second": 41.995, "eval_steps_per_second": 2.1, "step": 1 }, { "epoch": 0.0, "grad_norm": 11.611292476924348, "learning_rate": 2e-08, "loss": 1.0125, "step": 2 }, { "epoch": 0.0, "grad_norm": 11.680888573298947, "learning_rate": 3.0000000000000004e-08, "loss": 0.9921, "step": 3 }, { "epoch": 0.0, "grad_norm": 11.223600490850279, "learning_rate": 4e-08, "loss": 0.9905, "step": 4 }, { "epoch": 0.0, "grad_norm": 13.16229268115578, "learning_rate": 5.0000000000000004e-08, "loss": 1.0453, "step": 5 }, { "epoch": 0.0, "grad_norm": 11.866725221480236, "learning_rate": 6.000000000000001e-08, "loss": 1.0109, "step": 6 }, { "epoch": 0.0, "grad_norm": 11.384069184647979, "learning_rate": 7e-08, "loss": 0.9907, "step": 7 }, { "epoch": 0.0, "grad_norm": 11.950365392573932, "learning_rate": 8e-08, "loss": 1.0039, "step": 8 }, { "epoch": 0.0, "grad_norm": 11.269911995803245, "learning_rate": 9e-08, "loss": 0.992, "step": 9 }, { "epoch": 0.0, "grad_norm": 11.784110104038653, "learning_rate": 1.0000000000000001e-07, "loss": 0.9909, "step": 10 }, { "epoch": 0.0, "grad_norm": 11.530150291016858, "learning_rate": 1.1e-07, "loss": 0.9717, "step": 11 }, { "epoch": 0.01, "grad_norm": 11.46072146393034, "learning_rate": 1.2000000000000002e-07, "loss": 0.9813, "step": 12 }, { "epoch": 0.01, "grad_norm": 10.983926810290162, "learning_rate": 1.3e-07, "loss": 1.0025, "step": 13 }, { "epoch": 0.01, "grad_norm": 11.258765973186346, "learning_rate": 1.4e-07, "loss": 0.9927, "step": 14 }, { "epoch": 0.01, "grad_norm": 11.213060806605753, "learning_rate": 1.5000000000000002e-07, "loss": 0.9729, "step": 15 }, { "epoch": 0.01, "grad_norm": 11.271084778147063, "learning_rate": 1.6e-07, "loss": 0.9861, "step": 16 }, { "epoch": 0.01, "grad_norm": 11.40785228773617, "learning_rate": 1.7000000000000001e-07, "loss": 1.004, "step": 17 }, { "epoch": 0.01, "grad_norm": 11.071262500540902, "learning_rate": 1.8e-07, "loss": 0.999, "step": 18 }, { "epoch": 0.01, "grad_norm": 10.939577936304328, "learning_rate": 1.9e-07, "loss": 0.9951, "step": 19 }, { "epoch": 0.01, "grad_norm": 10.543379116586797, "learning_rate": 2.0000000000000002e-07, "loss": 0.9573, "step": 20 }, { "epoch": 0.01, "grad_norm": 10.648729634073065, "learning_rate": 2.1000000000000003e-07, "loss": 0.95, "step": 21 }, { "epoch": 0.01, "grad_norm": 9.442791775540536, "learning_rate": 2.2e-07, "loss": 0.9315, "step": 22 }, { "epoch": 0.01, "grad_norm": 9.458344300581993, "learning_rate": 2.3000000000000002e-07, "loss": 0.942, "step": 23 }, { "epoch": 0.01, "grad_norm": 9.249373081558684, "learning_rate": 2.4000000000000003e-07, "loss": 0.9065, "step": 24 }, { "epoch": 0.01, "grad_norm": 9.104568818785538, "learning_rate": 2.5000000000000004e-07, "loss": 0.9251, "step": 25 }, { "epoch": 0.01, "grad_norm": 8.776787693093286, "learning_rate": 2.6e-07, "loss": 0.9381, "step": 26 }, { "epoch": 0.01, "grad_norm": 8.809034639899487, "learning_rate": 2.7e-07, "loss": 0.8958, "step": 27 }, { "epoch": 0.01, "grad_norm": 8.588186330852137, "learning_rate": 2.8e-07, "loss": 0.9345, "step": 28 }, { "epoch": 0.01, "grad_norm": 7.054903161127684, "learning_rate": 2.9000000000000003e-07, "loss": 0.926, "step": 29 }, { "epoch": 0.01, "grad_norm": 5.409541396614469, "learning_rate": 3.0000000000000004e-07, "loss": 0.8518, "step": 30 }, { "epoch": 0.01, "grad_norm": 5.079371789402381, "learning_rate": 3.1000000000000005e-07, "loss": 0.8247, "step": 31 }, { "epoch": 0.01, "grad_norm": 5.285936528164377, "learning_rate": 3.2e-07, "loss": 0.8839, "step": 32 }, { "epoch": 0.01, "grad_norm": 5.0378580616482385, "learning_rate": 3.3e-07, "loss": 0.8482, "step": 33 }, { "epoch": 0.01, "grad_norm": 4.8906753179918745, "learning_rate": 3.4000000000000003e-07, "loss": 0.8848, "step": 34 }, { "epoch": 0.01, "grad_norm": 4.673664611548944, "learning_rate": 3.5000000000000004e-07, "loss": 0.8514, "step": 35 }, { "epoch": 0.02, "grad_norm": 4.672079486287008, "learning_rate": 3.6e-07, "loss": 0.8531, "step": 36 }, { "epoch": 0.02, "grad_norm": 4.534392734183333, "learning_rate": 3.7e-07, "loss": 0.8408, "step": 37 }, { "epoch": 0.02, "grad_norm": 4.426822133821268, "learning_rate": 3.8e-07, "loss": 0.8421, "step": 38 }, { "epoch": 0.02, "grad_norm": 4.050598149897309, "learning_rate": 3.9e-07, "loss": 0.8143, "step": 39 }, { "epoch": 0.02, "grad_norm": 3.3681521726267727, "learning_rate": 4.0000000000000003e-07, "loss": 0.7697, "step": 40 }, { "epoch": 0.02, "grad_norm": 3.168148018142974, "learning_rate": 4.1000000000000004e-07, "loss": 0.7522, "step": 41 }, { "epoch": 0.02, "grad_norm": 2.988199405686652, "learning_rate": 4.2000000000000006e-07, "loss": 0.7721, "step": 42 }, { "epoch": 0.02, "grad_norm": 2.7711442613950577, "learning_rate": 4.3e-07, "loss": 0.7777, "step": 43 }, { "epoch": 0.02, "grad_norm": 2.3134967643199253, "learning_rate": 4.4e-07, "loss": 0.7478, "step": 44 }, { "epoch": 0.02, "grad_norm": 2.102821646941925, "learning_rate": 4.5000000000000003e-07, "loss": 0.7614, "step": 45 }, { "epoch": 0.02, "grad_norm": 2.806705011993839, "learning_rate": 4.6000000000000004e-07, "loss": 0.7344, "step": 46 }, { "epoch": 0.02, "grad_norm": 2.07648392275885, "learning_rate": 4.7000000000000005e-07, "loss": 0.7469, "step": 47 }, { "epoch": 0.02, "grad_norm": 1.9203144322745775, "learning_rate": 4.800000000000001e-07, "loss": 0.7611, "step": 48 }, { "epoch": 0.02, "grad_norm": 1.8850095224554737, "learning_rate": 4.900000000000001e-07, "loss": 0.7892, "step": 49 }, { "epoch": 0.02, "grad_norm": 1.8950995227228287, "learning_rate": 5.000000000000001e-07, "loss": 0.7249, "step": 50 }, { "epoch": 0.02, "grad_norm": 2.220629645536677, "learning_rate": 5.1e-07, "loss": 0.7286, "step": 51 }, { "epoch": 0.02, "grad_norm": 1.6217121963418404, "learning_rate": 5.2e-07, "loss": 0.72, "step": 52 }, { "epoch": 0.02, "grad_norm": 1.7536100967521506, "learning_rate": 5.3e-07, "loss": 0.7303, "step": 53 }, { "epoch": 0.02, "grad_norm": 1.8632645115840507, "learning_rate": 5.4e-07, "loss": 0.7041, "step": 54 }, { "epoch": 0.02, "grad_norm": 1.6438907360560255, "learning_rate": 5.5e-07, "loss": 0.6778, "step": 55 }, { "epoch": 0.02, "grad_norm": 1.4220708453129016, "learning_rate": 5.6e-07, "loss": 0.6928, "step": 56 }, { "epoch": 0.02, "grad_norm": 1.3065263284110715, "learning_rate": 5.7e-07, "loss": 0.6705, "step": 57 }, { "epoch": 0.02, "grad_norm": 1.160277550171688, "learning_rate": 5.800000000000001e-07, "loss": 0.714, "step": 58 }, { "epoch": 0.03, "grad_norm": 1.0619267911744315, "learning_rate": 5.900000000000001e-07, "loss": 0.6578, "step": 59 }, { "epoch": 0.03, "grad_norm": 1.0978405000752092, "learning_rate": 6.000000000000001e-07, "loss": 0.6827, "step": 60 }, { "epoch": 0.03, "grad_norm": 1.0203948283683908, "learning_rate": 6.100000000000001e-07, "loss": 0.6945, "step": 61 }, { "epoch": 0.03, "grad_norm": 1.0376432923241783, "learning_rate": 6.200000000000001e-07, "loss": 0.6785, "step": 62 }, { "epoch": 0.03, "grad_norm": 1.1199898784937596, "learning_rate": 6.3e-07, "loss": 0.6879, "step": 63 }, { "epoch": 0.03, "grad_norm": 0.9323677710694938, "learning_rate": 6.4e-07, "loss": 0.6958, "step": 64 }, { "epoch": 0.03, "grad_norm": 0.9242075211877638, "learning_rate": 6.5e-07, "loss": 0.6632, "step": 65 }, { "epoch": 0.03, "grad_norm": 0.9436047323924038, "learning_rate": 6.6e-07, "loss": 0.679, "step": 66 }, { "epoch": 0.03, "grad_norm": 0.8822704045541839, "learning_rate": 6.7e-07, "loss": 0.6936, "step": 67 }, { "epoch": 0.03, "grad_norm": 0.8869148114540986, "learning_rate": 6.800000000000001e-07, "loss": 0.6598, "step": 68 }, { "epoch": 0.03, "grad_norm": 0.9122382330757471, "learning_rate": 6.900000000000001e-07, "loss": 0.6733, "step": 69 }, { "epoch": 0.03, "grad_norm": 0.9213292233226152, "learning_rate": 7.000000000000001e-07, "loss": 0.6868, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.7942384246431622, "learning_rate": 7.1e-07, "loss": 0.6438, "step": 71 }, { "epoch": 0.03, "eval_loss": 0.6540535092353821, "eval_runtime": 6906.1355, "eval_samples_per_second": 42.048, "eval_steps_per_second": 2.102, "step": 71 }, { "epoch": 0.03, "grad_norm": 0.8205470446647939, "learning_rate": 7.2e-07, "loss": 0.6462, "step": 72 }, { "epoch": 0.03, "grad_norm": 0.7946802028186414, "learning_rate": 7.3e-07, "loss": 0.6806, "step": 73 }, { "epoch": 0.03, "grad_norm": 0.8828845496654094, "learning_rate": 7.4e-07, "loss": 0.6664, "step": 74 }, { "epoch": 0.03, "grad_norm": 0.9383931456659222, "learning_rate": 7.5e-07, "loss": 0.6137, "step": 75 }, { "epoch": 0.03, "grad_norm": 0.8776340800086451, "learning_rate": 7.6e-07, "loss": 0.6453, "step": 76 }, { "epoch": 0.03, "grad_norm": 0.7928098345984148, "learning_rate": 7.7e-07, "loss": 0.6245, "step": 77 }, { "epoch": 0.03, "grad_norm": 0.8548051569752384, "learning_rate": 7.8e-07, "loss": 0.6588, "step": 78 }, { "epoch": 0.03, "grad_norm": 0.7989762163409693, "learning_rate": 7.900000000000001e-07, "loss": 0.6225, "step": 79 }, { "epoch": 0.03, "grad_norm": 0.7487196972831345, "learning_rate": 8.000000000000001e-07, "loss": 0.6306, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.7503963380748643, "learning_rate": 8.100000000000001e-07, "loss": 0.642, "step": 81 }, { "epoch": 0.04, "grad_norm": 0.7715020340302331, "learning_rate": 8.200000000000001e-07, "loss": 0.637, "step": 82 }, { "epoch": 0.04, "grad_norm": 0.7461624365856538, "learning_rate": 8.300000000000001e-07, "loss": 0.6456, "step": 83 }, { "epoch": 0.04, "grad_norm": 0.7873038726512404, "learning_rate": 8.400000000000001e-07, "loss": 0.6301, "step": 84 }, { "epoch": 0.04, "grad_norm": 0.8371533025271144, "learning_rate": 8.500000000000001e-07, "loss": 0.6294, "step": 85 }, { "epoch": 0.04, "grad_norm": 0.7722902390890481, "learning_rate": 8.6e-07, "loss": 0.6285, "step": 86 }, { "epoch": 0.04, "grad_norm": 0.7349010275266745, "learning_rate": 8.7e-07, "loss": 0.656, "step": 87 }, { "epoch": 0.04, "grad_norm": 0.8710625748918476, "learning_rate": 8.8e-07, "loss": 0.6156, "step": 88 }, { "epoch": 0.04, "grad_norm": 0.7833176365407506, "learning_rate": 8.900000000000001e-07, "loss": 0.6329, "step": 89 }, { "epoch": 0.04, "grad_norm": 0.7956237199904019, "learning_rate": 9.000000000000001e-07, "loss": 0.604, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.7663333987942103, "learning_rate": 9.100000000000001e-07, "loss": 0.6132, "step": 91 }, { "epoch": 0.04, "grad_norm": 0.7350037692506446, "learning_rate": 9.200000000000001e-07, "loss": 0.5959, "step": 92 }, { "epoch": 0.04, "grad_norm": 0.7095481904743137, "learning_rate": 9.300000000000001e-07, "loss": 0.581, "step": 93 }, { "epoch": 0.04, "grad_norm": 0.7669373034765433, "learning_rate": 9.400000000000001e-07, "loss": 0.6059, "step": 94 }, { "epoch": 0.04, "grad_norm": 0.7042131948418571, "learning_rate": 9.500000000000001e-07, "loss": 0.6159, "step": 95 }, { "epoch": 0.04, "grad_norm": 0.7415353323840956, "learning_rate": 9.600000000000001e-07, "loss": 0.6046, "step": 96 }, { "epoch": 0.04, "grad_norm": 0.7038043696743754, "learning_rate": 9.7e-07, "loss": 0.6313, "step": 97 }, { "epoch": 0.04, "grad_norm": 0.7144659980129581, "learning_rate": 9.800000000000001e-07, "loss": 0.6091, "step": 98 }, { "epoch": 0.04, "grad_norm": 0.7198651744206456, "learning_rate": 9.9e-07, "loss": 0.6296, "step": 99 }, { "epoch": 0.04, "grad_norm": 0.7518090561897931, "learning_rate": 1.0000000000000002e-06, "loss": 0.6172, "step": 100 }, { "epoch": 0.04, "grad_norm": 0.7176227042870604, "learning_rate": 1.01e-06, "loss": 0.5913, "step": 101 }, { "epoch": 0.04, "grad_norm": 0.7227768155981023, "learning_rate": 1.02e-06, "loss": 0.592, "step": 102 }, { "epoch": 0.04, "grad_norm": 0.7111683414742458, "learning_rate": 1.03e-06, "loss": 0.625, "step": 103 }, { "epoch": 0.04, "grad_norm": 0.6992806259847806, "learning_rate": 1.04e-06, "loss": 0.5955, "step": 104 }, { "epoch": 0.04, "grad_norm": 0.673513907571938, "learning_rate": 1.0500000000000001e-06, "loss": 0.6244, "step": 105 }, { "epoch": 0.05, "grad_norm": 0.6626454057249019, "learning_rate": 1.06e-06, "loss": 0.6246, "step": 106 }, { "epoch": 0.05, "grad_norm": 0.6807547559648992, "learning_rate": 1.0700000000000001e-06, "loss": 0.6302, "step": 107 }, { "epoch": 0.05, "grad_norm": 0.7118070037431555, "learning_rate": 1.08e-06, "loss": 0.612, "step": 108 }, { "epoch": 0.05, "grad_norm": 0.7101111808553712, "learning_rate": 1.0900000000000002e-06, "loss": 0.6299, "step": 109 }, { "epoch": 0.05, "grad_norm": 0.7361716975541328, "learning_rate": 1.1e-06, "loss": 0.5889, "step": 110 }, { "epoch": 0.05, "grad_norm": 0.7507623271913958, "learning_rate": 1.1100000000000002e-06, "loss": 0.5939, "step": 111 }, { "epoch": 0.05, "grad_norm": 0.69232482698427, "learning_rate": 1.12e-06, "loss": 0.5857, "step": 112 }, { "epoch": 0.05, "grad_norm": 0.7560268686291741, "learning_rate": 1.1300000000000002e-06, "loss": 0.5926, "step": 113 }, { "epoch": 0.05, "grad_norm": 0.6909950381406327, "learning_rate": 1.14e-06, "loss": 0.6259, "step": 114 }, { "epoch": 0.05, "grad_norm": 0.705457142068379, "learning_rate": 1.1500000000000002e-06, "loss": 0.594, "step": 115 }, { "epoch": 0.05, "grad_norm": 0.7479733597296523, "learning_rate": 1.1600000000000001e-06, "loss": 0.5941, "step": 116 }, { "epoch": 0.05, "grad_norm": 0.6703707169078938, "learning_rate": 1.1700000000000002e-06, "loss": 0.6002, "step": 117 }, { "epoch": 0.05, "grad_norm": 0.7142454896237415, "learning_rate": 1.1800000000000001e-06, "loss": 0.5721, "step": 118 }, { "epoch": 0.05, "grad_norm": 0.6410921583173411, "learning_rate": 1.19e-06, "loss": 0.5549, "step": 119 }, { "epoch": 0.05, "grad_norm": 0.7371839254744956, "learning_rate": 1.2000000000000002e-06, "loss": 0.6076, "step": 120 }, { "epoch": 0.05, "grad_norm": 0.6642983393421674, "learning_rate": 1.21e-06, "loss": 0.6164, "step": 121 }, { "epoch": 0.05, "grad_norm": 0.6622892175821214, "learning_rate": 1.2200000000000002e-06, "loss": 0.5916, "step": 122 }, { "epoch": 0.05, "grad_norm": 0.6836278867814578, "learning_rate": 1.23e-06, "loss": 0.6055, "step": 123 }, { "epoch": 0.05, "grad_norm": 0.7449640043860738, "learning_rate": 1.2400000000000002e-06, "loss": 0.5706, "step": 124 }, { "epoch": 0.05, "grad_norm": 0.7030914815199217, "learning_rate": 1.25e-06, "loss": 0.5962, "step": 125 }, { "epoch": 0.05, "grad_norm": 0.6636464764371871, "learning_rate": 1.26e-06, "loss": 0.6004, "step": 126 }, { "epoch": 0.05, "grad_norm": 0.7292101825675744, "learning_rate": 1.2700000000000001e-06, "loss": 0.6002, "step": 127 }, { "epoch": 0.05, "grad_norm": 0.7002155035767813, "learning_rate": 1.28e-06, "loss": 0.597, "step": 128 }, { "epoch": 0.06, "grad_norm": 0.697947240938124, "learning_rate": 1.2900000000000001e-06, "loss": 0.5788, "step": 129 }, { "epoch": 0.06, "grad_norm": 0.7106171095112853, "learning_rate": 1.3e-06, "loss": 0.5878, "step": 130 }, { "epoch": 0.06, "grad_norm": 0.7712154049962048, "learning_rate": 1.3100000000000002e-06, "loss": 0.6198, "step": 131 }, { "epoch": 0.06, "grad_norm": 0.9336735438619129, "learning_rate": 1.32e-06, "loss": 0.5883, "step": 132 }, { "epoch": 0.06, "grad_norm": 0.6689366967241089, "learning_rate": 1.3300000000000002e-06, "loss": 0.5881, "step": 133 }, { "epoch": 0.06, "grad_norm": 0.6553033178045365, "learning_rate": 1.34e-06, "loss": 0.564, "step": 134 }, { "epoch": 0.06, "grad_norm": 0.7780421140633135, "learning_rate": 1.3500000000000002e-06, "loss": 0.5939, "step": 135 }, { "epoch": 0.06, "grad_norm": 0.7512017485981449, "learning_rate": 1.3600000000000001e-06, "loss": 0.5967, "step": 136 }, { "epoch": 0.06, "grad_norm": 0.7273946733225628, "learning_rate": 1.3700000000000002e-06, "loss": 0.593, "step": 137 }, { "epoch": 0.06, "grad_norm": 0.7070418593498196, "learning_rate": 1.3800000000000001e-06, "loss": 0.5848, "step": 138 }, { "epoch": 0.06, "grad_norm": 0.7716412837183378, "learning_rate": 1.3900000000000002e-06, "loss": 0.6087, "step": 139 }, { "epoch": 0.06, "grad_norm": 0.7211339598744784, "learning_rate": 1.4000000000000001e-06, "loss": 0.5711, "step": 140 }, { "epoch": 0.06, "grad_norm": 0.6962486858771757, "learning_rate": 1.41e-06, "loss": 0.5791, "step": 141 }, { "epoch": 0.06, "grad_norm": 0.7001118725454738, "learning_rate": 1.42e-06, "loss": 0.5772, "step": 142 }, { "epoch": 0.06, "eval_loss": 0.5818271040916443, "eval_runtime": 6911.9695, "eval_samples_per_second": 42.012, "eval_steps_per_second": 2.101, "step": 142 }, { "epoch": 0.06, "grad_norm": 0.764854682764742, "learning_rate": 1.43e-06, "loss": 0.6009, "step": 143 }, { "epoch": 0.06, "grad_norm": 0.730261637210628, "learning_rate": 1.44e-06, "loss": 0.5785, "step": 144 }, { "epoch": 0.06, "grad_norm": 0.8038474557194998, "learning_rate": 1.45e-06, "loss": 0.607, "step": 145 }, { "epoch": 0.06, "grad_norm": 0.6835086511414872, "learning_rate": 1.46e-06, "loss": 0.5825, "step": 146 }, { "epoch": 0.06, "grad_norm": 0.7505101234005623, "learning_rate": 1.4700000000000001e-06, "loss": 0.6049, "step": 147 }, { "epoch": 0.06, "grad_norm": 0.7840302962113346, "learning_rate": 1.48e-06, "loss": 0.5569, "step": 148 }, { "epoch": 0.06, "grad_norm": 0.6936446529890948, "learning_rate": 1.4900000000000001e-06, "loss": 0.5713, "step": 149 }, { "epoch": 0.06, "grad_norm": 0.7331878475223359, "learning_rate": 1.5e-06, "loss": 0.588, "step": 150 }, { "epoch": 0.06, "grad_norm": 0.6745518982361585, "learning_rate": 1.5100000000000002e-06, "loss": 0.5576, "step": 151 }, { "epoch": 0.06, "grad_norm": 0.6950585712176994, "learning_rate": 1.52e-06, "loss": 0.5853, "step": 152 }, { "epoch": 0.07, "grad_norm": 0.7339007955624004, "learning_rate": 1.5300000000000002e-06, "loss": 0.5887, "step": 153 }, { "epoch": 0.07, "grad_norm": 0.7086482597013368, "learning_rate": 1.54e-06, "loss": 0.5857, "step": 154 }, { "epoch": 0.07, "grad_norm": 0.6929345752180732, "learning_rate": 1.5500000000000002e-06, "loss": 0.5937, "step": 155 }, { "epoch": 0.07, "grad_norm": 0.716359352444948, "learning_rate": 1.56e-06, "loss": 0.5459, "step": 156 }, { "epoch": 0.07, "grad_norm": 0.7432089483211889, "learning_rate": 1.5700000000000002e-06, "loss": 0.5766, "step": 157 }, { "epoch": 0.07, "grad_norm": 0.6876861316239075, "learning_rate": 1.5800000000000001e-06, "loss": 0.5729, "step": 158 }, { "epoch": 0.07, "grad_norm": 0.7652117324379928, "learning_rate": 1.5900000000000002e-06, "loss": 0.5714, "step": 159 }, { "epoch": 0.07, "grad_norm": 0.6874557208270947, "learning_rate": 1.6000000000000001e-06, "loss": 0.5786, "step": 160 }, { "epoch": 0.07, "grad_norm": 0.6873913339602188, "learning_rate": 1.6100000000000003e-06, "loss": 0.588, "step": 161 }, { "epoch": 0.07, "grad_norm": 0.683965861475918, "learning_rate": 1.6200000000000002e-06, "loss": 0.5772, "step": 162 }, { "epoch": 0.07, "grad_norm": 0.774874549937846, "learning_rate": 1.6300000000000003e-06, "loss": 0.6, "step": 163 }, { "epoch": 0.07, "grad_norm": 0.7021871067570723, "learning_rate": 1.6400000000000002e-06, "loss": 0.5689, "step": 164 }, { "epoch": 0.07, "grad_norm": 0.7209423777996005, "learning_rate": 1.6500000000000003e-06, "loss": 0.5607, "step": 165 }, { "epoch": 0.07, "grad_norm": 0.6803679044895155, "learning_rate": 1.6600000000000002e-06, "loss": 0.609, "step": 166 }, { "epoch": 0.07, "grad_norm": 0.7712655814872096, "learning_rate": 1.6700000000000003e-06, "loss": 0.5764, "step": 167 }, { "epoch": 0.07, "grad_norm": 0.7619624904596949, "learning_rate": 1.6800000000000002e-06, "loss": 0.5599, "step": 168 }, { "epoch": 0.07, "grad_norm": 0.669785223369481, "learning_rate": 1.6900000000000003e-06, "loss": 0.5518, "step": 169 }, { "epoch": 0.07, "grad_norm": 0.7392059624798044, "learning_rate": 1.7000000000000002e-06, "loss": 0.595, "step": 170 }, { "epoch": 0.07, "grad_norm": 0.6847168836492608, "learning_rate": 1.7100000000000004e-06, "loss": 0.5554, "step": 171 }, { "epoch": 0.07, "grad_norm": 0.7793064159857329, "learning_rate": 1.72e-06, "loss": 0.5905, "step": 172 }, { "epoch": 0.07, "grad_norm": 0.6760936921129691, "learning_rate": 1.73e-06, "loss": 0.5278, "step": 173 }, { "epoch": 0.07, "grad_norm": 0.7120881185318579, "learning_rate": 1.74e-06, "loss": 0.562, "step": 174 }, { "epoch": 0.07, "grad_norm": 0.6777641027946281, "learning_rate": 1.75e-06, "loss": 0.5967, "step": 175 }, { "epoch": 0.08, "grad_norm": 0.6644124127387283, "learning_rate": 1.76e-06, "loss": 0.5685, "step": 176 }, { "epoch": 0.08, "grad_norm": 0.6633049014407767, "learning_rate": 1.77e-06, "loss": 0.5694, "step": 177 }, { "epoch": 0.08, "grad_norm": 0.6598336805084065, "learning_rate": 1.7800000000000001e-06, "loss": 0.5556, "step": 178 }, { "epoch": 0.08, "grad_norm": 0.7006058904587646, "learning_rate": 1.79e-06, "loss": 0.5769, "step": 179 }, { "epoch": 0.08, "grad_norm": 0.7269153745138233, "learning_rate": 1.8000000000000001e-06, "loss": 0.5725, "step": 180 }, { "epoch": 0.08, "grad_norm": 0.7282646554724388, "learning_rate": 1.81e-06, "loss": 0.5593, "step": 181 }, { "epoch": 0.08, "grad_norm": 0.857099360195062, "learning_rate": 1.8200000000000002e-06, "loss": 0.5669, "step": 182 }, { "epoch": 0.08, "grad_norm": 0.6950980962926347, "learning_rate": 1.83e-06, "loss": 0.5777, "step": 183 }, { "epoch": 0.08, "grad_norm": 0.6604073281679678, "learning_rate": 1.8400000000000002e-06, "loss": 0.5918, "step": 184 }, { "epoch": 0.08, "grad_norm": 0.6690212574223983, "learning_rate": 1.85e-06, "loss": 0.561, "step": 185 }, { "epoch": 0.08, "grad_norm": 0.6959851809345552, "learning_rate": 1.8600000000000002e-06, "loss": 0.5523, "step": 186 }, { "epoch": 0.08, "grad_norm": 3.375144332117638, "learning_rate": 1.87e-06, "loss": 0.5612, "step": 187 }, { "epoch": 0.08, "grad_norm": 0.7710335154558103, "learning_rate": 1.8800000000000002e-06, "loss": 0.5515, "step": 188 }, { "epoch": 0.08, "grad_norm": 0.7387587574522239, "learning_rate": 1.8900000000000001e-06, "loss": 0.548, "step": 189 }, { "epoch": 0.08, "grad_norm": 0.6895693730643745, "learning_rate": 1.9000000000000002e-06, "loss": 0.5525, "step": 190 }, { "epoch": 0.08, "grad_norm": 0.6809790348929371, "learning_rate": 1.9100000000000003e-06, "loss": 0.5539, "step": 191 }, { "epoch": 0.08, "grad_norm": 0.6975774380503847, "learning_rate": 1.9200000000000003e-06, "loss": 0.5576, "step": 192 }, { "epoch": 0.08, "grad_norm": 0.65978668039197, "learning_rate": 1.93e-06, "loss": 0.5546, "step": 193 }, { "epoch": 0.08, "grad_norm": 0.7381481487351611, "learning_rate": 1.94e-06, "loss": 0.5362, "step": 194 }, { "epoch": 0.08, "grad_norm": 0.7590514250142023, "learning_rate": 1.9500000000000004e-06, "loss": 0.5496, "step": 195 }, { "epoch": 0.08, "grad_norm": 0.6814269181772578, "learning_rate": 1.9600000000000003e-06, "loss": 0.595, "step": 196 }, { "epoch": 0.08, "grad_norm": 0.7290336017009336, "learning_rate": 1.97e-06, "loss": 0.5845, "step": 197 }, { "epoch": 0.08, "grad_norm": 0.7087771732688267, "learning_rate": 1.98e-06, "loss": 0.5425, "step": 198 }, { "epoch": 0.09, "grad_norm": 0.6740991887260622, "learning_rate": 1.9900000000000004e-06, "loss": 0.5697, "step": 199 }, { "epoch": 0.09, "grad_norm": 0.7692213485546533, "learning_rate": 2.0000000000000003e-06, "loss": 0.5832, "step": 200 }, { "epoch": 0.09, "grad_norm": 0.7310442342385465, "learning_rate": 2.0100000000000002e-06, "loss": 0.5484, "step": 201 }, { "epoch": 0.09, "grad_norm": 0.6807932721845763, "learning_rate": 2.02e-06, "loss": 0.5476, "step": 202 }, { "epoch": 0.09, "grad_norm": 0.703371626783706, "learning_rate": 2.0300000000000005e-06, "loss": 0.6031, "step": 203 }, { "epoch": 0.09, "grad_norm": 0.6802894338699406, "learning_rate": 2.04e-06, "loss": 0.5828, "step": 204 }, { "epoch": 0.09, "grad_norm": 0.6652596394872644, "learning_rate": 2.05e-06, "loss": 0.5614, "step": 205 }, { "epoch": 0.09, "grad_norm": 0.737938303801843, "learning_rate": 2.06e-06, "loss": 0.5649, "step": 206 }, { "epoch": 0.09, "grad_norm": 0.6851616685774734, "learning_rate": 2.07e-06, "loss": 0.5923, "step": 207 }, { "epoch": 0.09, "grad_norm": 0.7193070222820541, "learning_rate": 2.08e-06, "loss": 0.5344, "step": 208 }, { "epoch": 0.09, "grad_norm": 0.6982256320037129, "learning_rate": 2.09e-06, "loss": 0.524, "step": 209 }, { "epoch": 0.09, "grad_norm": 0.6731186167503178, "learning_rate": 2.1000000000000002e-06, "loss": 0.524, "step": 210 }, { "epoch": 0.09, "grad_norm": 1.1798176224305597, "learning_rate": 2.11e-06, "loss": 0.5603, "step": 211 }, { "epoch": 0.09, "grad_norm": 0.7003894417531635, "learning_rate": 2.12e-06, "loss": 0.5547, "step": 212 }, { "epoch": 0.09, "grad_norm": 0.6678276891220589, "learning_rate": 2.13e-06, "loss": 0.543, "step": 213 }, { "epoch": 0.09, "eval_loss": 0.5565809607505798, "eval_runtime": 6927.1816, "eval_samples_per_second": 41.92, "eval_steps_per_second": 2.096, "step": 213 }, { "epoch": 0.09, "grad_norm": 0.6757438454080638, "learning_rate": 2.1400000000000003e-06, "loss": 0.5468, "step": 214 }, { "epoch": 0.09, "grad_norm": 0.770869092645212, "learning_rate": 2.15e-06, "loss": 0.56, "step": 215 }, { "epoch": 0.09, "grad_norm": 0.7073954416641812, "learning_rate": 2.16e-06, "loss": 0.5481, "step": 216 }, { "epoch": 0.09, "grad_norm": 0.7155585643449607, "learning_rate": 2.17e-06, "loss": 0.5616, "step": 217 }, { "epoch": 0.09, "grad_norm": 0.7290481543044689, "learning_rate": 2.1800000000000003e-06, "loss": 0.5418, "step": 218 }, { "epoch": 0.09, "grad_norm": 0.8118091409501089, "learning_rate": 2.19e-06, "loss": 0.5438, "step": 219 }, { "epoch": 0.09, "grad_norm": 1.8302800073178411, "learning_rate": 2.2e-06, "loss": 0.5487, "step": 220 }, { "epoch": 0.09, "grad_norm": 0.7364426707486922, "learning_rate": 2.21e-06, "loss": 0.5855, "step": 221 }, { "epoch": 0.09, "grad_norm": 1.456340142180152, "learning_rate": 2.2200000000000003e-06, "loss": 0.5836, "step": 222 }, { "epoch": 0.1, "grad_norm": 0.7385740541544212, "learning_rate": 2.2300000000000002e-06, "loss": 0.5697, "step": 223 }, { "epoch": 0.1, "grad_norm": 0.84040439389078, "learning_rate": 2.24e-06, "loss": 0.5688, "step": 224 }, { "epoch": 0.1, "grad_norm": 0.6791205410588949, "learning_rate": 2.25e-06, "loss": 0.5504, "step": 225 }, { "epoch": 0.1, "grad_norm": 0.7249144826404104, "learning_rate": 2.2600000000000004e-06, "loss": 0.5573, "step": 226 }, { "epoch": 0.1, "grad_norm": 0.7512914278711404, "learning_rate": 2.2700000000000003e-06, "loss": 0.5443, "step": 227 }, { "epoch": 0.1, "grad_norm": 0.7283685135498201, "learning_rate": 2.28e-06, "loss": 0.5423, "step": 228 }, { "epoch": 0.1, "grad_norm": 0.745183565987288, "learning_rate": 2.29e-06, "loss": 0.5304, "step": 229 }, { "epoch": 0.1, "grad_norm": 0.738561325322298, "learning_rate": 2.3000000000000004e-06, "loss": 0.5406, "step": 230 }, { "epoch": 0.1, "grad_norm": 0.8285144394816474, "learning_rate": 2.3100000000000003e-06, "loss": 0.5299, "step": 231 }, { "epoch": 0.1, "grad_norm": 0.6778837717707735, "learning_rate": 2.3200000000000002e-06, "loss": 0.5195, "step": 232 }, { "epoch": 0.1, "grad_norm": 0.7719966350372699, "learning_rate": 2.33e-06, "loss": 0.5571, "step": 233 }, { "epoch": 0.1, "grad_norm": 0.76143480039083, "learning_rate": 2.3400000000000005e-06, "loss": 0.5392, "step": 234 }, { "epoch": 0.1, "grad_norm": 0.7799555692877767, "learning_rate": 2.35e-06, "loss": 0.5675, "step": 235 }, { "epoch": 0.1, "grad_norm": 2.172177763095597, "learning_rate": 2.3600000000000003e-06, "loss": 0.5751, "step": 236 }, { "epoch": 0.1, "grad_norm": 0.7382398234329495, "learning_rate": 2.37e-06, "loss": 0.541, "step": 237 }, { "epoch": 0.1, "grad_norm": 0.7869817068341356, "learning_rate": 2.38e-06, "loss": 0.5616, "step": 238 }, { "epoch": 0.1, "grad_norm": 0.747149871498409, "learning_rate": 2.39e-06, "loss": 0.5368, "step": 239 }, { "epoch": 0.1, "grad_norm": 0.709533915967845, "learning_rate": 2.4000000000000003e-06, "loss": 0.5401, "step": 240 }, { "epoch": 0.1, "grad_norm": 0.8009167743191223, "learning_rate": 2.4100000000000002e-06, "loss": 0.5553, "step": 241 }, { "epoch": 0.1, "grad_norm": 0.7824397361977652, "learning_rate": 2.42e-06, "loss": 0.5582, "step": 242 }, { "epoch": 0.1, "grad_norm": 0.7713371690459996, "learning_rate": 2.43e-06, "loss": 0.5567, "step": 243 }, { "epoch": 0.1, "grad_norm": 0.6841472190627085, "learning_rate": 2.4400000000000004e-06, "loss": 0.5255, "step": 244 }, { "epoch": 0.1, "grad_norm": 0.7395314286170461, "learning_rate": 2.4500000000000003e-06, "loss": 0.5474, "step": 245 }, { "epoch": 0.11, "grad_norm": 0.9037560995126637, "learning_rate": 2.46e-06, "loss": 0.5443, "step": 246 }, { "epoch": 0.11, "grad_norm": 0.7394243896953805, "learning_rate": 2.47e-06, "loss": 0.5654, "step": 247 }, { "epoch": 0.11, "grad_norm": 0.7368266522326133, "learning_rate": 2.4800000000000004e-06, "loss": 0.5588, "step": 248 }, { "epoch": 0.11, "grad_norm": 0.7216141071484306, "learning_rate": 2.4900000000000003e-06, "loss": 0.546, "step": 249 }, { "epoch": 0.11, "grad_norm": 0.7381090709238742, "learning_rate": 2.5e-06, "loss": 0.5359, "step": 250 }, { "epoch": 0.11, "grad_norm": 0.81508667979967, "learning_rate": 2.51e-06, "loss": 0.5831, "step": 251 }, { "epoch": 0.11, "grad_norm": 0.7507679622871583, "learning_rate": 2.52e-06, "loss": 0.5536, "step": 252 }, { "epoch": 0.11, "grad_norm": 0.725818735593307, "learning_rate": 2.5300000000000003e-06, "loss": 0.5301, "step": 253 }, { "epoch": 0.11, "grad_norm": 0.7171172879542945, "learning_rate": 2.5400000000000002e-06, "loss": 0.562, "step": 254 }, { "epoch": 0.11, "grad_norm": 0.8845290493858748, "learning_rate": 2.55e-06, "loss": 0.5641, "step": 255 }, { "epoch": 0.11, "grad_norm": 0.7748353959313085, "learning_rate": 2.56e-06, "loss": 0.5617, "step": 256 }, { "epoch": 0.11, "grad_norm": 1.4792826780398232, "learning_rate": 2.5700000000000004e-06, "loss": 0.5218, "step": 257 }, { "epoch": 0.11, "grad_norm": 0.7755584686345505, "learning_rate": 2.5800000000000003e-06, "loss": 0.5549, "step": 258 }, { "epoch": 0.11, "grad_norm": 0.7144098990925435, "learning_rate": 2.59e-06, "loss": 0.5452, "step": 259 }, { "epoch": 0.11, "grad_norm": 0.7016327562812078, "learning_rate": 2.6e-06, "loss": 0.5359, "step": 260 }, { "epoch": 0.11, "grad_norm": 0.7674707640501007, "learning_rate": 2.6100000000000004e-06, "loss": 0.5492, "step": 261 }, { "epoch": 0.11, "grad_norm": 0.6991460195923854, "learning_rate": 2.6200000000000003e-06, "loss": 0.5739, "step": 262 }, { "epoch": 0.11, "grad_norm": 0.7481462435947455, "learning_rate": 2.6300000000000002e-06, "loss": 0.5382, "step": 263 }, { "epoch": 0.11, "grad_norm": 0.6903104793011363, "learning_rate": 2.64e-06, "loss": 0.5774, "step": 264 }, { "epoch": 0.11, "grad_norm": 0.7276434522232335, "learning_rate": 2.6500000000000005e-06, "loss": 0.5395, "step": 265 }, { "epoch": 0.11, "grad_norm": 0.7049348082872935, "learning_rate": 2.6600000000000004e-06, "loss": 0.5077, "step": 266 }, { "epoch": 0.11, "grad_norm": 0.6860853746928447, "learning_rate": 2.6700000000000003e-06, "loss": 0.5365, "step": 267 }, { "epoch": 0.11, "grad_norm": 0.7019598422590565, "learning_rate": 2.68e-06, "loss": 0.5405, "step": 268 }, { "epoch": 0.11, "grad_norm": 0.7452321432305177, "learning_rate": 2.6900000000000005e-06, "loss": 0.5301, "step": 269 }, { "epoch": 0.12, "grad_norm": 0.6869176763831081, "learning_rate": 2.7000000000000004e-06, "loss": 0.5294, "step": 270 }, { "epoch": 0.12, "grad_norm": 0.923474966878648, "learning_rate": 2.7100000000000003e-06, "loss": 0.5341, "step": 271 }, { "epoch": 0.12, "grad_norm": 0.6685065451328585, "learning_rate": 2.7200000000000002e-06, "loss": 0.5364, "step": 272 }, { "epoch": 0.12, "grad_norm": 0.7024497350510225, "learning_rate": 2.7300000000000005e-06, "loss": 0.5411, "step": 273 }, { "epoch": 0.12, "grad_norm": 0.6954294556378668, "learning_rate": 2.7400000000000004e-06, "loss": 0.5599, "step": 274 }, { "epoch": 0.12, "grad_norm": 0.7307559316445182, "learning_rate": 2.7500000000000004e-06, "loss": 0.5546, "step": 275 }, { "epoch": 0.12, "grad_norm": 0.6817035611374646, "learning_rate": 2.7600000000000003e-06, "loss": 0.5252, "step": 276 }, { "epoch": 0.12, "grad_norm": 0.6749522949619997, "learning_rate": 2.7700000000000006e-06, "loss": 0.5329, "step": 277 }, { "epoch": 0.12, "grad_norm": 0.6885720885016431, "learning_rate": 2.7800000000000005e-06, "loss": 0.5437, "step": 278 }, { "epoch": 0.12, "grad_norm": 0.6847069171457737, "learning_rate": 2.7900000000000004e-06, "loss": 0.5298, "step": 279 }, { "epoch": 0.12, "grad_norm": 0.6540194496966271, "learning_rate": 2.8000000000000003e-06, "loss": 0.5129, "step": 280 }, { "epoch": 0.12, "grad_norm": 0.7232645465800646, "learning_rate": 2.8100000000000006e-06, "loss": 0.5727, "step": 281 }, { "epoch": 0.12, "grad_norm": 0.6686647596073149, "learning_rate": 2.82e-06, "loss": 0.5507, "step": 282 }, { "epoch": 0.12, "grad_norm": 0.6600278121030401, "learning_rate": 2.83e-06, "loss": 0.5413, "step": 283 }, { "epoch": 0.12, "grad_norm": 0.7295788704850122, "learning_rate": 2.84e-06, "loss": 0.5644, "step": 284 }, { "epoch": 0.12, "eval_loss": 0.5427850484848022, "eval_runtime": 6910.0669, "eval_samples_per_second": 42.024, "eval_steps_per_second": 2.101, "step": 284 }, { "epoch": 0.12, "grad_norm": 0.7172749652762687, "learning_rate": 2.85e-06, "loss": 0.5586, "step": 285 }, { "epoch": 0.12, "grad_norm": 0.7146040704416872, "learning_rate": 2.86e-06, "loss": 0.5468, "step": 286 }, { "epoch": 0.12, "grad_norm": 0.6773322028612333, "learning_rate": 2.87e-06, "loss": 0.5439, "step": 287 }, { "epoch": 0.12, "grad_norm": 0.6807935236750091, "learning_rate": 2.88e-06, "loss": 0.541, "step": 288 }, { "epoch": 0.12, "grad_norm": 0.7137692746953846, "learning_rate": 2.89e-06, "loss": 0.5299, "step": 289 }, { "epoch": 0.12, "grad_norm": 0.7079425743756034, "learning_rate": 2.9e-06, "loss": 0.5591, "step": 290 }, { "epoch": 0.12, "grad_norm": 0.6971956190090158, "learning_rate": 2.91e-06, "loss": 0.5515, "step": 291 }, { "epoch": 0.12, "grad_norm": 0.7203057497367688, "learning_rate": 2.92e-06, "loss": 0.5335, "step": 292 }, { "epoch": 0.13, "grad_norm": 0.6508573931612972, "learning_rate": 2.93e-06, "loss": 0.5242, "step": 293 }, { "epoch": 0.13, "grad_norm": 0.682319005640009, "learning_rate": 2.9400000000000002e-06, "loss": 0.544, "step": 294 }, { "epoch": 0.13, "grad_norm": 0.6467072142132468, "learning_rate": 2.95e-06, "loss": 0.5236, "step": 295 }, { "epoch": 0.13, "grad_norm": 0.6776154655530738, "learning_rate": 2.96e-06, "loss": 0.5271, "step": 296 }, { "epoch": 0.13, "grad_norm": 0.6938107980975571, "learning_rate": 2.97e-06, "loss": 0.5382, "step": 297 }, { "epoch": 0.13, "grad_norm": 0.721981231896848, "learning_rate": 2.9800000000000003e-06, "loss": 0.5911, "step": 298 }, { "epoch": 0.13, "grad_norm": 0.712390399983667, "learning_rate": 2.99e-06, "loss": 0.5546, "step": 299 }, { "epoch": 0.13, "grad_norm": 0.7104303392210323, "learning_rate": 3e-06, "loss": 0.5519, "step": 300 }, { "epoch": 0.13, "grad_norm": 0.6771404308712767, "learning_rate": 3.01e-06, "loss": 0.535, "step": 301 }, { "epoch": 0.13, "grad_norm": 0.6911636460013854, "learning_rate": 3.0200000000000003e-06, "loss": 0.5293, "step": 302 }, { "epoch": 0.13, "grad_norm": 0.6692540043016396, "learning_rate": 3.0300000000000002e-06, "loss": 0.542, "step": 303 }, { "epoch": 0.13, "grad_norm": 0.6744768013007634, "learning_rate": 3.04e-06, "loss": 0.515, "step": 304 }, { "epoch": 0.13, "grad_norm": 0.670911158940749, "learning_rate": 3.05e-06, "loss": 0.5163, "step": 305 }, { "epoch": 0.13, "grad_norm": 0.663474423719772, "learning_rate": 3.0600000000000003e-06, "loss": 0.5305, "step": 306 }, { "epoch": 0.13, "grad_norm": 0.6925074950155179, "learning_rate": 3.0700000000000003e-06, "loss": 0.5415, "step": 307 }, { "epoch": 0.13, "grad_norm": 0.6934197367843478, "learning_rate": 3.08e-06, "loss": 0.5525, "step": 308 }, { "epoch": 0.13, "grad_norm": 0.6661951817270549, "learning_rate": 3.09e-06, "loss": 0.5608, "step": 309 }, { "epoch": 0.13, "grad_norm": 0.6502761715692771, "learning_rate": 3.1000000000000004e-06, "loss": 0.5597, "step": 310 }, { "epoch": 0.13, "grad_norm": 0.644832829029986, "learning_rate": 3.1100000000000003e-06, "loss": 0.5438, "step": 311 }, { "epoch": 0.13, "grad_norm": 0.6264165348789397, "learning_rate": 3.12e-06, "loss": 0.5292, "step": 312 }, { "epoch": 0.13, "grad_norm": 0.7028440661397689, "learning_rate": 3.13e-06, "loss": 0.5475, "step": 313 }, { "epoch": 0.13, "grad_norm": 0.6530445345568231, "learning_rate": 3.1400000000000004e-06, "loss": 0.5567, "step": 314 }, { "epoch": 0.13, "grad_norm": 0.6365729580283129, "learning_rate": 3.1500000000000003e-06, "loss": 0.5375, "step": 315 }, { "epoch": 0.14, "grad_norm": 0.7033548602718471, "learning_rate": 3.1600000000000002e-06, "loss": 0.5691, "step": 316 }, { "epoch": 0.14, "grad_norm": 0.6757049797519596, "learning_rate": 3.17e-06, "loss": 0.5206, "step": 317 }, { "epoch": 0.14, "grad_norm": 0.6685908127465012, "learning_rate": 3.1800000000000005e-06, "loss": 0.5304, "step": 318 }, { "epoch": 0.14, "grad_norm": 0.6793556517907352, "learning_rate": 3.1900000000000004e-06, "loss": 0.5217, "step": 319 }, { "epoch": 0.14, "grad_norm": 0.6523324979691255, "learning_rate": 3.2000000000000003e-06, "loss": 0.501, "step": 320 }, { "epoch": 0.14, "grad_norm": 0.6443915969872007, "learning_rate": 3.21e-06, "loss": 0.5315, "step": 321 }, { "epoch": 0.14, "grad_norm": 0.6681792930023069, "learning_rate": 3.2200000000000005e-06, "loss": 0.5411, "step": 322 }, { "epoch": 0.14, "grad_norm": 0.6976917063523176, "learning_rate": 3.2300000000000004e-06, "loss": 0.5356, "step": 323 }, { "epoch": 0.14, "grad_norm": 0.6455932268651956, "learning_rate": 3.2400000000000003e-06, "loss": 0.5085, "step": 324 }, { "epoch": 0.14, "grad_norm": 0.7324457281506692, "learning_rate": 3.2500000000000002e-06, "loss": 0.5804, "step": 325 }, { "epoch": 0.14, "grad_norm": 0.6887957564513499, "learning_rate": 3.2600000000000006e-06, "loss": 0.5315, "step": 326 }, { "epoch": 0.14, "grad_norm": 0.6933538113778166, "learning_rate": 3.2700000000000005e-06, "loss": 0.529, "step": 327 }, { "epoch": 0.14, "grad_norm": 0.6968756135404809, "learning_rate": 3.2800000000000004e-06, "loss": 0.509, "step": 328 }, { "epoch": 0.14, "grad_norm": 0.6547487769661117, "learning_rate": 3.2900000000000003e-06, "loss": 0.5319, "step": 329 }, { "epoch": 0.14, "grad_norm": 0.6918958083567368, "learning_rate": 3.3000000000000006e-06, "loss": 0.5369, "step": 330 }, { "epoch": 0.14, "grad_norm": 0.7537331163722618, "learning_rate": 3.3100000000000005e-06, "loss": 0.5559, "step": 331 }, { "epoch": 0.14, "grad_norm": 0.6622519534641422, "learning_rate": 3.3200000000000004e-06, "loss": 0.5399, "step": 332 }, { "epoch": 0.14, "grad_norm": 0.7164771428347083, "learning_rate": 3.3300000000000003e-06, "loss": 0.5565, "step": 333 }, { "epoch": 0.14, "grad_norm": 0.7082376525584928, "learning_rate": 3.3400000000000006e-06, "loss": 0.5212, "step": 334 }, { "epoch": 0.14, "grad_norm": 0.6726486174520817, "learning_rate": 3.3500000000000005e-06, "loss": 0.5214, "step": 335 }, { "epoch": 0.14, "grad_norm": 0.6698599823643026, "learning_rate": 3.3600000000000004e-06, "loss": 0.5392, "step": 336 }, { "epoch": 0.14, "grad_norm": 0.7412977141620687, "learning_rate": 3.3700000000000003e-06, "loss": 0.5786, "step": 337 }, { "epoch": 0.14, "grad_norm": 0.7232716108771607, "learning_rate": 3.3800000000000007e-06, "loss": 0.5315, "step": 338 }, { "epoch": 0.14, "grad_norm": 0.6392581387125058, "learning_rate": 3.3900000000000006e-06, "loss": 0.5443, "step": 339 }, { "epoch": 0.15, "grad_norm": 0.709924249993068, "learning_rate": 3.4000000000000005e-06, "loss": 0.5518, "step": 340 }, { "epoch": 0.15, "grad_norm": 0.6805700381054698, "learning_rate": 3.4100000000000004e-06, "loss": 0.51, "step": 341 }, { "epoch": 0.15, "grad_norm": 0.6404690210430515, "learning_rate": 3.4200000000000007e-06, "loss": 0.5482, "step": 342 }, { "epoch": 0.15, "grad_norm": 0.7350376016254183, "learning_rate": 3.4300000000000006e-06, "loss": 0.4998, "step": 343 }, { "epoch": 0.15, "grad_norm": 0.692941753107107, "learning_rate": 3.44e-06, "loss": 0.5174, "step": 344 }, { "epoch": 0.15, "grad_norm": 0.6917522453335027, "learning_rate": 3.45e-06, "loss": 0.5252, "step": 345 }, { "epoch": 0.15, "grad_norm": 0.7698041778535101, "learning_rate": 3.46e-06, "loss": 0.5116, "step": 346 }, { "epoch": 0.15, "grad_norm": 0.6534415447218267, "learning_rate": 3.4700000000000002e-06, "loss": 0.5324, "step": 347 }, { "epoch": 0.15, "grad_norm": 0.6553279473118478, "learning_rate": 3.48e-06, "loss": 0.4979, "step": 348 }, { "epoch": 0.15, "grad_norm": 0.7260157286864319, "learning_rate": 3.49e-06, "loss": 0.5883, "step": 349 }, { "epoch": 0.15, "grad_norm": 0.6509209097177096, "learning_rate": 3.5e-06, "loss": 0.5532, "step": 350 }, { "epoch": 0.15, "grad_norm": 0.7431590152540184, "learning_rate": 3.5100000000000003e-06, "loss": 0.5651, "step": 351 }, { "epoch": 0.15, "grad_norm": 0.7154996648263839, "learning_rate": 3.52e-06, "loss": 0.5522, "step": 352 }, { "epoch": 0.15, "grad_norm": 0.6949365648624505, "learning_rate": 3.53e-06, "loss": 0.5648, "step": 353 }, { "epoch": 0.15, "grad_norm": 0.6507409922513757, "learning_rate": 3.54e-06, "loss": 0.5278, "step": 354 }, { "epoch": 0.15, "grad_norm": 0.7005428814595557, "learning_rate": 3.5500000000000003e-06, "loss": 0.5555, "step": 355 }, { "epoch": 0.15, "eval_loss": 0.5336291193962097, "eval_runtime": 6911.9871, "eval_samples_per_second": 42.012, "eval_steps_per_second": 2.101, "step": 355 }, { "epoch": 0.15, "grad_norm": 0.6932495434274139, "learning_rate": 3.5600000000000002e-06, "loss": 0.515, "step": 356 }, { "epoch": 0.15, "grad_norm": 0.6592137259570553, "learning_rate": 3.57e-06, "loss": 0.5593, "step": 357 }, { "epoch": 0.15, "grad_norm": 0.6621976521925507, "learning_rate": 3.58e-06, "loss": 0.5314, "step": 358 }, { "epoch": 0.15, "grad_norm": 0.6743898928954292, "learning_rate": 3.5900000000000004e-06, "loss": 0.5458, "step": 359 }, { "epoch": 0.15, "grad_norm": 0.6270336100237296, "learning_rate": 3.6000000000000003e-06, "loss": 0.5281, "step": 360 }, { "epoch": 0.15, "grad_norm": 0.718834892269395, "learning_rate": 3.61e-06, "loss": 0.54, "step": 361 }, { "epoch": 0.15, "grad_norm": 0.6096717034209563, "learning_rate": 3.62e-06, "loss": 0.5023, "step": 362 }, { "epoch": 0.16, "grad_norm": 0.6952055757879536, "learning_rate": 3.6300000000000004e-06, "loss": 0.5268, "step": 363 }, { "epoch": 0.16, "grad_norm": 0.6571216773223735, "learning_rate": 3.6400000000000003e-06, "loss": 0.5314, "step": 364 }, { "epoch": 0.16, "grad_norm": 0.644276415756237, "learning_rate": 3.65e-06, "loss": 0.5194, "step": 365 }, { "epoch": 0.16, "grad_norm": 1.298671233861733, "learning_rate": 3.66e-06, "loss": 0.5117, "step": 366 }, { "epoch": 0.16, "grad_norm": 0.7103634954229385, "learning_rate": 3.6700000000000004e-06, "loss": 0.5385, "step": 367 }, { "epoch": 0.16, "grad_norm": 0.6104474479946076, "learning_rate": 3.6800000000000003e-06, "loss": 0.4998, "step": 368 }, { "epoch": 0.16, "grad_norm": 0.6896768428677847, "learning_rate": 3.6900000000000002e-06, "loss": 0.533, "step": 369 }, { "epoch": 0.16, "grad_norm": 0.6413918813463226, "learning_rate": 3.7e-06, "loss": 0.5312, "step": 370 }, { "epoch": 0.16, "grad_norm": 0.7573960000478444, "learning_rate": 3.7100000000000005e-06, "loss": 0.5118, "step": 371 }, { "epoch": 0.16, "grad_norm": 0.6754673328160921, "learning_rate": 3.7200000000000004e-06, "loss": 0.531, "step": 372 }, { "epoch": 0.16, "grad_norm": 0.7380227752448969, "learning_rate": 3.7300000000000003e-06, "loss": 0.553, "step": 373 }, { "epoch": 0.16, "grad_norm": 0.7101835866153544, "learning_rate": 3.74e-06, "loss": 0.5356, "step": 374 }, { "epoch": 0.16, "grad_norm": 0.6854482179866157, "learning_rate": 3.7500000000000005e-06, "loss": 0.5241, "step": 375 }, { "epoch": 0.16, "grad_norm": 0.6626980381085176, "learning_rate": 3.7600000000000004e-06, "loss": 0.51, "step": 376 }, { "epoch": 0.16, "grad_norm": 0.6510834607877704, "learning_rate": 3.7700000000000003e-06, "loss": 0.5231, "step": 377 }, { "epoch": 0.16, "grad_norm": 0.6659115075290825, "learning_rate": 3.7800000000000002e-06, "loss": 0.5164, "step": 378 }, { "epoch": 0.16, "grad_norm": 0.6977709143095813, "learning_rate": 3.79e-06, "loss": 0.5529, "step": 379 }, { "epoch": 0.16, "grad_norm": 0.6429659300311611, "learning_rate": 3.8000000000000005e-06, "loss": 0.5177, "step": 380 }, { "epoch": 0.16, "grad_norm": 0.6780609817770847, "learning_rate": 3.8100000000000004e-06, "loss": 0.5017, "step": 381 }, { "epoch": 0.16, "grad_norm": 0.6312430732177394, "learning_rate": 3.820000000000001e-06, "loss": 0.5034, "step": 382 }, { "epoch": 0.16, "grad_norm": 0.6557950564147815, "learning_rate": 3.830000000000001e-06, "loss": 0.5297, "step": 383 }, { "epoch": 0.16, "grad_norm": 0.7189637600222241, "learning_rate": 3.8400000000000005e-06, "loss": 0.5396, "step": 384 }, { "epoch": 0.16, "grad_norm": 0.7535141909531238, "learning_rate": 3.85e-06, "loss": 0.5572, "step": 385 }, { "epoch": 0.16, "grad_norm": 0.6848030312310904, "learning_rate": 3.86e-06, "loss": 0.5135, "step": 386 }, { "epoch": 0.17, "grad_norm": 0.6605297036299905, "learning_rate": 3.87e-06, "loss": 0.5576, "step": 387 }, { "epoch": 0.17, "grad_norm": 0.6988172021818264, "learning_rate": 3.88e-06, "loss": 0.5281, "step": 388 }, { "epoch": 0.17, "grad_norm": 0.6511359929973235, "learning_rate": 3.89e-06, "loss": 0.5148, "step": 389 }, { "epoch": 0.17, "grad_norm": 0.74475476203548, "learning_rate": 3.900000000000001e-06, "loss": 0.5313, "step": 390 }, { "epoch": 0.17, "grad_norm": 0.6768085651878416, "learning_rate": 3.910000000000001e-06, "loss": 0.5289, "step": 391 }, { "epoch": 0.17, "grad_norm": 0.6590289960576611, "learning_rate": 3.920000000000001e-06, "loss": 0.526, "step": 392 }, { "epoch": 0.17, "grad_norm": 0.6294447240216576, "learning_rate": 3.9300000000000005e-06, "loss": 0.5066, "step": 393 }, { "epoch": 0.17, "grad_norm": 0.6795503573063048, "learning_rate": 3.94e-06, "loss": 0.5265, "step": 394 }, { "epoch": 0.17, "grad_norm": 0.6327421219484214, "learning_rate": 3.95e-06, "loss": 0.5262, "step": 395 }, { "epoch": 0.17, "grad_norm": 0.6702321778478743, "learning_rate": 3.96e-06, "loss": 0.5521, "step": 396 }, { "epoch": 0.17, "grad_norm": 0.6601601082950667, "learning_rate": 3.97e-06, "loss": 0.5511, "step": 397 }, { "epoch": 0.17, "grad_norm": 0.672312807049255, "learning_rate": 3.980000000000001e-06, "loss": 0.581, "step": 398 }, { "epoch": 0.17, "grad_norm": 0.70887535443515, "learning_rate": 3.990000000000001e-06, "loss": 0.5693, "step": 399 }, { "epoch": 0.17, "grad_norm": 0.674197659542202, "learning_rate": 4.000000000000001e-06, "loss": 0.5304, "step": 400 }, { "epoch": 0.17, "grad_norm": 0.6367460409014842, "learning_rate": 4.0100000000000006e-06, "loss": 0.5081, "step": 401 }, { "epoch": 0.17, "grad_norm": 0.6857813275083907, "learning_rate": 4.0200000000000005e-06, "loss": 0.4973, "step": 402 }, { "epoch": 0.17, "grad_norm": 0.6556472016511119, "learning_rate": 4.03e-06, "loss": 0.5389, "step": 403 }, { "epoch": 0.17, "grad_norm": 0.6985422078815098, "learning_rate": 4.04e-06, "loss": 0.5289, "step": 404 }, { "epoch": 0.17, "grad_norm": 0.6649519288148354, "learning_rate": 4.05e-06, "loss": 0.5379, "step": 405 }, { "epoch": 0.17, "grad_norm": 0.6263707838794045, "learning_rate": 4.060000000000001e-06, "loss": 0.5421, "step": 406 }, { "epoch": 0.17, "grad_norm": 0.7664906236471626, "learning_rate": 4.07e-06, "loss": 0.5535, "step": 407 }, { "epoch": 0.17, "grad_norm": 0.6553639230258015, "learning_rate": 4.08e-06, "loss": 0.5349, "step": 408 }, { "epoch": 0.17, "grad_norm": 0.6750627011135212, "learning_rate": 4.09e-06, "loss": 0.5453, "step": 409 }, { "epoch": 0.18, "grad_norm": 0.6577154298564163, "learning_rate": 4.1e-06, "loss": 0.5082, "step": 410 }, { "epoch": 0.18, "grad_norm": 0.6630263227345423, "learning_rate": 4.1100000000000005e-06, "loss": 0.5856, "step": 411 }, { "epoch": 0.18, "grad_norm": 0.6574426137463745, "learning_rate": 4.12e-06, "loss": 0.5384, "step": 412 }, { "epoch": 0.18, "grad_norm": 0.6988289040575354, "learning_rate": 4.13e-06, "loss": 0.5399, "step": 413 }, { "epoch": 0.18, "grad_norm": 0.6306882029774344, "learning_rate": 4.14e-06, "loss": 0.5336, "step": 414 }, { "epoch": 0.18, "grad_norm": 0.7179639205026864, "learning_rate": 4.15e-06, "loss": 0.518, "step": 415 }, { "epoch": 0.18, "grad_norm": 0.6516513551525797, "learning_rate": 4.16e-06, "loss": 0.5257, "step": 416 }, { "epoch": 0.18, "grad_norm": 0.6601218324564917, "learning_rate": 4.17e-06, "loss": 0.5289, "step": 417 }, { "epoch": 0.18, "grad_norm": 0.7211814488287548, "learning_rate": 4.18e-06, "loss": 0.5276, "step": 418 }, { "epoch": 0.18, "grad_norm": 0.6699099078689619, "learning_rate": 4.1900000000000005e-06, "loss": 0.5244, "step": 419 }, { "epoch": 0.18, "grad_norm": 0.6916838990628401, "learning_rate": 4.2000000000000004e-06, "loss": 0.5181, "step": 420 }, { "epoch": 0.18, "grad_norm": 0.6545058846876347, "learning_rate": 4.21e-06, "loss": 0.5475, "step": 421 }, { "epoch": 0.18, "grad_norm": 0.7236179142807477, "learning_rate": 4.22e-06, "loss": 0.5049, "step": 422 }, { "epoch": 0.18, "grad_norm": 0.6610815551752731, "learning_rate": 4.23e-06, "loss": 0.5463, "step": 423 }, { "epoch": 0.18, "grad_norm": 0.6821422605987203, "learning_rate": 4.24e-06, "loss": 0.5307, "step": 424 }, { "epoch": 0.18, "grad_norm": 0.6431222169441803, "learning_rate": 4.25e-06, "loss": 0.521, "step": 425 }, { "epoch": 0.18, "grad_norm": 0.6952795202176767, "learning_rate": 4.26e-06, "loss": 0.514, "step": 426 }, { "epoch": 0.18, "eval_loss": 0.5265827775001526, "eval_runtime": 6909.3691, "eval_samples_per_second": 42.028, "eval_steps_per_second": 2.101, "step": 426 }, { "epoch": 0.18, "grad_norm": 0.6134347482653176, "learning_rate": 4.270000000000001e-06, "loss": 0.5251, "step": 427 }, { "epoch": 0.18, "grad_norm": 0.7147159318510563, "learning_rate": 4.2800000000000005e-06, "loss": 0.5309, "step": 428 }, { "epoch": 0.18, "grad_norm": 0.7026933031500548, "learning_rate": 4.2900000000000004e-06, "loss": 0.5336, "step": 429 }, { "epoch": 0.18, "grad_norm": 0.7005957676093472, "learning_rate": 4.3e-06, "loss": 0.5518, "step": 430 }, { "epoch": 0.18, "grad_norm": 0.697347284426043, "learning_rate": 4.31e-06, "loss": 0.5222, "step": 431 }, { "epoch": 0.18, "grad_norm": 0.6618459969675489, "learning_rate": 4.32e-06, "loss": 0.5077, "step": 432 }, { "epoch": 0.18, "grad_norm": 0.6436931259589193, "learning_rate": 4.33e-06, "loss": 0.5155, "step": 433 }, { "epoch": 0.19, "grad_norm": 0.6589831621019552, "learning_rate": 4.34e-06, "loss": 0.5293, "step": 434 }, { "epoch": 0.19, "grad_norm": 0.6487932842171147, "learning_rate": 4.350000000000001e-06, "loss": 0.5234, "step": 435 }, { "epoch": 0.19, "grad_norm": 0.6388950573406754, "learning_rate": 4.360000000000001e-06, "loss": 0.5247, "step": 436 }, { "epoch": 0.19, "grad_norm": 0.6484480185597394, "learning_rate": 4.3700000000000005e-06, "loss": 0.5493, "step": 437 }, { "epoch": 0.19, "grad_norm": 0.6656552397893157, "learning_rate": 4.38e-06, "loss": 0.5122, "step": 438 }, { "epoch": 0.19, "grad_norm": 0.6100914831240443, "learning_rate": 4.39e-06, "loss": 0.5223, "step": 439 }, { "epoch": 0.19, "grad_norm": 0.6806948048082133, "learning_rate": 4.4e-06, "loss": 0.5366, "step": 440 }, { "epoch": 0.19, "grad_norm": 0.6792634934940536, "learning_rate": 4.41e-06, "loss": 0.5371, "step": 441 }, { "epoch": 0.19, "grad_norm": 0.6163101784317894, "learning_rate": 4.42e-06, "loss": 0.5001, "step": 442 }, { "epoch": 0.19, "grad_norm": 0.6711982512846896, "learning_rate": 4.430000000000001e-06, "loss": 0.5431, "step": 443 }, { "epoch": 0.19, "grad_norm": 0.6656044261856343, "learning_rate": 4.440000000000001e-06, "loss": 0.4961, "step": 444 }, { "epoch": 0.19, "grad_norm": 0.6090638329534794, "learning_rate": 4.450000000000001e-06, "loss": 0.5124, "step": 445 }, { "epoch": 0.19, "grad_norm": 0.6642264166031999, "learning_rate": 4.4600000000000005e-06, "loss": 0.5401, "step": 446 }, { "epoch": 0.19, "grad_norm": 0.6146557964679851, "learning_rate": 4.47e-06, "loss": 0.521, "step": 447 }, { "epoch": 0.19, "grad_norm": 0.6540634116911361, "learning_rate": 4.48e-06, "loss": 0.5472, "step": 448 }, { "epoch": 0.19, "grad_norm": 0.6394159843807653, "learning_rate": 4.49e-06, "loss": 0.4984, "step": 449 }, { "epoch": 0.19, "grad_norm": 0.6320717472264333, "learning_rate": 4.5e-06, "loss": 0.539, "step": 450 }, { "epoch": 0.19, "grad_norm": 0.6335986050650263, "learning_rate": 4.510000000000001e-06, "loss": 0.5279, "step": 451 }, { "epoch": 0.19, "grad_norm": 0.6238922538054539, "learning_rate": 4.520000000000001e-06, "loss": 0.5632, "step": 452 }, { "epoch": 0.19, "grad_norm": 0.6628688965370182, "learning_rate": 4.530000000000001e-06, "loss": 0.5417, "step": 453 }, { "epoch": 0.19, "grad_norm": 0.6559295875868244, "learning_rate": 4.540000000000001e-06, "loss": 0.5127, "step": 454 }, { "epoch": 0.19, "grad_norm": 0.6429887683654244, "learning_rate": 4.5500000000000005e-06, "loss": 0.5116, "step": 455 }, { "epoch": 0.19, "grad_norm": 0.6925428894756206, "learning_rate": 4.56e-06, "loss": 0.5264, "step": 456 }, { "epoch": 0.2, "grad_norm": 0.6604588640436396, "learning_rate": 4.57e-06, "loss": 0.5284, "step": 457 }, { "epoch": 0.2, "grad_norm": 0.6392985738009899, "learning_rate": 4.58e-06, "loss": 0.5188, "step": 458 }, { "epoch": 0.2, "grad_norm": 0.6765577462503485, "learning_rate": 4.590000000000001e-06, "loss": 0.514, "step": 459 }, { "epoch": 0.2, "grad_norm": 0.8495240761386383, "learning_rate": 4.600000000000001e-06, "loss": 0.5268, "step": 460 }, { "epoch": 0.2, "grad_norm": 0.6836958527754253, "learning_rate": 4.610000000000001e-06, "loss": 0.5305, "step": 461 }, { "epoch": 0.2, "grad_norm": 0.6502080642686838, "learning_rate": 4.620000000000001e-06, "loss": 0.5178, "step": 462 }, { "epoch": 0.2, "grad_norm": 0.6342249164223178, "learning_rate": 4.6300000000000006e-06, "loss": 0.5217, "step": 463 }, { "epoch": 0.2, "grad_norm": 0.6777938665629211, "learning_rate": 4.6400000000000005e-06, "loss": 0.534, "step": 464 }, { "epoch": 0.2, "grad_norm": 0.7062865741492855, "learning_rate": 4.65e-06, "loss": 0.5077, "step": 465 }, { "epoch": 0.2, "grad_norm": 0.6383144907337746, "learning_rate": 4.66e-06, "loss": 0.5134, "step": 466 }, { "epoch": 0.2, "grad_norm": 0.6730064655141996, "learning_rate": 4.670000000000001e-06, "loss": 0.552, "step": 467 }, { "epoch": 0.2, "grad_norm": 0.6394012481326041, "learning_rate": 4.680000000000001e-06, "loss": 0.4908, "step": 468 }, { "epoch": 0.2, "grad_norm": 0.7061191203371422, "learning_rate": 4.69e-06, "loss": 0.5354, "step": 469 }, { "epoch": 0.2, "grad_norm": 0.6289046687203929, "learning_rate": 4.7e-06, "loss": 0.524, "step": 470 }, { "epoch": 0.2, "grad_norm": 0.6880276584214321, "learning_rate": 4.71e-06, "loss": 0.5317, "step": 471 }, { "epoch": 0.2, "grad_norm": 0.6626736935804997, "learning_rate": 4.7200000000000005e-06, "loss": 0.5293, "step": 472 }, { "epoch": 0.2, "grad_norm": 0.6704211606772497, "learning_rate": 4.7300000000000005e-06, "loss": 0.5135, "step": 473 }, { "epoch": 0.2, "grad_norm": 0.6671310077095398, "learning_rate": 4.74e-06, "loss": 0.5256, "step": 474 }, { "epoch": 0.2, "grad_norm": 0.637683155549069, "learning_rate": 4.75e-06, "loss": 0.5113, "step": 475 }, { "epoch": 0.2, "grad_norm": 0.7243606567809928, "learning_rate": 4.76e-06, "loss": 0.5222, "step": 476 }, { "epoch": 0.2, "grad_norm": 0.6898226976279818, "learning_rate": 4.77e-06, "loss": 0.5434, "step": 477 }, { "epoch": 0.2, "grad_norm": 0.7102119430718769, "learning_rate": 4.78e-06, "loss": 0.5584, "step": 478 }, { "epoch": 0.2, "grad_norm": 0.6357099843026002, "learning_rate": 4.79e-06, "loss": 0.5377, "step": 479 }, { "epoch": 0.21, "grad_norm": 0.7078871919110719, "learning_rate": 4.800000000000001e-06, "loss": 0.51, "step": 480 }, { "epoch": 0.21, "grad_norm": 0.7004243288260213, "learning_rate": 4.8100000000000005e-06, "loss": 0.5098, "step": 481 }, { "epoch": 0.21, "grad_norm": 0.7218886636410997, "learning_rate": 4.8200000000000004e-06, "loss": 0.5473, "step": 482 }, { "epoch": 0.21, "grad_norm": 0.6908387236569687, "learning_rate": 4.83e-06, "loss": 0.5155, "step": 483 }, { "epoch": 0.21, "grad_norm": 0.7042724565499465, "learning_rate": 4.84e-06, "loss": 0.5152, "step": 484 }, { "epoch": 0.21, "grad_norm": 0.7088947897764216, "learning_rate": 4.85e-06, "loss": 0.5417, "step": 485 }, { "epoch": 0.21, "grad_norm": 0.6279155418925114, "learning_rate": 4.86e-06, "loss": 0.5349, "step": 486 }, { "epoch": 0.21, "grad_norm": 0.7207841117522383, "learning_rate": 4.87e-06, "loss": 0.5261, "step": 487 }, { "epoch": 0.21, "grad_norm": 0.6652618430741107, "learning_rate": 4.880000000000001e-06, "loss": 0.5355, "step": 488 }, { "epoch": 0.21, "grad_norm": 0.6296330144736813, "learning_rate": 4.890000000000001e-06, "loss": 0.5302, "step": 489 }, { "epoch": 0.21, "grad_norm": 0.6197754222224622, "learning_rate": 4.9000000000000005e-06, "loss": 0.4987, "step": 490 }, { "epoch": 0.21, "grad_norm": 0.642766778231823, "learning_rate": 4.9100000000000004e-06, "loss": 0.5247, "step": 491 }, { "epoch": 0.21, "grad_norm": 0.643792980820932, "learning_rate": 4.92e-06, "loss": 0.5347, "step": 492 }, { "epoch": 0.21, "grad_norm": 0.625452787303321, "learning_rate": 4.93e-06, "loss": 0.499, "step": 493 }, { "epoch": 0.21, "grad_norm": 0.5964088928926613, "learning_rate": 4.94e-06, "loss": 0.5407, "step": 494 }, { "epoch": 0.21, "grad_norm": 0.6500032443873983, "learning_rate": 4.95e-06, "loss": 0.5464, "step": 495 }, { "epoch": 0.21, "grad_norm": 0.6460792843444659, "learning_rate": 4.960000000000001e-06, "loss": 0.4949, "step": 496 }, { "epoch": 0.21, "grad_norm": 0.7395445647719444, "learning_rate": 4.970000000000001e-06, "loss": 0.5291, "step": 497 }, { "epoch": 0.21, "eval_loss": 0.5214883685112, "eval_runtime": 6920.669, "eval_samples_per_second": 41.96, "eval_steps_per_second": 2.098, "step": 497 }, { "epoch": 0.21, "grad_norm": 0.6537595312676089, "learning_rate": 4.980000000000001e-06, "loss": 0.5176, "step": 498 }, { "epoch": 0.21, "grad_norm": 0.6463304730768601, "learning_rate": 4.9900000000000005e-06, "loss": 0.5206, "step": 499 }, { "epoch": 0.21, "grad_norm": 0.6973321215012371, "learning_rate": 5e-06, "loss": 0.5404, "step": 500 }, { "epoch": 0.21, "grad_norm": 0.6779339517089825, "learning_rate": 4.999999293914693e-06, "loss": 0.5073, "step": 501 }, { "epoch": 0.21, "grad_norm": 0.6916291409636109, "learning_rate": 4.99999717565917e-06, "loss": 0.5277, "step": 502 }, { "epoch": 0.21, "grad_norm": 0.6019781233779008, "learning_rate": 4.999993645234629e-06, "loss": 0.488, "step": 503 }, { "epoch": 0.22, "grad_norm": 0.6767553900584968, "learning_rate": 4.999988702643063e-06, "loss": 0.5517, "step": 504 }, { "epoch": 0.22, "grad_norm": 0.6399171255962077, "learning_rate": 4.999982347887264e-06, "loss": 0.5363, "step": 505 }, { "epoch": 0.22, "grad_norm": 0.6957517244632938, "learning_rate": 4.999974580970822e-06, "loss": 0.5425, "step": 506 }, { "epoch": 0.22, "grad_norm": 0.6303242233074258, "learning_rate": 4.999965401898124e-06, "loss": 0.5359, "step": 507 }, { "epoch": 0.22, "grad_norm": 0.6857079786662916, "learning_rate": 4.999954810674355e-06, "loss": 0.4941, "step": 508 }, { "epoch": 0.22, "grad_norm": 0.6875532162149117, "learning_rate": 4.999942807305497e-06, "loss": 0.5345, "step": 509 }, { "epoch": 0.22, "grad_norm": 0.6275865235654622, "learning_rate": 4.9999293917983325e-06, "loss": 0.5099, "step": 510 }, { "epoch": 0.22, "grad_norm": 0.7718521188057177, "learning_rate": 4.999914564160437e-06, "loss": 0.5181, "step": 511 }, { "epoch": 0.22, "grad_norm": 0.6534743581262111, "learning_rate": 4.999898324400187e-06, "loss": 0.5126, "step": 512 }, { "epoch": 0.22, "grad_norm": 0.674911757416486, "learning_rate": 4.999880672526757e-06, "loss": 0.5368, "step": 513 }, { "epoch": 0.22, "grad_norm": 0.764101502643344, "learning_rate": 4.999861608550116e-06, "loss": 0.5105, "step": 514 }, { "epoch": 0.22, "grad_norm": 0.6607294039970933, "learning_rate": 4.999841132481035e-06, "loss": 0.5231, "step": 515 }, { "epoch": 0.22, "grad_norm": 0.7724967744040852, "learning_rate": 4.999819244331078e-06, "loss": 0.5489, "step": 516 }, { "epoch": 0.22, "grad_norm": 0.6730774325770473, "learning_rate": 4.99979594411261e-06, "loss": 0.5326, "step": 517 }, { "epoch": 0.22, "grad_norm": 0.7136063574644014, "learning_rate": 4.999771231838792e-06, "loss": 0.5144, "step": 518 }, { "epoch": 0.22, "grad_norm": 0.6477743762407223, "learning_rate": 4.999745107523583e-06, "loss": 0.5111, "step": 519 }, { "epoch": 0.22, "grad_norm": 0.6916139988086453, "learning_rate": 4.999717571181742e-06, "loss": 0.529, "step": 520 }, { "epoch": 0.22, "grad_norm": 0.6564109960699416, "learning_rate": 4.999688622828821e-06, "loss": 0.5101, "step": 521 }, { "epoch": 0.22, "grad_norm": 0.6610425029900775, "learning_rate": 4.999658262481173e-06, "loss": 0.5095, "step": 522 }, { "epoch": 0.22, "grad_norm": 0.6062090231312307, "learning_rate": 4.999626490155947e-06, "loss": 0.5072, "step": 523 }, { "epoch": 0.22, "grad_norm": 0.629980273760487, "learning_rate": 4.999593305871091e-06, "loss": 0.5382, "step": 524 }, { "epoch": 0.22, "grad_norm": 0.6597246057010635, "learning_rate": 4.999558709645349e-06, "loss": 0.5307, "step": 525 }, { "epoch": 0.22, "grad_norm": 0.636745442438904, "learning_rate": 4.999522701498263e-06, "loss": 0.5302, "step": 526 }, { "epoch": 0.23, "grad_norm": 0.6324462143762214, "learning_rate": 4.999485281450174e-06, "loss": 0.4996, "step": 527 }, { "epoch": 0.23, "grad_norm": 0.6069558558098991, "learning_rate": 4.99944644952222e-06, "loss": 0.5058, "step": 528 }, { "epoch": 0.23, "grad_norm": 0.6006305147544736, "learning_rate": 4.999406205736334e-06, "loss": 0.5218, "step": 529 }, { "epoch": 0.23, "grad_norm": 0.6354602393667919, "learning_rate": 4.9993645501152485e-06, "loss": 0.498, "step": 530 }, { "epoch": 0.23, "grad_norm": 0.6299647790639523, "learning_rate": 4.999321482682495e-06, "loss": 0.5435, "step": 531 }, { "epoch": 0.23, "grad_norm": 0.5702190944475204, "learning_rate": 4.9992770034624e-06, "loss": 0.5259, "step": 532 }, { "epoch": 0.23, "grad_norm": 0.6629005033715545, "learning_rate": 4.999231112480088e-06, "loss": 0.5247, "step": 533 }, { "epoch": 0.23, "grad_norm": 0.6161756194165398, "learning_rate": 4.999183809761481e-06, "loss": 0.5233, "step": 534 }, { "epoch": 0.23, "grad_norm": 0.6143755401649461, "learning_rate": 4.999135095333301e-06, "loss": 0.4983, "step": 535 }, { "epoch": 0.23, "grad_norm": 0.6374604849318725, "learning_rate": 4.999084969223064e-06, "loss": 0.5407, "step": 536 }, { "epoch": 0.23, "grad_norm": 0.6489680492749198, "learning_rate": 4.999033431459084e-06, "loss": 0.5216, "step": 537 }, { "epoch": 0.23, "grad_norm": 0.6128330475540421, "learning_rate": 4.998980482070473e-06, "loss": 0.4956, "step": 538 }, { "epoch": 0.23, "grad_norm": 0.6084501316124987, "learning_rate": 4.998926121087142e-06, "loss": 0.5105, "step": 539 }, { "epoch": 0.23, "grad_norm": 0.6718312705515297, "learning_rate": 4.998870348539797e-06, "loss": 0.4978, "step": 540 }, { "epoch": 0.23, "grad_norm": 0.5917495579968527, "learning_rate": 4.998813164459942e-06, "loss": 0.5005, "step": 541 }, { "epoch": 0.23, "grad_norm": 0.7002888012401992, "learning_rate": 4.9987545688798765e-06, "loss": 0.5158, "step": 542 }, { "epoch": 0.23, "grad_norm": 0.6215562316702827, "learning_rate": 4.998694561832703e-06, "loss": 0.5063, "step": 543 }, { "epoch": 0.23, "grad_norm": 0.6463810547164883, "learning_rate": 4.998633143352315e-06, "loss": 0.5216, "step": 544 }, { "epoch": 0.23, "grad_norm": 0.6607126999162819, "learning_rate": 4.998570313473408e-06, "loss": 0.5176, "step": 545 }, { "epoch": 0.23, "grad_norm": 0.6896680501018091, "learning_rate": 4.998506072231469e-06, "loss": 0.5142, "step": 546 }, { "epoch": 0.23, "grad_norm": 0.64482281681983, "learning_rate": 4.99844041966279e-06, "loss": 0.5223, "step": 547 }, { "epoch": 0.23, "grad_norm": 0.6804316388777557, "learning_rate": 4.998373355804454e-06, "loss": 0.512, "step": 548 }, { "epoch": 0.23, "grad_norm": 0.6799252023969709, "learning_rate": 4.998304880694342e-06, "loss": 0.4962, "step": 549 }, { "epoch": 0.23, "grad_norm": 0.637365085524163, "learning_rate": 4.998234994371135e-06, "loss": 0.5234, "step": 550 }, { "epoch": 0.24, "grad_norm": 0.7036436945946638, "learning_rate": 4.99816369687431e-06, "loss": 0.5141, "step": 551 }, { "epoch": 0.24, "grad_norm": 0.6240264386156906, "learning_rate": 4.99809098824414e-06, "loss": 0.5365, "step": 552 }, { "epoch": 0.24, "grad_norm": 0.7403699319338876, "learning_rate": 4.998016868521695e-06, "loss": 0.5326, "step": 553 }, { "epoch": 0.24, "grad_norm": 0.6543848290779426, "learning_rate": 4.997941337748845e-06, "loss": 0.5146, "step": 554 }, { "epoch": 0.24, "grad_norm": 0.7042287590734285, "learning_rate": 4.997864395968252e-06, "loss": 0.5259, "step": 555 }, { "epoch": 0.24, "grad_norm": 0.6574049565188761, "learning_rate": 4.997786043223381e-06, "loss": 0.5137, "step": 556 }, { "epoch": 0.24, "grad_norm": 0.6165676577046679, "learning_rate": 4.9977062795584895e-06, "loss": 0.5473, "step": 557 }, { "epoch": 0.24, "grad_norm": 0.7044566948417562, "learning_rate": 4.997625105018634e-06, "loss": 0.5067, "step": 558 }, { "epoch": 0.24, "grad_norm": 0.6447891576595195, "learning_rate": 4.9975425196496656e-06, "loss": 0.5117, "step": 559 }, { "epoch": 0.24, "grad_norm": 0.6083915401749171, "learning_rate": 4.997458523498236e-06, "loss": 0.4752, "step": 560 }, { "epoch": 0.24, "grad_norm": 0.6990442708117952, "learning_rate": 4.997373116611792e-06, "loss": 0.5145, "step": 561 }, { "epoch": 0.24, "grad_norm": 0.6792107396650091, "learning_rate": 4.997286299038576e-06, "loss": 0.5143, "step": 562 }, { "epoch": 0.24, "grad_norm": 0.6274185410270217, "learning_rate": 4.997198070827629e-06, "loss": 0.5079, "step": 563 }, { "epoch": 0.24, "grad_norm": 0.7317503385718371, "learning_rate": 4.99710843202879e-06, "loss": 0.5058, "step": 564 }, { "epoch": 0.24, "grad_norm": 0.6602507519356526, "learning_rate": 4.99701738269269e-06, "loss": 0.5269, "step": 565 }, { "epoch": 0.24, "grad_norm": 0.7158785502037818, "learning_rate": 4.9969249228707625e-06, "loss": 0.5449, "step": 566 }, { "epoch": 0.24, "grad_norm": 0.6678036728294128, "learning_rate": 4.996831052615234e-06, "loss": 0.4996, "step": 567 }, { "epoch": 0.24, "grad_norm": 0.6635048447906537, "learning_rate": 4.996735771979129e-06, "loss": 0.5117, "step": 568 }, { "epoch": 0.24, "eval_loss": 0.517206609249115, "eval_runtime": 6917.186, "eval_samples_per_second": 41.981, "eval_steps_per_second": 2.099, "step": 568 }, { "epoch": 0.24, "grad_norm": 0.6631113641261142, "learning_rate": 4.996639081016268e-06, "loss": 0.5103, "step": 569 }, { "epoch": 0.24, "grad_norm": 0.6279444558937123, "learning_rate": 4.996540979781269e-06, "loss": 0.5397, "step": 570 }, { "epoch": 0.24, "grad_norm": 0.6876493931054276, "learning_rate": 4.996441468329547e-06, "loss": 0.521, "step": 571 }, { "epoch": 0.24, "grad_norm": 0.6664423383196236, "learning_rate": 4.996340546717312e-06, "loss": 0.5449, "step": 572 }, { "epoch": 0.24, "grad_norm": 0.6101029466977869, "learning_rate": 4.996238215001571e-06, "loss": 0.5443, "step": 573 }, { "epoch": 0.25, "grad_norm": 0.6622002597747987, "learning_rate": 4.99613447324013e-06, "loss": 0.5245, "step": 574 }, { "epoch": 0.25, "grad_norm": 0.60547382309911, "learning_rate": 4.996029321491587e-06, "loss": 0.5101, "step": 575 }, { "epoch": 0.25, "grad_norm": 0.6685376758007866, "learning_rate": 4.9959227598153395e-06, "loss": 0.5226, "step": 576 }, { "epoch": 0.25, "grad_norm": 0.6325127114162676, "learning_rate": 4.995814788271582e-06, "loss": 0.525, "step": 577 }, { "epoch": 0.25, "grad_norm": 0.6462009207619525, "learning_rate": 4.995705406921303e-06, "loss": 0.5181, "step": 578 }, { "epoch": 0.25, "grad_norm": 0.6514032258227322, "learning_rate": 4.995594615826289e-06, "loss": 0.5225, "step": 579 }, { "epoch": 0.25, "grad_norm": 0.6013786935845273, "learning_rate": 4.995482415049123e-06, "loss": 0.5105, "step": 580 }, { "epoch": 0.25, "grad_norm": 0.6722327524821109, "learning_rate": 4.995368804653182e-06, "loss": 0.5088, "step": 581 }, { "epoch": 0.25, "grad_norm": 0.6486288344146888, "learning_rate": 4.995253784702643e-06, "loss": 0.4924, "step": 582 }, { "epoch": 0.25, "grad_norm": 0.609052561499619, "learning_rate": 4.995137355262475e-06, "loss": 0.5091, "step": 583 }, { "epoch": 0.25, "grad_norm": 0.6149869407753081, "learning_rate": 4.995019516398447e-06, "loss": 0.522, "step": 584 }, { "epoch": 0.25, "grad_norm": 0.649711066334699, "learning_rate": 4.994900268177121e-06, "loss": 0.5067, "step": 585 }, { "epoch": 0.25, "grad_norm": 0.6492055338072015, "learning_rate": 4.994779610665858e-06, "loss": 0.5531, "step": 586 }, { "epoch": 0.25, "grad_norm": 0.6055259963194052, "learning_rate": 4.9946575439328124e-06, "loss": 0.4979, "step": 587 }, { "epoch": 0.25, "grad_norm": 0.5966649511641912, "learning_rate": 4.994534068046936e-06, "loss": 0.511, "step": 588 }, { "epoch": 0.25, "grad_norm": 0.6406292315887588, "learning_rate": 4.994409183077979e-06, "loss": 0.5423, "step": 589 }, { "epoch": 0.25, "grad_norm": 0.6273737605531128, "learning_rate": 4.99428288909648e-06, "loss": 0.5, "step": 590 }, { "epoch": 0.25, "grad_norm": 0.6106470129106254, "learning_rate": 4.994155186173782e-06, "loss": 0.5107, "step": 591 }, { "epoch": 0.25, "grad_norm": 0.6047596492090179, "learning_rate": 4.994026074382019e-06, "loss": 0.5267, "step": 592 }, { "epoch": 0.25, "grad_norm": 0.5986927472135537, "learning_rate": 4.993895553794123e-06, "loss": 0.5025, "step": 593 }, { "epoch": 0.25, "grad_norm": 0.6429583180993461, "learning_rate": 4.993763624483821e-06, "loss": 0.5158, "step": 594 }, { "epoch": 0.25, "grad_norm": 0.5881062972343085, "learning_rate": 4.993630286525634e-06, "loss": 0.5061, "step": 595 }, { "epoch": 0.25, "grad_norm": 0.6368950879373987, "learning_rate": 4.993495539994882e-06, "loss": 0.5203, "step": 596 }, { "epoch": 0.26, "grad_norm": 0.6716881530561206, "learning_rate": 4.99335938496768e-06, "loss": 0.5228, "step": 597 }, { "epoch": 0.26, "grad_norm": 0.6023179095017582, "learning_rate": 4.993221821520935e-06, "loss": 0.5349, "step": 598 }, { "epoch": 0.26, "grad_norm": 0.6118755989606954, "learning_rate": 4.993082849732353e-06, "loss": 0.5071, "step": 599 }, { "epoch": 0.26, "grad_norm": 0.6160351740855943, "learning_rate": 4.992942469680437e-06, "loss": 0.4796, "step": 600 }, { "epoch": 0.26, "grad_norm": 0.6291124034547457, "learning_rate": 4.99280068144448e-06, "loss": 0.5545, "step": 601 }, { "epoch": 0.26, "grad_norm": 0.6534616170585308, "learning_rate": 4.992657485104575e-06, "loss": 0.5358, "step": 602 }, { "epoch": 0.26, "grad_norm": 0.6264541141447443, "learning_rate": 4.99251288074161e-06, "loss": 0.5292, "step": 603 }, { "epoch": 0.26, "grad_norm": 0.6329545583821921, "learning_rate": 4.992366868437266e-06, "loss": 0.5032, "step": 604 }, { "epoch": 0.26, "grad_norm": 0.6262380789197445, "learning_rate": 4.992219448274022e-06, "loss": 0.5178, "step": 605 }, { "epoch": 0.26, "grad_norm": 0.6003543172945967, "learning_rate": 4.9920706203351495e-06, "loss": 0.5037, "step": 606 }, { "epoch": 0.26, "grad_norm": 0.6614794995436638, "learning_rate": 4.9919203847047185e-06, "loss": 0.5391, "step": 607 }, { "epoch": 0.26, "grad_norm": 0.6276911443763706, "learning_rate": 4.99176874146759e-06, "loss": 0.4837, "step": 608 }, { "epoch": 0.26, "grad_norm": 0.6140831194461105, "learning_rate": 4.9916156907094246e-06, "loss": 0.4944, "step": 609 }, { "epoch": 0.26, "grad_norm": 0.6520299553825838, "learning_rate": 4.991461232516675e-06, "loss": 0.4984, "step": 610 }, { "epoch": 0.26, "grad_norm": 0.6785930087303712, "learning_rate": 4.99130536697659e-06, "loss": 0.5289, "step": 611 }, { "epoch": 0.26, "grad_norm": 0.6751246335138484, "learning_rate": 4.991148094177212e-06, "loss": 0.517, "step": 612 }, { "epoch": 0.26, "grad_norm": 0.7063631237307217, "learning_rate": 4.990989414207381e-06, "loss": 0.4747, "step": 613 }, { "epoch": 0.26, "grad_norm": 0.633915838850146, "learning_rate": 4.990829327156729e-06, "loss": 0.5182, "step": 614 }, { "epoch": 0.26, "grad_norm": 0.6346239919506149, "learning_rate": 4.990667833115684e-06, "loss": 0.4991, "step": 615 }, { "epoch": 0.26, "grad_norm": 0.6922170062238953, "learning_rate": 4.990504932175471e-06, "loss": 0.4997, "step": 616 }, { "epoch": 0.26, "grad_norm": 0.6497250780582481, "learning_rate": 4.990340624428105e-06, "loss": 0.5025, "step": 617 }, { "epoch": 0.26, "grad_norm": 0.6480587404624075, "learning_rate": 4.990174909966399e-06, "loss": 0.5164, "step": 618 }, { "epoch": 0.26, "grad_norm": 0.5647887869895122, "learning_rate": 4.9900077888839606e-06, "loss": 0.4974, "step": 619 }, { "epoch": 0.26, "grad_norm": 0.6157405333654503, "learning_rate": 4.989839261275191e-06, "loss": 0.5119, "step": 620 }, { "epoch": 0.27, "grad_norm": 0.6350389596733836, "learning_rate": 4.989669327235285e-06, "loss": 0.5252, "step": 621 }, { "epoch": 0.27, "grad_norm": 0.584442178782563, "learning_rate": 4.989497986860234e-06, "loss": 0.5043, "step": 622 }, { "epoch": 0.27, "grad_norm": 0.6196796070950197, "learning_rate": 4.989325240246823e-06, "loss": 0.5176, "step": 623 }, { "epoch": 0.27, "grad_norm": 0.6015855990105409, "learning_rate": 4.98915108749263e-06, "loss": 0.5065, "step": 624 }, { "epoch": 0.27, "grad_norm": 0.5944351660408316, "learning_rate": 4.988975528696028e-06, "loss": 0.5027, "step": 625 }, { "epoch": 0.27, "grad_norm": 0.5799955435183154, "learning_rate": 4.988798563956186e-06, "loss": 0.4897, "step": 626 }, { "epoch": 0.27, "grad_norm": 0.6252189457639663, "learning_rate": 4.988620193373066e-06, "loss": 0.5185, "step": 627 }, { "epoch": 0.27, "grad_norm": 0.6091789072393431, "learning_rate": 4.988440417047424e-06, "loss": 0.5364, "step": 628 }, { "epoch": 0.27, "grad_norm": 0.5991774516054336, "learning_rate": 4.988259235080807e-06, "loss": 0.5218, "step": 629 }, { "epoch": 0.27, "grad_norm": 0.6004079133450607, "learning_rate": 4.988076647575562e-06, "loss": 0.5346, "step": 630 }, { "epoch": 0.27, "grad_norm": 0.6055525535102732, "learning_rate": 4.987892654634825e-06, "loss": 0.4794, "step": 631 }, { "epoch": 0.27, "grad_norm": 0.6231744592672184, "learning_rate": 4.987707256362529e-06, "loss": 0.5332, "step": 632 }, { "epoch": 0.27, "grad_norm": 0.5943003024735096, "learning_rate": 4.9875204528633995e-06, "loss": 0.4958, "step": 633 }, { "epoch": 0.27, "grad_norm": 0.6429217088606017, "learning_rate": 4.987332244242955e-06, "loss": 0.5049, "step": 634 }, { "epoch": 0.27, "grad_norm": 0.6474023001802436, "learning_rate": 4.98714263060751e-06, "loss": 0.4769, "step": 635 }, { "epoch": 0.27, "grad_norm": 0.6404945389649939, "learning_rate": 4.9869516120641705e-06, "loss": 0.4947, "step": 636 }, { "epoch": 0.27, "grad_norm": 0.6640610934148168, "learning_rate": 4.986759188720836e-06, "loss": 0.5258, "step": 637 }, { "epoch": 0.27, "grad_norm": 0.6232010288109409, "learning_rate": 4.986565360686201e-06, "loss": 0.5189, "step": 638 }, { "epoch": 0.27, "grad_norm": 0.6143458681147979, "learning_rate": 4.9863701280697535e-06, "loss": 0.4986, "step": 639 }, { "epoch": 0.27, "eval_loss": 0.5126649141311646, "eval_runtime": 6910.7711, "eval_samples_per_second": 42.02, "eval_steps_per_second": 2.101, "step": 639 }, { "epoch": 0.27, "grad_norm": 0.6231312288980349, "learning_rate": 4.986173490981773e-06, "loss": 0.5218, "step": 640 }, { "epoch": 0.27, "grad_norm": 0.6521330675888304, "learning_rate": 4.985975449533335e-06, "loss": 0.4984, "step": 641 }, { "epoch": 0.27, "grad_norm": 0.6192181654765082, "learning_rate": 4.9857760038363045e-06, "loss": 0.5257, "step": 642 }, { "epoch": 0.27, "grad_norm": 0.6486948358491045, "learning_rate": 4.9855751540033446e-06, "loss": 0.5147, "step": 643 }, { "epoch": 0.28, "grad_norm": 0.5969342371638264, "learning_rate": 4.985372900147907e-06, "loss": 0.5185, "step": 644 }, { "epoch": 0.28, "grad_norm": 0.6320704370838177, "learning_rate": 4.9851692423842406e-06, "loss": 0.5383, "step": 645 }, { "epoch": 0.28, "grad_norm": 0.5966997098167783, "learning_rate": 4.984964180827383e-06, "loss": 0.5044, "step": 646 }, { "epoch": 0.28, "grad_norm": 0.6109011858617819, "learning_rate": 4.984757715593168e-06, "loss": 0.5091, "step": 647 }, { "epoch": 0.28, "grad_norm": 0.653805713393856, "learning_rate": 4.984549846798221e-06, "loss": 0.5161, "step": 648 }, { "epoch": 0.28, "grad_norm": 0.6199215842335395, "learning_rate": 4.984340574559961e-06, "loss": 0.5152, "step": 649 }, { "epoch": 0.28, "grad_norm": 0.6068098393695217, "learning_rate": 4.984129898996599e-06, "loss": 0.5209, "step": 650 }, { "epoch": 0.28, "grad_norm": 0.5796411094349677, "learning_rate": 4.9839178202271375e-06, "loss": 0.502, "step": 651 }, { "epoch": 0.28, "grad_norm": 0.6053468178000233, "learning_rate": 4.983704338371375e-06, "loss": 0.5283, "step": 652 }, { "epoch": 0.28, "grad_norm": 0.6156634669849913, "learning_rate": 4.983489453549901e-06, "loss": 0.4947, "step": 653 }, { "epoch": 0.28, "grad_norm": 0.6241545798810763, "learning_rate": 4.983273165884096e-06, "loss": 0.5176, "step": 654 }, { "epoch": 0.28, "grad_norm": 0.5895664409761231, "learning_rate": 4.983055475496134e-06, "loss": 0.5105, "step": 655 }, { "epoch": 0.28, "grad_norm": 0.5935426249245771, "learning_rate": 4.982836382508981e-06, "loss": 0.5, "step": 656 }, { "epoch": 0.28, "grad_norm": 0.6233645788446041, "learning_rate": 4.9826158870463955e-06, "loss": 0.5099, "step": 657 }, { "epoch": 0.28, "grad_norm": 0.5991332439052809, "learning_rate": 4.982393989232931e-06, "loss": 0.4924, "step": 658 }, { "epoch": 0.28, "grad_norm": 0.5880708311480832, "learning_rate": 4.982170689193927e-06, "loss": 0.4978, "step": 659 }, { "epoch": 0.28, "grad_norm": 0.627672707017739, "learning_rate": 4.981945987055521e-06, "loss": 0.5077, "step": 660 }, { "epoch": 0.28, "grad_norm": 0.6040524425886902, "learning_rate": 4.981719882944639e-06, "loss": 0.493, "step": 661 }, { "epoch": 0.28, "grad_norm": 0.5875699138537736, "learning_rate": 4.981492376989001e-06, "loss": 0.506, "step": 662 }, { "epoch": 0.28, "grad_norm": 0.6071888999132754, "learning_rate": 4.981263469317116e-06, "loss": 0.4954, "step": 663 }, { "epoch": 0.28, "grad_norm": 0.6343036259474204, "learning_rate": 4.981033160058289e-06, "loss": 0.5302, "step": 664 }, { "epoch": 0.28, "grad_norm": 0.6270477353053345, "learning_rate": 4.9808014493426124e-06, "loss": 0.4676, "step": 665 }, { "epoch": 0.28, "grad_norm": 0.5957359905347063, "learning_rate": 4.9805683373009746e-06, "loss": 0.5108, "step": 666 }, { "epoch": 0.28, "grad_norm": 0.6267707786737462, "learning_rate": 4.980333824065051e-06, "loss": 0.5293, "step": 667 }, { "epoch": 0.29, "grad_norm": 0.577921156213165, "learning_rate": 4.980097909767311e-06, "loss": 0.5115, "step": 668 }, { "epoch": 0.29, "grad_norm": 0.6306888663442499, "learning_rate": 4.9798605945410156e-06, "loss": 0.5123, "step": 669 }, { "epoch": 0.29, "grad_norm": 0.5963346695333823, "learning_rate": 4.979621878520217e-06, "loss": 0.5026, "step": 670 }, { "epoch": 0.29, "grad_norm": 0.590690163604036, "learning_rate": 4.979381761839757e-06, "loss": 0.503, "step": 671 }, { "epoch": 0.29, "grad_norm": 0.6410334373270873, "learning_rate": 4.979140244635271e-06, "loss": 0.5116, "step": 672 }, { "epoch": 0.29, "grad_norm": 0.6105251339526676, "learning_rate": 4.978897327043185e-06, "loss": 0.503, "step": 673 }, { "epoch": 0.29, "grad_norm": 0.611118980970183, "learning_rate": 4.978653009200713e-06, "loss": 0.5005, "step": 674 }, { "epoch": 0.29, "grad_norm": 0.61089260561556, "learning_rate": 4.978407291245866e-06, "loss": 0.4745, "step": 675 }, { "epoch": 0.29, "grad_norm": 0.6296501736035641, "learning_rate": 4.978160173317439e-06, "loss": 0.5413, "step": 676 }, { "epoch": 0.29, "grad_norm": 0.5754436830327142, "learning_rate": 4.977911655555022e-06, "loss": 0.5258, "step": 677 }, { "epoch": 0.29, "grad_norm": 0.6080486185142544, "learning_rate": 4.977661738098996e-06, "loss": 0.5022, "step": 678 }, { "epoch": 0.29, "grad_norm": 0.6095469549060126, "learning_rate": 4.97741042109053e-06, "loss": 0.5236, "step": 679 }, { "epoch": 0.29, "grad_norm": 0.6246514201425122, "learning_rate": 4.977157704671585e-06, "loss": 0.4948, "step": 680 }, { "epoch": 0.29, "grad_norm": 0.6043169735120094, "learning_rate": 4.976903588984913e-06, "loss": 0.5174, "step": 681 }, { "epoch": 0.29, "grad_norm": 0.5782841931005864, "learning_rate": 4.976648074174056e-06, "loss": 0.4935, "step": 682 }, { "epoch": 0.29, "grad_norm": 0.6172869080905491, "learning_rate": 4.976391160383347e-06, "loss": 0.5068, "step": 683 }, { "epoch": 0.29, "grad_norm": 0.5986800192103562, "learning_rate": 4.976132847757906e-06, "loss": 0.5293, "step": 684 }, { "epoch": 0.29, "grad_norm": 0.5997059239072083, "learning_rate": 4.975873136443649e-06, "loss": 0.5311, "step": 685 }, { "epoch": 0.29, "grad_norm": 0.586987259487267, "learning_rate": 4.9756120265872755e-06, "loss": 0.513, "step": 686 }, { "epoch": 0.29, "grad_norm": 0.6278820584072181, "learning_rate": 4.97534951833628e-06, "loss": 0.5278, "step": 687 }, { "epoch": 0.29, "grad_norm": 0.5792346089546118, "learning_rate": 4.975085611838944e-06, "loss": 0.5048, "step": 688 }, { "epoch": 0.29, "grad_norm": 0.5926608888246347, "learning_rate": 4.974820307244341e-06, "loss": 0.4979, "step": 689 }, { "epoch": 0.29, "grad_norm": 0.5905921659974432, "learning_rate": 4.974553604702332e-06, "loss": 0.4977, "step": 690 }, { "epoch": 0.3, "grad_norm": 0.6604840701536976, "learning_rate": 4.974285504363569e-06, "loss": 0.504, "step": 691 }, { "epoch": 0.3, "grad_norm": 0.6243772158616288, "learning_rate": 4.974016006379495e-06, "loss": 0.5288, "step": 692 }, { "epoch": 0.3, "grad_norm": 0.6240076780093031, "learning_rate": 4.973745110902339e-06, "loss": 0.5189, "step": 693 }, { "epoch": 0.3, "grad_norm": 0.6107829102505847, "learning_rate": 4.973472818085122e-06, "loss": 0.5359, "step": 694 }, { "epoch": 0.3, "grad_norm": 0.6345275208943483, "learning_rate": 4.9731991280816534e-06, "loss": 0.5096, "step": 695 }, { "epoch": 0.3, "grad_norm": 0.6136833545759933, "learning_rate": 4.9729240410465315e-06, "loss": 0.5048, "step": 696 }, { "epoch": 0.3, "grad_norm": 0.6405628475705748, "learning_rate": 4.972647557135146e-06, "loss": 0.5234, "step": 697 }, { "epoch": 0.3, "grad_norm": 0.5807229777227344, "learning_rate": 4.972369676503672e-06, "loss": 0.516, "step": 698 }, { "epoch": 0.3, "grad_norm": 0.6808588571301922, "learning_rate": 4.972090399309075e-06, "loss": 0.4935, "step": 699 }, { "epoch": 0.3, "grad_norm": 0.598882478841374, "learning_rate": 4.971809725709112e-06, "loss": 0.5167, "step": 700 }, { "epoch": 0.3, "grad_norm": 0.6597647873322787, "learning_rate": 4.971527655862325e-06, "loss": 0.4986, "step": 701 }, { "epoch": 0.3, "grad_norm": 0.6205471007307612, "learning_rate": 4.9712441899280475e-06, "loss": 0.481, "step": 702 }, { "epoch": 0.3, "grad_norm": 0.6501750674785319, "learning_rate": 4.970959328066399e-06, "loss": 0.5141, "step": 703 }, { "epoch": 0.3, "grad_norm": 0.6389770637630756, "learning_rate": 4.97067307043829e-06, "loss": 0.5045, "step": 704 }, { "epoch": 0.3, "grad_norm": 0.6071448493335176, "learning_rate": 4.970385417205418e-06, "loss": 0.5231, "step": 705 }, { "epoch": 0.3, "grad_norm": 0.6307400799413926, "learning_rate": 4.9700963685302685e-06, "loss": 0.5132, "step": 706 }, { "epoch": 0.3, "grad_norm": 0.6345555266447565, "learning_rate": 4.969805924576116e-06, "loss": 0.5165, "step": 707 }, { "epoch": 0.3, "grad_norm": 0.5992064057317342, "learning_rate": 4.969514085507025e-06, "loss": 0.4933, "step": 708 }, { "epoch": 0.3, "grad_norm": 0.6327334322592149, "learning_rate": 4.9692208514878445e-06, "loss": 0.4959, "step": 709 }, { "epoch": 0.3, "grad_norm": 0.6250597389392478, "learning_rate": 4.968926222684213e-06, "loss": 0.5097, "step": 710 }, { "epoch": 0.3, "eval_loss": 0.5087815523147583, "eval_runtime": 6910.3283, "eval_samples_per_second": 42.022, "eval_steps_per_second": 2.101, "step": 710 }, { "epoch": 0.3, "grad_norm": 0.6069692793366778, "learning_rate": 4.9686301992625575e-06, "loss": 0.4892, "step": 711 }, { "epoch": 0.3, "grad_norm": 0.6516138876780239, "learning_rate": 4.968332781390092e-06, "loss": 0.5303, "step": 712 }, { "epoch": 0.3, "grad_norm": 0.6143846915255134, "learning_rate": 4.968033969234818e-06, "loss": 0.4919, "step": 713 }, { "epoch": 0.31, "grad_norm": 0.5981365954582023, "learning_rate": 4.967733762965526e-06, "loss": 0.5053, "step": 714 }, { "epoch": 0.31, "grad_norm": 0.6237699240689051, "learning_rate": 4.967432162751792e-06, "loss": 0.4727, "step": 715 }, { "epoch": 0.31, "grad_norm": 0.666141553803713, "learning_rate": 4.967129168763981e-06, "loss": 0.4991, "step": 716 }, { "epoch": 0.31, "grad_norm": 0.646564036075848, "learning_rate": 4.966824781173245e-06, "loss": 0.5251, "step": 717 }, { "epoch": 0.31, "grad_norm": 0.6401865563361007, "learning_rate": 4.966519000151522e-06, "loss": 0.4882, "step": 718 }, { "epoch": 0.31, "grad_norm": 0.6532940949381087, "learning_rate": 4.966211825871538e-06, "loss": 0.4859, "step": 719 }, { "epoch": 0.31, "grad_norm": 0.6240667658033411, "learning_rate": 4.965903258506806e-06, "loss": 0.5165, "step": 720 }, { "epoch": 0.31, "grad_norm": 0.6050182788187826, "learning_rate": 4.965593298231627e-06, "loss": 0.4972, "step": 721 }, { "epoch": 0.31, "grad_norm": 0.6394796781675777, "learning_rate": 4.965281945221086e-06, "loss": 0.4863, "step": 722 }, { "epoch": 0.31, "grad_norm": 0.5879647113680488, "learning_rate": 4.964969199651059e-06, "loss": 0.5268, "step": 723 }, { "epoch": 0.31, "grad_norm": 0.6719126547265977, "learning_rate": 4.964655061698204e-06, "loss": 0.5094, "step": 724 }, { "epoch": 0.31, "grad_norm": 0.6247081850307215, "learning_rate": 4.964339531539967e-06, "loss": 0.5042, "step": 725 }, { "epoch": 0.31, "grad_norm": 0.6448631581908335, "learning_rate": 4.964022609354583e-06, "loss": 0.548, "step": 726 }, { "epoch": 0.31, "grad_norm": 0.6155726808462035, "learning_rate": 4.963704295321069e-06, "loss": 0.5079, "step": 727 }, { "epoch": 0.31, "grad_norm": 0.5856540757636703, "learning_rate": 4.963384589619233e-06, "loss": 0.5344, "step": 728 }, { "epoch": 0.31, "grad_norm": 0.6160053187700611, "learning_rate": 4.963063492429665e-06, "loss": 0.5174, "step": 729 }, { "epoch": 0.31, "grad_norm": 0.6092787845104115, "learning_rate": 4.9627410039337426e-06, "loss": 0.5069, "step": 730 }, { "epoch": 0.31, "grad_norm": 0.572582103354353, "learning_rate": 4.96241712431363e-06, "loss": 0.4989, "step": 731 }, { "epoch": 0.31, "grad_norm": 0.5711072343879144, "learning_rate": 4.962091853752276e-06, "loss": 0.5148, "step": 732 }, { "epoch": 0.31, "grad_norm": 0.6061761244724242, "learning_rate": 4.961765192433415e-06, "loss": 0.5287, "step": 733 }, { "epoch": 0.31, "grad_norm": 0.6033994961802631, "learning_rate": 4.961437140541569e-06, "loss": 0.4927, "step": 734 }, { "epoch": 0.31, "grad_norm": 0.6209778806623933, "learning_rate": 4.9611076982620445e-06, "loss": 0.5358, "step": 735 }, { "epoch": 0.31, "grad_norm": 0.6501849583238293, "learning_rate": 4.960776865780931e-06, "loss": 0.5183, "step": 736 }, { "epoch": 0.31, "grad_norm": 0.5740429036937207, "learning_rate": 4.9604446432851064e-06, "loss": 0.481, "step": 737 }, { "epoch": 0.32, "grad_norm": 0.5986038003667088, "learning_rate": 4.960111030962232e-06, "loss": 0.5035, "step": 738 }, { "epoch": 0.32, "grad_norm": 0.6269593371981165, "learning_rate": 4.959776029000756e-06, "loss": 0.5019, "step": 739 }, { "epoch": 0.32, "grad_norm": 0.586820399261423, "learning_rate": 4.959439637589909e-06, "loss": 0.4907, "step": 740 }, { "epoch": 0.32, "grad_norm": 0.591243683238131, "learning_rate": 4.959101856919709e-06, "loss": 0.5128, "step": 741 }, { "epoch": 0.32, "grad_norm": 0.6043813840833816, "learning_rate": 4.9587626871809564e-06, "loss": 0.5254, "step": 742 }, { "epoch": 0.32, "grad_norm": 0.6088008476429794, "learning_rate": 4.958422128565238e-06, "loss": 0.5099, "step": 743 }, { "epoch": 0.32, "grad_norm": 0.5919159624960704, "learning_rate": 4.958080181264926e-06, "loss": 0.4813, "step": 744 }, { "epoch": 0.32, "grad_norm": 0.5767332938769787, "learning_rate": 4.957736845473173e-06, "loss": 0.5118, "step": 745 }, { "epoch": 0.32, "grad_norm": 0.6115434997604602, "learning_rate": 4.957392121383919e-06, "loss": 0.5178, "step": 746 }, { "epoch": 0.32, "grad_norm": 0.606547936666955, "learning_rate": 4.957046009191889e-06, "loss": 0.5125, "step": 747 }, { "epoch": 0.32, "grad_norm": 0.6281682365690769, "learning_rate": 4.956698509092591e-06, "loss": 0.5302, "step": 748 }, { "epoch": 0.32, "grad_norm": 0.6041160141331913, "learning_rate": 4.956349621282315e-06, "loss": 0.4981, "step": 749 }, { "epoch": 0.32, "grad_norm": 0.6082725831189176, "learning_rate": 4.9559993459581375e-06, "loss": 0.5022, "step": 750 }, { "epoch": 0.32, "grad_norm": 0.6308489518539907, "learning_rate": 4.9556476833179185e-06, "loss": 0.5145, "step": 751 }, { "epoch": 0.32, "grad_norm": 0.6122107491320039, "learning_rate": 4.9552946335603006e-06, "loss": 0.4803, "step": 752 }, { "epoch": 0.32, "grad_norm": 0.6247807256313168, "learning_rate": 4.95494019688471e-06, "loss": 0.5033, "step": 753 }, { "epoch": 0.32, "grad_norm": 0.6133850387446359, "learning_rate": 4.954584373491357e-06, "loss": 0.5094, "step": 754 }, { "epoch": 0.32, "grad_norm": 0.6610062403122814, "learning_rate": 4.954227163581234e-06, "loss": 0.5246, "step": 755 }, { "epoch": 0.32, "grad_norm": 0.5719697098479033, "learning_rate": 4.953868567356121e-06, "loss": 0.4986, "step": 756 }, { "epoch": 0.32, "grad_norm": 0.611277655866564, "learning_rate": 4.953508585018573e-06, "loss": 0.5084, "step": 757 }, { "epoch": 0.32, "grad_norm": 0.570613285492476, "learning_rate": 4.953147216771935e-06, "loss": 0.5154, "step": 758 }, { "epoch": 0.32, "grad_norm": 0.6081820557545713, "learning_rate": 4.952784462820333e-06, "loss": 0.5217, "step": 759 }, { "epoch": 0.32, "grad_norm": 0.6233401854792252, "learning_rate": 4.952420323368673e-06, "loss": 0.4905, "step": 760 }, { "epoch": 0.33, "grad_norm": 0.5903715996857434, "learning_rate": 4.952054798622649e-06, "loss": 0.4612, "step": 761 }, { "epoch": 0.33, "grad_norm": 0.5976026591907524, "learning_rate": 4.951687888788731e-06, "loss": 0.5114, "step": 762 }, { "epoch": 0.33, "grad_norm": 0.6424120319126979, "learning_rate": 4.9513195940741764e-06, "loss": 0.4839, "step": 763 }, { "epoch": 0.33, "grad_norm": 0.6168679097191889, "learning_rate": 4.950949914687024e-06, "loss": 0.5134, "step": 764 }, { "epoch": 0.33, "grad_norm": 0.6400860199848029, "learning_rate": 4.950578850836092e-06, "loss": 0.5087, "step": 765 }, { "epoch": 0.33, "grad_norm": 0.6011473106721757, "learning_rate": 4.950206402730984e-06, "loss": 0.526, "step": 766 }, { "epoch": 0.33, "grad_norm": 0.6076752750689335, "learning_rate": 4.949832570582083e-06, "loss": 0.5243, "step": 767 }, { "epoch": 0.33, "grad_norm": 0.611589146611987, "learning_rate": 4.949457354600556e-06, "loss": 0.5602, "step": 768 }, { "epoch": 0.33, "grad_norm": 0.6379045005420435, "learning_rate": 4.94908075499835e-06, "loss": 0.485, "step": 769 }, { "epoch": 0.33, "grad_norm": 0.6111544996406524, "learning_rate": 4.948702771988195e-06, "loss": 0.4961, "step": 770 }, { "epoch": 0.33, "grad_norm": 0.6440163967432894, "learning_rate": 4.9483234057836e-06, "loss": 0.5247, "step": 771 }, { "epoch": 0.33, "grad_norm": 0.6424136239524529, "learning_rate": 4.9479426565988585e-06, "loss": 0.5095, "step": 772 }, { "epoch": 0.33, "grad_norm": 0.5845387618126129, "learning_rate": 4.947560524649043e-06, "loss": 0.4667, "step": 773 }, { "epoch": 0.33, "grad_norm": 0.5581508613420673, "learning_rate": 4.947177010150007e-06, "loss": 0.4915, "step": 774 }, { "epoch": 0.33, "grad_norm": 0.6002089992364806, "learning_rate": 4.9467921133183864e-06, "loss": 0.4957, "step": 775 }, { "epoch": 0.33, "grad_norm": 0.571455007692429, "learning_rate": 4.946405834371598e-06, "loss": 0.4998, "step": 776 }, { "epoch": 0.33, "grad_norm": 0.5996615965227683, "learning_rate": 4.9460181735278365e-06, "loss": 0.5058, "step": 777 }, { "epoch": 0.33, "grad_norm": 0.5933857632393196, "learning_rate": 4.945629131006081e-06, "loss": 0.5165, "step": 778 }, { "epoch": 0.33, "grad_norm": 0.5922936609778912, "learning_rate": 4.945238707026087e-06, "loss": 0.497, "step": 779 }, { "epoch": 0.33, "grad_norm": 0.6468383832092212, "learning_rate": 4.944846901808397e-06, "loss": 0.4988, "step": 780 }, { "epoch": 0.33, "grad_norm": 0.5963282275866615, "learning_rate": 4.9444537155743245e-06, "loss": 0.5146, "step": 781 }, { "epoch": 0.33, "eval_loss": 0.5058240294456482, "eval_runtime": 6911.9149, "eval_samples_per_second": 42.013, "eval_steps_per_second": 2.101, "step": 781 }, { "epoch": 0.33, "grad_norm": 0.616517236291001, "learning_rate": 4.944059148545971e-06, "loss": 0.5123, "step": 782 }, { "epoch": 0.33, "grad_norm": 0.6002931009806305, "learning_rate": 4.943663200946213e-06, "loss": 0.501, "step": 783 }, { "epoch": 0.33, "grad_norm": 0.5875304361967715, "learning_rate": 4.94326587299871e-06, "loss": 0.4847, "step": 784 }, { "epoch": 0.34, "grad_norm": 0.5836573087771422, "learning_rate": 4.942867164927899e-06, "loss": 0.4998, "step": 785 }, { "epoch": 0.34, "grad_norm": 0.5783319062666641, "learning_rate": 4.942467076958999e-06, "loss": 0.4989, "step": 786 }, { "epoch": 0.34, "grad_norm": 0.6295939909126551, "learning_rate": 4.9420656093180056e-06, "loss": 0.4886, "step": 787 }, { "epoch": 0.34, "grad_norm": 0.5808772327348201, "learning_rate": 4.941662762231695e-06, "loss": 0.5036, "step": 788 }, { "epoch": 0.34, "grad_norm": 0.5757656711915483, "learning_rate": 4.9412585359276235e-06, "loss": 0.4892, "step": 789 }, { "epoch": 0.34, "grad_norm": 0.6156971766935416, "learning_rate": 4.940852930634126e-06, "loss": 0.5189, "step": 790 }, { "epoch": 0.34, "grad_norm": 0.5964903100014775, "learning_rate": 4.940445946580315e-06, "loss": 0.4893, "step": 791 }, { "epoch": 0.34, "grad_norm": 0.5930502537178118, "learning_rate": 4.9400375839960826e-06, "loss": 0.4854, "step": 792 }, { "epoch": 0.34, "grad_norm": 0.6216632090251537, "learning_rate": 4.939627843112102e-06, "loss": 0.52, "step": 793 }, { "epoch": 0.34, "grad_norm": 0.5981307705816444, "learning_rate": 4.939216724159821e-06, "loss": 0.4924, "step": 794 }, { "epoch": 0.34, "grad_norm": 0.6171104301125759, "learning_rate": 4.938804227371467e-06, "loss": 0.4949, "step": 795 }, { "epoch": 0.34, "grad_norm": 0.6232653150523578, "learning_rate": 4.938390352980049e-06, "loss": 0.5276, "step": 796 }, { "epoch": 0.34, "grad_norm": 0.6319806108460523, "learning_rate": 4.93797510121935e-06, "loss": 0.4867, "step": 797 }, { "epoch": 0.34, "grad_norm": 0.6373229220396098, "learning_rate": 4.937558472323932e-06, "loss": 0.5175, "step": 798 }, { "epoch": 0.34, "grad_norm": 0.6381263394133697, "learning_rate": 4.937140466529135e-06, "loss": 0.5212, "step": 799 }, { "epoch": 0.34, "grad_norm": 0.6889087852800232, "learning_rate": 4.936721084071079e-06, "loss": 0.5068, "step": 800 }, { "epoch": 0.34, "grad_norm": 0.6217485975923667, "learning_rate": 4.936300325186659e-06, "loss": 0.4926, "step": 801 }, { "epoch": 0.34, "grad_norm": 0.571769849631971, "learning_rate": 4.9358781901135485e-06, "loss": 0.509, "step": 802 }, { "epoch": 0.34, "grad_norm": 0.6780444728173729, "learning_rate": 4.935454679090197e-06, "loss": 0.5165, "step": 803 }, { "epoch": 0.34, "grad_norm": 0.6076278839686122, "learning_rate": 4.935029792355834e-06, "loss": 0.5039, "step": 804 }, { "epoch": 0.34, "grad_norm": 0.6538852692558306, "learning_rate": 4.9346035301504644e-06, "loss": 0.4962, "step": 805 }, { "epoch": 0.34, "grad_norm": 0.6171376333190426, "learning_rate": 4.934175892714869e-06, "loss": 0.5112, "step": 806 }, { "epoch": 0.34, "grad_norm": 0.6558521386937952, "learning_rate": 4.933746880290607e-06, "loss": 0.4807, "step": 807 }, { "epoch": 0.35, "grad_norm": 0.5755989068902357, "learning_rate": 4.933316493120015e-06, "loss": 0.4821, "step": 808 }, { "epoch": 0.35, "grad_norm": 0.6612931100425695, "learning_rate": 4.932884731446204e-06, "loss": 0.517, "step": 809 }, { "epoch": 0.35, "grad_norm": 0.6216763697335129, "learning_rate": 4.932451595513063e-06, "loss": 0.5269, "step": 810 }, { "epoch": 0.35, "grad_norm": 0.6136117850372235, "learning_rate": 4.932017085565256e-06, "loss": 0.5197, "step": 811 }, { "epoch": 0.35, "grad_norm": 0.5783344700139442, "learning_rate": 4.931581201848224e-06, "loss": 0.5167, "step": 812 }, { "epoch": 0.35, "grad_norm": 0.5691685860925245, "learning_rate": 4.931143944608184e-06, "loss": 0.4957, "step": 813 }, { "epoch": 0.35, "grad_norm": 0.6136146632090349, "learning_rate": 4.93070531409213e-06, "loss": 0.4923, "step": 814 }, { "epoch": 0.35, "grad_norm": 0.6146071241365089, "learning_rate": 4.930265310547829e-06, "loss": 0.498, "step": 815 }, { "epoch": 0.35, "grad_norm": 0.6002352821905501, "learning_rate": 4.9298239342238255e-06, "loss": 0.4994, "step": 816 }, { "epoch": 0.35, "grad_norm": 0.5889451517506399, "learning_rate": 4.929381185369438e-06, "loss": 0.5001, "step": 817 }, { "epoch": 0.35, "grad_norm": 0.6082258257150579, "learning_rate": 4.928937064234764e-06, "loss": 0.5521, "step": 818 }, { "epoch": 0.35, "grad_norm": 0.6330482315113839, "learning_rate": 4.928491571070669e-06, "loss": 0.5219, "step": 819 }, { "epoch": 0.35, "grad_norm": 0.6004600333139476, "learning_rate": 4.928044706128803e-06, "loss": 0.5005, "step": 820 }, { "epoch": 0.35, "grad_norm": 0.6189100444908509, "learning_rate": 4.927596469661582e-06, "loss": 0.5176, "step": 821 }, { "epoch": 0.35, "grad_norm": 0.606015342163562, "learning_rate": 4.9271468619222015e-06, "loss": 0.496, "step": 822 }, { "epoch": 0.35, "grad_norm": 0.8556526555027408, "learning_rate": 4.926695883164632e-06, "loss": 0.5257, "step": 823 }, { "epoch": 0.35, "grad_norm": 0.6258732371599479, "learning_rate": 4.926243533643615e-06, "loss": 0.487, "step": 824 }, { "epoch": 0.35, "grad_norm": 0.5994220294037795, "learning_rate": 4.92578981361467e-06, "loss": 0.5072, "step": 825 }, { "epoch": 0.35, "grad_norm": 0.5716588082101919, "learning_rate": 4.925334723334088e-06, "loss": 0.4994, "step": 826 }, { "epoch": 0.35, "grad_norm": 0.585486108238998, "learning_rate": 4.924878263058937e-06, "loss": 0.5226, "step": 827 }, { "epoch": 0.35, "grad_norm": 0.5747575013418114, "learning_rate": 4.924420433047055e-06, "loss": 0.4965, "step": 828 }, { "epoch": 0.35, "grad_norm": 0.5919850393540613, "learning_rate": 4.9239612335570555e-06, "loss": 0.4948, "step": 829 }, { "epoch": 0.35, "grad_norm": 0.5830251031451256, "learning_rate": 4.923500664848327e-06, "loss": 0.487, "step": 830 }, { "epoch": 0.36, "grad_norm": 0.6178725623700653, "learning_rate": 4.923038727181028e-06, "loss": 0.4862, "step": 831 }, { "epoch": 0.36, "grad_norm": 0.5814875278732987, "learning_rate": 4.922575420816095e-06, "loss": 0.5166, "step": 832 }, { "epoch": 0.36, "grad_norm": 0.5986635615376367, "learning_rate": 4.922110746015234e-06, "loss": 0.489, "step": 833 }, { "epoch": 0.36, "grad_norm": 0.5859519526703544, "learning_rate": 4.921644703040925e-06, "loss": 0.4906, "step": 834 }, { "epoch": 0.36, "grad_norm": 0.5743263620222753, "learning_rate": 4.9211772921564205e-06, "loss": 0.4843, "step": 835 }, { "epoch": 0.36, "grad_norm": 0.8492032385838142, "learning_rate": 4.920708513625746e-06, "loss": 0.5099, "step": 836 }, { "epoch": 0.36, "grad_norm": 0.6140314776814894, "learning_rate": 4.9202383677137005e-06, "loss": 0.5006, "step": 837 }, { "epoch": 0.36, "grad_norm": 0.6527049634117856, "learning_rate": 4.919766854685854e-06, "loss": 0.5264, "step": 838 }, { "epoch": 0.36, "grad_norm": 0.6114157924940975, "learning_rate": 4.91929397480855e-06, "loss": 0.4989, "step": 839 }, { "epoch": 0.36, "grad_norm": 0.660553349533273, "learning_rate": 4.918819728348901e-06, "loss": 0.4894, "step": 840 }, { "epoch": 0.36, "grad_norm": 0.6011284115605714, "learning_rate": 4.918344115574797e-06, "loss": 0.5038, "step": 841 }, { "epoch": 0.36, "grad_norm": 0.5690628939882818, "learning_rate": 4.917867136754894e-06, "loss": 0.5419, "step": 842 }, { "epoch": 0.36, "grad_norm": 0.5962796608244493, "learning_rate": 4.917388792158623e-06, "loss": 0.4726, "step": 843 }, { "epoch": 0.36, "grad_norm": 0.6420232226244765, "learning_rate": 4.9169090820561845e-06, "loss": 0.5121, "step": 844 }, { "epoch": 0.36, "grad_norm": 0.5977037225796433, "learning_rate": 4.916428006718555e-06, "loss": 0.5125, "step": 845 }, { "epoch": 0.36, "grad_norm": 0.6028773887667327, "learning_rate": 4.9159455664174756e-06, "loss": 0.4987, "step": 846 }, { "epoch": 0.36, "grad_norm": 0.6366562430011422, "learning_rate": 4.9154617614254616e-06, "loss": 0.482, "step": 847 }, { "epoch": 0.36, "grad_norm": 0.6591094862661601, "learning_rate": 4.914976592015801e-06, "loss": 0.5366, "step": 848 }, { "epoch": 0.36, "grad_norm": 0.5958159768787876, "learning_rate": 4.914490058462549e-06, "loss": 0.4975, "step": 849 }, { "epoch": 0.36, "grad_norm": 0.60494927183394, "learning_rate": 4.9140021610405335e-06, "loss": 0.5224, "step": 850 }, { "epoch": 0.36, "grad_norm": 0.6105732826310788, "learning_rate": 4.913512900025351e-06, "loss": 0.52, "step": 851 }, { "epoch": 0.36, "grad_norm": 0.5967159340815495, "learning_rate": 4.913022275693372e-06, "loss": 0.4667, "step": 852 }, { "epoch": 0.36, "eval_loss": 0.5032065510749817, "eval_runtime": 6906.5208, "eval_samples_per_second": 42.046, "eval_steps_per_second": 2.102, "step": 852 }, { "epoch": 0.36, "grad_norm": 0.6009525199808483, "learning_rate": 4.912530288321733e-06, "loss": 0.4868, "step": 853 }, { "epoch": 0.36, "grad_norm": 0.5830517125297875, "learning_rate": 4.912036938188342e-06, "loss": 0.5266, "step": 854 }, { "epoch": 0.37, "grad_norm": 0.5889940979301209, "learning_rate": 4.911542225571877e-06, "loss": 0.5029, "step": 855 }, { "epoch": 0.37, "grad_norm": 0.6026973021517633, "learning_rate": 4.911046150751786e-06, "loss": 0.517, "step": 856 }, { "epoch": 0.37, "grad_norm": 0.5785861902321248, "learning_rate": 4.910548714008285e-06, "loss": 0.4926, "step": 857 }, { "epoch": 0.37, "grad_norm": 0.6270047301502678, "learning_rate": 4.910049915622361e-06, "loss": 0.5154, "step": 858 }, { "epoch": 0.37, "grad_norm": 0.5882468386280117, "learning_rate": 4.90954975587577e-06, "loss": 0.5096, "step": 859 }, { "epoch": 0.37, "grad_norm": 0.5663550702566633, "learning_rate": 4.909048235051033e-06, "loss": 0.4908, "step": 860 }, { "epoch": 0.37, "grad_norm": 0.6017236125989268, "learning_rate": 4.9085453534314474e-06, "loss": 0.5193, "step": 861 }, { "epoch": 0.37, "grad_norm": 0.6222382484211336, "learning_rate": 4.908041111301074e-06, "loss": 0.5167, "step": 862 }, { "epoch": 0.37, "grad_norm": 0.5963402145379366, "learning_rate": 4.90753550894474e-06, "loss": 0.4786, "step": 863 }, { "epoch": 0.37, "grad_norm": 0.5654270988232517, "learning_rate": 4.907028546648049e-06, "loss": 0.509, "step": 864 }, { "epoch": 0.37, "grad_norm": 0.6108797196445513, "learning_rate": 4.906520224697364e-06, "loss": 0.5055, "step": 865 }, { "epoch": 0.37, "grad_norm": 0.6008153268576729, "learning_rate": 4.906010543379821e-06, "loss": 0.5043, "step": 866 }, { "epoch": 0.37, "grad_norm": 0.6448956256200653, "learning_rate": 4.905499502983325e-06, "loss": 0.5257, "step": 867 }, { "epoch": 0.37, "grad_norm": 0.5595461401241696, "learning_rate": 4.904987103796544e-06, "loss": 0.5017, "step": 868 }, { "epoch": 0.37, "grad_norm": 0.5941241562315258, "learning_rate": 4.904473346108916e-06, "loss": 0.5052, "step": 869 }, { "epoch": 0.37, "grad_norm": 0.6018542340595553, "learning_rate": 4.903958230210647e-06, "loss": 0.4875, "step": 870 }, { "epoch": 0.37, "grad_norm": 0.5681619089604921, "learning_rate": 4.9034417563927105e-06, "loss": 0.4876, "step": 871 }, { "epoch": 0.37, "grad_norm": 0.580332576906727, "learning_rate": 4.902923924946845e-06, "loss": 0.503, "step": 872 }, { "epoch": 0.37, "grad_norm": 0.5991899633835461, "learning_rate": 4.902404736165557e-06, "loss": 0.4792, "step": 873 }, { "epoch": 0.37, "grad_norm": 0.5788544217689144, "learning_rate": 4.901884190342121e-06, "loss": 0.534, "step": 874 }, { "epoch": 0.37, "grad_norm": 0.5873037059704003, "learning_rate": 4.901362287770576e-06, "loss": 0.5138, "step": 875 }, { "epoch": 0.37, "grad_norm": 0.5673844198994867, "learning_rate": 4.900839028745727e-06, "loss": 0.5094, "step": 876 }, { "epoch": 0.37, "grad_norm": 0.5992496148744582, "learning_rate": 4.900314413563149e-06, "loss": 0.5296, "step": 877 }, { "epoch": 0.38, "grad_norm": 0.619714807218819, "learning_rate": 4.899788442519178e-06, "loss": 0.5174, "step": 878 }, { "epoch": 0.38, "grad_norm": 0.5833898092793183, "learning_rate": 4.899261115910919e-06, "loss": 0.4845, "step": 879 }, { "epoch": 0.38, "grad_norm": 0.5731340631423973, "learning_rate": 4.8987324340362445e-06, "loss": 0.494, "step": 880 }, { "epoch": 0.38, "grad_norm": 0.6335983797182833, "learning_rate": 4.898202397193787e-06, "loss": 0.4902, "step": 881 }, { "epoch": 0.38, "grad_norm": 0.6021252115369131, "learning_rate": 4.897671005682948e-06, "loss": 0.5039, "step": 882 }, { "epoch": 0.38, "grad_norm": 0.6136802203792989, "learning_rate": 4.8971382598038945e-06, "loss": 0.5108, "step": 883 }, { "epoch": 0.38, "grad_norm": 0.6131873733015205, "learning_rate": 4.896604159857557e-06, "loss": 0.4997, "step": 884 }, { "epoch": 0.38, "grad_norm": 0.6159506568258571, "learning_rate": 4.896068706145632e-06, "loss": 0.5292, "step": 885 }, { "epoch": 0.38, "grad_norm": 0.6041398474984774, "learning_rate": 4.8955318989705814e-06, "loss": 0.51, "step": 886 }, { "epoch": 0.38, "grad_norm": 0.6148781569224463, "learning_rate": 4.8949937386356284e-06, "loss": 0.5094, "step": 887 }, { "epoch": 0.38, "grad_norm": 0.5976753831281014, "learning_rate": 4.894454225444764e-06, "loss": 0.5026, "step": 888 }, { "epoch": 0.38, "grad_norm": 0.608033075624346, "learning_rate": 4.893913359702742e-06, "loss": 0.5011, "step": 889 }, { "epoch": 0.38, "grad_norm": 0.5610693452613091, "learning_rate": 4.89337114171508e-06, "loss": 0.4903, "step": 890 }, { "epoch": 0.38, "grad_norm": 0.5740287354727569, "learning_rate": 4.89282757178806e-06, "loss": 0.5355, "step": 891 }, { "epoch": 0.38, "grad_norm": 0.5808789786027553, "learning_rate": 4.892282650228728e-06, "loss": 0.4798, "step": 892 }, { "epoch": 0.38, "grad_norm": 0.5636801724275967, "learning_rate": 4.891736377344891e-06, "loss": 0.4929, "step": 893 }, { "epoch": 0.38, "grad_norm": 0.6092124636402675, "learning_rate": 4.891188753445122e-06, "loss": 0.4976, "step": 894 }, { "epoch": 0.38, "grad_norm": 0.5840172519663077, "learning_rate": 4.890639778838757e-06, "loss": 0.4841, "step": 895 }, { "epoch": 0.38, "grad_norm": 0.5819312884051389, "learning_rate": 4.890089453835894e-06, "loss": 0.4869, "step": 896 }, { "epoch": 0.38, "grad_norm": 0.6120829375007303, "learning_rate": 4.889537778747396e-06, "loss": 0.5265, "step": 897 }, { "epoch": 0.38, "grad_norm": 0.5726939025833054, "learning_rate": 4.888984753884882e-06, "loss": 0.4708, "step": 898 }, { "epoch": 0.38, "grad_norm": 0.6080885452408311, "learning_rate": 4.8884303795607424e-06, "loss": 0.501, "step": 899 }, { "epoch": 0.38, "grad_norm": 0.5909973175245595, "learning_rate": 4.887874656088124e-06, "loss": 0.5017, "step": 900 }, { "epoch": 0.38, "grad_norm": 0.591651916865507, "learning_rate": 4.887317583780937e-06, "loss": 0.483, "step": 901 }, { "epoch": 0.39, "grad_norm": 0.5745658546447854, "learning_rate": 4.886759162953856e-06, "loss": 0.4942, "step": 902 }, { "epoch": 0.39, "grad_norm": 0.6021387624200228, "learning_rate": 4.886199393922313e-06, "loss": 0.4906, "step": 903 }, { "epoch": 0.39, "grad_norm": 0.5993298907461448, "learning_rate": 4.885638277002503e-06, "loss": 0.4987, "step": 904 }, { "epoch": 0.39, "grad_norm": 0.6004161472848657, "learning_rate": 4.885075812511386e-06, "loss": 0.5152, "step": 905 }, { "epoch": 0.39, "grad_norm": 0.5799301402705993, "learning_rate": 4.884512000766679e-06, "loss": 0.5045, "step": 906 }, { "epoch": 0.39, "grad_norm": 0.6121599376461038, "learning_rate": 4.883946842086861e-06, "loss": 0.4915, "step": 907 }, { "epoch": 0.39, "grad_norm": 0.5663312512818208, "learning_rate": 4.883380336791172e-06, "loss": 0.4983, "step": 908 }, { "epoch": 0.39, "grad_norm": 0.6095573669970138, "learning_rate": 4.882812485199614e-06, "loss": 0.5285, "step": 909 }, { "epoch": 0.39, "grad_norm": 0.5921811411224858, "learning_rate": 4.882243287632947e-06, "loss": 0.5194, "step": 910 }, { "epoch": 0.39, "grad_norm": 0.6006510756243623, "learning_rate": 4.8816727444126935e-06, "loss": 0.492, "step": 911 }, { "epoch": 0.39, "grad_norm": 0.5649111234747042, "learning_rate": 4.881100855861134e-06, "loss": 0.4916, "step": 912 }, { "epoch": 0.39, "grad_norm": 0.5762188689086207, "learning_rate": 4.880527622301312e-06, "loss": 0.4887, "step": 913 }, { "epoch": 0.39, "grad_norm": 0.6441465256010206, "learning_rate": 4.879953044057028e-06, "loss": 0.511, "step": 914 }, { "epoch": 0.39, "grad_norm": 0.5684710047872453, "learning_rate": 4.879377121452844e-06, "loss": 0.5001, "step": 915 }, { "epoch": 0.39, "grad_norm": 0.5686046518640278, "learning_rate": 4.8787998548140794e-06, "loss": 0.5014, "step": 916 }, { "epoch": 0.39, "grad_norm": 0.5905740541193955, "learning_rate": 4.878221244466813e-06, "loss": 0.486, "step": 917 }, { "epoch": 0.39, "grad_norm": 0.5952876200881407, "learning_rate": 4.8776412907378845e-06, "loss": 0.481, "step": 918 }, { "epoch": 0.39, "grad_norm": 0.5872355311893122, "learning_rate": 4.877059993954891e-06, "loss": 0.5039, "step": 919 }, { "epoch": 0.39, "grad_norm": 0.6074745120236803, "learning_rate": 4.8764773544461895e-06, "loss": 0.5108, "step": 920 }, { "epoch": 0.39, "grad_norm": 0.6130696940827612, "learning_rate": 4.875893372540893e-06, "loss": 0.5048, "step": 921 }, { "epoch": 0.39, "grad_norm": 0.6004208834442807, "learning_rate": 4.875308048568875e-06, "loss": 0.4988, "step": 922 }, { "epoch": 0.39, "grad_norm": 0.6892465313158495, "learning_rate": 4.8747213828607675e-06, "loss": 0.483, "step": 923 }, { "epoch": 0.39, "eval_loss": 0.5001593232154846, "eval_runtime": 6909.7667, "eval_samples_per_second": 42.026, "eval_steps_per_second": 2.101, "step": 923 }, { "epoch": 0.39, "grad_norm": 0.6134636983656643, "learning_rate": 4.874133375747957e-06, "loss": 0.5137, "step": 924 }, { "epoch": 0.4, "grad_norm": 0.6440085903145949, "learning_rate": 4.873544027562593e-06, "loss": 0.5176, "step": 925 }, { "epoch": 0.4, "grad_norm": 0.6899079462308539, "learning_rate": 4.8729533386375775e-06, "loss": 0.5162, "step": 926 }, { "epoch": 0.4, "grad_norm": 0.6146416139724011, "learning_rate": 4.872361309306572e-06, "loss": 0.5113, "step": 927 }, { "epoch": 0.4, "grad_norm": 0.585989400430957, "learning_rate": 4.8717679399039954e-06, "loss": 0.4987, "step": 928 }, { "epoch": 0.4, "grad_norm": 0.6462954923557843, "learning_rate": 4.871173230765024e-06, "loss": 0.501, "step": 929 }, { "epoch": 0.4, "grad_norm": 0.6369012674472616, "learning_rate": 4.8705771822255895e-06, "loss": 0.4859, "step": 930 }, { "epoch": 0.4, "grad_norm": 0.6299056450791893, "learning_rate": 4.8699797946223805e-06, "loss": 0.5061, "step": 931 }, { "epoch": 0.4, "grad_norm": 0.6098008991534455, "learning_rate": 4.869381068292842e-06, "loss": 0.4871, "step": 932 }, { "epoch": 0.4, "grad_norm": 0.63534813466544, "learning_rate": 4.868781003575176e-06, "loss": 0.4946, "step": 933 }, { "epoch": 0.4, "grad_norm": 0.607178762605803, "learning_rate": 4.86817960080834e-06, "loss": 0.4986, "step": 934 }, { "epoch": 0.4, "grad_norm": 0.5983782529586613, "learning_rate": 4.867576860332048e-06, "loss": 0.4795, "step": 935 }, { "epoch": 0.4, "grad_norm": 0.6738363380499427, "learning_rate": 4.8669727824867686e-06, "loss": 0.5166, "step": 936 }, { "epoch": 0.4, "grad_norm": 0.598996652424602, "learning_rate": 4.866367367613725e-06, "loss": 0.5197, "step": 937 }, { "epoch": 0.4, "grad_norm": 0.5815833611063322, "learning_rate": 4.865760616054899e-06, "loss": 0.4844, "step": 938 }, { "epoch": 0.4, "grad_norm": 0.603444821349114, "learning_rate": 4.865152528153022e-06, "loss": 0.5207, "step": 939 }, { "epoch": 0.4, "grad_norm": 0.5867776906541347, "learning_rate": 4.864543104251587e-06, "loss": 0.529, "step": 940 }, { "epoch": 0.4, "grad_norm": 0.5650043009819257, "learning_rate": 4.863932344694837e-06, "loss": 0.4959, "step": 941 }, { "epoch": 0.4, "grad_norm": 0.6917490151688115, "learning_rate": 4.8633202498277695e-06, "loss": 0.5137, "step": 942 }, { "epoch": 0.4, "grad_norm": 0.549493622060356, "learning_rate": 4.862706819996139e-06, "loss": 0.4845, "step": 943 }, { "epoch": 0.4, "grad_norm": 0.5858437778240971, "learning_rate": 4.8620920555464515e-06, "loss": 0.5006, "step": 944 }, { "epoch": 0.4, "grad_norm": 0.5993940522874021, "learning_rate": 4.8614759568259685e-06, "loss": 0.506, "step": 945 }, { "epoch": 0.4, "grad_norm": 0.5948965330805686, "learning_rate": 4.860858524182704e-06, "loss": 0.5192, "step": 946 }, { "epoch": 0.4, "grad_norm": 0.5944946909841916, "learning_rate": 4.860239757965428e-06, "loss": 0.5071, "step": 947 }, { "epoch": 0.41, "grad_norm": 0.5915006708411217, "learning_rate": 4.8596196585236595e-06, "loss": 0.4853, "step": 948 }, { "epoch": 0.41, "grad_norm": 0.5630831967485228, "learning_rate": 4.858998226207674e-06, "loss": 0.484, "step": 949 }, { "epoch": 0.41, "grad_norm": 0.5666674173283087, "learning_rate": 4.858375461368499e-06, "loss": 0.512, "step": 950 }, { "epoch": 0.41, "grad_norm": 0.5836639113649253, "learning_rate": 4.857751364357913e-06, "loss": 0.514, "step": 951 }, { "epoch": 0.41, "grad_norm": 0.5744731439956018, "learning_rate": 4.857125935528451e-06, "loss": 0.5177, "step": 952 }, { "epoch": 0.41, "grad_norm": 0.5926955465496379, "learning_rate": 4.8564991752333975e-06, "loss": 0.4955, "step": 953 }, { "epoch": 0.41, "grad_norm": 0.5628507262484403, "learning_rate": 4.855871083826789e-06, "loss": 0.4995, "step": 954 }, { "epoch": 0.41, "grad_norm": 0.5866653009845907, "learning_rate": 4.855241661663413e-06, "loss": 0.5002, "step": 955 }, { "epoch": 0.41, "grad_norm": 0.5599328850779348, "learning_rate": 4.854610909098813e-06, "loss": 0.4593, "step": 956 }, { "epoch": 0.41, "grad_norm": 0.5973389278804867, "learning_rate": 4.853978826489277e-06, "loss": 0.4913, "step": 957 }, { "epoch": 0.41, "grad_norm": 0.643022571476318, "learning_rate": 4.8533454141918525e-06, "loss": 0.5036, "step": 958 }, { "epoch": 0.41, "grad_norm": 0.5688007484429585, "learning_rate": 4.852710672564332e-06, "loss": 0.5036, "step": 959 }, { "epoch": 0.41, "grad_norm": 0.5969281444464002, "learning_rate": 4.852074601965261e-06, "loss": 0.5344, "step": 960 }, { "epoch": 0.41, "grad_norm": 0.5730247632836935, "learning_rate": 4.851437202753936e-06, "loss": 0.4809, "step": 961 }, { "epoch": 0.41, "grad_norm": 0.5799456612673418, "learning_rate": 4.850798475290403e-06, "loss": 0.4536, "step": 962 }, { "epoch": 0.41, "grad_norm": 0.6205297443588728, "learning_rate": 4.8501584199354604e-06, "loss": 0.5099, "step": 963 }, { "epoch": 0.41, "grad_norm": 0.5681608576563654, "learning_rate": 4.849517037050653e-06, "loss": 0.4903, "step": 964 }, { "epoch": 0.41, "grad_norm": 0.6025610912235543, "learning_rate": 4.848874326998279e-06, "loss": 0.4836, "step": 965 }, { "epoch": 0.41, "grad_norm": 0.5901338922642949, "learning_rate": 4.848230290141383e-06, "loss": 0.4662, "step": 966 }, { "epoch": 0.41, "grad_norm": 0.5688713468398793, "learning_rate": 4.847584926843765e-06, "loss": 0.4884, "step": 967 }, { "epoch": 0.41, "grad_norm": 0.5778466913445185, "learning_rate": 4.846938237469966e-06, "loss": 0.5096, "step": 968 }, { "epoch": 0.41, "grad_norm": 0.603606926382752, "learning_rate": 4.846290222385282e-06, "loss": 0.5415, "step": 969 }, { "epoch": 0.41, "grad_norm": 0.594103598829822, "learning_rate": 4.845640881955757e-06, "loss": 0.5007, "step": 970 }, { "epoch": 0.41, "grad_norm": 0.5983275671847197, "learning_rate": 4.844990216548181e-06, "loss": 0.501, "step": 971 }, { "epoch": 0.42, "grad_norm": 0.6107199353721016, "learning_rate": 4.844338226530095e-06, "loss": 0.5303, "step": 972 }, { "epoch": 0.42, "grad_norm": 0.6032407557209453, "learning_rate": 4.843684912269789e-06, "loss": 0.4946, "step": 973 }, { "epoch": 0.42, "grad_norm": 0.5413313146483812, "learning_rate": 4.843030274136297e-06, "loss": 0.4956, "step": 974 }, { "epoch": 0.42, "grad_norm": 0.6106091128154526, "learning_rate": 4.842374312499405e-06, "loss": 0.5105, "step": 975 }, { "epoch": 0.42, "grad_norm": 0.574289734465813, "learning_rate": 4.841717027729643e-06, "loss": 0.5215, "step": 976 }, { "epoch": 0.42, "grad_norm": 0.6083050534381844, "learning_rate": 4.8410584201982934e-06, "loss": 0.4862, "step": 977 }, { "epoch": 0.42, "grad_norm": 0.5766091053101502, "learning_rate": 4.84039849027738e-06, "loss": 0.5117, "step": 978 }, { "epoch": 0.42, "grad_norm": 0.5711574672193085, "learning_rate": 4.8397372383396765e-06, "loss": 0.4722, "step": 979 }, { "epoch": 0.42, "grad_norm": 0.5774240645659215, "learning_rate": 4.839074664758705e-06, "loss": 0.522, "step": 980 }, { "epoch": 0.42, "grad_norm": 0.5572922524103036, "learning_rate": 4.8384107699087305e-06, "loss": 0.4989, "step": 981 }, { "epoch": 0.42, "grad_norm": 0.6070310913343694, "learning_rate": 4.837745554164766e-06, "loss": 0.4857, "step": 982 }, { "epoch": 0.42, "grad_norm": 0.60261744740847, "learning_rate": 4.8370790179025715e-06, "loss": 0.5182, "step": 983 }, { "epoch": 0.42, "grad_norm": 0.5659965233077204, "learning_rate": 4.836411161498653e-06, "loss": 0.4828, "step": 984 }, { "epoch": 0.42, "grad_norm": 0.5957726040740746, "learning_rate": 4.835741985330259e-06, "loss": 0.4724, "step": 985 }, { "epoch": 0.42, "grad_norm": 0.6024381722019819, "learning_rate": 4.835071489775388e-06, "loss": 0.5038, "step": 986 }, { "epoch": 0.42, "grad_norm": 0.5889077069135094, "learning_rate": 4.834399675212781e-06, "loss": 0.4837, "step": 987 }, { "epoch": 0.42, "grad_norm": 0.6240890961484814, "learning_rate": 4.8337265420219245e-06, "loss": 0.5006, "step": 988 }, { "epoch": 0.42, "grad_norm": 0.5986554935817346, "learning_rate": 4.833052090583052e-06, "loss": 0.4603, "step": 989 }, { "epoch": 0.42, "grad_norm": 0.5632943529640556, "learning_rate": 4.832376321277136e-06, "loss": 0.478, "step": 990 }, { "epoch": 0.42, "grad_norm": 0.5811104630507077, "learning_rate": 4.831699234485899e-06, "loss": 0.503, "step": 991 }, { "epoch": 0.42, "grad_norm": 0.6286701964893617, "learning_rate": 4.831020830591806e-06, "loss": 0.5186, "step": 992 }, { "epoch": 0.42, "grad_norm": 0.5656632345758932, "learning_rate": 4.8303411099780665e-06, "loss": 0.4863, "step": 993 }, { "epoch": 0.42, "grad_norm": 0.5914309687581247, "learning_rate": 4.829660073028631e-06, "loss": 0.5056, "step": 994 }, { "epoch": 0.42, "eval_loss": 0.4973524808883667, "eval_runtime": 6914.8778, "eval_samples_per_second": 41.995, "eval_steps_per_second": 2.1, "step": 994 }, { "epoch": 0.43, "grad_norm": 0.6155119630859821, "learning_rate": 4.828977720128198e-06, "loss": 0.494, "step": 995 }, { "epoch": 0.43, "grad_norm": 0.5711544453505207, "learning_rate": 4.828294051662206e-06, "loss": 0.4958, "step": 996 }, { "epoch": 0.43, "grad_norm": 0.5630820865144905, "learning_rate": 4.827609068016836e-06, "loss": 0.5164, "step": 997 }, { "epoch": 0.43, "grad_norm": 0.5954033434874677, "learning_rate": 4.826922769579017e-06, "loss": 0.5199, "step": 998 }, { "epoch": 0.43, "grad_norm": 0.5740037081231429, "learning_rate": 4.826235156736414e-06, "loss": 0.4798, "step": 999 }, { "epoch": 0.43, "grad_norm": 0.5812019395259428, "learning_rate": 4.825546229877439e-06, "loss": 0.4865, "step": 1000 }, { "epoch": 0.43, "grad_norm": 0.5893943585416656, "learning_rate": 4.824855989391245e-06, "loss": 0.4912, "step": 1001 }, { "epoch": 0.43, "grad_norm": 0.6065006741354068, "learning_rate": 4.824164435667727e-06, "loss": 0.5072, "step": 1002 }, { "epoch": 0.43, "grad_norm": 0.6019073096492751, "learning_rate": 4.823471569097521e-06, "loss": 0.4971, "step": 1003 }, { "epoch": 0.43, "grad_norm": 0.5883223828151468, "learning_rate": 4.822777390072006e-06, "loss": 0.514, "step": 1004 }, { "epoch": 0.43, "grad_norm": 0.6604784109834196, "learning_rate": 4.822081898983302e-06, "loss": 0.5065, "step": 1005 }, { "epoch": 0.43, "grad_norm": 0.5937829112220406, "learning_rate": 4.821385096224268e-06, "loss": 0.4714, "step": 1006 }, { "epoch": 0.43, "grad_norm": 0.5748353294494913, "learning_rate": 4.820686982188508e-06, "loss": 0.5059, "step": 1007 }, { "epoch": 0.43, "grad_norm": 0.6069950364399875, "learning_rate": 4.819987557270364e-06, "loss": 0.5204, "step": 1008 }, { "epoch": 0.43, "grad_norm": 0.6358630633744264, "learning_rate": 4.819286821864917e-06, "loss": 0.4903, "step": 1009 }, { "epoch": 0.43, "grad_norm": 0.6204642750728111, "learning_rate": 4.818584776367992e-06, "loss": 0.5128, "step": 1010 }, { "epoch": 0.43, "grad_norm": 0.6829577526239162, "learning_rate": 4.817881421176153e-06, "loss": 0.5197, "step": 1011 }, { "epoch": 0.43, "grad_norm": 0.5922818255526525, "learning_rate": 4.817176756686701e-06, "loss": 0.4918, "step": 1012 }, { "epoch": 0.43, "grad_norm": 0.5740860408805906, "learning_rate": 4.816470783297679e-06, "loss": 0.4982, "step": 1013 }, { "epoch": 0.43, "grad_norm": 0.5910300282185704, "learning_rate": 4.815763501407869e-06, "loss": 0.485, "step": 1014 }, { "epoch": 0.43, "grad_norm": 0.5715165207150573, "learning_rate": 4.815054911416795e-06, "loss": 0.4849, "step": 1015 }, { "epoch": 0.43, "grad_norm": 0.6024018069444244, "learning_rate": 4.8143450137247116e-06, "loss": 0.4994, "step": 1016 }, { "epoch": 0.43, "grad_norm": 0.6266053738868206, "learning_rate": 4.8136338087326214e-06, "loss": 0.5286, "step": 1017 }, { "epoch": 0.43, "grad_norm": 0.5953155175370475, "learning_rate": 4.812921296842261e-06, "loss": 0.4776, "step": 1018 }, { "epoch": 0.44, "grad_norm": 0.6031211740166059, "learning_rate": 4.812207478456105e-06, "loss": 0.5148, "step": 1019 }, { "epoch": 0.44, "grad_norm": 0.6273922038641672, "learning_rate": 4.811492353977366e-06, "loss": 0.5029, "step": 1020 }, { "epoch": 0.44, "grad_norm": 0.5826143241371425, "learning_rate": 4.810775923809996e-06, "loss": 0.5056, "step": 1021 }, { "epoch": 0.44, "grad_norm": 0.582077622885555, "learning_rate": 4.810058188358685e-06, "loss": 0.487, "step": 1022 }, { "epoch": 0.44, "grad_norm": 0.5767038876088614, "learning_rate": 4.809339148028857e-06, "loss": 0.4805, "step": 1023 }, { "epoch": 0.44, "grad_norm": 0.5850526215638874, "learning_rate": 4.808618803226675e-06, "loss": 0.4686, "step": 1024 }, { "epoch": 0.44, "grad_norm": 0.6178237737641012, "learning_rate": 4.80789715435904e-06, "loss": 0.5005, "step": 1025 }, { "epoch": 0.44, "grad_norm": 0.5579649875995263, "learning_rate": 4.807174201833589e-06, "loss": 0.4848, "step": 1026 }, { "epoch": 0.44, "grad_norm": 0.576234598032971, "learning_rate": 4.8064499460586926e-06, "loss": 0.4824, "step": 1027 }, { "epoch": 0.44, "grad_norm": 0.5826322787338029, "learning_rate": 4.8057243874434625e-06, "loss": 0.4997, "step": 1028 }, { "epoch": 0.44, "grad_norm": 0.5580813074509025, "learning_rate": 4.8049975263977416e-06, "loss": 0.4693, "step": 1029 }, { "epoch": 0.44, "grad_norm": 0.5695474003960879, "learning_rate": 4.804269363332112e-06, "loss": 0.492, "step": 1030 }, { "epoch": 0.44, "grad_norm": 0.5905987109265735, "learning_rate": 4.80353989865789e-06, "loss": 0.4966, "step": 1031 }, { "epoch": 0.44, "grad_norm": 0.5685057462956427, "learning_rate": 4.802809132787125e-06, "loss": 0.5016, "step": 1032 }, { "epoch": 0.44, "grad_norm": 0.5886266001836196, "learning_rate": 4.802077066132607e-06, "loss": 0.4946, "step": 1033 }, { "epoch": 0.44, "grad_norm": 0.6035821042805889, "learning_rate": 4.801343699107854e-06, "loss": 0.4959, "step": 1034 }, { "epoch": 0.44, "grad_norm": 0.566265021004039, "learning_rate": 4.800609032127123e-06, "loss": 0.4972, "step": 1035 }, { "epoch": 0.44, "grad_norm": 0.5614736780544856, "learning_rate": 4.799873065605404e-06, "loss": 0.4807, "step": 1036 }, { "epoch": 0.44, "grad_norm": 0.600684352457, "learning_rate": 4.799135799958421e-06, "loss": 0.496, "step": 1037 }, { "epoch": 0.44, "grad_norm": 0.591873988888275, "learning_rate": 4.798397235602632e-06, "loss": 0.4698, "step": 1038 }, { "epoch": 0.44, "grad_norm": 0.6143917690140963, "learning_rate": 4.797657372955228e-06, "loss": 0.4906, "step": 1039 }, { "epoch": 0.44, "grad_norm": 0.5837819971908614, "learning_rate": 4.7969162124341354e-06, "loss": 0.4711, "step": 1040 }, { "epoch": 0.44, "grad_norm": 0.6109135143710297, "learning_rate": 4.79617375445801e-06, "loss": 0.5127, "step": 1041 }, { "epoch": 0.45, "grad_norm": 0.5763531175283814, "learning_rate": 4.795429999446246e-06, "loss": 0.4889, "step": 1042 }, { "epoch": 0.45, "grad_norm": 0.5818174333398002, "learning_rate": 4.794684947818964e-06, "loss": 0.4956, "step": 1043 }, { "epoch": 0.45, "grad_norm": 0.6195177609111279, "learning_rate": 4.793938599997021e-06, "loss": 0.4926, "step": 1044 }, { "epoch": 0.45, "grad_norm": 0.5686953952488751, "learning_rate": 4.793190956402005e-06, "loss": 0.4815, "step": 1045 }, { "epoch": 0.45, "grad_norm": 0.5648932079137525, "learning_rate": 4.792442017456237e-06, "loss": 0.5092, "step": 1046 }, { "epoch": 0.45, "grad_norm": 0.5851943135122919, "learning_rate": 4.791691783582768e-06, "loss": 0.4765, "step": 1047 }, { "epoch": 0.45, "grad_norm": 0.591274407551804, "learning_rate": 4.790940255205381e-06, "loss": 0.5056, "step": 1048 }, { "epoch": 0.45, "grad_norm": 0.6379099025664374, "learning_rate": 4.790187432748591e-06, "loss": 0.5158, "step": 1049 }, { "epoch": 0.45, "grad_norm": 0.6181747896706304, "learning_rate": 4.789433316637644e-06, "loss": 0.5146, "step": 1050 }, { "epoch": 0.45, "grad_norm": 0.5531345989453061, "learning_rate": 4.788677907298516e-06, "loss": 0.4878, "step": 1051 }, { "epoch": 0.45, "grad_norm": 0.5876103896392866, "learning_rate": 4.7879212051579124e-06, "loss": 0.5136, "step": 1052 }, { "epoch": 0.45, "grad_norm": 3.875613200014087, "learning_rate": 4.787163210643272e-06, "loss": 0.4833, "step": 1053 }, { "epoch": 0.45, "grad_norm": 0.6036074704422942, "learning_rate": 4.786403924182761e-06, "loss": 0.5099, "step": 1054 }, { "epoch": 0.45, "grad_norm": 0.621921611720999, "learning_rate": 4.785643346205277e-06, "loss": 0.5052, "step": 1055 }, { "epoch": 0.45, "grad_norm": 0.6125351583146175, "learning_rate": 4.784881477140445e-06, "loss": 0.5053, "step": 1056 }, { "epoch": 0.45, "grad_norm": 0.5999869964811588, "learning_rate": 4.784118317418621e-06, "loss": 0.5077, "step": 1057 }, { "epoch": 0.45, "grad_norm": 0.5923633626129878, "learning_rate": 4.7833538674708905e-06, "loss": 0.4784, "step": 1058 }, { "epoch": 0.45, "grad_norm": 0.5564286456120893, "learning_rate": 4.782588127729066e-06, "loss": 0.4985, "step": 1059 }, { "epoch": 0.45, "grad_norm": 0.5651692637493795, "learning_rate": 4.781821098625691e-06, "loss": 0.4796, "step": 1060 }, { "epoch": 0.45, "grad_norm": 0.6611434126482291, "learning_rate": 4.7810527805940344e-06, "loss": 0.5125, "step": 1061 }, { "epoch": 0.45, "grad_norm": 0.6079384633788307, "learning_rate": 4.7802831740680955e-06, "loss": 0.4909, "step": 1062 }, { "epoch": 0.45, "grad_norm": 0.5887501058130822, "learning_rate": 4.7795122794826e-06, "loss": 0.4729, "step": 1063 }, { "epoch": 0.45, "grad_norm": 0.8409421704273247, "learning_rate": 4.778740097273003e-06, "loss": 0.5162, "step": 1064 }, { "epoch": 0.46, "grad_norm": 0.6099196443182712, "learning_rate": 4.777966627875484e-06, "loss": 0.5268, "step": 1065 }, { "epoch": 0.46, "eval_loss": 0.49549439549446106, "eval_runtime": 6918.4924, "eval_samples_per_second": 41.973, "eval_steps_per_second": 2.099, "step": 1065 }, { "epoch": 0.46, "grad_norm": 0.63487506837019, "learning_rate": 4.777191871726951e-06, "loss": 0.4854, "step": 1066 }, { "epoch": 0.46, "grad_norm": 0.810054670245462, "learning_rate": 4.776415829265043e-06, "loss": 0.5234, "step": 1067 }, { "epoch": 0.46, "grad_norm": 0.5924971114450747, "learning_rate": 4.775638500928117e-06, "loss": 0.5024, "step": 1068 }, { "epoch": 0.46, "grad_norm": 0.5596644374385136, "learning_rate": 4.774859887155263e-06, "loss": 0.4743, "step": 1069 }, { "epoch": 0.46, "grad_norm": 0.6022292614035627, "learning_rate": 4.7740799883862966e-06, "loss": 0.4998, "step": 1070 }, { "epoch": 0.46, "grad_norm": 0.5685916266892248, "learning_rate": 4.773298805061756e-06, "loss": 0.4877, "step": 1071 }, { "epoch": 0.46, "grad_norm": 0.5962356632555038, "learning_rate": 4.772516337622907e-06, "loss": 0.4977, "step": 1072 }, { "epoch": 0.46, "grad_norm": 0.6113056469088662, "learning_rate": 4.771732586511741e-06, "loss": 0.5129, "step": 1073 }, { "epoch": 0.46, "grad_norm": 0.5729626434260354, "learning_rate": 4.7709475521709745e-06, "loss": 0.5059, "step": 1074 }, { "epoch": 0.46, "grad_norm": 0.5952807746932272, "learning_rate": 4.770161235044047e-06, "loss": 0.4947, "step": 1075 }, { "epoch": 0.46, "grad_norm": 0.583828215474725, "learning_rate": 4.769373635575127e-06, "loss": 0.4919, "step": 1076 }, { "epoch": 0.46, "grad_norm": 0.6213165462594205, "learning_rate": 4.768584754209101e-06, "loss": 0.5161, "step": 1077 }, { "epoch": 0.46, "grad_norm": 0.5695247509557161, "learning_rate": 4.767794591391585e-06, "loss": 0.4995, "step": 1078 }, { "epoch": 0.46, "grad_norm": 0.5741341553365366, "learning_rate": 4.767003147568917e-06, "loss": 0.475, "step": 1079 }, { "epoch": 0.46, "grad_norm": 0.5768424395874503, "learning_rate": 4.766210423188158e-06, "loss": 0.4994, "step": 1080 }, { "epoch": 0.46, "grad_norm": 0.5626374906483355, "learning_rate": 4.765416418697092e-06, "loss": 0.503, "step": 1081 }, { "epoch": 0.46, "grad_norm": 0.5683372940335709, "learning_rate": 4.764621134544229e-06, "loss": 0.5083, "step": 1082 }, { "epoch": 0.46, "grad_norm": 0.9131547521845851, "learning_rate": 4.763824571178798e-06, "loss": 0.4672, "step": 1083 }, { "epoch": 0.46, "grad_norm": 0.5723059333795633, "learning_rate": 4.763026729050752e-06, "loss": 0.483, "step": 1084 }, { "epoch": 0.46, "grad_norm": 0.5906452138705621, "learning_rate": 4.7622276086107685e-06, "loss": 0.5082, "step": 1085 }, { "epoch": 0.46, "grad_norm": 0.5774338186179943, "learning_rate": 4.761427210310244e-06, "loss": 0.4797, "step": 1086 }, { "epoch": 0.46, "grad_norm": 0.5795580232694232, "learning_rate": 4.760625534601299e-06, "loss": 0.4718, "step": 1087 }, { "epoch": 0.46, "grad_norm": 0.5917263527753107, "learning_rate": 4.759822581936773e-06, "loss": 0.4773, "step": 1088 }, { "epoch": 0.47, "grad_norm": 0.5606273835003459, "learning_rate": 4.759018352770229e-06, "loss": 0.4876, "step": 1089 }, { "epoch": 0.47, "grad_norm": 0.5763982202540356, "learning_rate": 4.758212847555953e-06, "loss": 0.476, "step": 1090 }, { "epoch": 0.47, "grad_norm": 0.5761656331674372, "learning_rate": 4.757406066748947e-06, "loss": 0.4526, "step": 1091 }, { "epoch": 0.47, "grad_norm": 0.6337475882937359, "learning_rate": 4.756598010804935e-06, "loss": 0.5034, "step": 1092 }, { "epoch": 0.47, "grad_norm": 0.598332500270868, "learning_rate": 4.755788680180363e-06, "loss": 0.5204, "step": 1093 }, { "epoch": 0.47, "grad_norm": 0.5983336967750388, "learning_rate": 4.754978075332398e-06, "loss": 0.4992, "step": 1094 }, { "epoch": 0.47, "grad_norm": 0.5932701762917508, "learning_rate": 4.7541661967189225e-06, "loss": 0.4924, "step": 1095 }, { "epoch": 0.47, "grad_norm": 0.571498332890339, "learning_rate": 4.7533530447985424e-06, "loss": 0.507, "step": 1096 }, { "epoch": 0.47, "grad_norm": 0.6003239552991582, "learning_rate": 4.752538620030581e-06, "loss": 0.4797, "step": 1097 }, { "epoch": 0.47, "grad_norm": 0.5906168083325332, "learning_rate": 4.7517229228750804e-06, "loss": 0.5193, "step": 1098 }, { "epoch": 0.47, "grad_norm": 0.5931501865913923, "learning_rate": 4.750905953792803e-06, "loss": 0.4812, "step": 1099 }, { "epoch": 0.47, "grad_norm": 0.6290969275442341, "learning_rate": 4.750087713245227e-06, "loss": 0.5057, "step": 1100 }, { "epoch": 0.47, "grad_norm": 0.5604034491261822, "learning_rate": 4.749268201694553e-06, "loss": 0.4678, "step": 1101 }, { "epoch": 0.47, "grad_norm": 0.6014101011761356, "learning_rate": 4.748447419603696e-06, "loss": 0.4999, "step": 1102 }, { "epoch": 0.47, "grad_norm": 0.5926144054570872, "learning_rate": 4.747625367436288e-06, "loss": 0.4803, "step": 1103 }, { "epoch": 0.47, "grad_norm": 0.5820804611550561, "learning_rate": 4.746802045656683e-06, "loss": 0.5047, "step": 1104 }, { "epoch": 0.47, "grad_norm": 0.6087772177843124, "learning_rate": 4.745977454729947e-06, "loss": 0.5042, "step": 1105 }, { "epoch": 0.47, "grad_norm": 0.594424789965164, "learning_rate": 4.7451515951218675e-06, "loss": 0.5102, "step": 1106 }, { "epoch": 0.47, "grad_norm": 0.5731428331636981, "learning_rate": 4.744324467298944e-06, "loss": 0.5191, "step": 1107 }, { "epoch": 0.47, "grad_norm": 0.5657909010178551, "learning_rate": 4.743496071728396e-06, "loss": 0.4829, "step": 1108 }, { "epoch": 0.47, "grad_norm": 0.5670594056702332, "learning_rate": 4.7426664088781585e-06, "loss": 0.4896, "step": 1109 }, { "epoch": 0.47, "grad_norm": 0.6028975438711562, "learning_rate": 4.74183547921688e-06, "loss": 0.4996, "step": 1110 }, { "epoch": 0.47, "grad_norm": 0.6072539145327235, "learning_rate": 4.741003283213928e-06, "loss": 0.5041, "step": 1111 }, { "epoch": 0.48, "grad_norm": 0.5867589851114696, "learning_rate": 4.740169821339381e-06, "loss": 0.4992, "step": 1112 }, { "epoch": 0.48, "grad_norm": 0.586826402405483, "learning_rate": 4.739335094064038e-06, "loss": 0.4616, "step": 1113 }, { "epoch": 0.48, "grad_norm": 0.5943697459318228, "learning_rate": 4.738499101859409e-06, "loss": 0.4942, "step": 1114 }, { "epoch": 0.48, "grad_norm": 0.6118584069615158, "learning_rate": 4.7376618451977195e-06, "loss": 0.4826, "step": 1115 }, { "epoch": 0.48, "grad_norm": 0.5877105797297991, "learning_rate": 4.736823324551909e-06, "loss": 0.4845, "step": 1116 }, { "epoch": 0.48, "grad_norm": 0.6099949628606918, "learning_rate": 4.735983540395631e-06, "loss": 0.4982, "step": 1117 }, { "epoch": 0.48, "grad_norm": 0.5834134958716418, "learning_rate": 4.735142493203253e-06, "loss": 0.4944, "step": 1118 }, { "epoch": 0.48, "grad_norm": 0.5566576275979142, "learning_rate": 4.734300183449856e-06, "loss": 0.5016, "step": 1119 }, { "epoch": 0.48, "grad_norm": 0.6126927082777508, "learning_rate": 4.733456611611233e-06, "loss": 0.5104, "step": 1120 }, { "epoch": 0.48, "grad_norm": 0.5844938513436758, "learning_rate": 4.732611778163894e-06, "loss": 0.5373, "step": 1121 }, { "epoch": 0.48, "grad_norm": 0.5486863613853675, "learning_rate": 4.7317656835850544e-06, "loss": 0.5098, "step": 1122 }, { "epoch": 0.48, "grad_norm": 0.6075051942047663, "learning_rate": 4.73091832835265e-06, "loss": 0.4901, "step": 1123 }, { "epoch": 0.48, "grad_norm": 0.5805346057610045, "learning_rate": 4.730069712945322e-06, "loss": 0.511, "step": 1124 }, { "epoch": 0.48, "grad_norm": 0.5775795364832905, "learning_rate": 4.729219837842427e-06, "loss": 0.4916, "step": 1125 }, { "epoch": 0.48, "grad_norm": 0.5931365026299273, "learning_rate": 4.728368703524034e-06, "loss": 0.4897, "step": 1126 }, { "epoch": 0.48, "grad_norm": 0.5538844072984839, "learning_rate": 4.72751631047092e-06, "loss": 0.4937, "step": 1127 }, { "epoch": 0.48, "grad_norm": 0.585745992804038, "learning_rate": 4.726662659164576e-06, "loss": 0.4791, "step": 1128 }, { "epoch": 0.48, "grad_norm": 0.58228568438699, "learning_rate": 4.725807750087201e-06, "loss": 0.4897, "step": 1129 }, { "epoch": 0.48, "grad_norm": 0.623272001177232, "learning_rate": 4.7249515837217075e-06, "loss": 0.4713, "step": 1130 }, { "epoch": 0.48, "grad_norm": 0.5882528036227618, "learning_rate": 4.724094160551716e-06, "loss": 0.4888, "step": 1131 }, { "epoch": 0.48, "grad_norm": 0.5665571281823603, "learning_rate": 4.7232354810615575e-06, "loss": 0.4634, "step": 1132 }, { "epoch": 0.48, "grad_norm": 0.5547983753857056, "learning_rate": 4.722375545736273e-06, "loss": 0.4908, "step": 1133 }, { "epoch": 0.48, "grad_norm": 0.5637582498617196, "learning_rate": 4.7215143550616124e-06, "loss": 0.4912, "step": 1134 }, { "epoch": 0.48, "grad_norm": 0.5691678446202798, "learning_rate": 4.720651909524037e-06, "loss": 0.5088, "step": 1135 }, { "epoch": 0.49, "grad_norm": 0.6037047316013417, "learning_rate": 4.719788209610711e-06, "loss": 0.5063, "step": 1136 }, { "epoch": 0.49, "eval_loss": 0.49330827593803406, "eval_runtime": 6918.0742, "eval_samples_per_second": 41.975, "eval_steps_per_second": 2.099, "step": 1136 }, { "epoch": 0.49, "grad_norm": 0.7053964822007601, "learning_rate": 4.718923255809514e-06, "loss": 0.4887, "step": 1137 }, { "epoch": 0.49, "grad_norm": 0.6112035128658351, "learning_rate": 4.71805704860903e-06, "loss": 0.515, "step": 1138 }, { "epoch": 0.49, "grad_norm": 0.6144098315190231, "learning_rate": 4.717189588498552e-06, "loss": 0.4933, "step": 1139 }, { "epoch": 0.49, "grad_norm": 0.5899433076087655, "learning_rate": 4.716320875968081e-06, "loss": 0.5082, "step": 1140 }, { "epoch": 0.49, "grad_norm": 0.5827664203359575, "learning_rate": 4.715450911508324e-06, "loss": 0.4962, "step": 1141 }, { "epoch": 0.49, "grad_norm": 0.559390267287684, "learning_rate": 4.714579695610698e-06, "loss": 0.496, "step": 1142 }, { "epoch": 0.49, "grad_norm": 0.5970166193989745, "learning_rate": 4.7137072287673244e-06, "loss": 0.4963, "step": 1143 }, { "epoch": 0.49, "grad_norm": 0.5862371291827102, "learning_rate": 4.712833511471032e-06, "loss": 0.5107, "step": 1144 }, { "epoch": 0.49, "grad_norm": 0.5621562211030774, "learning_rate": 4.711958544215355e-06, "loss": 0.4848, "step": 1145 }, { "epoch": 0.49, "grad_norm": 0.5957285834779159, "learning_rate": 4.711082327494536e-06, "loss": 0.5068, "step": 1146 }, { "epoch": 0.49, "grad_norm": 0.6068577769642044, "learning_rate": 4.710204861803522e-06, "loss": 0.4806, "step": 1147 }, { "epoch": 0.49, "grad_norm": 0.5871604299043601, "learning_rate": 4.709326147637965e-06, "loss": 0.5263, "step": 1148 }, { "epoch": 0.49, "grad_norm": 0.6470536987852363, "learning_rate": 4.708446185494222e-06, "loss": 0.4911, "step": 1149 }, { "epoch": 0.49, "grad_norm": 0.581159959235341, "learning_rate": 4.707564975869357e-06, "loss": 0.5169, "step": 1150 }, { "epoch": 0.49, "grad_norm": 0.5891669395141927, "learning_rate": 4.706682519261137e-06, "loss": 0.4872, "step": 1151 }, { "epoch": 0.49, "grad_norm": 0.6080392901510567, "learning_rate": 4.7057988161680325e-06, "loss": 0.5072, "step": 1152 }, { "epoch": 0.49, "grad_norm": 0.6173464560574663, "learning_rate": 4.704913867089221e-06, "loss": 0.4857, "step": 1153 }, { "epoch": 0.49, "grad_norm": 0.6353962573655269, "learning_rate": 4.704027672524582e-06, "loss": 0.5113, "step": 1154 }, { "epoch": 0.49, "grad_norm": 0.6161269437993665, "learning_rate": 4.703140232974697e-06, "loss": 0.4904, "step": 1155 }, { "epoch": 0.49, "grad_norm": 0.6096752081924692, "learning_rate": 4.7022515489408536e-06, "loss": 0.4925, "step": 1156 }, { "epoch": 0.49, "grad_norm": 0.669481066717579, "learning_rate": 4.701361620925041e-06, "loss": 0.4996, "step": 1157 }, { "epoch": 0.49, "grad_norm": 0.6192786083709042, "learning_rate": 4.700470449429952e-06, "loss": 0.5126, "step": 1158 }, { "epoch": 0.5, "grad_norm": 0.643144004386803, "learning_rate": 4.699578034958981e-06, "loss": 0.4969, "step": 1159 }, { "epoch": 0.5, "grad_norm": 0.5846720216828944, "learning_rate": 4.698684378016223e-06, "loss": 0.4776, "step": 1160 }, { "epoch": 0.5, "grad_norm": 0.5741587597255912, "learning_rate": 4.697789479106479e-06, "loss": 0.4923, "step": 1161 }, { "epoch": 0.5, "grad_norm": 0.5636128681731576, "learning_rate": 4.696893338735246e-06, "loss": 0.4946, "step": 1162 }, { "epoch": 0.5, "grad_norm": 0.5616937977408087, "learning_rate": 4.6959959574087265e-06, "loss": 0.4956, "step": 1163 }, { "epoch": 0.5, "grad_norm": 0.5752353921016059, "learning_rate": 4.695097335633823e-06, "loss": 0.4951, "step": 1164 }, { "epoch": 0.5, "grad_norm": 0.5975316140588034, "learning_rate": 4.694197473918139e-06, "loss": 0.4702, "step": 1165 }, { "epoch": 0.5, "grad_norm": 0.5712072339658532, "learning_rate": 4.693296372769978e-06, "loss": 0.5235, "step": 1166 }, { "epoch": 0.5, "grad_norm": 0.5634240355738215, "learning_rate": 4.692394032698341e-06, "loss": 0.4926, "step": 1167 }, { "epoch": 0.5, "grad_norm": 0.5804932429546033, "learning_rate": 4.691490454212933e-06, "loss": 0.4799, "step": 1168 }, { "epoch": 0.5, "grad_norm": 0.553753907689284, "learning_rate": 4.690585637824158e-06, "loss": 0.4689, "step": 1169 }, { "epoch": 0.5, "grad_norm": 0.5690679915211406, "learning_rate": 4.6896795840431155e-06, "loss": 0.4839, "step": 1170 }, { "epoch": 0.5, "grad_norm": 0.6053719083807559, "learning_rate": 4.688772293381608e-06, "loss": 0.5041, "step": 1171 }, { "epoch": 0.5, "grad_norm": 0.5622187791991081, "learning_rate": 4.687863766352134e-06, "loss": 0.5178, "step": 1172 }, { "epoch": 0.5, "grad_norm": 0.5706828038584673, "learning_rate": 4.686954003467894e-06, "loss": 0.5148, "step": 1173 }, { "epoch": 0.5, "grad_norm": 0.5814193933038267, "learning_rate": 4.686043005242781e-06, "loss": 0.4925, "step": 1174 }, { "epoch": 0.5, "grad_norm": 0.5841581837583776, "learning_rate": 4.685130772191392e-06, "loss": 0.5043, "step": 1175 }, { "epoch": 0.5, "grad_norm": 0.5432146572491381, "learning_rate": 4.684217304829017e-06, "loss": 0.497, "step": 1176 }, { "epoch": 0.5, "grad_norm": 0.6238281748493371, "learning_rate": 4.683302603671644e-06, "loss": 0.5117, "step": 1177 }, { "epoch": 0.5, "grad_norm": 0.6120786023963148, "learning_rate": 4.682386669235959e-06, "loss": 0.5043, "step": 1178 }, { "epoch": 0.5, "grad_norm": 0.6097201971727148, "learning_rate": 4.681469502039345e-06, "loss": 0.4781, "step": 1179 }, { "epoch": 0.5, "grad_norm": 0.6099227512690751, "learning_rate": 4.680551102599881e-06, "loss": 0.484, "step": 1180 }, { "epoch": 0.5, "grad_norm": 0.6118633618285224, "learning_rate": 4.67963147143634e-06, "loss": 0.4973, "step": 1181 }, { "epoch": 0.5, "grad_norm": 0.5963607635102951, "learning_rate": 4.678710609068193e-06, "loss": 0.5189, "step": 1182 }, { "epoch": 0.51, "grad_norm": 0.6095853439513462, "learning_rate": 4.677788516015608e-06, "loss": 0.4954, "step": 1183 }, { "epoch": 0.51, "grad_norm": 0.5621028815997745, "learning_rate": 4.676865192799443e-06, "loss": 0.4872, "step": 1184 }, { "epoch": 0.51, "grad_norm": 0.5945331480044012, "learning_rate": 4.675940639941256e-06, "loss": 0.5155, "step": 1185 }, { "epoch": 0.51, "grad_norm": 0.5753731126662963, "learning_rate": 4.675014857963297e-06, "loss": 0.4699, "step": 1186 }, { "epoch": 0.51, "grad_norm": 0.5747798420504858, "learning_rate": 4.674087847388511e-06, "loss": 0.4952, "step": 1187 }, { "epoch": 0.51, "grad_norm": 0.5749508507302784, "learning_rate": 4.673159608740536e-06, "loss": 0.4896, "step": 1188 }, { "epoch": 0.51, "grad_norm": 0.5809285237560987, "learning_rate": 4.6722301425437056e-06, "loss": 0.4659, "step": 1189 }, { "epoch": 0.51, "grad_norm": 0.6013752084991827, "learning_rate": 4.671299449323045e-06, "loss": 0.5326, "step": 1190 }, { "epoch": 0.51, "grad_norm": 0.6050011396855485, "learning_rate": 4.670367529604274e-06, "loss": 0.5158, "step": 1191 }, { "epoch": 0.51, "grad_norm": 0.5740809770574845, "learning_rate": 4.669434383913803e-06, "loss": 0.4945, "step": 1192 }, { "epoch": 0.51, "grad_norm": 0.6493339789648531, "learning_rate": 4.668500012778738e-06, "loss": 0.5303, "step": 1193 }, { "epoch": 0.51, "grad_norm": 0.5707017426293574, "learning_rate": 4.667564416726875e-06, "loss": 0.5118, "step": 1194 }, { "epoch": 0.51, "grad_norm": 0.5849224346254286, "learning_rate": 4.666627596286702e-06, "loss": 0.489, "step": 1195 }, { "epoch": 0.51, "grad_norm": 0.5747420247370756, "learning_rate": 4.6656895519874e-06, "loss": 0.4815, "step": 1196 }, { "epoch": 0.51, "grad_norm": 0.5840838885906293, "learning_rate": 4.664750284358841e-06, "loss": 0.5038, "step": 1197 }, { "epoch": 0.51, "grad_norm": 0.5865365275558818, "learning_rate": 4.663809793931585e-06, "loss": 0.481, "step": 1198 }, { "epoch": 0.51, "grad_norm": 0.6430958410357724, "learning_rate": 4.662868081236887e-06, "loss": 0.4874, "step": 1199 }, { "epoch": 0.51, "grad_norm": 0.5694523106768161, "learning_rate": 4.66192514680669e-06, "loss": 0.4681, "step": 1200 }, { "epoch": 0.51, "grad_norm": 0.6361578693239596, "learning_rate": 4.660980991173628e-06, "loss": 0.5068, "step": 1201 }, { "epoch": 0.51, "grad_norm": 0.6082855757365931, "learning_rate": 4.660035614871024e-06, "loss": 0.4994, "step": 1202 }, { "epoch": 0.51, "grad_norm": 0.5711249982493317, "learning_rate": 4.659089018432893e-06, "loss": 0.4821, "step": 1203 }, { "epoch": 0.51, "grad_norm": 0.6147166083946165, "learning_rate": 4.658141202393935e-06, "loss": 0.5056, "step": 1204 }, { "epoch": 0.51, "grad_norm": 0.5988711912518265, "learning_rate": 4.657192167289542e-06, "loss": 0.4711, "step": 1205 }, { "epoch": 0.52, "grad_norm": 0.5741231990207563, "learning_rate": 4.6562419136557935e-06, "loss": 0.5082, "step": 1206 }, { "epoch": 0.52, "grad_norm": 0.6046056425511495, "learning_rate": 4.655290442029459e-06, "loss": 0.4681, "step": 1207 }, { "epoch": 0.52, "eval_loss": 0.4913150668144226, "eval_runtime": 6919.579, "eval_samples_per_second": 41.966, "eval_steps_per_second": 2.098, "step": 1207 }, { "epoch": 0.52, "grad_norm": 0.5744167724479108, "learning_rate": 4.654337752947992e-06, "loss": 0.4737, "step": 1208 }, { "epoch": 0.52, "grad_norm": 0.5907262180093524, "learning_rate": 4.653383846949539e-06, "loss": 0.4771, "step": 1209 }, { "epoch": 0.52, "grad_norm": 0.6398591796202622, "learning_rate": 4.652428724572929e-06, "loss": 0.5307, "step": 1210 }, { "epoch": 0.52, "grad_norm": 0.6121432006011495, "learning_rate": 4.6514723863576815e-06, "loss": 0.4952, "step": 1211 }, { "epoch": 0.52, "grad_norm": 0.6039842346032075, "learning_rate": 4.650514832844002e-06, "loss": 0.5106, "step": 1212 }, { "epoch": 0.52, "grad_norm": 0.5945723569093062, "learning_rate": 4.649556064572781e-06, "loss": 0.5187, "step": 1213 }, { "epoch": 0.52, "grad_norm": 0.6127059471922567, "learning_rate": 4.648596082085597e-06, "loss": 0.5023, "step": 1214 }, { "epoch": 0.52, "grad_norm": 0.6148342317689859, "learning_rate": 4.647634885924713e-06, "loss": 0.5041, "step": 1215 }, { "epoch": 0.52, "grad_norm": 0.6171661212182703, "learning_rate": 4.64667247663308e-06, "loss": 0.4656, "step": 1216 }, { "epoch": 0.52, "grad_norm": 0.5811162956002015, "learning_rate": 4.645708854754329e-06, "loss": 0.4793, "step": 1217 }, { "epoch": 0.52, "grad_norm": 0.5897461617987049, "learning_rate": 4.644744020832782e-06, "loss": 0.511, "step": 1218 }, { "epoch": 0.52, "grad_norm": 0.6360033916365583, "learning_rate": 4.6437779754134424e-06, "loss": 0.4959, "step": 1219 }, { "epoch": 0.52, "grad_norm": 0.5920841965277126, "learning_rate": 4.642810719041999e-06, "loss": 0.5078, "step": 1220 }, { "epoch": 0.52, "grad_norm": 0.5764590932419221, "learning_rate": 4.641842252264824e-06, "loss": 0.4816, "step": 1221 }, { "epoch": 0.52, "grad_norm": 0.606347475674508, "learning_rate": 4.640872575628973e-06, "loss": 0.5039, "step": 1222 }, { "epoch": 0.52, "grad_norm": 0.5601452063877889, "learning_rate": 4.639901689682186e-06, "loss": 0.483, "step": 1223 }, { "epoch": 0.52, "grad_norm": 0.60107394353492, "learning_rate": 4.638929594972885e-06, "loss": 0.5017, "step": 1224 }, { "epoch": 0.52, "grad_norm": 0.5832229161651298, "learning_rate": 4.637956292050176e-06, "loss": 0.4881, "step": 1225 }, { "epoch": 0.52, "grad_norm": 0.6230227342266171, "learning_rate": 4.636981781463848e-06, "loss": 0.4771, "step": 1226 }, { "epoch": 0.52, "grad_norm": 0.6267479062825622, "learning_rate": 4.636006063764369e-06, "loss": 0.4898, "step": 1227 }, { "epoch": 0.52, "grad_norm": 0.6267200657967705, "learning_rate": 4.635029139502892e-06, "loss": 0.4901, "step": 1228 }, { "epoch": 0.53, "grad_norm": 0.6484932594616627, "learning_rate": 4.634051009231251e-06, "loss": 0.5206, "step": 1229 }, { "epoch": 0.53, "grad_norm": 0.5931663500547755, "learning_rate": 4.63307167350196e-06, "loss": 0.4702, "step": 1230 }, { "epoch": 0.53, "grad_norm": 0.5860181057294145, "learning_rate": 4.632091132868214e-06, "loss": 0.506, "step": 1231 }, { "epoch": 0.53, "grad_norm": 0.5931231715432055, "learning_rate": 4.631109387883891e-06, "loss": 0.5044, "step": 1232 }, { "epoch": 0.53, "grad_norm": 0.5821409581725687, "learning_rate": 4.630126439103546e-06, "loss": 0.4844, "step": 1233 }, { "epoch": 0.53, "grad_norm": 0.549630228044515, "learning_rate": 4.629142287082416e-06, "loss": 0.4804, "step": 1234 }, { "epoch": 0.53, "grad_norm": 0.5406774405919889, "learning_rate": 4.628156932376419e-06, "loss": 0.4937, "step": 1235 }, { "epoch": 0.53, "grad_norm": 0.5446742036382618, "learning_rate": 4.627170375542147e-06, "loss": 0.4896, "step": 1236 }, { "epoch": 0.53, "grad_norm": 0.5801553090659803, "learning_rate": 4.626182617136877e-06, "loss": 0.5097, "step": 1237 }, { "epoch": 0.53, "grad_norm": 0.547761487753955, "learning_rate": 4.625193657718563e-06, "loss": 0.486, "step": 1238 }, { "epoch": 0.53, "grad_norm": 0.5718577243267876, "learning_rate": 4.624203497845835e-06, "loss": 0.4951, "step": 1239 }, { "epoch": 0.53, "grad_norm": 0.5960930128816431, "learning_rate": 4.623212138078004e-06, "loss": 0.5076, "step": 1240 }, { "epoch": 0.53, "grad_norm": 0.5721738998469788, "learning_rate": 4.622219578975057e-06, "loss": 0.4883, "step": 1241 }, { "epoch": 0.53, "grad_norm": 0.5957161415624224, "learning_rate": 4.62122582109766e-06, "loss": 0.4832, "step": 1242 }, { "epoch": 0.53, "grad_norm": 0.5704351658199357, "learning_rate": 4.620230865007154e-06, "loss": 0.5335, "step": 1243 }, { "epoch": 0.53, "grad_norm": 0.5613480503706471, "learning_rate": 4.619234711265558e-06, "loss": 0.4999, "step": 1244 }, { "epoch": 0.53, "grad_norm": 0.5990237352315096, "learning_rate": 4.61823736043557e-06, "loss": 0.5194, "step": 1245 }, { "epoch": 0.53, "grad_norm": 0.5646085384828234, "learning_rate": 4.617238813080559e-06, "loss": 0.5143, "step": 1246 }, { "epoch": 0.53, "grad_norm": 0.5868684924014452, "learning_rate": 4.616239069764574e-06, "loss": 0.5031, "step": 1247 }, { "epoch": 0.53, "grad_norm": 0.6029024571519792, "learning_rate": 4.615238131052339e-06, "loss": 0.4964, "step": 1248 }, { "epoch": 0.53, "grad_norm": 0.6047661298443577, "learning_rate": 4.614235997509251e-06, "loss": 0.515, "step": 1249 }, { "epoch": 0.53, "grad_norm": 0.5787855934766027, "learning_rate": 4.613232669701384e-06, "loss": 0.5045, "step": 1250 }, { "epoch": 0.53, "grad_norm": 0.5628826075983923, "learning_rate": 4.612228148195486e-06, "loss": 0.4866, "step": 1251 }, { "epoch": 0.53, "grad_norm": 0.5482732141245912, "learning_rate": 4.61122243355898e-06, "loss": 0.4854, "step": 1252 }, { "epoch": 0.54, "grad_norm": 0.5570796537586561, "learning_rate": 4.610215526359961e-06, "loss": 0.4892, "step": 1253 }, { "epoch": 0.54, "grad_norm": 0.5726673029878695, "learning_rate": 4.609207427167201e-06, "loss": 0.4801, "step": 1254 }, { "epoch": 0.54, "grad_norm": 0.6214832894338556, "learning_rate": 4.60819813655014e-06, "loss": 0.5322, "step": 1255 }, { "epoch": 0.54, "grad_norm": 0.5714531549826166, "learning_rate": 4.607187655078896e-06, "loss": 0.4983, "step": 1256 }, { "epoch": 0.54, "grad_norm": 0.5854107865928243, "learning_rate": 4.6061759833242585e-06, "loss": 0.5097, "step": 1257 }, { "epoch": 0.54, "grad_norm": 0.5877571773366539, "learning_rate": 4.605163121857688e-06, "loss": 0.4764, "step": 1258 }, { "epoch": 0.54, "grad_norm": 0.6161387817508228, "learning_rate": 4.604149071251318e-06, "loss": 0.4921, "step": 1259 }, { "epoch": 0.54, "grad_norm": 0.5863778207848662, "learning_rate": 4.603133832077953e-06, "loss": 0.4652, "step": 1260 }, { "epoch": 0.54, "grad_norm": 0.5889518886848664, "learning_rate": 4.602117404911071e-06, "loss": 0.4978, "step": 1261 }, { "epoch": 0.54, "grad_norm": 0.569076904297205, "learning_rate": 4.601099790324817e-06, "loss": 0.5452, "step": 1262 }, { "epoch": 0.54, "grad_norm": 0.5823464063147468, "learning_rate": 4.6000809888940105e-06, "loss": 0.516, "step": 1263 }, { "epoch": 0.54, "grad_norm": 0.5762572387418361, "learning_rate": 4.59906100119414e-06, "loss": 0.4883, "step": 1264 }, { "epoch": 0.54, "grad_norm": 0.5710353785095638, "learning_rate": 4.598039827801364e-06, "loss": 0.5114, "step": 1265 }, { "epoch": 0.54, "grad_norm": 0.5484965886436899, "learning_rate": 4.597017469292511e-06, "loss": 0.4806, "step": 1266 }, { "epoch": 0.54, "grad_norm": 0.5809463207101558, "learning_rate": 4.5959939262450796e-06, "loss": 0.4614, "step": 1267 }, { "epoch": 0.54, "grad_norm": 0.5845778185226627, "learning_rate": 4.594969199237235e-06, "loss": 0.5241, "step": 1268 }, { "epoch": 0.54, "grad_norm": 0.5908229060244614, "learning_rate": 4.593943288847814e-06, "loss": 0.5007, "step": 1269 }, { "epoch": 0.54, "grad_norm": 0.5823679847675488, "learning_rate": 4.592916195656322e-06, "loss": 0.4949, "step": 1270 }, { "epoch": 0.54, "grad_norm": 0.6773394663731939, "learning_rate": 4.591887920242929e-06, "loss": 0.4771, "step": 1271 }, { "epoch": 0.54, "grad_norm": 0.5600458296952812, "learning_rate": 4.590858463188477e-06, "loss": 0.4636, "step": 1272 }, { "epoch": 0.54, "grad_norm": 0.5983807787136752, "learning_rate": 4.589827825074472e-06, "loss": 0.4664, "step": 1273 }, { "epoch": 0.54, "grad_norm": 0.6098172385279027, "learning_rate": 4.58879600648309e-06, "loss": 0.4971, "step": 1274 }, { "epoch": 0.54, "grad_norm": 0.6237066298991583, "learning_rate": 4.587763007997173e-06, "loss": 0.4906, "step": 1275 }, { "epoch": 0.55, "grad_norm": 0.5912938391288759, "learning_rate": 4.586728830200227e-06, "loss": 0.5064, "step": 1276 }, { "epoch": 0.55, "grad_norm": 0.6381599023642128, "learning_rate": 4.585693473676428e-06, "loss": 0.4548, "step": 1277 }, { "epoch": 0.55, "grad_norm": 0.5844695919916449, "learning_rate": 4.584656939010615e-06, "loss": 0.5001, "step": 1278 }, { "epoch": 0.55, "eval_loss": 0.48919913172721863, "eval_runtime": 6918.8774, "eval_samples_per_second": 41.971, "eval_steps_per_second": 2.099, "step": 1278 }, { "epoch": 0.55, "grad_norm": 0.586366005962443, "learning_rate": 4.583619226788294e-06, "loss": 0.5084, "step": 1279 }, { "epoch": 0.55, "grad_norm": 0.5845148658515918, "learning_rate": 4.582580337595636e-06, "loss": 0.4912, "step": 1280 }, { "epoch": 0.55, "grad_norm": 0.6242513167522349, "learning_rate": 4.581540272019476e-06, "loss": 0.519, "step": 1281 }, { "epoch": 0.55, "grad_norm": 0.5978675772328417, "learning_rate": 4.580499030647314e-06, "loss": 0.505, "step": 1282 }, { "epoch": 0.55, "grad_norm": 0.5716562163085582, "learning_rate": 4.579456614067315e-06, "loss": 0.4805, "step": 1283 }, { "epoch": 0.55, "grad_norm": 0.5887843179010744, "learning_rate": 4.578413022868305e-06, "loss": 0.4805, "step": 1284 }, { "epoch": 0.55, "grad_norm": 0.5448511038289022, "learning_rate": 4.577368257639778e-06, "loss": 0.5194, "step": 1285 }, { "epoch": 0.55, "grad_norm": 0.5583269034513156, "learning_rate": 4.576322318971888e-06, "loss": 0.4665, "step": 1286 }, { "epoch": 0.55, "grad_norm": 0.6175638858305248, "learning_rate": 4.575275207455451e-06, "loss": 0.4953, "step": 1287 }, { "epoch": 0.55, "grad_norm": 0.5893120538439389, "learning_rate": 4.5742269236819485e-06, "loss": 0.5078, "step": 1288 }, { "epoch": 0.55, "grad_norm": 0.5759225882638491, "learning_rate": 4.5731774682435225e-06, "loss": 0.4967, "step": 1289 }, { "epoch": 0.55, "grad_norm": 0.6027419371051689, "learning_rate": 4.572126841732977e-06, "loss": 0.4927, "step": 1290 }, { "epoch": 0.55, "grad_norm": 0.5713265952976949, "learning_rate": 4.571075044743778e-06, "loss": 0.4818, "step": 1291 }, { "epoch": 0.55, "grad_norm": 0.6136093959980795, "learning_rate": 4.570022077870051e-06, "loss": 0.4928, "step": 1292 }, { "epoch": 0.55, "grad_norm": 0.5812274200588445, "learning_rate": 4.568967941706584e-06, "loss": 0.461, "step": 1293 }, { "epoch": 0.55, "grad_norm": 0.5705816273671851, "learning_rate": 4.567912636848826e-06, "loss": 0.4902, "step": 1294 }, { "epoch": 0.55, "grad_norm": 0.5683273724782274, "learning_rate": 4.566856163892884e-06, "loss": 0.4751, "step": 1295 }, { "epoch": 0.55, "grad_norm": 0.5783916742928776, "learning_rate": 4.565798523435528e-06, "loss": 0.5014, "step": 1296 }, { "epoch": 0.55, "grad_norm": 0.6008921391069879, "learning_rate": 4.564739716074182e-06, "loss": 0.4751, "step": 1297 }, { "epoch": 0.55, "grad_norm": 0.5977176200664817, "learning_rate": 4.563679742406935e-06, "loss": 0.4861, "step": 1298 }, { "epoch": 0.55, "grad_norm": 0.5819179757150398, "learning_rate": 4.562618603032533e-06, "loss": 0.5001, "step": 1299 }, { "epoch": 0.56, "grad_norm": 0.609985142660628, "learning_rate": 4.561556298550379e-06, "loss": 0.5011, "step": 1300 }, { "epoch": 0.56, "grad_norm": 0.628383300337832, "learning_rate": 4.560492829560535e-06, "loss": 0.4917, "step": 1301 }, { "epoch": 0.56, "grad_norm": 0.556934869126385, "learning_rate": 4.559428196663721e-06, "loss": 0.4939, "step": 1302 }, { "epoch": 0.56, "grad_norm": 0.5753689242601436, "learning_rate": 4.558362400461315e-06, "loss": 0.4798, "step": 1303 }, { "epoch": 0.56, "grad_norm": 0.608958621920625, "learning_rate": 4.55729544155535e-06, "loss": 0.5147, "step": 1304 }, { "epoch": 0.56, "grad_norm": 0.5559662431460121, "learning_rate": 4.556227320548519e-06, "loss": 0.4939, "step": 1305 }, { "epoch": 0.56, "grad_norm": 0.5704357834306742, "learning_rate": 4.555158038044167e-06, "loss": 0.4716, "step": 1306 }, { "epoch": 0.56, "grad_norm": 0.6333282168527011, "learning_rate": 4.5540875946463e-06, "loss": 0.465, "step": 1307 }, { "epoch": 0.56, "grad_norm": 0.6080793831354635, "learning_rate": 4.553015990959577e-06, "loss": 0.4881, "step": 1308 }, { "epoch": 0.56, "grad_norm": 0.5706855311890715, "learning_rate": 4.551943227589314e-06, "loss": 0.4647, "step": 1309 }, { "epoch": 0.56, "grad_norm": 0.5821734570269693, "learning_rate": 4.550869305141478e-06, "loss": 0.5317, "step": 1310 }, { "epoch": 0.56, "grad_norm": 0.5827695023416871, "learning_rate": 4.549794224222697e-06, "loss": 0.5124, "step": 1311 }, { "epoch": 0.56, "grad_norm": 0.5511547366384472, "learning_rate": 4.548717985440247e-06, "loss": 0.4722, "step": 1312 }, { "epoch": 0.56, "grad_norm": 0.6041209279498603, "learning_rate": 4.547640589402063e-06, "loss": 0.5015, "step": 1313 }, { "epoch": 0.56, "grad_norm": 0.5861854523134222, "learning_rate": 4.546562036716732e-06, "loss": 0.4808, "step": 1314 }, { "epoch": 0.56, "grad_norm": 0.5860300360146689, "learning_rate": 4.5454823279934924e-06, "loss": 0.4905, "step": 1315 }, { "epoch": 0.56, "grad_norm": 0.5653121567715198, "learning_rate": 4.5444014638422396e-06, "loss": 0.4877, "step": 1316 }, { "epoch": 0.56, "grad_norm": 0.5828331265940039, "learning_rate": 4.543319444873517e-06, "loss": 0.4966, "step": 1317 }, { "epoch": 0.56, "grad_norm": 0.5524931399027349, "learning_rate": 4.5422362716985255e-06, "loss": 0.4934, "step": 1318 }, { "epoch": 0.56, "grad_norm": 0.5848515428912962, "learning_rate": 4.541151944929114e-06, "loss": 0.4874, "step": 1319 }, { "epoch": 0.56, "grad_norm": 0.5981291916343784, "learning_rate": 4.5400664651777835e-06, "loss": 0.4998, "step": 1320 }, { "epoch": 0.56, "grad_norm": 0.5655813478901666, "learning_rate": 4.538979833057688e-06, "loss": 0.4519, "step": 1321 }, { "epoch": 0.56, "grad_norm": 0.6562430108814516, "learning_rate": 4.537892049182631e-06, "loss": 0.5272, "step": 1322 }, { "epoch": 0.57, "grad_norm": 0.6017818414342826, "learning_rate": 4.536803114167067e-06, "loss": 0.4821, "step": 1323 }, { "epoch": 0.57, "grad_norm": 0.558603956646909, "learning_rate": 4.535713028626101e-06, "loss": 0.4879, "step": 1324 }, { "epoch": 0.57, "grad_norm": 0.5722300913660474, "learning_rate": 4.534621793175488e-06, "loss": 0.4873, "step": 1325 }, { "epoch": 0.57, "grad_norm": 0.5997686300961845, "learning_rate": 4.533529408431632e-06, "loss": 0.5033, "step": 1326 }, { "epoch": 0.57, "grad_norm": 0.5829364793352837, "learning_rate": 4.532435875011586e-06, "loss": 0.5, "step": 1327 }, { "epoch": 0.57, "grad_norm": 0.570165895880547, "learning_rate": 4.531341193533053e-06, "loss": 0.4978, "step": 1328 }, { "epoch": 0.57, "grad_norm": 0.579729697723111, "learning_rate": 4.530245364614384e-06, "loss": 0.4837, "step": 1329 }, { "epoch": 0.57, "grad_norm": 0.5809138960078671, "learning_rate": 4.529148388874577e-06, "loss": 0.5126, "step": 1330 }, { "epoch": 0.57, "grad_norm": 0.5916301742756499, "learning_rate": 4.528050266933279e-06, "loss": 0.4984, "step": 1331 }, { "epoch": 0.57, "grad_norm": 0.5820120794112843, "learning_rate": 4.526950999410785e-06, "loss": 0.4617, "step": 1332 }, { "epoch": 0.57, "grad_norm": 0.5602665039817353, "learning_rate": 4.525850586928036e-06, "loss": 0.4761, "step": 1333 }, { "epoch": 0.57, "grad_norm": 0.569022812704669, "learning_rate": 4.52474903010662e-06, "loss": 0.4734, "step": 1334 }, { "epoch": 0.57, "grad_norm": 0.5942689466703626, "learning_rate": 4.523646329568771e-06, "loss": 0.5025, "step": 1335 }, { "epoch": 0.57, "grad_norm": 0.5652312432297659, "learning_rate": 4.522542485937369e-06, "loss": 0.4848, "step": 1336 }, { "epoch": 0.57, "grad_norm": 0.6230515838830604, "learning_rate": 4.521437499835942e-06, "loss": 0.5154, "step": 1337 }, { "epoch": 0.57, "grad_norm": 0.5789207150950728, "learning_rate": 4.52033137188866e-06, "loss": 0.492, "step": 1338 }, { "epoch": 0.57, "grad_norm": 0.5723737075235371, "learning_rate": 4.519224102720341e-06, "loss": 0.4898, "step": 1339 }, { "epoch": 0.57, "grad_norm": 0.5921182647904031, "learning_rate": 4.518115692956445e-06, "loss": 0.4786, "step": 1340 }, { "epoch": 0.57, "grad_norm": 0.5661075262133396, "learning_rate": 4.517006143223078e-06, "loss": 0.4819, "step": 1341 }, { "epoch": 0.57, "grad_norm": 0.5639183149476357, "learning_rate": 4.515895454146989e-06, "loss": 0.4899, "step": 1342 }, { "epoch": 0.57, "grad_norm": 0.5844775926241916, "learning_rate": 4.514783626355571e-06, "loss": 0.4749, "step": 1343 }, { "epoch": 0.57, "grad_norm": 0.5554143190741964, "learning_rate": 4.513670660476861e-06, "loss": 0.4904, "step": 1344 }, { "epoch": 0.57, "grad_norm": 0.5730633115430818, "learning_rate": 4.512556557139538e-06, "loss": 0.4745, "step": 1345 }, { "epoch": 0.58, "grad_norm": 0.5651701903378715, "learning_rate": 4.5114413169729224e-06, "loss": 0.4944, "step": 1346 }, { "epoch": 0.58, "grad_norm": 0.5926181506414837, "learning_rate": 4.51032494060698e-06, "loss": 0.5002, "step": 1347 }, { "epoch": 0.58, "grad_norm": 0.5603459457356793, "learning_rate": 4.509207428672313e-06, "loss": 0.4831, "step": 1348 }, { "epoch": 0.58, "grad_norm": 0.5864508189913115, "learning_rate": 4.508088781800172e-06, "loss": 0.4759, "step": 1349 }, { "epoch": 0.58, "eval_loss": 0.4873812198638916, "eval_runtime": 6917.3872, "eval_samples_per_second": 41.98, "eval_steps_per_second": 2.099, "step": 1349 }, { "epoch": 0.58, "grad_norm": 0.5924744693021977, "learning_rate": 4.506969000622443e-06, "loss": 0.4916, "step": 1350 }, { "epoch": 0.58, "grad_norm": 0.7833760299298924, "learning_rate": 4.5058480857716554e-06, "loss": 0.4883, "step": 1351 }, { "epoch": 0.58, "grad_norm": 0.6032529206566636, "learning_rate": 4.504726037880978e-06, "loss": 0.4588, "step": 1352 }, { "epoch": 0.58, "grad_norm": 0.5651861924712768, "learning_rate": 4.5036028575842215e-06, "loss": 0.4935, "step": 1353 }, { "epoch": 0.58, "grad_norm": 0.5811213046043772, "learning_rate": 4.502478545515833e-06, "loss": 0.4927, "step": 1354 }, { "epoch": 0.58, "grad_norm": 0.5602131651168747, "learning_rate": 4.501353102310901e-06, "loss": 0.5045, "step": 1355 }, { "epoch": 0.58, "grad_norm": 0.5609065838079279, "learning_rate": 4.500226528605154e-06, "loss": 0.5024, "step": 1356 }, { "epoch": 0.58, "grad_norm": 0.5926162131260848, "learning_rate": 4.499098825034956e-06, "loss": 0.5022, "step": 1357 }, { "epoch": 0.58, "grad_norm": 0.581609311231316, "learning_rate": 4.497969992237312e-06, "loss": 0.5069, "step": 1358 }, { "epoch": 0.58, "grad_norm": 0.5734162550066819, "learning_rate": 4.496840030849864e-06, "loss": 0.4718, "step": 1359 }, { "epoch": 0.58, "grad_norm": 0.5832809240820751, "learning_rate": 4.49570894151089e-06, "loss": 0.4884, "step": 1360 }, { "epoch": 0.58, "grad_norm": 0.5845131849772652, "learning_rate": 4.494576724859307e-06, "loss": 0.5017, "step": 1361 }, { "epoch": 0.58, "grad_norm": 0.5917877188606556, "learning_rate": 4.49344338153467e-06, "loss": 0.5087, "step": 1362 }, { "epoch": 0.58, "grad_norm": 0.5336678295406534, "learning_rate": 4.492308912177166e-06, "loss": 0.5018, "step": 1363 }, { "epoch": 0.58, "grad_norm": 0.5351847391697913, "learning_rate": 4.491173317427622e-06, "loss": 0.4747, "step": 1364 }, { "epoch": 0.58, "grad_norm": 0.598617114856617, "learning_rate": 4.490036597927499e-06, "loss": 0.4894, "step": 1365 }, { "epoch": 0.58, "grad_norm": 0.5814612225515823, "learning_rate": 4.488898754318894e-06, "loss": 0.5018, "step": 1366 }, { "epoch": 0.58, "grad_norm": 0.5887096610888417, "learning_rate": 4.48775978724454e-06, "loss": 0.4913, "step": 1367 }, { "epoch": 0.58, "grad_norm": 0.5831429888356454, "learning_rate": 4.4866196973478e-06, "loss": 0.4789, "step": 1368 }, { "epoch": 0.58, "grad_norm": 0.5920011899513401, "learning_rate": 4.485478485272678e-06, "loss": 0.4933, "step": 1369 }, { "epoch": 0.59, "grad_norm": 0.5646646883743509, "learning_rate": 4.484336151663807e-06, "loss": 0.4657, "step": 1370 }, { "epoch": 0.59, "grad_norm": 0.5498567442056715, "learning_rate": 4.483192697166455e-06, "loss": 0.4762, "step": 1371 }, { "epoch": 0.59, "grad_norm": 0.5802015082280455, "learning_rate": 4.482048122426523e-06, "loss": 0.498, "step": 1372 }, { "epoch": 0.59, "grad_norm": 0.5960641848798551, "learning_rate": 4.480902428090546e-06, "loss": 0.4868, "step": 1373 }, { "epoch": 0.59, "grad_norm": 0.5951553770712328, "learning_rate": 4.4797556148056884e-06, "loss": 0.4796, "step": 1374 }, { "epoch": 0.59, "grad_norm": 0.5704287014934469, "learning_rate": 4.47860768321975e-06, "loss": 0.519, "step": 1375 }, { "epoch": 0.59, "grad_norm": 0.5911943453713712, "learning_rate": 4.477458633981161e-06, "loss": 0.4969, "step": 1376 }, { "epoch": 0.59, "grad_norm": 0.5706191668874873, "learning_rate": 4.476308467738982e-06, "loss": 0.4754, "step": 1377 }, { "epoch": 0.59, "grad_norm": 0.5941395696773826, "learning_rate": 4.4751571851429054e-06, "loss": 0.4993, "step": 1378 }, { "epoch": 0.59, "grad_norm": 0.5947043585408122, "learning_rate": 4.474004786843256e-06, "loss": 0.5004, "step": 1379 }, { "epoch": 0.59, "grad_norm": 0.5573964245620489, "learning_rate": 4.472851273490985e-06, "loss": 0.4914, "step": 1380 }, { "epoch": 0.59, "grad_norm": 0.5600880890049603, "learning_rate": 4.471696645737675e-06, "loss": 0.4713, "step": 1381 }, { "epoch": 0.59, "grad_norm": 0.6082765235407795, "learning_rate": 4.470540904235541e-06, "loss": 0.4619, "step": 1382 }, { "epoch": 0.59, "grad_norm": 0.579769492076148, "learning_rate": 4.469384049637423e-06, "loss": 0.5104, "step": 1383 }, { "epoch": 0.59, "grad_norm": 0.5594418549752033, "learning_rate": 4.468226082596792e-06, "loss": 0.4688, "step": 1384 }, { "epoch": 0.59, "grad_norm": 0.6322690165548234, "learning_rate": 4.467067003767745e-06, "loss": 0.4867, "step": 1385 }, { "epoch": 0.59, "grad_norm": 0.5935187981482064, "learning_rate": 4.465906813805012e-06, "loss": 0.5181, "step": 1386 }, { "epoch": 0.59, "grad_norm": 0.5944767421902712, "learning_rate": 4.464745513363945e-06, "loss": 0.4891, "step": 1387 }, { "epoch": 0.59, "grad_norm": 0.5765100142428179, "learning_rate": 4.463583103100527e-06, "loss": 0.4829, "step": 1388 }, { "epoch": 0.59, "grad_norm": 0.6169183106704456, "learning_rate": 4.462419583671366e-06, "loss": 0.4845, "step": 1389 }, { "epoch": 0.59, "grad_norm": 0.5803513203340209, "learning_rate": 4.4612549557336975e-06, "loss": 0.4658, "step": 1390 }, { "epoch": 0.59, "grad_norm": 0.562253781436203, "learning_rate": 4.460089219945383e-06, "loss": 0.4771, "step": 1391 }, { "epoch": 0.59, "grad_norm": 0.5584896620934483, "learning_rate": 4.458922376964909e-06, "loss": 0.452, "step": 1392 }, { "epoch": 0.6, "grad_norm": 0.5894231534862258, "learning_rate": 4.457754427451389e-06, "loss": 0.4713, "step": 1393 }, { "epoch": 0.6, "grad_norm": 0.5847332266508826, "learning_rate": 4.456585372064559e-06, "loss": 0.4912, "step": 1394 }, { "epoch": 0.6, "grad_norm": 0.5590333023061157, "learning_rate": 4.455415211464783e-06, "loss": 0.4849, "step": 1395 }, { "epoch": 0.6, "grad_norm": 0.5390426978695757, "learning_rate": 4.454243946313047e-06, "loss": 0.5039, "step": 1396 }, { "epoch": 0.6, "grad_norm": 0.5719225582437959, "learning_rate": 4.453071577270961e-06, "loss": 0.4779, "step": 1397 }, { "epoch": 0.6, "grad_norm": 0.5850243105245156, "learning_rate": 4.451898105000759e-06, "loss": 0.4898, "step": 1398 }, { "epoch": 0.6, "grad_norm": 0.5324711572140098, "learning_rate": 4.450723530165299e-06, "loss": 0.4757, "step": 1399 }, { "epoch": 0.6, "grad_norm": 0.5560225385184383, "learning_rate": 4.449547853428061e-06, "loss": 0.4883, "step": 1400 }, { "epoch": 0.6, "grad_norm": 0.5485424264260553, "learning_rate": 4.448371075453147e-06, "loss": 0.4701, "step": 1401 }, { "epoch": 0.6, "grad_norm": 0.5667730723100881, "learning_rate": 4.4471931969052816e-06, "loss": 0.492, "step": 1402 }, { "epoch": 0.6, "grad_norm": 0.6246169641223871, "learning_rate": 4.446014218449811e-06, "loss": 0.4884, "step": 1403 }, { "epoch": 0.6, "grad_norm": 0.5799291856487485, "learning_rate": 4.444834140752702e-06, "loss": 0.4933, "step": 1404 }, { "epoch": 0.6, "grad_norm": 0.5851580198155035, "learning_rate": 4.443652964480544e-06, "loss": 0.4821, "step": 1405 }, { "epoch": 0.6, "grad_norm": 0.6056180941381275, "learning_rate": 4.442470690300546e-06, "loss": 0.4947, "step": 1406 }, { "epoch": 0.6, "grad_norm": 0.5758464475466127, "learning_rate": 4.441287318880537e-06, "loss": 0.5095, "step": 1407 }, { "epoch": 0.6, "grad_norm": 0.6109440245331582, "learning_rate": 4.4401028508889645e-06, "loss": 0.4702, "step": 1408 }, { "epoch": 0.6, "grad_norm": 0.5763086784688934, "learning_rate": 4.4389172869949e-06, "loss": 0.4869, "step": 1409 }, { "epoch": 0.6, "grad_norm": 0.6069906437238171, "learning_rate": 4.437730627868028e-06, "loss": 0.4947, "step": 1410 }, { "epoch": 0.6, "grad_norm": 0.5927086729110405, "learning_rate": 4.4365428741786554e-06, "loss": 0.4606, "step": 1411 }, { "epoch": 0.6, "grad_norm": 0.6090925352724758, "learning_rate": 4.435354026597707e-06, "loss": 0.4914, "step": 1412 }, { "epoch": 0.6, "grad_norm": 0.5746470686497894, "learning_rate": 4.434164085796724e-06, "loss": 0.4891, "step": 1413 }, { "epoch": 0.6, "grad_norm": 0.6111669276029154, "learning_rate": 4.432973052447868e-06, "loss": 0.4752, "step": 1414 }, { "epoch": 0.6, "grad_norm": 0.5736428596336413, "learning_rate": 4.4317809272239145e-06, "loss": 0.4849, "step": 1415 }, { "epoch": 0.6, "grad_norm": 0.5279491616141355, "learning_rate": 4.430587710798257e-06, "loss": 0.4732, "step": 1416 }, { "epoch": 0.61, "grad_norm": 0.5799544850676969, "learning_rate": 4.429393403844906e-06, "loss": 0.5072, "step": 1417 }, { "epoch": 0.61, "grad_norm": 0.5345098644918358, "learning_rate": 4.428198007038489e-06, "loss": 0.4731, "step": 1418 }, { "epoch": 0.61, "grad_norm": 0.5979310441080966, "learning_rate": 4.427001521054245e-06, "loss": 0.4888, "step": 1419 }, { "epoch": 0.61, "grad_norm": 0.580732593585383, "learning_rate": 4.425803946568033e-06, "loss": 0.4849, "step": 1420 }, { "epoch": 0.61, "eval_loss": 0.4855799674987793, "eval_runtime": 6914.623, "eval_samples_per_second": 41.996, "eval_steps_per_second": 2.1, "step": 1420 }, { "epoch": 0.61, "grad_norm": 0.5404083046391809, "learning_rate": 4.424605284256323e-06, "loss": 0.4898, "step": 1421 }, { "epoch": 0.61, "grad_norm": 0.5484706120901545, "learning_rate": 4.423405534796204e-06, "loss": 0.4973, "step": 1422 }, { "epoch": 0.61, "grad_norm": 0.5788605549686905, "learning_rate": 4.422204698865374e-06, "loss": 0.4837, "step": 1423 }, { "epoch": 0.61, "grad_norm": 0.6106952748012678, "learning_rate": 4.421002777142148e-06, "loss": 0.5014, "step": 1424 }, { "epoch": 0.61, "grad_norm": 0.5448876773549647, "learning_rate": 4.419799770305453e-06, "loss": 0.4775, "step": 1425 }, { "epoch": 0.61, "grad_norm": 0.5628932799070395, "learning_rate": 4.41859567903483e-06, "loss": 0.4877, "step": 1426 }, { "epoch": 0.61, "grad_norm": 0.58401703660086, "learning_rate": 4.417390504010432e-06, "loss": 0.5241, "step": 1427 }, { "epoch": 0.61, "grad_norm": 0.5916298435437614, "learning_rate": 4.416184245913022e-06, "loss": 0.4664, "step": 1428 }, { "epoch": 0.61, "grad_norm": 0.5713018784420761, "learning_rate": 4.41497690542398e-06, "loss": 0.5127, "step": 1429 }, { "epoch": 0.61, "grad_norm": 0.5997368182029638, "learning_rate": 4.413768483225292e-06, "loss": 0.4928, "step": 1430 }, { "epoch": 0.61, "grad_norm": 0.5793090886150154, "learning_rate": 4.4125589799995585e-06, "loss": 0.4658, "step": 1431 }, { "epoch": 0.61, "grad_norm": 0.563370967957538, "learning_rate": 4.411348396429989e-06, "loss": 0.4933, "step": 1432 }, { "epoch": 0.61, "grad_norm": 0.5541515539221555, "learning_rate": 4.410136733200404e-06, "loss": 0.4635, "step": 1433 }, { "epoch": 0.61, "grad_norm": 0.6348388776122461, "learning_rate": 4.4089239909952335e-06, "loss": 0.5072, "step": 1434 }, { "epoch": 0.61, "grad_norm": 0.5613188088037165, "learning_rate": 4.407710170499517e-06, "loss": 0.4722, "step": 1435 }, { "epoch": 0.61, "grad_norm": 0.5702782150061315, "learning_rate": 4.406495272398903e-06, "loss": 0.4601, "step": 1436 }, { "epoch": 0.61, "grad_norm": 0.5706119901840532, "learning_rate": 4.405279297379648e-06, "loss": 0.4873, "step": 1437 }, { "epoch": 0.61, "grad_norm": 0.6397976458766784, "learning_rate": 4.404062246128621e-06, "loss": 0.4943, "step": 1438 }, { "epoch": 0.61, "grad_norm": 0.5973771040772421, "learning_rate": 4.4028441193332914e-06, "loss": 0.5085, "step": 1439 }, { "epoch": 0.62, "grad_norm": 0.5914091770327425, "learning_rate": 4.401624917681743e-06, "loss": 0.4922, "step": 1440 }, { "epoch": 0.62, "grad_norm": 0.6055784851510099, "learning_rate": 4.400404641862664e-06, "loss": 0.4745, "step": 1441 }, { "epoch": 0.62, "grad_norm": 0.5764544930402707, "learning_rate": 4.399183292565347e-06, "loss": 0.4857, "step": 1442 }, { "epoch": 0.62, "grad_norm": 0.5996098894523001, "learning_rate": 4.397960870479696e-06, "loss": 0.4978, "step": 1443 }, { "epoch": 0.62, "grad_norm": 0.5521355170405737, "learning_rate": 4.396737376296218e-06, "loss": 0.4587, "step": 1444 }, { "epoch": 0.62, "grad_norm": 0.59045938437032, "learning_rate": 4.395512810706026e-06, "loss": 0.4908, "step": 1445 }, { "epoch": 0.62, "grad_norm": 0.583988820333397, "learning_rate": 4.394287174400838e-06, "loss": 0.4869, "step": 1446 }, { "epoch": 0.62, "grad_norm": 0.6065684858025009, "learning_rate": 4.393060468072976e-06, "loss": 0.4892, "step": 1447 }, { "epoch": 0.62, "grad_norm": 0.5703024949579703, "learning_rate": 4.3918326924153685e-06, "loss": 0.4759, "step": 1448 }, { "epoch": 0.62, "grad_norm": 0.5745917135551377, "learning_rate": 4.390603848121546e-06, "loss": 0.4869, "step": 1449 }, { "epoch": 0.62, "grad_norm": 0.5720705945602651, "learning_rate": 4.3893739358856465e-06, "loss": 0.4663, "step": 1450 }, { "epoch": 0.62, "grad_norm": 0.5748475515933826, "learning_rate": 4.388142956402405e-06, "loss": 0.4981, "step": 1451 }, { "epoch": 0.62, "grad_norm": 1.1756158657989677, "learning_rate": 4.3869109103671645e-06, "loss": 0.5071, "step": 1452 }, { "epoch": 0.62, "grad_norm": 0.5696921062936988, "learning_rate": 4.385677798475868e-06, "loss": 0.4764, "step": 1453 }, { "epoch": 0.62, "grad_norm": 0.557690101161529, "learning_rate": 4.384443621425062e-06, "loss": 0.5103, "step": 1454 }, { "epoch": 0.62, "grad_norm": 0.5542852620725858, "learning_rate": 4.383208379911893e-06, "loss": 0.4835, "step": 1455 }, { "epoch": 0.62, "grad_norm": 0.5402197457616243, "learning_rate": 4.38197207463411e-06, "loss": 0.4963, "step": 1456 }, { "epoch": 0.62, "grad_norm": 0.584415569246153, "learning_rate": 4.380734706290063e-06, "loss": 0.4835, "step": 1457 }, { "epoch": 0.62, "grad_norm": 0.5483678859464396, "learning_rate": 4.379496275578701e-06, "loss": 0.489, "step": 1458 }, { "epoch": 0.62, "grad_norm": 0.6061260112046732, "learning_rate": 4.378256783199575e-06, "loss": 0.507, "step": 1459 }, { "epoch": 0.62, "grad_norm": 0.5542441873465928, "learning_rate": 4.377016229852836e-06, "loss": 0.4915, "step": 1460 }, { "epoch": 0.62, "grad_norm": 0.5545139083535812, "learning_rate": 4.375774616239231e-06, "loss": 0.5185, "step": 1461 }, { "epoch": 0.62, "grad_norm": 0.552239627065928, "learning_rate": 4.374531943060109e-06, "loss": 0.4783, "step": 1462 }, { "epoch": 0.63, "grad_norm": 0.5335369265994653, "learning_rate": 4.373288211017418e-06, "loss": 0.454, "step": 1463 }, { "epoch": 0.63, "grad_norm": 0.5454066478470697, "learning_rate": 4.3720434208137015e-06, "loss": 0.4824, "step": 1464 }, { "epoch": 0.63, "grad_norm": 0.6037061519362923, "learning_rate": 4.370797573152101e-06, "loss": 0.4977, "step": 1465 }, { "epoch": 0.63, "grad_norm": 0.5805121340778033, "learning_rate": 4.369550668736358e-06, "loss": 0.4894, "step": 1466 }, { "epoch": 0.63, "grad_norm": 0.551429386005362, "learning_rate": 4.3683027082708085e-06, "loss": 0.4612, "step": 1467 }, { "epoch": 0.63, "grad_norm": 0.5772151081375165, "learning_rate": 4.3670536924603855e-06, "loss": 0.4893, "step": 1468 }, { "epoch": 0.63, "grad_norm": 0.5600194793378279, "learning_rate": 4.365803622010618e-06, "loss": 0.4956, "step": 1469 }, { "epoch": 0.63, "grad_norm": 0.5530119828515296, "learning_rate": 4.364552497627632e-06, "loss": 0.4612, "step": 1470 }, { "epoch": 0.63, "grad_norm": 0.5805548135765888, "learning_rate": 4.363300320018148e-06, "loss": 0.4659, "step": 1471 }, { "epoch": 0.63, "grad_norm": 0.5501610854660056, "learning_rate": 4.36204708988948e-06, "loss": 0.4842, "step": 1472 }, { "epoch": 0.63, "grad_norm": 0.5997289615402112, "learning_rate": 4.36079280794954e-06, "loss": 0.52, "step": 1473 }, { "epoch": 0.63, "grad_norm": 0.5805877442370376, "learning_rate": 4.359537474906831e-06, "loss": 0.487, "step": 1474 }, { "epoch": 0.63, "grad_norm": 0.558823842875777, "learning_rate": 4.35828109147045e-06, "loss": 0.4664, "step": 1475 }, { "epoch": 0.63, "grad_norm": 0.5889535016987915, "learning_rate": 4.357023658350089e-06, "loss": 0.4989, "step": 1476 }, { "epoch": 0.63, "grad_norm": 0.6277448104948404, "learning_rate": 4.3557651762560316e-06, "loss": 0.5118, "step": 1477 }, { "epoch": 0.63, "grad_norm": 0.5679808700739567, "learning_rate": 4.3545056458991556e-06, "loss": 0.463, "step": 1478 }, { "epoch": 0.63, "grad_norm": 0.5962236218865731, "learning_rate": 4.353245067990928e-06, "loss": 0.4798, "step": 1479 }, { "epoch": 0.63, "grad_norm": 0.5808142134901263, "learning_rate": 4.3519834432434095e-06, "loss": 0.514, "step": 1480 }, { "epoch": 0.63, "grad_norm": 0.5889258035229223, "learning_rate": 4.350720772369252e-06, "loss": 0.5016, "step": 1481 }, { "epoch": 0.63, "grad_norm": 0.5618362131992736, "learning_rate": 4.3494570560817e-06, "loss": 0.4861, "step": 1482 }, { "epoch": 0.63, "grad_norm": 0.5693422718547901, "learning_rate": 4.348192295094585e-06, "loss": 0.4737, "step": 1483 }, { "epoch": 0.63, "grad_norm": 0.5554250264653801, "learning_rate": 4.346926490122329e-06, "loss": 0.4959, "step": 1484 }, { "epoch": 0.63, "grad_norm": 0.5541704411153275, "learning_rate": 4.345659641879948e-06, "loss": 0.489, "step": 1485 }, { "epoch": 0.63, "grad_norm": 0.5749556736212126, "learning_rate": 4.344391751083043e-06, "loss": 0.4919, "step": 1486 }, { "epoch": 0.64, "grad_norm": 0.5815536068592329, "learning_rate": 4.343122818447804e-06, "loss": 0.478, "step": 1487 }, { "epoch": 0.64, "grad_norm": 0.5807918654560094, "learning_rate": 4.341852844691012e-06, "loss": 0.4893, "step": 1488 }, { "epoch": 0.64, "grad_norm": 0.5772854343547457, "learning_rate": 4.340581830530036e-06, "loss": 0.4883, "step": 1489 }, { "epoch": 0.64, "grad_norm": 0.6083000277946388, "learning_rate": 4.33930977668283e-06, "loss": 0.5033, "step": 1490 }, { "epoch": 0.64, "grad_norm": 0.5740926343714129, "learning_rate": 4.338036683867936e-06, "loss": 0.4943, "step": 1491 }, { "epoch": 0.64, "eval_loss": 0.4840275049209595, "eval_runtime": 6927.0119, "eval_samples_per_second": 41.921, "eval_steps_per_second": 2.096, "step": 1491 }, { "epoch": 0.64, "grad_norm": 0.5684026018424353, "learning_rate": 4.336762552804485e-06, "loss": 0.4718, "step": 1492 }, { "epoch": 0.64, "grad_norm": 0.562055744831237, "learning_rate": 4.335487384212194e-06, "loss": 0.5053, "step": 1493 }, { "epoch": 0.64, "grad_norm": 0.5816786952791797, "learning_rate": 4.334211178811364e-06, "loss": 0.4864, "step": 1494 }, { "epoch": 0.64, "grad_norm": 0.543579418092536, "learning_rate": 4.332933937322883e-06, "loss": 0.4937, "step": 1495 }, { "epoch": 0.64, "grad_norm": 0.576270682270156, "learning_rate": 4.331655660468224e-06, "loss": 0.4587, "step": 1496 }, { "epoch": 0.64, "grad_norm": 0.5770731512199584, "learning_rate": 4.330376348969445e-06, "loss": 0.4996, "step": 1497 }, { "epoch": 0.64, "grad_norm": 0.5523724826628789, "learning_rate": 4.329096003549189e-06, "loss": 0.4748, "step": 1498 }, { "epoch": 0.64, "grad_norm": 0.603518080716257, "learning_rate": 4.3278146249306825e-06, "loss": 0.4974, "step": 1499 }, { "epoch": 0.64, "grad_norm": 0.5503858002021573, "learning_rate": 4.326532213837735e-06, "loss": 0.4927, "step": 1500 }, { "epoch": 0.64, "grad_norm": 0.5464005046510432, "learning_rate": 4.325248770994741e-06, "loss": 0.4834, "step": 1501 }, { "epoch": 0.64, "grad_norm": 0.591865173361713, "learning_rate": 4.323964297126675e-06, "loss": 0.4651, "step": 1502 }, { "epoch": 0.64, "grad_norm": 0.5839558098791746, "learning_rate": 4.3226787929590965e-06, "loss": 0.4726, "step": 1503 }, { "epoch": 0.64, "grad_norm": 0.5870418970742052, "learning_rate": 4.3213922592181455e-06, "loss": 0.4996, "step": 1504 }, { "epoch": 0.64, "grad_norm": 0.5949338157726647, "learning_rate": 4.320104696630544e-06, "loss": 0.4882, "step": 1505 }, { "epoch": 0.64, "grad_norm": 0.5822457164207244, "learning_rate": 4.318816105923596e-06, "loss": 0.4999, "step": 1506 }, { "epoch": 0.64, "grad_norm": 0.6042290491386848, "learning_rate": 4.317526487825185e-06, "loss": 0.4743, "step": 1507 }, { "epoch": 0.64, "grad_norm": 0.576454722720242, "learning_rate": 4.316235843063775e-06, "loss": 0.5038, "step": 1508 }, { "epoch": 0.64, "grad_norm": 0.5715912329774304, "learning_rate": 4.314944172368411e-06, "loss": 0.5029, "step": 1509 }, { "epoch": 0.65, "grad_norm": 0.5530105706856633, "learning_rate": 4.3136514764687155e-06, "loss": 0.4861, "step": 1510 }, { "epoch": 0.65, "grad_norm": 0.5500767838459567, "learning_rate": 4.312357756094892e-06, "loss": 0.4858, "step": 1511 }, { "epoch": 0.65, "grad_norm": 0.54693416081482, "learning_rate": 4.311063011977723e-06, "loss": 0.5017, "step": 1512 }, { "epoch": 0.65, "grad_norm": 0.5419499341904167, "learning_rate": 4.309767244848567e-06, "loss": 0.4891, "step": 1513 }, { "epoch": 0.65, "grad_norm": 0.5614950534369257, "learning_rate": 4.308470455439362e-06, "loss": 0.4971, "step": 1514 }, { "epoch": 0.65, "grad_norm": 0.5349156359990223, "learning_rate": 4.3071726444826244e-06, "loss": 0.4777, "step": 1515 }, { "epoch": 0.65, "grad_norm": 0.6003648657933892, "learning_rate": 4.305873812711445e-06, "loss": 0.4883, "step": 1516 }, { "epoch": 0.65, "grad_norm": 0.5561939266607488, "learning_rate": 4.304573960859493e-06, "loss": 0.477, "step": 1517 }, { "epoch": 0.65, "grad_norm": 0.5772579264539893, "learning_rate": 4.303273089661013e-06, "loss": 0.4636, "step": 1518 }, { "epoch": 0.65, "grad_norm": 0.5555633091494279, "learning_rate": 4.301971199850826e-06, "loss": 0.4598, "step": 1519 }, { "epoch": 0.65, "grad_norm": 0.5909063796287866, "learning_rate": 4.300668292164329e-06, "loss": 0.4704, "step": 1520 }, { "epoch": 0.65, "grad_norm": 0.5321519349592307, "learning_rate": 4.299364367337493e-06, "loss": 0.4726, "step": 1521 }, { "epoch": 0.65, "grad_norm": 0.5948749563634228, "learning_rate": 4.298059426106864e-06, "loss": 0.5042, "step": 1522 }, { "epoch": 0.65, "grad_norm": 0.5590529380799898, "learning_rate": 4.29675346920956e-06, "loss": 0.4771, "step": 1523 }, { "epoch": 0.65, "grad_norm": 0.573778432963487, "learning_rate": 4.2954464973832765e-06, "loss": 0.4841, "step": 1524 }, { "epoch": 0.65, "grad_norm": 0.5531613083984294, "learning_rate": 4.29413851136628e-06, "loss": 0.4695, "step": 1525 }, { "epoch": 0.65, "grad_norm": 0.5612844436545967, "learning_rate": 4.292829511897409e-06, "loss": 0.4804, "step": 1526 }, { "epoch": 0.65, "grad_norm": 0.7125770424238018, "learning_rate": 4.2915194997160774e-06, "loss": 0.4788, "step": 1527 }, { "epoch": 0.65, "grad_norm": 0.6254458428657401, "learning_rate": 4.2902084755622685e-06, "loss": 0.4875, "step": 1528 }, { "epoch": 0.65, "grad_norm": 0.5699982607183243, "learning_rate": 4.288896440176539e-06, "loss": 0.5127, "step": 1529 }, { "epoch": 0.65, "grad_norm": 0.5359705267411138, "learning_rate": 4.287583394300016e-06, "loss": 0.4592, "step": 1530 }, { "epoch": 0.65, "grad_norm": 0.573098209766848, "learning_rate": 4.286269338674396e-06, "loss": 0.4872, "step": 1531 }, { "epoch": 0.65, "grad_norm": 0.582145535892815, "learning_rate": 4.284954274041949e-06, "loss": 0.4932, "step": 1532 }, { "epoch": 0.65, "grad_norm": 0.569146953738728, "learning_rate": 4.283638201145512e-06, "loss": 0.4842, "step": 1533 }, { "epoch": 0.66, "grad_norm": 0.6060270899238028, "learning_rate": 4.282321120728493e-06, "loss": 0.5012, "step": 1534 }, { "epoch": 0.66, "grad_norm": 0.5636460053991561, "learning_rate": 4.28100303353487e-06, "loss": 0.4912, "step": 1535 }, { "epoch": 0.66, "grad_norm": 0.5594354604572797, "learning_rate": 4.279683940309187e-06, "loss": 0.4742, "step": 1536 }, { "epoch": 0.66, "grad_norm": 0.6148223579588533, "learning_rate": 4.278363841796559e-06, "loss": 0.4705, "step": 1537 }, { "epoch": 0.66, "grad_norm": 0.5510025520117999, "learning_rate": 4.277042738742668e-06, "loss": 0.4813, "step": 1538 }, { "epoch": 0.66, "grad_norm": 0.6189865762055806, "learning_rate": 4.2757206318937625e-06, "loss": 0.5076, "step": 1539 }, { "epoch": 0.66, "grad_norm": 0.5549678033446015, "learning_rate": 4.274397521996658e-06, "loss": 0.5005, "step": 1540 }, { "epoch": 0.66, "grad_norm": 0.5444972036743465, "learning_rate": 4.27307340979874e-06, "loss": 0.4738, "step": 1541 }, { "epoch": 0.66, "grad_norm": 0.5863310201010252, "learning_rate": 4.271748296047953e-06, "loss": 0.5036, "step": 1542 }, { "epoch": 0.66, "grad_norm": 0.5834850598785314, "learning_rate": 4.270422181492815e-06, "loss": 0.4789, "step": 1543 }, { "epoch": 0.66, "grad_norm": 0.5617538920507555, "learning_rate": 4.269095066882406e-06, "loss": 0.4632, "step": 1544 }, { "epoch": 0.66, "grad_norm": 0.5907455073189339, "learning_rate": 4.267766952966369e-06, "loss": 0.4953, "step": 1545 }, { "epoch": 0.66, "grad_norm": 0.5983165451822997, "learning_rate": 4.266437840494915e-06, "loss": 0.4796, "step": 1546 }, { "epoch": 0.66, "grad_norm": 0.569812624676123, "learning_rate": 4.265107730218817e-06, "loss": 0.4514, "step": 1547 }, { "epoch": 0.66, "grad_norm": 0.5847228353394706, "learning_rate": 4.2637766228894115e-06, "loss": 0.4656, "step": 1548 }, { "epoch": 0.66, "grad_norm": 0.5925087756170236, "learning_rate": 4.2624445192585994e-06, "loss": 0.5031, "step": 1549 }, { "epoch": 0.66, "grad_norm": 0.5703001753744149, "learning_rate": 4.261111420078844e-06, "loss": 0.4648, "step": 1550 }, { "epoch": 0.66, "grad_norm": 0.5777371677357935, "learning_rate": 4.259777326103169e-06, "loss": 0.5158, "step": 1551 }, { "epoch": 0.66, "grad_norm": 0.6186779369597184, "learning_rate": 4.258442238085164e-06, "loss": 0.4643, "step": 1552 }, { "epoch": 0.66, "grad_norm": 0.64721013399992, "learning_rate": 4.2571061567789766e-06, "loss": 0.4947, "step": 1553 }, { "epoch": 0.66, "grad_norm": 0.536482893670931, "learning_rate": 4.255769082939316e-06, "loss": 0.4661, "step": 1554 }, { "epoch": 0.66, "grad_norm": 0.5970665352812153, "learning_rate": 4.2544310173214546e-06, "loss": 0.4676, "step": 1555 }, { "epoch": 0.66, "grad_norm": 0.6564985474844536, "learning_rate": 4.253091960681222e-06, "loss": 0.4875, "step": 1556 }, { "epoch": 0.67, "grad_norm": 0.5742936073076695, "learning_rate": 4.251751913775009e-06, "loss": 0.4685, "step": 1557 }, { "epoch": 0.67, "grad_norm": 0.5791371405621296, "learning_rate": 4.250410877359765e-06, "loss": 0.4816, "step": 1558 }, { "epoch": 0.67, "grad_norm": 0.5879477020083694, "learning_rate": 4.2490688521930005e-06, "loss": 0.4648, "step": 1559 }, { "epoch": 0.67, "grad_norm": 0.5644425455654086, "learning_rate": 4.247725839032781e-06, "loss": 0.4979, "step": 1560 }, { "epoch": 0.67, "grad_norm": 0.5816219717795591, "learning_rate": 4.246381838637733e-06, "loss": 0.4767, "step": 1561 }, { "epoch": 0.67, "grad_norm": 0.6181448897628907, "learning_rate": 4.245036851767039e-06, "loss": 0.5092, "step": 1562 }, { "epoch": 0.67, "eval_loss": 0.4825836420059204, "eval_runtime": 6927.1849, "eval_samples_per_second": 41.92, "eval_steps_per_second": 2.096, "step": 1562 }, { "epoch": 0.67, "grad_norm": 0.6319339756032352, "learning_rate": 4.243690879180441e-06, "loss": 0.4999, "step": 1563 }, { "epoch": 0.67, "grad_norm": 0.5766409130292584, "learning_rate": 4.242343921638235e-06, "loss": 0.4912, "step": 1564 }, { "epoch": 0.67, "grad_norm": 0.6169305231670678, "learning_rate": 4.240995979901273e-06, "loss": 0.5037, "step": 1565 }, { "epoch": 0.67, "grad_norm": 0.6002589034049418, "learning_rate": 4.239647054730966e-06, "loss": 0.506, "step": 1566 }, { "epoch": 0.67, "grad_norm": 0.6023521389889079, "learning_rate": 4.238297146889281e-06, "loss": 0.5257, "step": 1567 }, { "epoch": 0.67, "grad_norm": 0.5158122792594769, "learning_rate": 4.236946257138734e-06, "loss": 0.441, "step": 1568 }, { "epoch": 0.67, "grad_norm": 0.5495762183794795, "learning_rate": 4.235594386242403e-06, "loss": 0.4743, "step": 1569 }, { "epoch": 0.67, "grad_norm": 0.6628519895823336, "learning_rate": 4.234241534963916e-06, "loss": 0.5272, "step": 1570 }, { "epoch": 0.67, "grad_norm": 0.5959806073232053, "learning_rate": 4.232887704067455e-06, "loss": 0.4951, "step": 1571 }, { "epoch": 0.67, "grad_norm": 0.5572508686653875, "learning_rate": 4.231532894317757e-06, "loss": 0.4923, "step": 1572 }, { "epoch": 0.67, "grad_norm": 0.5899131939103285, "learning_rate": 4.23017710648011e-06, "loss": 0.4837, "step": 1573 }, { "epoch": 0.67, "grad_norm": 0.6064850421779246, "learning_rate": 4.228820341320356e-06, "loss": 0.5005, "step": 1574 }, { "epoch": 0.67, "grad_norm": 0.5780537164257747, "learning_rate": 4.227462599604889e-06, "loss": 0.4785, "step": 1575 }, { "epoch": 0.67, "grad_norm": 0.5810376085859065, "learning_rate": 4.226103882100654e-06, "loss": 0.4714, "step": 1576 }, { "epoch": 0.67, "grad_norm": 0.5909147691169829, "learning_rate": 4.224744189575148e-06, "loss": 0.5021, "step": 1577 }, { "epoch": 0.67, "grad_norm": 0.5942835635277367, "learning_rate": 4.2233835227964145e-06, "loss": 0.4924, "step": 1578 }, { "epoch": 0.67, "grad_norm": 0.5618720908082389, "learning_rate": 4.222021882533056e-06, "loss": 0.4724, "step": 1579 }, { "epoch": 0.68, "grad_norm": 0.5522683879852052, "learning_rate": 4.220659269554217e-06, "loss": 0.476, "step": 1580 }, { "epoch": 0.68, "grad_norm": 0.5626316433669417, "learning_rate": 4.219295684629595e-06, "loss": 0.5038, "step": 1581 }, { "epoch": 0.68, "grad_norm": 0.5832875815941423, "learning_rate": 4.217931128529436e-06, "loss": 0.4942, "step": 1582 }, { "epoch": 0.68, "grad_norm": 0.5515392841114919, "learning_rate": 4.216565602024533e-06, "loss": 0.4832, "step": 1583 }, { "epoch": 0.68, "grad_norm": 0.6821325914674553, "learning_rate": 4.2151991058862314e-06, "loss": 0.4696, "step": 1584 }, { "epoch": 0.68, "grad_norm": 0.5918911213299775, "learning_rate": 4.21383164088642e-06, "loss": 0.4981, "step": 1585 }, { "epoch": 0.68, "grad_norm": 0.567039525371673, "learning_rate": 4.212463207797535e-06, "loss": 0.474, "step": 1586 }, { "epoch": 0.68, "grad_norm": 0.5743234762161183, "learning_rate": 4.211093807392562e-06, "loss": 0.4675, "step": 1587 }, { "epoch": 0.68, "grad_norm": 0.5855603678522909, "learning_rate": 4.209723440445032e-06, "loss": 0.4796, "step": 1588 }, { "epoch": 0.68, "grad_norm": 0.5500461666318762, "learning_rate": 4.208352107729021e-06, "loss": 0.4844, "step": 1589 }, { "epoch": 0.68, "grad_norm": 0.5791356596651714, "learning_rate": 4.206979810019153e-06, "loss": 0.4769, "step": 1590 }, { "epoch": 0.68, "grad_norm": 0.5943166707550008, "learning_rate": 4.205606548090593e-06, "loss": 0.4842, "step": 1591 }, { "epoch": 0.68, "grad_norm": 0.5765468748074813, "learning_rate": 4.204232322719055e-06, "loss": 0.4877, "step": 1592 }, { "epoch": 0.68, "grad_norm": 0.5705538264405167, "learning_rate": 4.202857134680795e-06, "loss": 0.4848, "step": 1593 }, { "epoch": 0.68, "grad_norm": 0.550378639192882, "learning_rate": 4.201480984752612e-06, "loss": 0.4709, "step": 1594 }, { "epoch": 0.68, "grad_norm": 0.5843384273029046, "learning_rate": 4.20010387371185e-06, "loss": 0.5116, "step": 1595 }, { "epoch": 0.68, "grad_norm": 0.5430286998603538, "learning_rate": 4.198725802336395e-06, "loss": 0.4755, "step": 1596 }, { "epoch": 0.68, "grad_norm": 0.5613235833509066, "learning_rate": 4.197346771404677e-06, "loss": 0.483, "step": 1597 }, { "epoch": 0.68, "grad_norm": 0.5431464537859271, "learning_rate": 4.1959667816956654e-06, "loss": 0.4954, "step": 1598 }, { "epoch": 0.68, "grad_norm": 0.5816367873246644, "learning_rate": 4.194585833988873e-06, "loss": 0.5123, "step": 1599 }, { "epoch": 0.68, "grad_norm": 0.5842017694622618, "learning_rate": 4.1932039290643534e-06, "loss": 0.4751, "step": 1600 }, { "epoch": 0.68, "grad_norm": 0.5983057704757485, "learning_rate": 4.191821067702701e-06, "loss": 0.476, "step": 1601 }, { "epoch": 0.68, "grad_norm": 0.5785359587175982, "learning_rate": 4.190437250685049e-06, "loss": 0.4882, "step": 1602 }, { "epoch": 0.68, "grad_norm": 0.6048435561330398, "learning_rate": 4.189052478793074e-06, "loss": 0.5045, "step": 1603 }, { "epoch": 0.69, "grad_norm": 0.5886694967565423, "learning_rate": 4.187666752808987e-06, "loss": 0.4809, "step": 1604 }, { "epoch": 0.69, "grad_norm": 0.5662837567043483, "learning_rate": 4.186280073515543e-06, "loss": 0.508, "step": 1605 }, { "epoch": 0.69, "grad_norm": 0.5507517273321046, "learning_rate": 4.184892441696031e-06, "loss": 0.4574, "step": 1606 }, { "epoch": 0.69, "grad_norm": 0.5506339868366799, "learning_rate": 4.183503858134283e-06, "loss": 0.4983, "step": 1607 }, { "epoch": 0.69, "grad_norm": 0.551124570714273, "learning_rate": 4.182114323614662e-06, "loss": 0.4964, "step": 1608 }, { "epoch": 0.69, "grad_norm": 0.5632058519052752, "learning_rate": 4.180723838922076e-06, "loss": 0.5091, "step": 1609 }, { "epoch": 0.69, "grad_norm": 0.5523436692520042, "learning_rate": 4.179332404841963e-06, "loss": 0.484, "step": 1610 }, { "epoch": 0.69, "grad_norm": 0.6137884481921679, "learning_rate": 4.177940022160299e-06, "loss": 0.4796, "step": 1611 }, { "epoch": 0.69, "grad_norm": 0.6400564535674431, "learning_rate": 4.1765466916636e-06, "loss": 0.4813, "step": 1612 }, { "epoch": 0.69, "grad_norm": 0.6137552397160586, "learning_rate": 4.175152414138911e-06, "loss": 0.5128, "step": 1613 }, { "epoch": 0.69, "grad_norm": 0.5878395349717804, "learning_rate": 4.173757190373817e-06, "loss": 0.4921, "step": 1614 }, { "epoch": 0.69, "grad_norm": 0.638177925386608, "learning_rate": 4.172361021156436e-06, "loss": 0.4697, "step": 1615 }, { "epoch": 0.69, "grad_norm": 0.55678238023355, "learning_rate": 4.170963907275418e-06, "loss": 0.456, "step": 1616 }, { "epoch": 0.69, "grad_norm": 0.5856822237716187, "learning_rate": 4.169565849519949e-06, "loss": 0.4762, "step": 1617 }, { "epoch": 0.69, "grad_norm": 0.6465303799069492, "learning_rate": 4.1681668486797475e-06, "loss": 0.4964, "step": 1618 }, { "epoch": 0.69, "grad_norm": 0.6003634481531572, "learning_rate": 4.166766905545064e-06, "loss": 0.4864, "step": 1619 }, { "epoch": 0.69, "grad_norm": 0.5916487532205914, "learning_rate": 4.1653660209066835e-06, "loss": 0.5002, "step": 1620 }, { "epoch": 0.69, "grad_norm": 0.5898166480233019, "learning_rate": 4.1639641955559205e-06, "loss": 0.4827, "step": 1621 }, { "epoch": 0.69, "grad_norm": 0.5866832054250863, "learning_rate": 4.162561430284621e-06, "loss": 0.5073, "step": 1622 }, { "epoch": 0.69, "grad_norm": 0.5553096089138998, "learning_rate": 4.161157725885163e-06, "loss": 0.4507, "step": 1623 }, { "epoch": 0.69, "grad_norm": 0.5797616225893526, "learning_rate": 4.159753083150455e-06, "loss": 0.4781, "step": 1624 }, { "epoch": 0.69, "grad_norm": 0.5782876181988972, "learning_rate": 4.158347502873933e-06, "loss": 0.486, "step": 1625 }, { "epoch": 0.69, "grad_norm": 0.5593458949862828, "learning_rate": 4.156940985849568e-06, "loss": 0.4722, "step": 1626 }, { "epoch": 0.7, "grad_norm": 0.5566363926907538, "learning_rate": 4.155533532871855e-06, "loss": 0.471, "step": 1627 }, { "epoch": 0.7, "grad_norm": 0.5687649356444076, "learning_rate": 4.154125144735819e-06, "loss": 0.4593, "step": 1628 }, { "epoch": 0.7, "grad_norm": 0.5805813995919267, "learning_rate": 4.1527158222370134e-06, "loss": 0.4825, "step": 1629 }, { "epoch": 0.7, "grad_norm": 0.5698382648767448, "learning_rate": 4.151305566171521e-06, "loss": 0.4744, "step": 1630 }, { "epoch": 0.7, "grad_norm": 0.5478296421305997, "learning_rate": 4.149894377335951e-06, "loss": 0.4708, "step": 1631 }, { "epoch": 0.7, "grad_norm": 0.5579153962795045, "learning_rate": 4.148482256527438e-06, "loss": 0.4838, "step": 1632 }, { "epoch": 0.7, "grad_norm": 0.5487271513915988, "learning_rate": 4.147069204543645e-06, "loss": 0.4538, "step": 1633 }, { "epoch": 0.7, "eval_loss": 0.4806346595287323, "eval_runtime": 6927.3711, "eval_samples_per_second": 41.919, "eval_steps_per_second": 2.096, "step": 1633 }, { "epoch": 0.7, "grad_norm": 0.5571550450326609, "learning_rate": 4.14565522218276e-06, "loss": 0.4696, "step": 1634 }, { "epoch": 0.7, "grad_norm": 0.573294746763072, "learning_rate": 4.144240310243496e-06, "loss": 0.476, "step": 1635 }, { "epoch": 0.7, "grad_norm": 0.5571754212391226, "learning_rate": 4.142824469525093e-06, "loss": 0.4823, "step": 1636 }, { "epoch": 0.7, "grad_norm": 0.5921760399908439, "learning_rate": 4.1414077008273134e-06, "loss": 0.4787, "step": 1637 }, { "epoch": 0.7, "grad_norm": 0.5663213930738107, "learning_rate": 4.139990004950446e-06, "loss": 0.4837, "step": 1638 }, { "epoch": 0.7, "grad_norm": 0.6090434172084536, "learning_rate": 4.138571382695301e-06, "loss": 0.472, "step": 1639 }, { "epoch": 0.7, "grad_norm": 0.6277842926973998, "learning_rate": 4.137151834863213e-06, "loss": 0.4627, "step": 1640 }, { "epoch": 0.7, "grad_norm": 0.5518929513736244, "learning_rate": 4.13573136225604e-06, "loss": 0.4815, "step": 1641 }, { "epoch": 0.7, "grad_norm": 0.5750076801420694, "learning_rate": 4.1343099656761635e-06, "loss": 0.4713, "step": 1642 }, { "epoch": 0.7, "grad_norm": 0.5636682836359242, "learning_rate": 4.132887645926482e-06, "loss": 0.4739, "step": 1643 }, { "epoch": 0.7, "grad_norm": 0.5582878941115875, "learning_rate": 4.1314644038104215e-06, "loss": 0.4846, "step": 1644 }, { "epoch": 0.7, "grad_norm": 0.5355410249521172, "learning_rate": 4.130040240131925e-06, "loss": 0.487, "step": 1645 }, { "epoch": 0.7, "grad_norm": 0.6050538286426506, "learning_rate": 4.128615155695458e-06, "loss": 0.4791, "step": 1646 }, { "epoch": 0.7, "grad_norm": 0.551251347571686, "learning_rate": 4.127189151306004e-06, "loss": 0.4636, "step": 1647 }, { "epoch": 0.7, "grad_norm": 0.5586211992866543, "learning_rate": 4.12576222776907e-06, "loss": 0.5068, "step": 1648 }, { "epoch": 0.7, "grad_norm": 0.5541371767356188, "learning_rate": 4.124334385890678e-06, "loss": 0.4666, "step": 1649 }, { "epoch": 0.7, "grad_norm": 0.597101794217058, "learning_rate": 4.122905626477371e-06, "loss": 0.4905, "step": 1650 }, { "epoch": 0.71, "grad_norm": 0.5675341113402599, "learning_rate": 4.121475950336209e-06, "loss": 0.5052, "step": 1651 }, { "epoch": 0.71, "grad_norm": 0.5863054446966842, "learning_rate": 4.120045358274772e-06, "loss": 0.4974, "step": 1652 }, { "epoch": 0.71, "grad_norm": 0.553734529174061, "learning_rate": 4.118613851101156e-06, "loss": 0.4674, "step": 1653 }, { "epoch": 0.71, "grad_norm": 0.5949725607990067, "learning_rate": 4.117181429623973e-06, "loss": 0.4659, "step": 1654 }, { "epoch": 0.71, "grad_norm": 0.5834567071239104, "learning_rate": 4.115748094652352e-06, "loss": 0.4743, "step": 1655 }, { "epoch": 0.71, "grad_norm": 0.5535876065946561, "learning_rate": 4.114313846995941e-06, "loss": 0.5171, "step": 1656 }, { "epoch": 0.71, "grad_norm": 0.5838621642562685, "learning_rate": 4.112878687464898e-06, "loss": 0.5032, "step": 1657 }, { "epoch": 0.71, "grad_norm": 0.5823106011343094, "learning_rate": 4.111442616869901e-06, "loss": 0.4829, "step": 1658 }, { "epoch": 0.71, "grad_norm": 0.5559063219280944, "learning_rate": 4.110005636022138e-06, "loss": 0.4636, "step": 1659 }, { "epoch": 0.71, "grad_norm": 0.5736354627247767, "learning_rate": 4.108567745733318e-06, "loss": 0.4933, "step": 1660 }, { "epoch": 0.71, "grad_norm": 0.5690533891379516, "learning_rate": 4.107128946815657e-06, "loss": 0.4771, "step": 1661 }, { "epoch": 0.71, "grad_norm": 0.5816163405766323, "learning_rate": 4.105689240081886e-06, "loss": 0.4599, "step": 1662 }, { "epoch": 0.71, "grad_norm": 0.5686008337838981, "learning_rate": 4.104248626345252e-06, "loss": 0.4588, "step": 1663 }, { "epoch": 0.71, "grad_norm": 0.5713712745582588, "learning_rate": 4.102807106419511e-06, "loss": 0.4707, "step": 1664 }, { "epoch": 0.71, "grad_norm": 0.5568768914986018, "learning_rate": 4.10136468111893e-06, "loss": 0.491, "step": 1665 }, { "epoch": 0.71, "grad_norm": 0.5680603398114878, "learning_rate": 4.099921351258292e-06, "loss": 0.4887, "step": 1666 }, { "epoch": 0.71, "grad_norm": 0.555747852519542, "learning_rate": 4.098477117652887e-06, "loss": 0.4678, "step": 1667 }, { "epoch": 0.71, "grad_norm": 0.5732419584091223, "learning_rate": 4.097031981118516e-06, "loss": 0.5059, "step": 1668 }, { "epoch": 0.71, "grad_norm": 0.5978764739542107, "learning_rate": 4.095585942471492e-06, "loss": 0.4862, "step": 1669 }, { "epoch": 0.71, "grad_norm": 0.5641825612908864, "learning_rate": 4.094139002528635e-06, "loss": 0.4892, "step": 1670 }, { "epoch": 0.71, "grad_norm": 0.59291044888431, "learning_rate": 4.092691162107277e-06, "loss": 0.4689, "step": 1671 }, { "epoch": 0.71, "grad_norm": 0.5633228261402596, "learning_rate": 4.091242422025256e-06, "loss": 0.4867, "step": 1672 }, { "epoch": 0.71, "grad_norm": 0.5446421814689791, "learning_rate": 4.08979278310092e-06, "loss": 0.4963, "step": 1673 }, { "epoch": 0.72, "grad_norm": 0.5900103216285727, "learning_rate": 4.088342246153123e-06, "loss": 0.4784, "step": 1674 }, { "epoch": 0.72, "grad_norm": 0.5662846183673187, "learning_rate": 4.086890812001228e-06, "loss": 0.4728, "step": 1675 }, { "epoch": 0.72, "grad_norm": 0.5481824376820944, "learning_rate": 4.085438481465104e-06, "loss": 0.4925, "step": 1676 }, { "epoch": 0.72, "grad_norm": 0.603893688943784, "learning_rate": 4.083985255365127e-06, "loss": 0.4775, "step": 1677 }, { "epoch": 0.72, "grad_norm": 0.5616169558946099, "learning_rate": 4.082531134522176e-06, "loss": 0.4836, "step": 1678 }, { "epoch": 0.72, "grad_norm": 0.5771671536691606, "learning_rate": 4.0810761197576405e-06, "loss": 0.4729, "step": 1679 }, { "epoch": 0.72, "grad_norm": 0.5356849896576423, "learning_rate": 4.07962021189341e-06, "loss": 0.4831, "step": 1680 }, { "epoch": 0.72, "grad_norm": 0.5705452601718124, "learning_rate": 4.078163411751882e-06, "loss": 0.5036, "step": 1681 }, { "epoch": 0.72, "grad_norm": 0.5853521037301916, "learning_rate": 4.076705720155956e-06, "loss": 0.4663, "step": 1682 }, { "epoch": 0.72, "grad_norm": 0.5767304367108156, "learning_rate": 4.075247137929036e-06, "loss": 0.4696, "step": 1683 }, { "epoch": 0.72, "grad_norm": 0.5469609445931426, "learning_rate": 4.073787665895029e-06, "loss": 0.4861, "step": 1684 }, { "epoch": 0.72, "grad_norm": 0.5647896369896342, "learning_rate": 4.0723273048783426e-06, "loss": 0.4751, "step": 1685 }, { "epoch": 0.72, "grad_norm": 0.5471961307604523, "learning_rate": 4.070866055703892e-06, "loss": 0.4755, "step": 1686 }, { "epoch": 0.72, "grad_norm": 0.5409469094446197, "learning_rate": 4.069403919197087e-06, "loss": 0.4485, "step": 1687 }, { "epoch": 0.72, "grad_norm": 0.5493683437426159, "learning_rate": 4.067940896183843e-06, "loss": 0.466, "step": 1688 }, { "epoch": 0.72, "grad_norm": 0.6184085336074597, "learning_rate": 4.0664769874905765e-06, "loss": 0.4941, "step": 1689 }, { "epoch": 0.72, "grad_norm": 0.5897131879671903, "learning_rate": 4.065012193944201e-06, "loss": 0.4949, "step": 1690 }, { "epoch": 0.72, "grad_norm": 0.5476514705576049, "learning_rate": 4.063546516372134e-06, "loss": 0.4627, "step": 1691 }, { "epoch": 0.72, "grad_norm": 0.5639046127555836, "learning_rate": 4.0620799556022886e-06, "loss": 0.507, "step": 1692 }, { "epoch": 0.72, "grad_norm": 0.569493917934072, "learning_rate": 4.060612512463079e-06, "loss": 0.4689, "step": 1693 }, { "epoch": 0.72, "grad_norm": 0.5576467520547685, "learning_rate": 4.059144187783417e-06, "loss": 0.4533, "step": 1694 }, { "epoch": 0.72, "grad_norm": 0.5785819029326567, "learning_rate": 4.057674982392713e-06, "loss": 0.484, "step": 1695 }, { "epoch": 0.72, "grad_norm": 0.5800649358100449, "learning_rate": 4.056204897120875e-06, "loss": 0.475, "step": 1696 }, { "epoch": 0.73, "grad_norm": 0.5547737396991846, "learning_rate": 4.054733932798306e-06, "loss": 0.4617, "step": 1697 }, { "epoch": 0.73, "grad_norm": 0.55033684788177, "learning_rate": 4.053262090255908e-06, "loss": 0.4748, "step": 1698 }, { "epoch": 0.73, "grad_norm": 0.5624353739448029, "learning_rate": 4.051789370325079e-06, "loss": 0.4869, "step": 1699 }, { "epoch": 0.73, "grad_norm": 0.5452044799501949, "learning_rate": 4.050315773837708e-06, "loss": 0.4463, "step": 1700 }, { "epoch": 0.73, "grad_norm": 0.5802412977647277, "learning_rate": 4.048841301626188e-06, "loss": 0.4821, "step": 1701 }, { "epoch": 0.73, "grad_norm": 0.5909786502202287, "learning_rate": 4.047365954523398e-06, "loss": 0.4666, "step": 1702 }, { "epoch": 0.73, "grad_norm": 0.5825689334745631, "learning_rate": 4.045889733362717e-06, "loss": 0.4833, "step": 1703 }, { "epoch": 0.73, "grad_norm": 0.5429276122164189, "learning_rate": 4.044412638978012e-06, "loss": 0.4792, "step": 1704 }, { "epoch": 0.73, "eval_loss": 0.4791695773601532, "eval_runtime": 6928.6024, "eval_samples_per_second": 41.912, "eval_steps_per_second": 2.096, "step": 1704 }, { "epoch": 0.73, "grad_norm": 0.5506114214338211, "learning_rate": 4.042934672203651e-06, "loss": 0.4495, "step": 1705 }, { "epoch": 0.73, "grad_norm": 0.5578625634328879, "learning_rate": 4.041455833874488e-06, "loss": 0.4522, "step": 1706 }, { "epoch": 0.73, "grad_norm": 0.5795772583330789, "learning_rate": 4.039976124825872e-06, "loss": 0.5001, "step": 1707 }, { "epoch": 0.73, "grad_norm": 0.5536178873684257, "learning_rate": 4.038495545893643e-06, "loss": 0.4803, "step": 1708 }, { "epoch": 0.73, "grad_norm": 0.5392842977706797, "learning_rate": 4.037014097914135e-06, "loss": 0.4682, "step": 1709 }, { "epoch": 0.73, "grad_norm": 0.5708019402077392, "learning_rate": 4.0355317817241705e-06, "loss": 0.4785, "step": 1710 }, { "epoch": 0.73, "grad_norm": 0.6113657504825251, "learning_rate": 4.034048598161061e-06, "loss": 0.4797, "step": 1711 }, { "epoch": 0.73, "grad_norm": 0.5620870459786824, "learning_rate": 4.032564548062612e-06, "loss": 0.472, "step": 1712 }, { "epoch": 0.73, "grad_norm": 0.5660234184012428, "learning_rate": 4.0310796322671144e-06, "loss": 0.4867, "step": 1713 }, { "epoch": 0.73, "grad_norm": 0.5428002908374318, "learning_rate": 4.029593851613351e-06, "loss": 0.4772, "step": 1714 }, { "epoch": 0.73, "grad_norm": 0.5576743567400213, "learning_rate": 4.028107206940592e-06, "loss": 0.484, "step": 1715 }, { "epoch": 0.73, "grad_norm": 0.5472128158032424, "learning_rate": 4.0266196990885955e-06, "loss": 0.5003, "step": 1716 }, { "epoch": 0.73, "grad_norm": 0.5194449059148619, "learning_rate": 4.025131328897608e-06, "loss": 0.4718, "step": 1717 }, { "epoch": 0.73, "grad_norm": 0.5611014049931291, "learning_rate": 4.023642097208362e-06, "loss": 0.4812, "step": 1718 }, { "epoch": 0.73, "grad_norm": 0.6211118572373506, "learning_rate": 4.022152004862079e-06, "loss": 0.4877, "step": 1719 }, { "epoch": 0.73, "grad_norm": 0.541074320013028, "learning_rate": 4.020661052700462e-06, "loss": 0.4996, "step": 1720 }, { "epoch": 0.74, "grad_norm": 0.5552304848575638, "learning_rate": 4.019169241565704e-06, "loss": 0.5153, "step": 1721 }, { "epoch": 0.74, "grad_norm": 0.553005883252194, "learning_rate": 4.0176765723004805e-06, "loss": 0.4583, "step": 1722 }, { "epoch": 0.74, "grad_norm": 0.5609121544313215, "learning_rate": 4.0161830457479555e-06, "loss": 0.4524, "step": 1723 }, { "epoch": 0.74, "grad_norm": 0.5641653969492663, "learning_rate": 4.014688662751773e-06, "loss": 0.4543, "step": 1724 }, { "epoch": 0.74, "grad_norm": 0.5328244993549478, "learning_rate": 4.013193424156062e-06, "loss": 0.4417, "step": 1725 }, { "epoch": 0.74, "grad_norm": 0.5561692361326591, "learning_rate": 4.011697330805436e-06, "loss": 0.488, "step": 1726 }, { "epoch": 0.74, "grad_norm": 0.5700577799667438, "learning_rate": 4.010200383544992e-06, "loss": 0.4964, "step": 1727 }, { "epoch": 0.74, "grad_norm": 0.5451521766339095, "learning_rate": 4.0087025832203065e-06, "loss": 0.4556, "step": 1728 }, { "epoch": 0.74, "grad_norm": 0.5817456153792706, "learning_rate": 4.007203930677438e-06, "loss": 0.4983, "step": 1729 }, { "epoch": 0.74, "grad_norm": 0.6380188720129754, "learning_rate": 4.00570442676293e-06, "loss": 0.4805, "step": 1730 }, { "epoch": 0.74, "grad_norm": 0.5205063698001692, "learning_rate": 4.0042040723238055e-06, "loss": 0.4659, "step": 1731 }, { "epoch": 0.74, "grad_norm": 0.5768671155441834, "learning_rate": 4.002702868207563e-06, "loss": 0.4873, "step": 1732 }, { "epoch": 0.74, "grad_norm": 0.5494137373354541, "learning_rate": 4.001200815262188e-06, "loss": 0.4723, "step": 1733 }, { "epoch": 0.74, "grad_norm": 0.5585419846591574, "learning_rate": 3.999697914336143e-06, "loss": 0.4873, "step": 1734 }, { "epoch": 0.74, "grad_norm": 0.606023064302735, "learning_rate": 3.9981941662783675e-06, "loss": 0.5029, "step": 1735 }, { "epoch": 0.74, "grad_norm": 0.5623066425478451, "learning_rate": 3.996689571938282e-06, "loss": 0.463, "step": 1736 }, { "epoch": 0.74, "grad_norm": 0.5971959876411371, "learning_rate": 3.995184132165783e-06, "loss": 0.4753, "step": 1737 }, { "epoch": 0.74, "grad_norm": 0.54900424681578, "learning_rate": 3.993677847811247e-06, "loss": 0.4872, "step": 1738 }, { "epoch": 0.74, "grad_norm": 0.574046829299625, "learning_rate": 3.992170719725524e-06, "loss": 0.4602, "step": 1739 }, { "epoch": 0.74, "grad_norm": 0.588055937602866, "learning_rate": 3.990662748759946e-06, "loss": 0.4915, "step": 1740 }, { "epoch": 0.74, "grad_norm": 0.5684082620173647, "learning_rate": 3.989153935766314e-06, "loss": 0.4765, "step": 1741 }, { "epoch": 0.74, "grad_norm": 0.5755931683742708, "learning_rate": 3.987644281596913e-06, "loss": 0.4909, "step": 1742 }, { "epoch": 0.74, "grad_norm": 0.553216450579215, "learning_rate": 3.986133787104496e-06, "loss": 0.4723, "step": 1743 }, { "epoch": 0.75, "grad_norm": 0.5366461565908377, "learning_rate": 3.984622453142293e-06, "loss": 0.4624, "step": 1744 }, { "epoch": 0.75, "grad_norm": 0.5610507340686801, "learning_rate": 3.983110280564009e-06, "loss": 0.5002, "step": 1745 }, { "epoch": 0.75, "grad_norm": 0.5676547948280201, "learning_rate": 3.981597270223822e-06, "loss": 0.4897, "step": 1746 }, { "epoch": 0.75, "grad_norm": 0.5702435461535129, "learning_rate": 3.980083422976386e-06, "loss": 0.4706, "step": 1747 }, { "epoch": 0.75, "grad_norm": 0.5682513989307584, "learning_rate": 3.978568739676822e-06, "loss": 0.4631, "step": 1748 }, { "epoch": 0.75, "grad_norm": 0.5668309362429137, "learning_rate": 3.977053221180729e-06, "loss": 0.4878, "step": 1749 }, { "epoch": 0.75, "grad_norm": 0.5630149523518507, "learning_rate": 3.975536868344174e-06, "loss": 0.4971, "step": 1750 }, { "epoch": 0.75, "grad_norm": 0.5525662343852639, "learning_rate": 3.974019682023695e-06, "loss": 0.4765, "step": 1751 }, { "epoch": 0.75, "grad_norm": 0.5456185795288315, "learning_rate": 3.972501663076306e-06, "loss": 0.4695, "step": 1752 }, { "epoch": 0.75, "grad_norm": 0.543284167845247, "learning_rate": 3.9709828123594855e-06, "loss": 0.4546, "step": 1753 }, { "epoch": 0.75, "grad_norm": 0.5783138022249001, "learning_rate": 3.969463130731183e-06, "loss": 0.5026, "step": 1754 }, { "epoch": 0.75, "grad_norm": 0.5821103906157934, "learning_rate": 3.96794261904982e-06, "loss": 0.4685, "step": 1755 }, { "epoch": 0.75, "grad_norm": 0.5776445776955914, "learning_rate": 3.9664212781742865e-06, "loss": 0.4696, "step": 1756 }, { "epoch": 0.75, "grad_norm": 0.5444444652574508, "learning_rate": 3.964899108963937e-06, "loss": 0.4715, "step": 1757 }, { "epoch": 0.75, "grad_norm": 0.5791101056295087, "learning_rate": 3.963376112278597e-06, "loss": 0.4892, "step": 1758 }, { "epoch": 0.75, "grad_norm": 0.5409942798323824, "learning_rate": 3.96185228897856e-06, "loss": 0.4679, "step": 1759 }, { "epoch": 0.75, "grad_norm": 0.5313906519312775, "learning_rate": 3.9603276399245864e-06, "loss": 0.4364, "step": 1760 }, { "epoch": 0.75, "grad_norm": 0.5590372661816775, "learning_rate": 3.9588021659779e-06, "loss": 0.4478, "step": 1761 }, { "epoch": 0.75, "grad_norm": 0.5535499771886758, "learning_rate": 3.957275868000192e-06, "loss": 0.4771, "step": 1762 }, { "epoch": 0.75, "grad_norm": 0.5343930689867736, "learning_rate": 3.9557487468536225e-06, "loss": 0.4735, "step": 1763 }, { "epoch": 0.75, "grad_norm": 0.5610690419155818, "learning_rate": 3.954220803400811e-06, "loss": 0.4762, "step": 1764 }, { "epoch": 0.75, "grad_norm": 0.5305051159735634, "learning_rate": 3.9526920385048465e-06, "loss": 0.4619, "step": 1765 }, { "epoch": 0.75, "grad_norm": 0.5436426867711756, "learning_rate": 3.951162453029278e-06, "loss": 0.4793, "step": 1766 }, { "epoch": 0.75, "grad_norm": 0.5326612510661454, "learning_rate": 3.94963204783812e-06, "loss": 0.4849, "step": 1767 }, { "epoch": 0.76, "grad_norm": 0.5515320255957604, "learning_rate": 3.948100823795851e-06, "loss": 0.503, "step": 1768 }, { "epoch": 0.76, "grad_norm": 0.5694215728858405, "learning_rate": 3.946568781767409e-06, "loss": 0.4945, "step": 1769 }, { "epoch": 0.76, "grad_norm": 0.5641781422463842, "learning_rate": 3.945035922618198e-06, "loss": 0.478, "step": 1770 }, { "epoch": 0.76, "grad_norm": 0.5588846997248246, "learning_rate": 3.94350224721408e-06, "loss": 0.4798, "step": 1771 }, { "epoch": 0.76, "grad_norm": 0.5569535296640705, "learning_rate": 3.9419677564213795e-06, "loss": 0.4866, "step": 1772 }, { "epoch": 0.76, "grad_norm": 0.6058189007310192, "learning_rate": 3.9404324511068825e-06, "loss": 0.4777, "step": 1773 }, { "epoch": 0.76, "grad_norm": 0.5488550968313441, "learning_rate": 3.938896332137834e-06, "loss": 0.4885, "step": 1774 }, { "epoch": 0.76, "grad_norm": 0.5717371621896641, "learning_rate": 3.937359400381938e-06, "loss": 0.481, "step": 1775 }, { "epoch": 0.76, "eval_loss": 0.47748449444770813, "eval_runtime": 6921.6137, "eval_samples_per_second": 41.954, "eval_steps_per_second": 2.098, "step": 1775 }, { "epoch": 0.76, "grad_norm": 0.5658570817407027, "learning_rate": 3.935821656707359e-06, "loss": 0.4739, "step": 1776 }, { "epoch": 0.76, "grad_norm": 0.6083719336101632, "learning_rate": 3.93428310198272e-06, "loss": 0.5087, "step": 1777 }, { "epoch": 0.76, "grad_norm": 0.5358828780708986, "learning_rate": 3.932743737077101e-06, "loss": 0.4849, "step": 1778 }, { "epoch": 0.76, "grad_norm": 0.550854913111367, "learning_rate": 3.931203562860042e-06, "loss": 0.5071, "step": 1779 }, { "epoch": 0.76, "grad_norm": 0.5347707663990211, "learning_rate": 3.929662580201536e-06, "loss": 0.4794, "step": 1780 }, { "epoch": 0.76, "grad_norm": 0.5551468944100317, "learning_rate": 3.928120789972036e-06, "loss": 0.4689, "step": 1781 }, { "epoch": 0.76, "grad_norm": 0.5727517444264358, "learning_rate": 3.926578193042451e-06, "loss": 0.4952, "step": 1782 }, { "epoch": 0.76, "grad_norm": 0.5623029835503726, "learning_rate": 3.9250347902841456e-06, "loss": 0.4723, "step": 1783 }, { "epoch": 0.76, "grad_norm": 0.5452388086485114, "learning_rate": 3.923490582568937e-06, "loss": 0.4746, "step": 1784 }, { "epoch": 0.76, "grad_norm": 0.525242855196112, "learning_rate": 3.9219455707691004e-06, "loss": 0.4802, "step": 1785 }, { "epoch": 0.76, "grad_norm": 0.6002892698931297, "learning_rate": 3.920399755757365e-06, "loss": 0.4951, "step": 1786 }, { "epoch": 0.76, "grad_norm": 0.5602495779757628, "learning_rate": 3.9188531384069095e-06, "loss": 0.4791, "step": 1787 }, { "epoch": 0.76, "grad_norm": 0.5483232695317782, "learning_rate": 3.917305719591372e-06, "loss": 0.4699, "step": 1788 }, { "epoch": 0.76, "grad_norm": 0.5742707937304822, "learning_rate": 3.915757500184838e-06, "loss": 0.4729, "step": 1789 }, { "epoch": 0.76, "grad_norm": 0.5424718840301662, "learning_rate": 3.91420848106185e-06, "loss": 0.4983, "step": 1790 }, { "epoch": 0.77, "grad_norm": 0.5297073714321522, "learning_rate": 3.912658663097396e-06, "loss": 0.4725, "step": 1791 }, { "epoch": 0.77, "grad_norm": 0.5391456048094847, "learning_rate": 3.911108047166924e-06, "loss": 0.4588, "step": 1792 }, { "epoch": 0.77, "grad_norm": 0.5422617623881943, "learning_rate": 3.909556634146323e-06, "loss": 0.461, "step": 1793 }, { "epoch": 0.77, "grad_norm": 0.5559883587716514, "learning_rate": 3.908004424911939e-06, "loss": 0.4981, "step": 1794 }, { "epoch": 0.77, "grad_norm": 0.5355343530496602, "learning_rate": 3.906451420340566e-06, "loss": 0.4845, "step": 1795 }, { "epoch": 0.77, "grad_norm": 0.6731733861479123, "learning_rate": 3.904897621309446e-06, "loss": 0.4936, "step": 1796 }, { "epoch": 0.77, "grad_norm": 0.5495562245224338, "learning_rate": 3.9033430286962714e-06, "loss": 0.4577, "step": 1797 }, { "epoch": 0.77, "grad_norm": 0.5574105971150268, "learning_rate": 3.901787643379183e-06, "loss": 0.4693, "step": 1798 }, { "epoch": 0.77, "grad_norm": 0.5755389775504788, "learning_rate": 3.900231466236766e-06, "loss": 0.4844, "step": 1799 }, { "epoch": 0.77, "grad_norm": 0.6078323318223101, "learning_rate": 3.898674498148058e-06, "loss": 0.4728, "step": 1800 }, { "epoch": 0.77, "grad_norm": 0.5907606344024237, "learning_rate": 3.897116739992539e-06, "loss": 0.485, "step": 1801 }, { "epoch": 0.77, "grad_norm": 0.5509448118458101, "learning_rate": 3.89555819265014e-06, "loss": 0.4868, "step": 1802 }, { "epoch": 0.77, "grad_norm": 0.5247691225819013, "learning_rate": 3.893998857001231e-06, "loss": 0.4919, "step": 1803 }, { "epoch": 0.77, "grad_norm": 0.5708932504567875, "learning_rate": 3.892438733926634e-06, "loss": 0.4607, "step": 1804 }, { "epoch": 0.77, "grad_norm": 0.8363516955156233, "learning_rate": 3.890877824307611e-06, "loss": 0.4721, "step": 1805 }, { "epoch": 0.77, "grad_norm": 0.5591473348688127, "learning_rate": 3.889316129025873e-06, "loss": 0.4905, "step": 1806 }, { "epoch": 0.77, "grad_norm": 0.5918209400494773, "learning_rate": 3.887753648963569e-06, "loss": 0.473, "step": 1807 }, { "epoch": 0.77, "grad_norm": 0.5897196038146727, "learning_rate": 3.886190385003297e-06, "loss": 0.4528, "step": 1808 }, { "epoch": 0.77, "grad_norm": 0.6070139742560858, "learning_rate": 3.884626338028093e-06, "loss": 0.4734, "step": 1809 }, { "epoch": 0.77, "grad_norm": 0.6106551950471045, "learning_rate": 3.883061508921439e-06, "loss": 0.5053, "step": 1810 }, { "epoch": 0.77, "grad_norm": 0.5919165573743996, "learning_rate": 3.8814958985672564e-06, "loss": 0.4792, "step": 1811 }, { "epoch": 0.77, "grad_norm": 0.5683775432095641, "learning_rate": 3.87992950784991e-06, "loss": 0.4869, "step": 1812 }, { "epoch": 0.77, "grad_norm": 0.5693276502530639, "learning_rate": 3.878362337654203e-06, "loss": 0.4919, "step": 1813 }, { "epoch": 0.78, "grad_norm": 0.6456814342581492, "learning_rate": 3.87679438886538e-06, "loss": 0.5012, "step": 1814 }, { "epoch": 0.78, "grad_norm": 0.6047567254595965, "learning_rate": 3.875225662369125e-06, "loss": 0.489, "step": 1815 }, { "epoch": 0.78, "grad_norm": 0.6019100134305163, "learning_rate": 3.8736561590515646e-06, "loss": 0.4998, "step": 1816 }, { "epoch": 0.78, "grad_norm": 0.5455946748258141, "learning_rate": 3.872085879799258e-06, "loss": 0.4617, "step": 1817 }, { "epoch": 0.78, "grad_norm": 0.593910191752196, "learning_rate": 3.870514825499208e-06, "loss": 0.4898, "step": 1818 }, { "epoch": 0.78, "grad_norm": 0.6125446125342663, "learning_rate": 3.868942997038853e-06, "loss": 0.4781, "step": 1819 }, { "epoch": 0.78, "grad_norm": 0.5571000922345232, "learning_rate": 3.8673703953060685e-06, "loss": 0.5001, "step": 1820 }, { "epoch": 0.78, "grad_norm": 0.6338578928227893, "learning_rate": 3.865797021189167e-06, "loss": 0.5032, "step": 1821 }, { "epoch": 0.78, "grad_norm": 0.5693059830561665, "learning_rate": 3.864222875576898e-06, "loss": 0.4717, "step": 1822 }, { "epoch": 0.78, "grad_norm": 0.5713524469999574, "learning_rate": 3.862647959358447e-06, "loss": 0.469, "step": 1823 }, { "epoch": 0.78, "grad_norm": 0.5469501972304562, "learning_rate": 3.861072273423434e-06, "loss": 0.5001, "step": 1824 }, { "epoch": 0.78, "grad_norm": 0.5643553209363702, "learning_rate": 3.859495818661914e-06, "loss": 0.4752, "step": 1825 }, { "epoch": 0.78, "grad_norm": 0.6079497862523476, "learning_rate": 3.857918595964375e-06, "loss": 0.4657, "step": 1826 }, { "epoch": 0.78, "grad_norm": 0.5940401601609584, "learning_rate": 3.8563406062217405e-06, "loss": 0.4794, "step": 1827 }, { "epoch": 0.78, "grad_norm": 0.5224134371483142, "learning_rate": 3.8547618503253685e-06, "loss": 0.4596, "step": 1828 }, { "epoch": 0.78, "grad_norm": 0.5691746499773465, "learning_rate": 3.8531823291670455e-06, "loss": 0.4705, "step": 1829 }, { "epoch": 0.78, "grad_norm": 0.5709823691070093, "learning_rate": 3.8516020436389945e-06, "loss": 0.4802, "step": 1830 }, { "epoch": 0.78, "grad_norm": 0.5674686057693449, "learning_rate": 3.850020994633869e-06, "loss": 0.5046, "step": 1831 }, { "epoch": 0.78, "grad_norm": 0.5942932229729947, "learning_rate": 3.848439183044751e-06, "loss": 0.465, "step": 1832 }, { "epoch": 0.78, "grad_norm": 0.6007440298375468, "learning_rate": 3.846856609765158e-06, "loss": 0.485, "step": 1833 }, { "epoch": 0.78, "grad_norm": 0.5905695400663254, "learning_rate": 3.845273275689035e-06, "loss": 0.4898, "step": 1834 }, { "epoch": 0.78, "grad_norm": 0.5526340123188758, "learning_rate": 3.843689181710756e-06, "loss": 0.4895, "step": 1835 }, { "epoch": 0.78, "grad_norm": 0.585057913171662, "learning_rate": 3.842104328725127e-06, "loss": 0.457, "step": 1836 }, { "epoch": 0.78, "grad_norm": 0.5917091841007728, "learning_rate": 3.8405187176273794e-06, "loss": 0.4745, "step": 1837 }, { "epoch": 0.79, "grad_norm": 0.5831734022479653, "learning_rate": 3.838932349313176e-06, "loss": 0.5003, "step": 1838 }, { "epoch": 0.79, "grad_norm": 0.5475814395504766, "learning_rate": 3.837345224678605e-06, "loss": 0.4875, "step": 1839 }, { "epoch": 0.79, "grad_norm": 0.5752248721709146, "learning_rate": 3.835757344620183e-06, "loss": 0.4636, "step": 1840 }, { "epoch": 0.79, "grad_norm": 0.5901567903691399, "learning_rate": 3.8341687100348536e-06, "loss": 0.4924, "step": 1841 }, { "epoch": 0.79, "grad_norm": 0.5434020555712842, "learning_rate": 3.832579321819985e-06, "loss": 0.4634, "step": 1842 }, { "epoch": 0.79, "grad_norm": 0.5492021117737101, "learning_rate": 3.830989180873373e-06, "loss": 0.4777, "step": 1843 }, { "epoch": 0.79, "grad_norm": 0.5306994578054051, "learning_rate": 3.829398288093237e-06, "loss": 0.472, "step": 1844 }, { "epoch": 0.79, "grad_norm": 0.5921460420396056, "learning_rate": 3.827806644378221e-06, "loss": 0.4951, "step": 1845 }, { "epoch": 0.79, "grad_norm": 0.5211955272058552, "learning_rate": 3.826214250627397e-06, "loss": 0.4513, "step": 1846 }, { "epoch": 0.79, "eval_loss": 0.47622737288475037, "eval_runtime": 6919.5773, "eval_samples_per_second": 41.966, "eval_steps_per_second": 2.098, "step": 1846 }, { "epoch": 0.79, "grad_norm": 0.572627752021322, "learning_rate": 3.824621107740255e-06, "loss": 0.5055, "step": 1847 }, { "epoch": 0.79, "grad_norm": 0.5855256130142836, "learning_rate": 3.823027216616711e-06, "loss": 0.4712, "step": 1848 }, { "epoch": 0.79, "grad_norm": 0.5330067035136337, "learning_rate": 3.821432578157105e-06, "loss": 0.461, "step": 1849 }, { "epoch": 0.79, "grad_norm": 0.5379073734644638, "learning_rate": 3.819837193262197e-06, "loss": 0.4646, "step": 1850 }, { "epoch": 0.79, "grad_norm": 0.5550409399367109, "learning_rate": 3.818241062833168e-06, "loss": 0.5092, "step": 1851 }, { "epoch": 0.79, "grad_norm": 0.5475883585070609, "learning_rate": 3.816644187771624e-06, "loss": 0.4845, "step": 1852 }, { "epoch": 0.79, "grad_norm": 0.5304094349485199, "learning_rate": 3.815046568979585e-06, "loss": 0.5045, "step": 1853 }, { "epoch": 0.79, "grad_norm": 0.5902094724693763, "learning_rate": 3.8134482073594997e-06, "loss": 0.4703, "step": 1854 }, { "epoch": 0.79, "grad_norm": 0.5373657261604207, "learning_rate": 3.811849103814229e-06, "loss": 0.4438, "step": 1855 }, { "epoch": 0.79, "grad_norm": 0.5500441237475201, "learning_rate": 3.8102492592470562e-06, "loss": 0.4808, "step": 1856 }, { "epoch": 0.79, "grad_norm": 0.587067338502479, "learning_rate": 3.808648674561683e-06, "loss": 0.484, "step": 1857 }, { "epoch": 0.79, "grad_norm": 0.5762212544281782, "learning_rate": 3.8070473506622283e-06, "loss": 0.4765, "step": 1858 }, { "epoch": 0.79, "grad_norm": 0.5750217699914456, "learning_rate": 3.80544528845323e-06, "loss": 0.4695, "step": 1859 }, { "epoch": 0.79, "grad_norm": 0.5928903124224453, "learning_rate": 3.803842488839642e-06, "loss": 0.4931, "step": 1860 }, { "epoch": 0.8, "grad_norm": 0.5211947228388228, "learning_rate": 3.8022389527268344e-06, "loss": 0.4671, "step": 1861 }, { "epoch": 0.8, "grad_norm": 0.5624138113341162, "learning_rate": 3.8006346810205935e-06, "loss": 0.462, "step": 1862 }, { "epoch": 0.8, "grad_norm": 0.6055264031899258, "learning_rate": 3.7990296746271227e-06, "loss": 0.4951, "step": 1863 }, { "epoch": 0.8, "grad_norm": 0.575871000906671, "learning_rate": 3.797423934453038e-06, "loss": 0.4613, "step": 1864 }, { "epoch": 0.8, "grad_norm": 0.5436757648363332, "learning_rate": 3.795817461405372e-06, "loss": 0.495, "step": 1865 }, { "epoch": 0.8, "grad_norm": 0.5489759393271353, "learning_rate": 3.7942102563915693e-06, "loss": 0.4761, "step": 1866 }, { "epoch": 0.8, "grad_norm": 0.5799706124416135, "learning_rate": 3.79260232031949e-06, "loss": 0.4992, "step": 1867 }, { "epoch": 0.8, "grad_norm": 0.6021574593638155, "learning_rate": 3.7909936540974052e-06, "loss": 0.4573, "step": 1868 }, { "epoch": 0.8, "grad_norm": 0.5528129676449581, "learning_rate": 3.7893842586340003e-06, "loss": 0.515, "step": 1869 }, { "epoch": 0.8, "grad_norm": 0.5695311954254395, "learning_rate": 3.7877741348383703e-06, "loss": 0.4728, "step": 1870 }, { "epoch": 0.8, "grad_norm": 0.5592177344188098, "learning_rate": 3.7861632836200245e-06, "loss": 0.4673, "step": 1871 }, { "epoch": 0.8, "grad_norm": 0.5611705084883242, "learning_rate": 3.784551705888881e-06, "loss": 0.4584, "step": 1872 }, { "epoch": 0.8, "grad_norm": 0.5539834069231695, "learning_rate": 3.7829394025552684e-06, "loss": 0.4644, "step": 1873 }, { "epoch": 0.8, "grad_norm": 0.537446301860318, "learning_rate": 3.7813263745299257e-06, "loss": 0.4927, "step": 1874 }, { "epoch": 0.8, "grad_norm": 0.5596544860638828, "learning_rate": 3.779712622724003e-06, "loss": 0.4647, "step": 1875 }, { "epoch": 0.8, "grad_norm": 0.544652883431319, "learning_rate": 3.7780981480490554e-06, "loss": 0.4955, "step": 1876 }, { "epoch": 0.8, "grad_norm": 0.5512313969653201, "learning_rate": 3.776482951417049e-06, "loss": 0.4644, "step": 1877 }, { "epoch": 0.8, "grad_norm": 0.5497506195632861, "learning_rate": 3.774867033740357e-06, "loss": 0.4789, "step": 1878 }, { "epoch": 0.8, "grad_norm": 0.5742672691362793, "learning_rate": 3.77325039593176e-06, "loss": 0.4729, "step": 1879 }, { "epoch": 0.8, "grad_norm": 0.5629948393220319, "learning_rate": 3.7716330389044463e-06, "loss": 0.4723, "step": 1880 }, { "epoch": 0.8, "grad_norm": 0.5888605808332452, "learning_rate": 3.7700149635720086e-06, "loss": 0.4716, "step": 1881 }, { "epoch": 0.8, "grad_norm": 0.5578428602453922, "learning_rate": 3.768396170848445e-06, "loss": 0.4958, "step": 1882 }, { "epoch": 0.8, "grad_norm": 0.5466856701572237, "learning_rate": 3.766776661648163e-06, "loss": 0.4712, "step": 1883 }, { "epoch": 0.8, "grad_norm": 0.5650833519106695, "learning_rate": 3.76515643688597e-06, "loss": 0.4812, "step": 1884 }, { "epoch": 0.81, "grad_norm": 0.5662580501911987, "learning_rate": 3.76353549747708e-06, "loss": 0.477, "step": 1885 }, { "epoch": 0.81, "grad_norm": 0.5515090242427748, "learning_rate": 3.76191384433711e-06, "loss": 0.4784, "step": 1886 }, { "epoch": 0.81, "grad_norm": 0.5651933942612933, "learning_rate": 3.76029147838208e-06, "loss": 0.4493, "step": 1887 }, { "epoch": 0.81, "grad_norm": 0.5534626182885516, "learning_rate": 3.7586684005284146e-06, "loss": 0.4823, "step": 1888 }, { "epoch": 0.81, "grad_norm": 0.5308600240405152, "learning_rate": 3.7570446116929372e-06, "loss": 0.448, "step": 1889 }, { "epoch": 0.81, "grad_norm": 0.5654195108385544, "learning_rate": 3.7554201127928747e-06, "loss": 0.4679, "step": 1890 }, { "epoch": 0.81, "grad_norm": 0.5815086921921774, "learning_rate": 3.7537949047458567e-06, "loss": 0.4637, "step": 1891 }, { "epoch": 0.81, "grad_norm": 0.5603546362158033, "learning_rate": 3.7521689884699093e-06, "loss": 0.4805, "step": 1892 }, { "epoch": 0.81, "grad_norm": 0.5835928980888774, "learning_rate": 3.750542364883462e-06, "loss": 0.4665, "step": 1893 }, { "epoch": 0.81, "grad_norm": 0.5706169202769774, "learning_rate": 3.748915034905344e-06, "loss": 0.4889, "step": 1894 }, { "epoch": 0.81, "grad_norm": 0.5269442750245277, "learning_rate": 3.74728699945478e-06, "loss": 0.465, "step": 1895 }, { "epoch": 0.81, "grad_norm": 0.5547808175518558, "learning_rate": 3.745658259451397e-06, "loss": 0.4639, "step": 1896 }, { "epoch": 0.81, "grad_norm": 0.6267511707684223, "learning_rate": 3.744028815815219e-06, "loss": 0.4865, "step": 1897 }, { "epoch": 0.81, "grad_norm": 0.5643243359785219, "learning_rate": 3.742398669466665e-06, "loss": 0.4897, "step": 1898 }, { "epoch": 0.81, "grad_norm": 0.5600481323676141, "learning_rate": 3.740767821326555e-06, "loss": 0.4708, "step": 1899 }, { "epoch": 0.81, "grad_norm": 0.5421854980261044, "learning_rate": 3.739136272316102e-06, "loss": 0.4797, "step": 1900 }, { "epoch": 0.81, "grad_norm": 0.5504823302075539, "learning_rate": 3.737504023356916e-06, "loss": 0.4477, "step": 1901 }, { "epoch": 0.81, "grad_norm": 0.5596996727298843, "learning_rate": 3.735871075371004e-06, "loss": 0.4751, "step": 1902 }, { "epoch": 0.81, "grad_norm": 0.5490200023377907, "learning_rate": 3.734237429280766e-06, "loss": 0.4667, "step": 1903 }, { "epoch": 0.81, "grad_norm": 0.6262743935742706, "learning_rate": 3.7326030860089955e-06, "loss": 0.4838, "step": 1904 }, { "epoch": 0.81, "grad_norm": 0.5824379568686556, "learning_rate": 3.7309680464788835e-06, "loss": 0.4905, "step": 1905 }, { "epoch": 0.81, "grad_norm": 0.53774534973763, "learning_rate": 3.72933231161401e-06, "loss": 0.4701, "step": 1906 }, { "epoch": 0.81, "grad_norm": 0.5754065009216981, "learning_rate": 3.72769588233835e-06, "loss": 0.5086, "step": 1907 }, { "epoch": 0.82, "grad_norm": 0.5438398948047919, "learning_rate": 3.726058759576271e-06, "loss": 0.5028, "step": 1908 }, { "epoch": 0.82, "grad_norm": 0.5545263081214099, "learning_rate": 3.724420944252531e-06, "loss": 0.4742, "step": 1909 }, { "epoch": 0.82, "grad_norm": 0.5598750002548621, "learning_rate": 3.72278243729228e-06, "loss": 0.4568, "step": 1910 }, { "epoch": 0.82, "grad_norm": 0.5657656081137544, "learning_rate": 3.7211432396210595e-06, "loss": 0.4478, "step": 1911 }, { "epoch": 0.82, "grad_norm": 0.5621097014773546, "learning_rate": 3.7195033521647987e-06, "loss": 0.4779, "step": 1912 }, { "epoch": 0.82, "grad_norm": 0.5412943721977179, "learning_rate": 3.7178627758498194e-06, "loss": 0.4553, "step": 1913 }, { "epoch": 0.82, "grad_norm": 0.577522286854261, "learning_rate": 3.71622151160283e-06, "loss": 0.4829, "step": 1914 }, { "epoch": 0.82, "grad_norm": 0.543384848094331, "learning_rate": 3.7145795603509282e-06, "loss": 0.4582, "step": 1915 }, { "epoch": 0.82, "grad_norm": 0.5778457930279592, "learning_rate": 3.712936923021602e-06, "loss": 0.4713, "step": 1916 }, { "epoch": 0.82, "grad_norm": 0.5766320078694117, "learning_rate": 3.7112936005427237e-06, "loss": 0.4683, "step": 1917 }, { "epoch": 0.82, "eval_loss": 0.4745747447013855, "eval_runtime": 6921.2455, "eval_samples_per_second": 41.956, "eval_steps_per_second": 2.098, "step": 1917 }, { "epoch": 0.82, "grad_norm": 0.5432751258589548, "learning_rate": 3.7096495938425537e-06, "loss": 0.4674, "step": 1918 }, { "epoch": 0.82, "grad_norm": 0.569125410720449, "learning_rate": 3.7080049038497405e-06, "loss": 0.4793, "step": 1919 }, { "epoch": 0.82, "grad_norm": 0.5681975559829856, "learning_rate": 3.706359531493316e-06, "loss": 0.4942, "step": 1920 }, { "epoch": 0.82, "grad_norm": 0.5598180203564832, "learning_rate": 3.704713477702699e-06, "loss": 0.4869, "step": 1921 }, { "epoch": 0.82, "grad_norm": 0.5358569958499817, "learning_rate": 3.703066743407694e-06, "loss": 0.493, "step": 1922 }, { "epoch": 0.82, "grad_norm": 0.5694771092869512, "learning_rate": 3.701419329538487e-06, "loss": 0.4798, "step": 1923 }, { "epoch": 0.82, "grad_norm": 0.5515301941280839, "learning_rate": 3.699771237025652e-06, "loss": 0.4732, "step": 1924 }, { "epoch": 0.82, "grad_norm": 0.5836180114241711, "learning_rate": 3.6981224668001427e-06, "loss": 0.4868, "step": 1925 }, { "epoch": 0.82, "grad_norm": 0.5724166590716755, "learning_rate": 3.696473019793297e-06, "loss": 0.4631, "step": 1926 }, { "epoch": 0.82, "grad_norm": 0.5794404466148781, "learning_rate": 3.694822896936836e-06, "loss": 0.5093, "step": 1927 }, { "epoch": 0.82, "grad_norm": 0.5788548411305892, "learning_rate": 3.6931720991628613e-06, "loss": 0.4849, "step": 1928 }, { "epoch": 0.82, "grad_norm": 0.5388008666947813, "learning_rate": 3.691520627403856e-06, "loss": 0.4653, "step": 1929 }, { "epoch": 0.82, "grad_norm": 0.548773537353004, "learning_rate": 3.6898684825926845e-06, "loss": 0.4823, "step": 1930 }, { "epoch": 0.82, "grad_norm": 0.5588102780031893, "learning_rate": 3.68821566566259e-06, "loss": 0.4644, "step": 1931 }, { "epoch": 0.83, "grad_norm": 0.5614220676467329, "learning_rate": 3.686562177547197e-06, "loss": 0.4392, "step": 1932 }, { "epoch": 0.83, "grad_norm": 0.57266859860218, "learning_rate": 3.6849080191805087e-06, "loss": 0.4784, "step": 1933 }, { "epoch": 0.83, "grad_norm": 0.5752230473014237, "learning_rate": 3.683253191496906e-06, "loss": 0.4782, "step": 1934 }, { "epoch": 0.83, "grad_norm": 0.5692017163618623, "learning_rate": 3.681597695431149e-06, "loss": 0.4845, "step": 1935 }, { "epoch": 0.83, "grad_norm": 0.5999017797504378, "learning_rate": 3.6799415319183753e-06, "loss": 0.5121, "step": 1936 }, { "epoch": 0.83, "grad_norm": 0.5542057486345733, "learning_rate": 3.678284701894097e-06, "loss": 0.4545, "step": 1937 }, { "epoch": 0.83, "grad_norm": 0.5744953689894631, "learning_rate": 3.6766272062942066e-06, "loss": 0.4673, "step": 1938 }, { "epoch": 0.83, "grad_norm": 0.5519474373087149, "learning_rate": 3.6749690460549704e-06, "loss": 0.4768, "step": 1939 }, { "epoch": 0.83, "grad_norm": 0.5308709424185206, "learning_rate": 3.6733102221130303e-06, "loss": 0.4675, "step": 1940 }, { "epoch": 0.83, "grad_norm": 0.5543162922631799, "learning_rate": 3.6716507354054044e-06, "loss": 0.4884, "step": 1941 }, { "epoch": 0.83, "grad_norm": 0.5525721347222098, "learning_rate": 3.669990586869482e-06, "loss": 0.4633, "step": 1942 }, { "epoch": 0.83, "grad_norm": 0.5676565038664707, "learning_rate": 3.6683297774430287e-06, "loss": 0.4915, "step": 1943 }, { "epoch": 0.83, "grad_norm": 0.5354820116105071, "learning_rate": 3.6666683080641846e-06, "loss": 0.471, "step": 1944 }, { "epoch": 0.83, "grad_norm": 0.5521553904563508, "learning_rate": 3.6650061796714597e-06, "loss": 0.4683, "step": 1945 }, { "epoch": 0.83, "grad_norm": 0.5853163676085599, "learning_rate": 3.6633433932037376e-06, "loss": 0.5075, "step": 1946 }, { "epoch": 0.83, "grad_norm": 0.5751517974852843, "learning_rate": 3.661679949600275e-06, "loss": 0.4778, "step": 1947 }, { "epoch": 0.83, "grad_norm": 0.5440950912602847, "learning_rate": 3.6600158498006955e-06, "loss": 0.4706, "step": 1948 }, { "epoch": 0.83, "grad_norm": 0.5476897010825572, "learning_rate": 3.6583510947449983e-06, "loss": 0.4612, "step": 1949 }, { "epoch": 0.83, "grad_norm": 0.5562096216167524, "learning_rate": 3.656685685373552e-06, "loss": 0.4692, "step": 1950 }, { "epoch": 0.83, "grad_norm": 0.551268497700442, "learning_rate": 3.6550196226270894e-06, "loss": 0.4598, "step": 1951 }, { "epoch": 0.83, "grad_norm": 0.5391818232642948, "learning_rate": 3.65335290744672e-06, "loss": 0.4695, "step": 1952 }, { "epoch": 0.83, "grad_norm": 0.5338996664057097, "learning_rate": 3.6516855407739164e-06, "loss": 0.4665, "step": 1953 }, { "epoch": 0.83, "grad_norm": 0.5579648908867901, "learning_rate": 3.6500175235505226e-06, "loss": 0.4883, "step": 1954 }, { "epoch": 0.84, "grad_norm": 0.5331291112438312, "learning_rate": 3.6483488567187473e-06, "loss": 0.4836, "step": 1955 }, { "epoch": 0.84, "grad_norm": 0.556976603909284, "learning_rate": 3.646679541221168e-06, "loss": 0.4893, "step": 1956 }, { "epoch": 0.84, "grad_norm": 0.5623081423655151, "learning_rate": 3.6450095780007277e-06, "loss": 0.4966, "step": 1957 }, { "epoch": 0.84, "grad_norm": 0.5490180827544557, "learning_rate": 3.643338968000736e-06, "loss": 0.4592, "step": 1958 }, { "epoch": 0.84, "grad_norm": 0.5673115477683868, "learning_rate": 3.641667712164867e-06, "loss": 0.488, "step": 1959 }, { "epoch": 0.84, "grad_norm": 0.5550807543029536, "learning_rate": 3.6399958114371597e-06, "loss": 0.477, "step": 1960 }, { "epoch": 0.84, "grad_norm": 0.5857922488282928, "learning_rate": 3.6383232667620195e-06, "loss": 0.4715, "step": 1961 }, { "epoch": 0.84, "grad_norm": 1.31408407563173, "learning_rate": 3.6366500790842113e-06, "loss": 0.4798, "step": 1962 }, { "epoch": 0.84, "grad_norm": 0.5616626137169414, "learning_rate": 3.634976249348867e-06, "loss": 0.4779, "step": 1963 }, { "epoch": 0.84, "grad_norm": 0.5426687056510846, "learning_rate": 3.633301778501481e-06, "loss": 0.478, "step": 1964 }, { "epoch": 0.84, "grad_norm": 0.5819270419939396, "learning_rate": 3.631626667487906e-06, "loss": 0.4819, "step": 1965 }, { "epoch": 0.84, "grad_norm": 0.5530881380000352, "learning_rate": 3.6299509172543616e-06, "loss": 0.4939, "step": 1966 }, { "epoch": 0.84, "grad_norm": 0.5705862965767243, "learning_rate": 3.628274528747424e-06, "loss": 0.4805, "step": 1967 }, { "epoch": 0.84, "grad_norm": 0.6039400830717704, "learning_rate": 3.6265975029140334e-06, "loss": 0.4728, "step": 1968 }, { "epoch": 0.84, "grad_norm": 0.5576346901828154, "learning_rate": 3.624919840701488e-06, "loss": 0.4876, "step": 1969 }, { "epoch": 0.84, "grad_norm": 0.58645769749298, "learning_rate": 3.623241543057445e-06, "loss": 0.4852, "step": 1970 }, { "epoch": 0.84, "grad_norm": 0.5694595348460465, "learning_rate": 3.6215626109299218e-06, "loss": 0.4943, "step": 1971 }, { "epoch": 0.84, "grad_norm": 0.5505430973110726, "learning_rate": 3.6198830452672944e-06, "loss": 0.4823, "step": 1972 }, { "epoch": 0.84, "grad_norm": 0.5666099153148945, "learning_rate": 3.618202847018296e-06, "loss": 0.4794, "step": 1973 }, { "epoch": 0.84, "grad_norm": 0.5649501462357687, "learning_rate": 3.616522017132017e-06, "loss": 0.4729, "step": 1974 }, { "epoch": 0.84, "grad_norm": 0.6007742923118407, "learning_rate": 3.614840556557905e-06, "loss": 0.4757, "step": 1975 }, { "epoch": 0.84, "grad_norm": 0.5766917548962726, "learning_rate": 3.613158466245763e-06, "loss": 0.4645, "step": 1976 }, { "epoch": 0.84, "grad_norm": 0.5540384171742403, "learning_rate": 3.6114757471457514e-06, "loss": 0.4525, "step": 1977 }, { "epoch": 0.85, "grad_norm": 0.5488107248965611, "learning_rate": 3.6097924002083838e-06, "loss": 0.4632, "step": 1978 }, { "epoch": 0.85, "grad_norm": 0.556985216041662, "learning_rate": 3.60810842638453e-06, "loss": 0.4871, "step": 1979 }, { "epoch": 0.85, "grad_norm": 0.5302372510269003, "learning_rate": 3.606423826625414e-06, "loss": 0.4819, "step": 1980 }, { "epoch": 0.85, "grad_norm": 0.5650443172227675, "learning_rate": 3.604738601882612e-06, "loss": 0.4668, "step": 1981 }, { "epoch": 0.85, "grad_norm": 0.5687893746146396, "learning_rate": 3.6030527531080533e-06, "loss": 0.4668, "step": 1982 }, { "epoch": 0.85, "grad_norm": 0.5483149007131946, "learning_rate": 3.6013662812540217e-06, "loss": 0.4633, "step": 1983 }, { "epoch": 0.85, "grad_norm": 0.5666223715356314, "learning_rate": 3.5996791872731508e-06, "loss": 0.4996, "step": 1984 }, { "epoch": 0.85, "grad_norm": 0.5610496348612013, "learning_rate": 3.5979914721184263e-06, "loss": 0.4638, "step": 1985 }, { "epoch": 0.85, "grad_norm": 0.556813398196998, "learning_rate": 3.5963031367431856e-06, "loss": 0.489, "step": 1986 }, { "epoch": 0.85, "grad_norm": 0.555985238033488, "learning_rate": 3.594614182101115e-06, "loss": 0.4644, "step": 1987 }, { "epoch": 0.85, "grad_norm": 1.2268257169954715, "learning_rate": 3.592924609146251e-06, "loss": 0.4715, "step": 1988 }, { "epoch": 0.85, "eval_loss": 0.4732716977596283, "eval_runtime": 6918.9919, "eval_samples_per_second": 41.97, "eval_steps_per_second": 2.099, "step": 1988 }, { "epoch": 0.85, "grad_norm": 0.5830835777523167, "learning_rate": 3.5912344188329812e-06, "loss": 0.4622, "step": 1989 }, { "epoch": 0.85, "grad_norm": 0.5532809255407818, "learning_rate": 3.5895436121160388e-06, "loss": 0.5057, "step": 1990 }, { "epoch": 0.85, "grad_norm": 0.5548636273482329, "learning_rate": 3.5878521899505083e-06, "loss": 0.4717, "step": 1991 }, { "epoch": 0.85, "grad_norm": 0.5756081970951604, "learning_rate": 3.5861601532918188e-06, "loss": 0.4828, "step": 1992 }, { "epoch": 0.85, "grad_norm": 0.561307751747662, "learning_rate": 3.58446750309575e-06, "loss": 0.471, "step": 1993 }, { "epoch": 0.85, "grad_norm": 0.5716452195722197, "learning_rate": 3.5827742403184246e-06, "loss": 0.4799, "step": 1994 }, { "epoch": 0.85, "grad_norm": 0.6374115187137489, "learning_rate": 3.5810803659163136e-06, "loss": 0.4685, "step": 1995 }, { "epoch": 0.85, "grad_norm": 0.5584668047661877, "learning_rate": 3.579385880846232e-06, "loss": 0.4733, "step": 1996 }, { "epoch": 0.85, "grad_norm": 0.5582439256078539, "learning_rate": 3.577690786065343e-06, "loss": 0.4657, "step": 1997 }, { "epoch": 0.85, "grad_norm": 0.5551491606524519, "learning_rate": 3.5759950825311497e-06, "loss": 0.499, "step": 1998 }, { "epoch": 0.85, "grad_norm": 0.6015108198668244, "learning_rate": 3.5742987712015016e-06, "loss": 0.4971, "step": 1999 }, { "epoch": 0.85, "grad_norm": 0.5552048556458112, "learning_rate": 3.5726018530345913e-06, "loss": 0.4661, "step": 2000 }, { "epoch": 0.85, "grad_norm": 0.5346633218936818, "learning_rate": 3.5709043289889538e-06, "loss": 0.4608, "step": 2001 }, { "epoch": 0.86, "grad_norm": 0.5373071208933639, "learning_rate": 3.5692062000234663e-06, "loss": 0.4647, "step": 2002 }, { "epoch": 0.86, "grad_norm": 0.5416519357021035, "learning_rate": 3.5675074670973485e-06, "loss": 0.4738, "step": 2003 }, { "epoch": 0.86, "grad_norm": 0.569652388343622, "learning_rate": 3.565808131170161e-06, "loss": 0.4749, "step": 2004 }, { "epoch": 0.86, "grad_norm": 0.540670625527253, "learning_rate": 3.564108193201804e-06, "loss": 0.4843, "step": 2005 }, { "epoch": 0.86, "grad_norm": 0.5637489546305884, "learning_rate": 3.562407654152518e-06, "loss": 0.4718, "step": 2006 }, { "epoch": 0.86, "grad_norm": 0.5563549147978012, "learning_rate": 3.5607065149828845e-06, "loss": 0.4862, "step": 2007 }, { "epoch": 0.86, "grad_norm": 0.5371061058438552, "learning_rate": 3.559004776653823e-06, "loss": 0.4974, "step": 2008 }, { "epoch": 0.86, "grad_norm": 0.544463201871563, "learning_rate": 3.557302440126591e-06, "loss": 0.4674, "step": 2009 }, { "epoch": 0.86, "grad_norm": 0.5563948529476817, "learning_rate": 3.5555995063627842e-06, "loss": 0.4703, "step": 2010 }, { "epoch": 0.86, "grad_norm": 0.5433738422335685, "learning_rate": 3.5538959763243363e-06, "loss": 0.4801, "step": 2011 }, { "epoch": 0.86, "grad_norm": 0.5616100198019316, "learning_rate": 3.552191850973517e-06, "loss": 0.496, "step": 2012 }, { "epoch": 0.86, "grad_norm": 0.5891584297064337, "learning_rate": 3.550487131272933e-06, "loss": 0.4766, "step": 2013 }, { "epoch": 0.86, "grad_norm": 0.5684784781174675, "learning_rate": 3.5487818181855253e-06, "loss": 0.4633, "step": 2014 }, { "epoch": 0.86, "grad_norm": 0.544328038939811, "learning_rate": 3.5470759126745726e-06, "loss": 0.4694, "step": 2015 }, { "epoch": 0.86, "grad_norm": 0.5505644001026635, "learning_rate": 3.545369415703685e-06, "loss": 0.4834, "step": 2016 }, { "epoch": 0.86, "grad_norm": 0.605202877335372, "learning_rate": 3.54366232823681e-06, "loss": 0.4879, "step": 2017 }, { "epoch": 0.86, "grad_norm": 0.5543782398842882, "learning_rate": 3.5419546512382264e-06, "loss": 0.461, "step": 2018 }, { "epoch": 0.86, "grad_norm": 0.564246989631432, "learning_rate": 3.540246385672547e-06, "loss": 0.4655, "step": 2019 }, { "epoch": 0.86, "grad_norm": 0.5286400658391471, "learning_rate": 3.5385375325047167e-06, "loss": 0.4352, "step": 2020 }, { "epoch": 0.86, "grad_norm": 0.5577835291298394, "learning_rate": 3.536828092700012e-06, "loss": 0.4591, "step": 2021 }, { "epoch": 0.86, "grad_norm": 0.5649431678973527, "learning_rate": 3.5351180672240413e-06, "loss": 0.4604, "step": 2022 }, { "epoch": 0.86, "grad_norm": 0.5672528493306068, "learning_rate": 3.5334074570427444e-06, "loss": 0.5038, "step": 2023 }, { "epoch": 0.86, "grad_norm": 0.5562216889012358, "learning_rate": 3.5316962631223896e-06, "loss": 0.4651, "step": 2024 }, { "epoch": 0.87, "grad_norm": 0.5639239221337747, "learning_rate": 3.5299844864295773e-06, "loss": 0.438, "step": 2025 }, { "epoch": 0.87, "grad_norm": 0.5501230990120523, "learning_rate": 3.5282721279312343e-06, "loss": 0.4518, "step": 2026 }, { "epoch": 0.87, "grad_norm": 0.565754118751582, "learning_rate": 3.5265591885946184e-06, "loss": 0.5105, "step": 2027 }, { "epoch": 0.87, "grad_norm": 0.5488876736412479, "learning_rate": 3.5248456693873152e-06, "loss": 0.4705, "step": 2028 }, { "epoch": 0.87, "grad_norm": 0.567410215517672, "learning_rate": 3.523131571277235e-06, "loss": 0.4649, "step": 2029 }, { "epoch": 0.87, "grad_norm": 0.56236312948077, "learning_rate": 3.5214168952326205e-06, "loss": 0.4625, "step": 2030 }, { "epoch": 0.87, "grad_norm": 0.5713132266773925, "learning_rate": 3.519701642222036e-06, "loss": 0.4613, "step": 2031 }, { "epoch": 0.87, "grad_norm": 0.5586906230581218, "learning_rate": 3.5179858132143727e-06, "loss": 0.4749, "step": 2032 }, { "epoch": 0.87, "grad_norm": 0.5452698201212837, "learning_rate": 3.5162694091788506e-06, "loss": 0.4704, "step": 2033 }, { "epoch": 0.87, "grad_norm": 0.5472112238723226, "learning_rate": 3.5145524310850088e-06, "loss": 0.4723, "step": 2034 }, { "epoch": 0.87, "grad_norm": 0.5239386822886098, "learning_rate": 3.5128348799027157e-06, "loss": 0.4832, "step": 2035 }, { "epoch": 0.87, "grad_norm": 0.5582354151966005, "learning_rate": 3.5111167566021607e-06, "loss": 0.4825, "step": 2036 }, { "epoch": 0.87, "grad_norm": 0.5661871633973496, "learning_rate": 3.509398062153857e-06, "loss": 0.4808, "step": 2037 }, { "epoch": 0.87, "grad_norm": 0.5450889485025441, "learning_rate": 3.507678797528641e-06, "loss": 0.4908, "step": 2038 }, { "epoch": 0.87, "grad_norm": 0.5299540248896896, "learning_rate": 3.5059589636976704e-06, "loss": 0.4392, "step": 2039 }, { "epoch": 0.87, "grad_norm": 0.5347817988173922, "learning_rate": 3.5042385616324243e-06, "loss": 0.4673, "step": 2040 }, { "epoch": 0.87, "grad_norm": 0.545451063247587, "learning_rate": 3.5025175923047034e-06, "loss": 0.4531, "step": 2041 }, { "epoch": 0.87, "grad_norm": 0.657050095176287, "learning_rate": 3.5007960566866296e-06, "loss": 0.4599, "step": 2042 }, { "epoch": 0.87, "grad_norm": 0.5948751733409405, "learning_rate": 3.499073955750642e-06, "loss": 0.4796, "step": 2043 }, { "epoch": 0.87, "grad_norm": 0.5467310616780172, "learning_rate": 3.497351290469503e-06, "loss": 0.5012, "step": 2044 }, { "epoch": 0.87, "grad_norm": 0.538398730069935, "learning_rate": 3.4956280618162887e-06, "loss": 0.4799, "step": 2045 }, { "epoch": 0.87, "grad_norm": 0.5563044950019395, "learning_rate": 3.4939042707643983e-06, "loss": 0.4456, "step": 2046 }, { "epoch": 0.87, "grad_norm": 0.5552862407440945, "learning_rate": 3.492179918287547e-06, "loss": 0.4715, "step": 2047 }, { "epoch": 0.87, "grad_norm": 0.5443648119446477, "learning_rate": 3.4904550053597646e-06, "loss": 0.4737, "step": 2048 }, { "epoch": 0.88, "grad_norm": 0.533397172596125, "learning_rate": 3.488729532955401e-06, "loss": 0.4327, "step": 2049 }, { "epoch": 0.88, "grad_norm": 0.5290432865271623, "learning_rate": 3.4870035020491216e-06, "loss": 0.4568, "step": 2050 }, { "epoch": 0.88, "grad_norm": 0.5653745834920318, "learning_rate": 3.4852769136159047e-06, "loss": 0.4719, "step": 2051 }, { "epoch": 0.88, "grad_norm": 3.260354916091318, "learning_rate": 3.4835497686310458e-06, "loss": 0.4502, "step": 2052 }, { "epoch": 0.88, "grad_norm": 0.5407916301520294, "learning_rate": 3.4818220680701554e-06, "loss": 0.493, "step": 2053 }, { "epoch": 0.88, "grad_norm": 0.5530804605769145, "learning_rate": 3.480093812909155e-06, "loss": 0.4591, "step": 2054 }, { "epoch": 0.88, "grad_norm": 0.5560882191153973, "learning_rate": 3.4783650041242823e-06, "loss": 0.4814, "step": 2055 }, { "epoch": 0.88, "grad_norm": 0.5491339314657214, "learning_rate": 3.4766356426920854e-06, "loss": 0.459, "step": 2056 }, { "epoch": 0.88, "grad_norm": 0.5446665246830417, "learning_rate": 3.474905729589427e-06, "loss": 0.4721, "step": 2057 }, { "epoch": 0.88, "grad_norm": 0.5597955365837155, "learning_rate": 3.4731752657934793e-06, "loss": 0.4657, "step": 2058 }, { "epoch": 0.88, "grad_norm": 0.5257446537011154, "learning_rate": 3.471444252281726e-06, "loss": 0.4791, "step": 2059 }, { "epoch": 0.88, "eval_loss": 0.47191861271858215, "eval_runtime": 6926.5022, "eval_samples_per_second": 41.924, "eval_steps_per_second": 2.096, "step": 2059 }, { "epoch": 0.88, "grad_norm": 0.5550797881194796, "learning_rate": 3.469712690031962e-06, "loss": 0.4586, "step": 2060 }, { "epoch": 0.88, "grad_norm": 0.5435199223357358, "learning_rate": 3.467980580022293e-06, "loss": 0.4572, "step": 2061 }, { "epoch": 0.88, "grad_norm": 0.5445376220038961, "learning_rate": 3.466247923231131e-06, "loss": 0.4535, "step": 2062 }, { "epoch": 0.88, "grad_norm": 0.5438364103684316, "learning_rate": 3.4645147206371997e-06, "loss": 0.469, "step": 2063 }, { "epoch": 0.88, "grad_norm": 0.5622544652606651, "learning_rate": 3.4627809732195306e-06, "loss": 0.5053, "step": 2064 }, { "epoch": 0.88, "grad_norm": 0.5787220304673627, "learning_rate": 3.4610466819574617e-06, "loss": 0.4916, "step": 2065 }, { "epoch": 0.88, "grad_norm": 0.5665331755731509, "learning_rate": 3.45931184783064e-06, "loss": 0.461, "step": 2066 }, { "epoch": 0.88, "grad_norm": 0.5465798109205211, "learning_rate": 3.4575764718190174e-06, "loss": 0.4681, "step": 2067 }, { "epoch": 0.88, "grad_norm": 0.5519620649686449, "learning_rate": 3.455840554902853e-06, "loss": 0.4842, "step": 2068 }, { "epoch": 0.88, "grad_norm": 0.5679253676438992, "learning_rate": 3.4541040980627117e-06, "loss": 0.4714, "step": 2069 }, { "epoch": 0.88, "grad_norm": 0.5537751246266448, "learning_rate": 3.4523671022794612e-06, "loss": 0.4746, "step": 2070 }, { "epoch": 0.88, "grad_norm": 0.5895813898958205, "learning_rate": 3.450629568534277e-06, "loss": 0.483, "step": 2071 }, { "epoch": 0.89, "grad_norm": 0.5820282697400692, "learning_rate": 3.448891497808636e-06, "loss": 0.4948, "step": 2072 }, { "epoch": 0.89, "grad_norm": 0.5849936796434134, "learning_rate": 3.4471528910843193e-06, "loss": 0.4604, "step": 2073 }, { "epoch": 0.89, "grad_norm": 0.5994615032661528, "learning_rate": 3.4454137493434107e-06, "loss": 0.4893, "step": 2074 }, { "epoch": 0.89, "grad_norm": 0.5648890671471385, "learning_rate": 3.443674073568296e-06, "loss": 0.4761, "step": 2075 }, { "epoch": 0.89, "grad_norm": 0.5800587534232966, "learning_rate": 3.441933864741663e-06, "loss": 0.4747, "step": 2076 }, { "epoch": 0.89, "grad_norm": 0.5515407287133164, "learning_rate": 3.4401931238464996e-06, "loss": 0.4339, "step": 2077 }, { "epoch": 0.89, "grad_norm": 0.5814192658954972, "learning_rate": 3.438451851866097e-06, "loss": 0.4769, "step": 2078 }, { "epoch": 0.89, "grad_norm": 0.5393565470119203, "learning_rate": 3.4367100497840416e-06, "loss": 0.4843, "step": 2079 }, { "epoch": 0.89, "grad_norm": 0.5789119658212182, "learning_rate": 3.4349677185842246e-06, "loss": 0.4767, "step": 2080 }, { "epoch": 0.89, "grad_norm": 0.5605931723414512, "learning_rate": 3.433224859250832e-06, "loss": 0.4735, "step": 2081 }, { "epoch": 0.89, "grad_norm": 0.6040063691004713, "learning_rate": 3.4314814727683506e-06, "loss": 0.4968, "step": 2082 }, { "epoch": 0.89, "grad_norm": 0.5148532844857273, "learning_rate": 3.429737560121564e-06, "loss": 0.4516, "step": 2083 }, { "epoch": 0.89, "grad_norm": 0.5936934662433498, "learning_rate": 3.427993122295552e-06, "loss": 0.4827, "step": 2084 }, { "epoch": 0.89, "grad_norm": 0.5666076166587783, "learning_rate": 3.4262481602756937e-06, "loss": 0.4535, "step": 2085 }, { "epoch": 0.89, "grad_norm": 0.542895247264907, "learning_rate": 3.4245026750476618e-06, "loss": 0.4531, "step": 2086 }, { "epoch": 0.89, "grad_norm": 0.541516478103584, "learning_rate": 3.4227566675974256e-06, "loss": 0.4892, "step": 2087 }, { "epoch": 0.89, "grad_norm": 0.5605571680518421, "learning_rate": 3.421010138911249e-06, "loss": 0.4508, "step": 2088 }, { "epoch": 0.89, "grad_norm": 0.5849102411377991, "learning_rate": 3.4192630899756924e-06, "loss": 0.4528, "step": 2089 }, { "epoch": 0.89, "grad_norm": 0.5336283093255381, "learning_rate": 3.4175155217776057e-06, "loss": 0.4516, "step": 2090 }, { "epoch": 0.89, "grad_norm": 0.5558368518531489, "learning_rate": 3.4157674353041358e-06, "loss": 0.4703, "step": 2091 }, { "epoch": 0.89, "grad_norm": 0.5552349223460866, "learning_rate": 3.4140188315427216e-06, "loss": 0.497, "step": 2092 }, { "epoch": 0.89, "grad_norm": 0.5511369795033969, "learning_rate": 3.4122697114810934e-06, "loss": 0.5119, "step": 2093 }, { "epoch": 0.89, "grad_norm": 0.5583288164174555, "learning_rate": 3.410520076107273e-06, "loss": 0.4827, "step": 2094 }, { "epoch": 0.9, "grad_norm": 0.567817131338669, "learning_rate": 3.4087699264095746e-06, "loss": 0.4634, "step": 2095 }, { "epoch": 0.9, "grad_norm": 0.5815719817752771, "learning_rate": 3.4070192633766025e-06, "loss": 0.4886, "step": 2096 }, { "epoch": 0.9, "grad_norm": 0.5479218943057999, "learning_rate": 3.405268087997251e-06, "loss": 0.4886, "step": 2097 }, { "epoch": 0.9, "grad_norm": 0.5538188239242408, "learning_rate": 3.4035164012607013e-06, "loss": 0.4459, "step": 2098 }, { "epoch": 0.9, "grad_norm": 0.54601055265423, "learning_rate": 3.401764204156428e-06, "loss": 0.4682, "step": 2099 }, { "epoch": 0.9, "grad_norm": 0.5752200802837831, "learning_rate": 3.4000114976741905e-06, "loss": 0.4858, "step": 2100 }, { "epoch": 0.9, "grad_norm": 0.5329434779873036, "learning_rate": 3.3982582828040373e-06, "loss": 0.4508, "step": 2101 }, { "epoch": 0.9, "grad_norm": 0.5626679379467617, "learning_rate": 3.3965045605363036e-06, "loss": 0.4974, "step": 2102 }, { "epoch": 0.9, "grad_norm": 0.5227078861625193, "learning_rate": 3.3947503318616117e-06, "loss": 0.447, "step": 2103 }, { "epoch": 0.9, "grad_norm": 0.5318507506941723, "learning_rate": 3.3929955977708686e-06, "loss": 0.4611, "step": 2104 }, { "epoch": 0.9, "grad_norm": 0.5652337604907314, "learning_rate": 3.391240359255269e-06, "loss": 0.4825, "step": 2105 }, { "epoch": 0.9, "grad_norm": 0.5661117477966576, "learning_rate": 3.3894846173062917e-06, "loss": 0.4569, "step": 2106 }, { "epoch": 0.9, "grad_norm": 0.5225946246169721, "learning_rate": 3.3877283729156983e-06, "loss": 0.4646, "step": 2107 }, { "epoch": 0.9, "grad_norm": 0.522801955720487, "learning_rate": 3.385971627075537e-06, "loss": 0.4846, "step": 2108 }, { "epoch": 0.9, "grad_norm": 0.5654497625396676, "learning_rate": 3.3842143807781363e-06, "loss": 0.4641, "step": 2109 }, { "epoch": 0.9, "grad_norm": 0.553968034176698, "learning_rate": 3.38245663501611e-06, "loss": 0.4771, "step": 2110 }, { "epoch": 0.9, "grad_norm": 0.5512083756758241, "learning_rate": 3.3806983907823526e-06, "loss": 0.4711, "step": 2111 }, { "epoch": 0.9, "grad_norm": 0.5612387241279982, "learning_rate": 3.378939649070039e-06, "loss": 0.4929, "step": 2112 }, { "epoch": 0.9, "grad_norm": 0.5634378274470825, "learning_rate": 3.3771804108726294e-06, "loss": 0.4668, "step": 2113 }, { "epoch": 0.9, "grad_norm": 0.5691963273905253, "learning_rate": 3.375420677183859e-06, "loss": 0.4857, "step": 2114 }, { "epoch": 0.9, "grad_norm": 0.556084686207028, "learning_rate": 3.3736604489977465e-06, "loss": 0.4756, "step": 2115 }, { "epoch": 0.9, "grad_norm": 0.5453441170478283, "learning_rate": 3.3718997273085883e-06, "loss": 0.4633, "step": 2116 }, { "epoch": 0.9, "grad_norm": 0.5336519493865814, "learning_rate": 3.3701385131109617e-06, "loss": 0.4587, "step": 2117 }, { "epoch": 0.9, "grad_norm": 0.54183517888334, "learning_rate": 3.368376807399719e-06, "loss": 0.4897, "step": 2118 }, { "epoch": 0.91, "grad_norm": 0.537940362868582, "learning_rate": 3.3666146111699926e-06, "loss": 0.4613, "step": 2119 }, { "epoch": 0.91, "grad_norm": 0.5229756265690545, "learning_rate": 3.3648519254171906e-06, "loss": 0.4689, "step": 2120 }, { "epoch": 0.91, "grad_norm": 0.6066336811051063, "learning_rate": 3.363088751136999e-06, "loss": 0.5001, "step": 2121 }, { "epoch": 0.91, "grad_norm": 0.5518196727992455, "learning_rate": 3.3613250893253794e-06, "loss": 0.5078, "step": 2122 }, { "epoch": 0.91, "grad_norm": 0.5602072507870511, "learning_rate": 3.3595609409785668e-06, "loss": 0.479, "step": 2123 }, { "epoch": 0.91, "grad_norm": 0.5609819914853433, "learning_rate": 3.357796307093074e-06, "loss": 0.4798, "step": 2124 }, { "epoch": 0.91, "grad_norm": 0.540124268427881, "learning_rate": 3.3560311886656855e-06, "loss": 0.4707, "step": 2125 }, { "epoch": 0.91, "grad_norm": 0.5273549257492, "learning_rate": 3.3542655866934613e-06, "loss": 0.451, "step": 2126 }, { "epoch": 0.91, "grad_norm": 0.5257257097173317, "learning_rate": 3.352499502173734e-06, "loss": 0.4422, "step": 2127 }, { "epoch": 0.91, "grad_norm": 0.5242099348991289, "learning_rate": 3.350732936104108e-06, "loss": 0.4553, "step": 2128 }, { "epoch": 0.91, "grad_norm": 0.5871633066769245, "learning_rate": 3.3489658894824614e-06, "loss": 0.4706, "step": 2129 }, { "epoch": 0.91, "grad_norm": 0.5558589637343108, "learning_rate": 3.3471983633069414e-06, "loss": 0.4506, "step": 2130 }, { "epoch": 0.91, "eval_loss": 0.4703803062438965, "eval_runtime": 6929.62, "eval_samples_per_second": 41.905, "eval_steps_per_second": 2.095, "step": 2130 }, { "epoch": 0.91, "grad_norm": 0.538163796065571, "learning_rate": 3.3454303585759684e-06, "loss": 0.4616, "step": 2131 }, { "epoch": 0.91, "grad_norm": 0.6059394097926508, "learning_rate": 3.3436618762882322e-06, "loss": 0.4803, "step": 2132 }, { "epoch": 0.91, "grad_norm": 1.0796137145313163, "learning_rate": 3.3418929174426918e-06, "loss": 0.4908, "step": 2133 }, { "epoch": 0.91, "grad_norm": 0.5577675000716914, "learning_rate": 3.3401234830385753e-06, "loss": 0.457, "step": 2134 }, { "epoch": 0.91, "grad_norm": 0.5830147544071842, "learning_rate": 3.3383535740753813e-06, "loss": 0.4715, "step": 2135 }, { "epoch": 0.91, "grad_norm": 0.5860738051813335, "learning_rate": 3.336583191552876e-06, "loss": 0.4749, "step": 2136 }, { "epoch": 0.91, "grad_norm": 0.5718370942498533, "learning_rate": 3.334812336471089e-06, "loss": 0.462, "step": 2137 }, { "epoch": 0.91, "grad_norm": 0.5677796507381937, "learning_rate": 3.3330410098303224e-06, "loss": 0.4658, "step": 2138 }, { "epoch": 0.91, "grad_norm": 0.5660363318045506, "learning_rate": 3.3312692126311424e-06, "loss": 0.4654, "step": 2139 }, { "epoch": 0.91, "grad_norm": 0.5529225478270423, "learning_rate": 3.32949694587438e-06, "loss": 0.4853, "step": 2140 }, { "epoch": 0.91, "grad_norm": 0.558224467702825, "learning_rate": 3.3277242105611334e-06, "loss": 0.4635, "step": 2141 }, { "epoch": 0.92, "grad_norm": 0.5895844984799891, "learning_rate": 3.3259510076927644e-06, "loss": 0.4831, "step": 2142 }, { "epoch": 0.92, "grad_norm": 0.5474505234053266, "learning_rate": 3.324177338270898e-06, "loss": 0.4723, "step": 2143 }, { "epoch": 0.92, "grad_norm": 0.5565011263756835, "learning_rate": 3.322403203297424e-06, "loss": 0.4524, "step": 2144 }, { "epoch": 0.92, "grad_norm": 0.653742264632734, "learning_rate": 3.320628603774496e-06, "loss": 0.4539, "step": 2145 }, { "epoch": 0.92, "grad_norm": 0.5762670304100043, "learning_rate": 3.3188535407045274e-06, "loss": 0.4779, "step": 2146 }, { "epoch": 0.92, "grad_norm": 0.549175111157888, "learning_rate": 3.317078015090197e-06, "loss": 0.5013, "step": 2147 }, { "epoch": 0.92, "grad_norm": 0.5736471852199507, "learning_rate": 3.315302027934441e-06, "loss": 0.4816, "step": 2148 }, { "epoch": 0.92, "grad_norm": 0.557613887379806, "learning_rate": 3.313525580240459e-06, "loss": 0.4756, "step": 2149 }, { "epoch": 0.92, "grad_norm": 0.5468948120560473, "learning_rate": 3.3117486730117092e-06, "loss": 0.4753, "step": 2150 }, { "epoch": 0.92, "grad_norm": 0.5267251356037557, "learning_rate": 3.309971307251911e-06, "loss": 0.4672, "step": 2151 }, { "epoch": 0.92, "grad_norm": 0.5661351158225999, "learning_rate": 3.3081934839650404e-06, "loss": 0.4607, "step": 2152 }, { "epoch": 0.92, "grad_norm": 0.5777629397975975, "learning_rate": 3.3064152041553356e-06, "loss": 0.4451, "step": 2153 }, { "epoch": 0.92, "grad_norm": 0.5700643158197273, "learning_rate": 3.304636468827288e-06, "loss": 0.4695, "step": 2154 }, { "epoch": 0.92, "grad_norm": 0.5615418059663319, "learning_rate": 3.3028572789856507e-06, "loss": 0.469, "step": 2155 }, { "epoch": 0.92, "grad_norm": 0.5871509589578386, "learning_rate": 3.30107763563543e-06, "loss": 0.4827, "step": 2156 }, { "epoch": 0.92, "grad_norm": 0.5558568799307586, "learning_rate": 3.299297539781891e-06, "loss": 0.4716, "step": 2157 }, { "epoch": 0.92, "grad_norm": 0.5545006329981151, "learning_rate": 3.2975169924305524e-06, "loss": 0.4463, "step": 2158 }, { "epoch": 0.92, "grad_norm": 3.548575566071066, "learning_rate": 3.29573599458719e-06, "loss": 0.4889, "step": 2159 }, { "epoch": 0.92, "grad_norm": 0.5213212611415827, "learning_rate": 3.2939545472578314e-06, "loss": 0.4867, "step": 2160 }, { "epoch": 0.92, "grad_norm": 0.5843877680577335, "learning_rate": 3.292172651448761e-06, "loss": 0.4658, "step": 2161 }, { "epoch": 0.92, "grad_norm": 0.5710485476550279, "learning_rate": 3.290390308166515e-06, "loss": 0.4722, "step": 2162 }, { "epoch": 0.92, "grad_norm": 0.5539562494433544, "learning_rate": 3.2886075184178817e-06, "loss": 0.4885, "step": 2163 }, { "epoch": 0.92, "grad_norm": 0.6485074098343331, "learning_rate": 3.2868242832099034e-06, "loss": 0.471, "step": 2164 }, { "epoch": 0.92, "grad_norm": 0.6332549066175103, "learning_rate": 3.285040603549872e-06, "loss": 0.4821, "step": 2165 }, { "epoch": 0.93, "grad_norm": 0.5894833989878483, "learning_rate": 3.2832564804453327e-06, "loss": 0.4571, "step": 2166 }, { "epoch": 0.93, "grad_norm": 0.5558289841136125, "learning_rate": 3.281471914904079e-06, "loss": 0.4744, "step": 2167 }, { "epoch": 0.93, "grad_norm": 0.5809032359736926, "learning_rate": 3.2796869079341555e-06, "loss": 0.4868, "step": 2168 }, { "epoch": 0.93, "grad_norm": 0.5635833248240615, "learning_rate": 3.2779014605438563e-06, "loss": 0.4736, "step": 2169 }, { "epoch": 0.93, "grad_norm": 0.5668615337342484, "learning_rate": 3.276115573741724e-06, "loss": 0.4737, "step": 2170 }, { "epoch": 0.93, "grad_norm": 0.5702039104950186, "learning_rate": 3.274329248536548e-06, "loss": 0.459, "step": 2171 }, { "epoch": 0.93, "grad_norm": 0.5220961360234252, "learning_rate": 3.272542485937369e-06, "loss": 0.4656, "step": 2172 }, { "epoch": 0.93, "grad_norm": 0.5594033494520262, "learning_rate": 3.270755286953471e-06, "loss": 0.4814, "step": 2173 }, { "epoch": 0.93, "grad_norm": 0.5741756929068141, "learning_rate": 3.2689676525943854e-06, "loss": 0.4815, "step": 2174 }, { "epoch": 0.93, "grad_norm": 0.5464966733679589, "learning_rate": 3.267179583869892e-06, "loss": 0.4593, "step": 2175 }, { "epoch": 0.93, "grad_norm": 0.5721831411264321, "learning_rate": 3.265391081790012e-06, "loss": 0.4761, "step": 2176 }, { "epoch": 0.93, "grad_norm": 0.5456090495324294, "learning_rate": 3.2636021473650143e-06, "loss": 0.4875, "step": 2177 }, { "epoch": 0.93, "grad_norm": 0.5463724345795404, "learning_rate": 3.2618127816054117e-06, "loss": 0.4726, "step": 2178 }, { "epoch": 0.93, "grad_norm": 0.5367573084821134, "learning_rate": 3.2600229855219595e-06, "loss": 0.473, "step": 2179 }, { "epoch": 0.93, "grad_norm": 0.5855822548291622, "learning_rate": 3.2582327601256567e-06, "loss": 0.4782, "step": 2180 }, { "epoch": 0.93, "grad_norm": 0.5565384228650471, "learning_rate": 3.256442106427745e-06, "loss": 0.457, "step": 2181 }, { "epoch": 0.93, "grad_norm": 0.5386204130175131, "learning_rate": 3.254651025439707e-06, "loss": 0.4684, "step": 2182 }, { "epoch": 0.93, "grad_norm": 0.5334182218422864, "learning_rate": 3.252859518173269e-06, "loss": 0.4645, "step": 2183 }, { "epoch": 0.93, "grad_norm": 0.5715830265673014, "learning_rate": 3.251067585640395e-06, "loss": 0.4772, "step": 2184 }, { "epoch": 0.93, "grad_norm": 0.5741015269134933, "learning_rate": 3.249275228853292e-06, "loss": 0.4627, "step": 2185 }, { "epoch": 0.93, "grad_norm": 0.5672462313761777, "learning_rate": 3.247482448824405e-06, "loss": 0.4903, "step": 2186 }, { "epoch": 0.93, "grad_norm": 0.5373081330698016, "learning_rate": 3.245689246566418e-06, "loss": 0.4816, "step": 2187 }, { "epoch": 0.93, "grad_norm": 0.5476617186268243, "learning_rate": 3.243895623092254e-06, "loss": 0.4886, "step": 2188 }, { "epoch": 0.94, "grad_norm": 0.5356939267254818, "learning_rate": 3.2421015794150755e-06, "loss": 0.4765, "step": 2189 }, { "epoch": 0.94, "grad_norm": 0.5494604506543436, "learning_rate": 3.240307116548279e-06, "loss": 0.5035, "step": 2190 }, { "epoch": 0.94, "grad_norm": 0.5930523473140368, "learning_rate": 3.2385122355055004e-06, "loss": 0.4962, "step": 2191 }, { "epoch": 0.94, "grad_norm": 0.602008763063395, "learning_rate": 3.2367169373006114e-06, "loss": 0.4892, "step": 2192 }, { "epoch": 0.94, "grad_norm": 0.5651243722247239, "learning_rate": 3.234921222947718e-06, "loss": 0.4829, "step": 2193 }, { "epoch": 0.94, "grad_norm": 0.5385560583261362, "learning_rate": 3.2331250934611623e-06, "loss": 0.482, "step": 2194 }, { "epoch": 0.94, "grad_norm": 0.5547490494472022, "learning_rate": 3.231328549855522e-06, "loss": 0.4612, "step": 2195 }, { "epoch": 0.94, "grad_norm": 0.5470853885746786, "learning_rate": 3.2295315931456057e-06, "loss": 0.4593, "step": 2196 }, { "epoch": 0.94, "grad_norm": 0.5786280065830184, "learning_rate": 3.227734224346458e-06, "loss": 0.4769, "step": 2197 }, { "epoch": 0.94, "grad_norm": 0.5344368582425154, "learning_rate": 3.2259364444733567e-06, "loss": 0.466, "step": 2198 }, { "epoch": 0.94, "grad_norm": 0.5555074280873257, "learning_rate": 3.2241382545418087e-06, "loss": 0.4648, "step": 2199 }, { "epoch": 0.94, "grad_norm": 0.5482269284810969, "learning_rate": 3.222339655567556e-06, "loss": 0.4769, "step": 2200 }, { "epoch": 0.94, "grad_norm": 0.5667532379426644, "learning_rate": 3.2205406485665693e-06, "loss": 0.4695, "step": 2201 }, { "epoch": 0.94, "eval_loss": 0.46910035610198975, "eval_runtime": 6929.3666, "eval_samples_per_second": 41.907, "eval_steps_per_second": 2.095, "step": 2201 }, { "epoch": 0.94, "grad_norm": 0.5516775609671802, "learning_rate": 3.2187412345550493e-06, "loss": 0.4626, "step": 2202 }, { "epoch": 0.94, "grad_norm": 0.5274995673676252, "learning_rate": 3.2169414145494306e-06, "loss": 0.4542, "step": 2203 }, { "epoch": 0.94, "grad_norm": 0.5594394034476171, "learning_rate": 3.2151411895663713e-06, "loss": 0.5077, "step": 2204 }, { "epoch": 0.94, "grad_norm": 0.5592366706538346, "learning_rate": 3.2133405606227636e-06, "loss": 0.4836, "step": 2205 }, { "epoch": 0.94, "grad_norm": 0.5598764473379652, "learning_rate": 3.2115395287357247e-06, "loss": 0.4808, "step": 2206 }, { "epoch": 0.94, "grad_norm": 0.5593396084233587, "learning_rate": 3.2097380949226004e-06, "loss": 0.4692, "step": 2207 }, { "epoch": 0.94, "grad_norm": 0.5338314791656622, "learning_rate": 3.2079362602009633e-06, "loss": 0.4823, "step": 2208 }, { "epoch": 0.94, "grad_norm": 0.530750479189892, "learning_rate": 3.2061340255886135e-06, "loss": 0.4787, "step": 2209 }, { "epoch": 0.94, "grad_norm": 0.5710624896170967, "learning_rate": 3.2043313921035747e-06, "loss": 0.4536, "step": 2210 }, { "epoch": 0.94, "grad_norm": 0.5731148940480776, "learning_rate": 3.2025283607640985e-06, "loss": 0.4813, "step": 2211 }, { "epoch": 0.95, "grad_norm": 0.5946357271072944, "learning_rate": 3.200724932588659e-06, "loss": 0.4819, "step": 2212 }, { "epoch": 0.95, "grad_norm": 0.5728672004460359, "learning_rate": 3.1989211085959558e-06, "loss": 0.4919, "step": 2213 }, { "epoch": 0.95, "grad_norm": 0.5568909639263977, "learning_rate": 3.197116889804913e-06, "loss": 0.4864, "step": 2214 }, { "epoch": 0.95, "grad_norm": 0.5378921515155873, "learning_rate": 3.1953122772346757e-06, "loss": 0.4708, "step": 2215 }, { "epoch": 0.95, "grad_norm": 0.5540326517638553, "learning_rate": 3.193507271904612e-06, "loss": 0.4961, "step": 2216 }, { "epoch": 0.95, "grad_norm": 0.5264231160255753, "learning_rate": 3.191701874834312e-06, "loss": 0.4335, "step": 2217 }, { "epoch": 0.95, "grad_norm": 0.5506768071233391, "learning_rate": 3.1898960870435875e-06, "loss": 0.4793, "step": 2218 }, { "epoch": 0.95, "grad_norm": 0.5399042085543564, "learning_rate": 3.1880899095524698e-06, "loss": 0.4538, "step": 2219 }, { "epoch": 0.95, "grad_norm": 0.5470743327622264, "learning_rate": 3.1862833433812137e-06, "loss": 0.4624, "step": 2220 }, { "epoch": 0.95, "grad_norm": 0.5366708239822617, "learning_rate": 3.1844763895502876e-06, "loss": 0.4735, "step": 2221 }, { "epoch": 0.95, "grad_norm": 0.5593381531413127, "learning_rate": 3.1826690490803846e-06, "loss": 0.4577, "step": 2222 }, { "epoch": 0.95, "grad_norm": 0.5380013310652985, "learning_rate": 3.180861322992414e-06, "loss": 0.4526, "step": 2223 }, { "epoch": 0.95, "grad_norm": 0.5415307669206325, "learning_rate": 3.179053212307502e-06, "loss": 0.484, "step": 2224 }, { "epoch": 0.95, "grad_norm": 0.5610218584434514, "learning_rate": 3.1772447180469934e-06, "loss": 0.4998, "step": 2225 }, { "epoch": 0.95, "grad_norm": 0.5407354208309232, "learning_rate": 3.1754358412324483e-06, "loss": 0.4603, "step": 2226 }, { "epoch": 0.95, "grad_norm": 0.557457231879274, "learning_rate": 3.173626582885645e-06, "loss": 0.4657, "step": 2227 }, { "epoch": 0.95, "grad_norm": 0.5712441602450918, "learning_rate": 3.1718169440285763e-06, "loss": 0.4893, "step": 2228 }, { "epoch": 0.95, "grad_norm": 0.5433914547191268, "learning_rate": 3.1700069256834478e-06, "loss": 0.503, "step": 2229 }, { "epoch": 0.95, "grad_norm": 0.529494115478655, "learning_rate": 3.1681965288726825e-06, "loss": 0.4474, "step": 2230 }, { "epoch": 0.95, "grad_norm": 0.5572186853202381, "learning_rate": 3.166385754618917e-06, "loss": 0.4572, "step": 2231 }, { "epoch": 0.95, "grad_norm": 0.5258044718918907, "learning_rate": 3.1645746039449987e-06, "loss": 0.4461, "step": 2232 }, { "epoch": 0.95, "grad_norm": 0.5420554964039433, "learning_rate": 3.16276307787399e-06, "loss": 0.4553, "step": 2233 }, { "epoch": 0.95, "grad_norm": 0.5405232645917257, "learning_rate": 3.1609511774291646e-06, "loss": 0.4557, "step": 2234 }, { "epoch": 0.95, "grad_norm": 0.5847047282231195, "learning_rate": 3.1591389036340064e-06, "loss": 0.4527, "step": 2235 }, { "epoch": 0.96, "grad_norm": 0.5423932667911106, "learning_rate": 3.157326257512212e-06, "loss": 0.457, "step": 2236 }, { "epoch": 0.96, "grad_norm": 0.5692656034737968, "learning_rate": 3.1555132400876877e-06, "loss": 0.4943, "step": 2237 }, { "epoch": 0.96, "grad_norm": 0.5413038156860533, "learning_rate": 3.15369985238455e-06, "loss": 0.4516, "step": 2238 }, { "epoch": 0.96, "grad_norm": 0.5381862150816308, "learning_rate": 3.151886095427123e-06, "loss": 0.4753, "step": 2239 }, { "epoch": 0.96, "grad_norm": 0.5727293376987612, "learning_rate": 3.1500719702399406e-06, "loss": 0.5059, "step": 2240 }, { "epoch": 0.96, "grad_norm": 0.5805006268410494, "learning_rate": 3.1482574778477447e-06, "loss": 0.4524, "step": 2241 }, { "epoch": 0.96, "grad_norm": 0.528343362595312, "learning_rate": 3.146442619275486e-06, "loss": 0.471, "step": 2242 }, { "epoch": 0.96, "grad_norm": 0.5677192912627839, "learning_rate": 3.1446273955483173e-06, "loss": 0.4462, "step": 2243 }, { "epoch": 0.96, "grad_norm": 0.569474282068865, "learning_rate": 3.142811807691603e-06, "loss": 0.4663, "step": 2244 }, { "epoch": 0.96, "grad_norm": 0.5669828732246334, "learning_rate": 3.1409958567309114e-06, "loss": 0.4694, "step": 2245 }, { "epoch": 0.96, "grad_norm": 0.529668789550494, "learning_rate": 3.1391795436920136e-06, "loss": 0.4679, "step": 2246 }, { "epoch": 0.96, "grad_norm": 0.5397155007947799, "learning_rate": 3.1373628696008883e-06, "loss": 0.4856, "step": 2247 }, { "epoch": 0.96, "grad_norm": 0.5495612328617239, "learning_rate": 3.1355458354837183e-06, "loss": 0.4909, "step": 2248 }, { "epoch": 0.96, "grad_norm": 0.5196306467160406, "learning_rate": 3.133728442366885e-06, "loss": 0.4525, "step": 2249 }, { "epoch": 0.96, "grad_norm": 0.5278171503733031, "learning_rate": 3.1319106912769797e-06, "loss": 0.4458, "step": 2250 }, { "epoch": 0.96, "grad_norm": 0.5621344379861554, "learning_rate": 3.13009258324079e-06, "loss": 0.4288, "step": 2251 }, { "epoch": 0.96, "grad_norm": 0.5340342660151414, "learning_rate": 3.128274119285309e-06, "loss": 0.449, "step": 2252 }, { "epoch": 0.96, "grad_norm": 0.5771753795948189, "learning_rate": 3.1264553004377285e-06, "loss": 0.493, "step": 2253 }, { "epoch": 0.96, "grad_norm": 0.6109163801950218, "learning_rate": 3.1246361277254405e-06, "loss": 0.5092, "step": 2254 }, { "epoch": 0.96, "grad_norm": 0.5464493547347894, "learning_rate": 3.122816602176039e-06, "loss": 0.4831, "step": 2255 }, { "epoch": 0.96, "grad_norm": 0.5388061503869606, "learning_rate": 3.1209967248173167e-06, "loss": 0.474, "step": 2256 }, { "epoch": 0.96, "grad_norm": 0.5671598890046339, "learning_rate": 3.119176496677263e-06, "loss": 0.4911, "step": 2257 }, { "epoch": 0.96, "grad_norm": 0.5456085685153534, "learning_rate": 3.1173559187840683e-06, "loss": 0.4614, "step": 2258 }, { "epoch": 0.97, "grad_norm": 0.54513522335301, "learning_rate": 3.115534992166119e-06, "loss": 0.4692, "step": 2259 }, { "epoch": 0.97, "grad_norm": 0.5262225869939646, "learning_rate": 3.1137137178519983e-06, "loss": 0.4568, "step": 2260 }, { "epoch": 0.97, "grad_norm": 0.5473252038967218, "learning_rate": 3.111892096870487e-06, "loss": 0.4611, "step": 2261 }, { "epoch": 0.97, "grad_norm": 0.5599766804442933, "learning_rate": 3.1100701302505586e-06, "loss": 0.4753, "step": 2262 }, { "epoch": 0.97, "grad_norm": 0.5756313356809942, "learning_rate": 3.1082478190213872e-06, "loss": 0.4845, "step": 2263 }, { "epoch": 0.97, "grad_norm": 0.5480923324133978, "learning_rate": 3.106425164212338e-06, "loss": 0.487, "step": 2264 }, { "epoch": 0.97, "grad_norm": 0.5315541819290027, "learning_rate": 3.1046021668529684e-06, "loss": 0.4702, "step": 2265 }, { "epoch": 0.97, "grad_norm": 0.5476930314697923, "learning_rate": 3.1027788279730343e-06, "loss": 0.4817, "step": 2266 }, { "epoch": 0.97, "grad_norm": 0.544515736247213, "learning_rate": 3.1009551486024814e-06, "loss": 0.4587, "step": 2267 }, { "epoch": 0.97, "grad_norm": 0.5703546973497986, "learning_rate": 3.099131129771448e-06, "loss": 0.4617, "step": 2268 }, { "epoch": 0.97, "grad_norm": 0.5424534272905377, "learning_rate": 3.0973067725102636e-06, "loss": 0.4484, "step": 2269 }, { "epoch": 0.97, "grad_norm": 0.5733254545058311, "learning_rate": 3.0954820778494516e-06, "loss": 0.49, "step": 2270 }, { "epoch": 0.97, "grad_norm": 0.5844361084620427, "learning_rate": 3.093657046819722e-06, "loss": 0.4734, "step": 2271 }, { "epoch": 0.97, "grad_norm": 0.5574279308944012, "learning_rate": 3.0918316804519784e-06, "loss": 0.4574, "step": 2272 }, { "epoch": 0.97, "eval_loss": 0.467759370803833, "eval_runtime": 6932.3606, "eval_samples_per_second": 41.889, "eval_steps_per_second": 2.095, "step": 2272 }, { "epoch": 0.97, "grad_norm": 0.546522412221111, "learning_rate": 3.0900059797773114e-06, "loss": 0.4886, "step": 2273 }, { "epoch": 0.97, "grad_norm": 0.553571296978123, "learning_rate": 3.0881799458270005e-06, "loss": 0.4543, "step": 2274 }, { "epoch": 0.97, "grad_norm": 0.5237411938748142, "learning_rate": 3.0863535796325173e-06, "loss": 0.4556, "step": 2275 }, { "epoch": 0.97, "grad_norm": 0.544211874410188, "learning_rate": 3.0845268822255155e-06, "loss": 0.4775, "step": 2276 }, { "epoch": 0.97, "grad_norm": 0.5661742324216515, "learning_rate": 3.0826998546378385e-06, "loss": 0.4602, "step": 2277 }, { "epoch": 0.97, "grad_norm": 0.5241341056853029, "learning_rate": 3.080872497901518e-06, "loss": 0.4673, "step": 2278 }, { "epoch": 0.97, "grad_norm": 0.5483140951807146, "learning_rate": 3.079044813048768e-06, "loss": 0.4986, "step": 2279 }, { "epoch": 0.97, "grad_norm": 0.5515056093607098, "learning_rate": 3.0772168011119894e-06, "loss": 0.4472, "step": 2280 }, { "epoch": 0.97, "grad_norm": 0.5594300479805302, "learning_rate": 3.0753884631237706e-06, "loss": 0.4762, "step": 2281 }, { "epoch": 0.97, "grad_norm": 0.5326582325370033, "learning_rate": 3.073559800116879e-06, "loss": 0.4652, "step": 2282 }, { "epoch": 0.98, "grad_norm": 0.5354171305776954, "learning_rate": 3.0717308131242695e-06, "loss": 0.4585, "step": 2283 }, { "epoch": 0.98, "grad_norm": 0.5414502214146208, "learning_rate": 3.069901503179079e-06, "loss": 0.4721, "step": 2284 }, { "epoch": 0.98, "grad_norm": 0.5586551837722931, "learning_rate": 3.068071871314626e-06, "loss": 0.4283, "step": 2285 }, { "epoch": 0.98, "grad_norm": 0.5627424065729489, "learning_rate": 3.0662419185644117e-06, "loss": 0.4811, "step": 2286 }, { "epoch": 0.98, "grad_norm": 0.5553189229312547, "learning_rate": 3.0644116459621177e-06, "loss": 0.4749, "step": 2287 }, { "epoch": 0.98, "grad_norm": 0.5758016314417642, "learning_rate": 3.0625810545416066e-06, "loss": 0.4696, "step": 2288 }, { "epoch": 0.98, "grad_norm": 0.5625491392976549, "learning_rate": 3.060750145336924e-06, "loss": 0.4641, "step": 2289 }, { "epoch": 0.98, "grad_norm": 0.5325354662459477, "learning_rate": 3.0589189193822894e-06, "loss": 0.4501, "step": 2290 }, { "epoch": 0.98, "grad_norm": 0.5498616147688943, "learning_rate": 3.057087377712106e-06, "loss": 0.4697, "step": 2291 }, { "epoch": 0.98, "grad_norm": 0.5402471923474919, "learning_rate": 3.0552555213609526e-06, "loss": 0.4746, "step": 2292 }, { "epoch": 0.98, "grad_norm": 0.5742344531078118, "learning_rate": 3.0534233513635863e-06, "loss": 0.4597, "step": 2293 }, { "epoch": 0.98, "grad_norm": 0.5660060095582498, "learning_rate": 3.0515908687549427e-06, "loss": 0.4712, "step": 2294 }, { "epoch": 0.98, "grad_norm": 0.5449862197515469, "learning_rate": 3.0497580745701334e-06, "loss": 0.4658, "step": 2295 }, { "epoch": 0.98, "grad_norm": 0.5637723063866732, "learning_rate": 3.047924969844444e-06, "loss": 0.4863, "step": 2296 }, { "epoch": 0.98, "grad_norm": 0.5495824769869212, "learning_rate": 3.046091555613339e-06, "loss": 0.4703, "step": 2297 }, { "epoch": 0.98, "grad_norm": 0.5382416875131586, "learning_rate": 3.0442578329124545e-06, "loss": 0.4584, "step": 2298 }, { "epoch": 0.98, "grad_norm": 0.5463125942933166, "learning_rate": 3.042423802777602e-06, "loss": 0.4772, "step": 2299 }, { "epoch": 0.98, "grad_norm": 0.5597328954873347, "learning_rate": 3.0405894662447682e-06, "loss": 0.4639, "step": 2300 }, { "epoch": 0.98, "grad_norm": 0.5559561549207039, "learning_rate": 3.038754824350111e-06, "loss": 0.4847, "step": 2301 }, { "epoch": 0.98, "grad_norm": 0.548445944370762, "learning_rate": 3.0369198781299615e-06, "loss": 0.47, "step": 2302 }, { "epoch": 0.98, "grad_norm": 0.5292092956017875, "learning_rate": 3.0350846286208223e-06, "loss": 0.4515, "step": 2303 }, { "epoch": 0.98, "grad_norm": 0.5674894268853128, "learning_rate": 3.0332490768593676e-06, "loss": 0.4685, "step": 2304 }, { "epoch": 0.98, "grad_norm": 0.5336634810588987, "learning_rate": 3.0314132238824416e-06, "loss": 0.4581, "step": 2305 }, { "epoch": 0.99, "grad_norm": 0.5533286565854473, "learning_rate": 3.029577070727061e-06, "loss": 0.4477, "step": 2306 }, { "epoch": 0.99, "grad_norm": 0.5680828232808299, "learning_rate": 3.027740618430409e-06, "loss": 0.4584, "step": 2307 }, { "epoch": 0.99, "grad_norm": 0.5469367387429259, "learning_rate": 3.0259038680298403e-06, "loss": 0.4566, "step": 2308 }, { "epoch": 0.99, "grad_norm": 0.575161918224578, "learning_rate": 3.0240668205628757e-06, "loss": 0.4918, "step": 2309 }, { "epoch": 0.99, "grad_norm": 0.5576124636437852, "learning_rate": 3.0222294770672054e-06, "loss": 0.4358, "step": 2310 }, { "epoch": 0.99, "grad_norm": 0.5802410529155669, "learning_rate": 3.0203918385806874e-06, "loss": 0.4712, "step": 2311 }, { "epoch": 0.99, "grad_norm": 0.5305957990981474, "learning_rate": 3.018553906141343e-06, "loss": 0.4656, "step": 2312 }, { "epoch": 0.99, "grad_norm": 0.5503984759313564, "learning_rate": 3.0167156807873637e-06, "loss": 0.4711, "step": 2313 }, { "epoch": 0.99, "grad_norm": 0.5629590646395772, "learning_rate": 3.014877163557105e-06, "loss": 0.4604, "step": 2314 }, { "epoch": 0.99, "grad_norm": 0.5731070115431295, "learning_rate": 3.013038355489086e-06, "loss": 0.4543, "step": 2315 }, { "epoch": 0.99, "grad_norm": 0.5625771430390166, "learning_rate": 3.0111992576219905e-06, "loss": 0.4564, "step": 2316 }, { "epoch": 0.99, "grad_norm": 0.5811573131231536, "learning_rate": 3.009359870994668e-06, "loss": 0.4792, "step": 2317 }, { "epoch": 0.99, "grad_norm": 0.5768867352155639, "learning_rate": 3.0075201966461286e-06, "loss": 0.4802, "step": 2318 }, { "epoch": 0.99, "grad_norm": 0.5833346195049011, "learning_rate": 3.0056802356155455e-06, "loss": 0.4669, "step": 2319 }, { "epoch": 0.99, "grad_norm": 0.5506865485168906, "learning_rate": 3.0038399889422553e-06, "loss": 0.4483, "step": 2320 }, { "epoch": 0.99, "grad_norm": 0.549893343788274, "learning_rate": 3.001999457665754e-06, "loss": 0.4608, "step": 2321 }, { "epoch": 0.99, "grad_norm": 0.5742059606229357, "learning_rate": 3.0001586428257006e-06, "loss": 0.4632, "step": 2322 }, { "epoch": 0.99, "grad_norm": 0.5710353252614344, "learning_rate": 2.9983175454619114e-06, "loss": 0.4906, "step": 2323 }, { "epoch": 0.99, "grad_norm": 0.5661140686401654, "learning_rate": 2.9964761666143638e-06, "loss": 0.4691, "step": 2324 }, { "epoch": 0.99, "grad_norm": 0.5832412124730281, "learning_rate": 2.9946345073231964e-06, "loss": 0.4715, "step": 2325 }, { "epoch": 0.99, "grad_norm": 0.5322313638153089, "learning_rate": 2.9927925686287006e-06, "loss": 0.4397, "step": 2326 }, { "epoch": 0.99, "grad_norm": 0.5532684507212673, "learning_rate": 2.9909503515713324e-06, "loss": 0.4526, "step": 2327 }, { "epoch": 0.99, "grad_norm": 0.5380218677936013, "learning_rate": 2.9891078571917004e-06, "loss": 0.4627, "step": 2328 }, { "epoch": 1.0, "grad_norm": 0.575331323008352, "learning_rate": 2.987265086530571e-06, "loss": 0.4582, "step": 2329 }, { "epoch": 1.0, "grad_norm": 0.5720361448465936, "learning_rate": 2.985422040628867e-06, "loss": 0.4663, "step": 2330 }, { "epoch": 1.0, "grad_norm": 0.54998684465839, "learning_rate": 2.983578720527667e-06, "loss": 0.455, "step": 2331 }, { "epoch": 1.0, "grad_norm": 0.5552738272333081, "learning_rate": 2.981735127268202e-06, "loss": 0.4705, "step": 2332 }, { "epoch": 1.0, "grad_norm": 0.552744557893692, "learning_rate": 2.9798912618918617e-06, "loss": 0.4633, "step": 2333 }, { "epoch": 1.0, "grad_norm": 0.5445751179840383, "learning_rate": 2.9780471254401868e-06, "loss": 0.4736, "step": 2334 }, { "epoch": 1.0, "grad_norm": 0.5333392734486967, "learning_rate": 2.976202718954869e-06, "loss": 0.4777, "step": 2335 }, { "epoch": 1.0, "grad_norm": 0.572472547878964, "learning_rate": 2.9743580434777586e-06, "loss": 0.487, "step": 2336 }, { "epoch": 1.0, "grad_norm": 0.5344393742783472, "learning_rate": 2.972513100050851e-06, "loss": 0.4707, "step": 2337 }, { "epoch": 1.0, "grad_norm": 0.5630355400902859, "learning_rate": 2.970667889716298e-06, "loss": 0.4366, "step": 2338 }, { "epoch": 1.0, "grad_norm": 0.5643967515499763, "learning_rate": 2.9688224135164e-06, "loss": 0.466, "step": 2339 }, { "epoch": 1.0, "grad_norm": 0.5624350508625989, "learning_rate": 2.9669766724936074e-06, "loss": 0.4352, "step": 2340 }, { "epoch": 1.0, "grad_norm": 0.5863321023005329, "learning_rate": 2.9651306676905213e-06, "loss": 0.4963, "step": 2341 }, { "epoch": 1.0, "grad_norm": 0.5263776129880875, "learning_rate": 2.9632844001498908e-06, "loss": 0.4565, "step": 2342 }, { "epoch": 1.0, "grad_norm": 0.5610974245603717, "learning_rate": 2.9614378709146136e-06, "loss": 0.4661, "step": 2343 }, { "epoch": 1.0, "eval_loss": 0.4664944112300873, "eval_runtime": 6923.3541, "eval_samples_per_second": 41.943, "eval_steps_per_second": 2.097, "step": 2343 }, { "epoch": 1.0, "grad_norm": 0.5460765503187325, "learning_rate": 2.9595910810277367e-06, "loss": 0.4629, "step": 2344 }, { "epoch": 1.0, "grad_norm": 0.5468673482394827, "learning_rate": 2.957744031532451e-06, "loss": 0.4583, "step": 2345 }, { "epoch": 1.0, "grad_norm": 0.5470321763441149, "learning_rate": 2.9558967234720976e-06, "loss": 0.5088, "step": 2346 }, { "epoch": 1.0, "grad_norm": 0.5599348363396319, "learning_rate": 2.9540491578901625e-06, "loss": 0.4596, "step": 2347 }, { "epoch": 1.0, "grad_norm": 0.554159234404833, "learning_rate": 2.9522013358302754e-06, "loss": 0.4577, "step": 2348 }, { "epoch": 1.0, "grad_norm": 0.6005976463431216, "learning_rate": 2.9503532583362126e-06, "loss": 0.468, "step": 2349 }, { "epoch": 1.0, "grad_norm": 0.5407554418974608, "learning_rate": 2.948504926451896e-06, "loss": 0.4715, "step": 2350 }, { "epoch": 1.0, "grad_norm": 0.5744854947285752, "learning_rate": 2.9466563412213873e-06, "loss": 0.4941, "step": 2351 }, { "epoch": 1.0, "grad_norm": 0.5637411990253361, "learning_rate": 2.9448075036888944e-06, "loss": 0.4573, "step": 2352 }, { "epoch": 1.01, "grad_norm": 0.5656040322788783, "learning_rate": 2.942958414898768e-06, "loss": 0.4586, "step": 2353 }, { "epoch": 1.01, "grad_norm": 0.5735314101382244, "learning_rate": 2.941109075895499e-06, "loss": 0.4493, "step": 2354 }, { "epoch": 1.01, "grad_norm": 0.5467877193091774, "learning_rate": 2.9392594877237194e-06, "loss": 0.5006, "step": 2355 }, { "epoch": 1.01, "grad_norm": 0.5362633073872797, "learning_rate": 2.937409651428205e-06, "loss": 0.4874, "step": 2356 }, { "epoch": 1.01, "grad_norm": 0.5325968068801722, "learning_rate": 2.935559568053867e-06, "loss": 0.4914, "step": 2357 }, { "epoch": 1.01, "grad_norm": 0.5378549381823456, "learning_rate": 2.93370923864576e-06, "loss": 0.4607, "step": 2358 }, { "epoch": 1.01, "grad_norm": 0.5271154873814673, "learning_rate": 2.9318586642490766e-06, "loss": 0.44, "step": 2359 }, { "epoch": 1.01, "grad_norm": 0.5924972443494668, "learning_rate": 2.930007845909146e-06, "loss": 0.5007, "step": 2360 }, { "epoch": 1.01, "grad_norm": 0.5759884987421106, "learning_rate": 2.9281567846714393e-06, "loss": 0.4847, "step": 2361 }, { "epoch": 1.01, "grad_norm": 0.5452440313358291, "learning_rate": 2.92630548158156e-06, "loss": 0.4728, "step": 2362 }, { "epoch": 1.01, "grad_norm": 0.5491210743756382, "learning_rate": 2.924453937685251e-06, "loss": 0.4814, "step": 2363 }, { "epoch": 1.01, "grad_norm": 0.5497417413168237, "learning_rate": 2.9226021540283914e-06, "loss": 0.479, "step": 2364 }, { "epoch": 1.01, "grad_norm": 0.5440302176322239, "learning_rate": 2.9207501316569936e-06, "loss": 0.469, "step": 2365 }, { "epoch": 1.01, "grad_norm": 0.5307306470354942, "learning_rate": 2.918897871617207e-06, "loss": 0.4494, "step": 2366 }, { "epoch": 1.01, "grad_norm": 0.5533694018060741, "learning_rate": 2.9170453749553158e-06, "loss": 0.4528, "step": 2367 }, { "epoch": 1.01, "grad_norm": 0.5821093073073446, "learning_rate": 2.9151926427177345e-06, "loss": 0.4847, "step": 2368 }, { "epoch": 1.01, "grad_norm": 0.5810561155735557, "learning_rate": 2.913339675951014e-06, "loss": 0.4577, "step": 2369 }, { "epoch": 1.01, "grad_norm": 0.5637656659230893, "learning_rate": 2.911486475701835e-06, "loss": 0.4705, "step": 2370 }, { "epoch": 1.01, "grad_norm": 0.5646507829525883, "learning_rate": 2.909633043017013e-06, "loss": 0.4655, "step": 2371 }, { "epoch": 1.01, "grad_norm": 0.5560085235238604, "learning_rate": 2.9077793789434925e-06, "loss": 0.4645, "step": 2372 }, { "epoch": 1.01, "grad_norm": 0.5804591657223265, "learning_rate": 2.905925484528349e-06, "loss": 0.4473, "step": 2373 }, { "epoch": 1.0, "grad_norm": 0.5183741960616122, "learning_rate": 2.9040713608187896e-06, "loss": 0.4561, "step": 2374 }, { "epoch": 1.0, "grad_norm": 0.6496654700785708, "learning_rate": 2.9022170088621497e-06, "loss": 0.4481, "step": 2375 }, { "epoch": 1.0, "grad_norm": 0.6258547954302262, "learning_rate": 2.900362429705893e-06, "loss": 0.4322, "step": 2376 }, { "epoch": 1.0, "grad_norm": 0.7675149067928225, "learning_rate": 2.8985076243976133e-06, "loss": 0.3944, "step": 2377 }, { "epoch": 1.0, "grad_norm": 0.6600899912780784, "learning_rate": 2.896652593985031e-06, "loss": 0.4245, "step": 2378 }, { "epoch": 1.0, "grad_norm": 0.7306824483448988, "learning_rate": 2.8947973395159934e-06, "loss": 0.4426, "step": 2379 }, { "epoch": 1.0, "grad_norm": 0.6516484002884269, "learning_rate": 2.892941862038475e-06, "loss": 0.4297, "step": 2380 }, { "epoch": 1.0, "grad_norm": 0.5912395370707995, "learning_rate": 2.8910861626005774e-06, "loss": 0.4273, "step": 2381 }, { "epoch": 1.0, "grad_norm": 0.6132721088402558, "learning_rate": 2.889230242250525e-06, "loss": 0.3981, "step": 2382 }, { "epoch": 1.0, "grad_norm": 0.6491777150140556, "learning_rate": 2.887374102036668e-06, "loss": 0.4167, "step": 2383 }, { "epoch": 1.0, "grad_norm": 0.6143918069720361, "learning_rate": 2.8855177430074817e-06, "loss": 0.4215, "step": 2384 }, { "epoch": 1.0, "grad_norm": 0.5796986965059269, "learning_rate": 2.883661166211564e-06, "loss": 0.4328, "step": 2385 }, { "epoch": 1.01, "grad_norm": 0.6385596001339221, "learning_rate": 2.881804372697637e-06, "loss": 0.4189, "step": 2386 }, { "epoch": 1.01, "grad_norm": 0.6774158174574826, "learning_rate": 2.879947363514543e-06, "loss": 0.4257, "step": 2387 }, { "epoch": 1.01, "grad_norm": 0.6439324568387629, "learning_rate": 2.878090139711249e-06, "loss": 0.4203, "step": 2388 }, { "epoch": 1.01, "grad_norm": 0.5928966275845924, "learning_rate": 2.8762327023368408e-06, "loss": 0.4118, "step": 2389 }, { "epoch": 1.01, "grad_norm": 0.6017304771309222, "learning_rate": 2.8743750524405254e-06, "loss": 0.4698, "step": 2390 }, { "epoch": 1.01, "grad_norm": 0.5709073058681067, "learning_rate": 2.872517191071631e-06, "loss": 0.4357, "step": 2391 }, { "epoch": 1.01, "grad_norm": 0.5680125069824045, "learning_rate": 2.870659119279605e-06, "loss": 0.4331, "step": 2392 }, { "epoch": 1.01, "grad_norm": 0.5694888697631014, "learning_rate": 2.8688008381140126e-06, "loss": 0.4225, "step": 2393 }, { "epoch": 1.01, "grad_norm": 0.533441546679153, "learning_rate": 2.866942348624538e-06, "loss": 0.4106, "step": 2394 }, { "epoch": 1.01, "grad_norm": 0.6115603135836928, "learning_rate": 2.8650836518609814e-06, "loss": 0.4167, "step": 2395 }, { "epoch": 1.01, "grad_norm": 0.6206156844939555, "learning_rate": 2.863224748873264e-06, "loss": 0.4153, "step": 2396 }, { "epoch": 1.01, "grad_norm": 0.5748859460500882, "learning_rate": 2.8613656407114197e-06, "loss": 0.4134, "step": 2397 }, { "epoch": 1.01, "grad_norm": 0.5420087579510725, "learning_rate": 2.8595063284255997e-06, "loss": 0.42, "step": 2398 }, { "epoch": 1.01, "grad_norm": 0.6393458020851205, "learning_rate": 2.85764681306607e-06, "loss": 0.4448, "step": 2399 }, { "epoch": 1.01, "grad_norm": 0.5570280986330635, "learning_rate": 2.8557870956832135e-06, "loss": 0.4192, "step": 2400 }, { "epoch": 1.01, "grad_norm": 0.5742816886506813, "learning_rate": 2.853927177327524e-06, "loss": 0.4063, "step": 2401 }, { "epoch": 1.01, "grad_norm": 0.5631462441989407, "learning_rate": 2.85206705904961e-06, "loss": 0.4187, "step": 2402 }, { "epoch": 1.01, "grad_norm": 0.5759070659432811, "learning_rate": 2.850206741900195e-06, "loss": 0.4294, "step": 2403 }, { "epoch": 1.01, "grad_norm": 0.5954714993206741, "learning_rate": 2.8483462269301117e-06, "loss": 0.4414, "step": 2404 }, { "epoch": 1.01, "grad_norm": 0.5847923954668218, "learning_rate": 2.8464855151903065e-06, "loss": 0.4153, "step": 2405 }, { "epoch": 1.01, "grad_norm": 0.6128854329193607, "learning_rate": 2.844624607731836e-06, "loss": 0.4528, "step": 2406 }, { "epoch": 1.01, "grad_norm": 0.5703811176973029, "learning_rate": 2.842763505605867e-06, "loss": 0.4222, "step": 2407 }, { "epoch": 1.01, "grad_norm": 0.5552066352368414, "learning_rate": 2.8409022098636797e-06, "loss": 0.436, "step": 2408 }, { "epoch": 1.02, "grad_norm": 0.6157208158352602, "learning_rate": 2.839040721556658e-06, "loss": 0.4302, "step": 2409 }, { "epoch": 1.02, "grad_norm": 0.5647862801761142, "learning_rate": 2.837179041736299e-06, "loss": 0.4083, "step": 2410 }, { "epoch": 1.02, "grad_norm": 0.5547355674329157, "learning_rate": 2.835317171454206e-06, "loss": 0.4162, "step": 2411 }, { "epoch": 1.02, "grad_norm": 0.5766014610774604, "learning_rate": 2.8334551117620908e-06, "loss": 0.4226, "step": 2412 }, { "epoch": 1.02, "grad_norm": 0.6015597806488204, "learning_rate": 2.8315928637117713e-06, "loss": 0.4291, "step": 2413 }, { "epoch": 1.02, "grad_norm": 0.5788471843088463, "learning_rate": 2.829730428355173e-06, "loss": 0.4251, "step": 2414 }, { "epoch": 1.02, "eval_loss": 0.4677947163581848, "eval_runtime": 6939.1205, "eval_samples_per_second": 41.848, "eval_steps_per_second": 2.092, "step": 2414 }, { "epoch": 1.02, "grad_norm": 0.586414114042501, "learning_rate": 2.8278678067443255e-06, "loss": 0.4145, "step": 2415 }, { "epoch": 1.02, "grad_norm": 0.5691685054792884, "learning_rate": 2.826004999931365e-06, "loss": 0.4376, "step": 2416 }, { "epoch": 1.02, "grad_norm": 0.5972775147112437, "learning_rate": 2.8241420089685327e-06, "loss": 0.4306, "step": 2417 }, { "epoch": 1.02, "grad_norm": 0.5620380464963803, "learning_rate": 2.8222788349081724e-06, "loss": 0.4096, "step": 2418 }, { "epoch": 1.02, "grad_norm": 0.5784135735472834, "learning_rate": 2.820415478802733e-06, "loss": 0.4322, "step": 2419 }, { "epoch": 1.02, "grad_norm": 0.5885392171954246, "learning_rate": 2.8185519417047624e-06, "loss": 0.4039, "step": 2420 }, { "epoch": 1.02, "grad_norm": 0.5536074618340685, "learning_rate": 2.8166882246669158e-06, "loss": 0.4291, "step": 2421 }, { "epoch": 1.02, "grad_norm": 0.581909431594722, "learning_rate": 2.814824328741948e-06, "loss": 0.4144, "step": 2422 }, { "epoch": 1.02, "grad_norm": 0.5675861206607421, "learning_rate": 2.8129602549827133e-06, "loss": 0.419, "step": 2423 }, { "epoch": 1.02, "grad_norm": 0.5632432133699884, "learning_rate": 2.811096004442168e-06, "loss": 0.4252, "step": 2424 }, { "epoch": 1.02, "grad_norm": 0.5650569070218456, "learning_rate": 2.80923157817337e-06, "loss": 0.4136, "step": 2425 }, { "epoch": 1.02, "grad_norm": 0.572161063598354, "learning_rate": 2.8073669772294714e-06, "loss": 0.4208, "step": 2426 }, { "epoch": 1.02, "grad_norm": 0.5719901179312831, "learning_rate": 2.805502202663728e-06, "loss": 0.4375, "step": 2427 }, { "epoch": 1.02, "grad_norm": 0.5612496619770833, "learning_rate": 2.8036372555294916e-06, "loss": 0.4047, "step": 2428 }, { "epoch": 1.02, "grad_norm": 0.5900526811179282, "learning_rate": 2.8017721368802105e-06, "loss": 0.4295, "step": 2429 }, { "epoch": 1.02, "grad_norm": 0.5789988982356299, "learning_rate": 2.799906847769433e-06, "loss": 0.4138, "step": 2430 }, { "epoch": 1.02, "grad_norm": 0.5438179046584847, "learning_rate": 2.7980413892507995e-06, "loss": 0.408, "step": 2431 }, { "epoch": 1.02, "grad_norm": 0.5857502120912527, "learning_rate": 2.79617576237805e-06, "loss": 0.406, "step": 2432 }, { "epoch": 1.03, "grad_norm": 0.5798751246080152, "learning_rate": 2.7943099682050174e-06, "loss": 0.4173, "step": 2433 }, { "epoch": 1.03, "grad_norm": 0.5830921819980437, "learning_rate": 2.7924440077856284e-06, "loss": 0.4353, "step": 2434 }, { "epoch": 1.03, "grad_norm": 0.5550145268591252, "learning_rate": 2.790577882173906e-06, "loss": 0.4033, "step": 2435 }, { "epoch": 1.03, "grad_norm": 0.5665020917284002, "learning_rate": 2.788711592423966e-06, "loss": 0.4282, "step": 2436 }, { "epoch": 1.03, "grad_norm": 0.5619717205524801, "learning_rate": 2.786845139590014e-06, "loss": 0.4161, "step": 2437 }, { "epoch": 1.03, "grad_norm": 0.5647215964187846, "learning_rate": 2.7849785247263515e-06, "loss": 0.419, "step": 2438 }, { "epoch": 1.03, "grad_norm": 0.6047710310723298, "learning_rate": 2.7831117488873703e-06, "loss": 0.4489, "step": 2439 }, { "epoch": 1.03, "grad_norm": 0.5729169302378637, "learning_rate": 2.781244813127552e-06, "loss": 0.4258, "step": 2440 }, { "epoch": 1.03, "grad_norm": 0.5726895367017322, "learning_rate": 2.779377718501469e-06, "loss": 0.4163, "step": 2441 }, { "epoch": 1.03, "grad_norm": 0.5898478621812457, "learning_rate": 2.7775104660637847e-06, "loss": 0.4383, "step": 2442 }, { "epoch": 1.03, "grad_norm": 0.5776286383338888, "learning_rate": 2.77564305686925e-06, "loss": 0.4172, "step": 2443 }, { "epoch": 1.03, "grad_norm": 0.5708128845728648, "learning_rate": 2.7737754919727057e-06, "loss": 0.433, "step": 2444 }, { "epoch": 1.03, "grad_norm": 0.5838647193488192, "learning_rate": 2.7719077724290793e-06, "loss": 0.4319, "step": 2445 }, { "epoch": 1.03, "grad_norm": 0.5839908538746356, "learning_rate": 2.7700398992933865e-06, "loss": 0.3981, "step": 2446 }, { "epoch": 1.03, "grad_norm": 0.5793472385138957, "learning_rate": 2.76817187362073e-06, "loss": 0.4075, "step": 2447 }, { "epoch": 1.03, "grad_norm": 0.5727590490167658, "learning_rate": 2.7663036964662967e-06, "loss": 0.4312, "step": 2448 }, { "epoch": 1.03, "grad_norm": 0.5920736043473027, "learning_rate": 2.764435368885362e-06, "loss": 0.4138, "step": 2449 }, { "epoch": 1.03, "grad_norm": 0.5723166701159088, "learning_rate": 2.762566891933285e-06, "loss": 0.4073, "step": 2450 }, { "epoch": 1.03, "grad_norm": 0.5624668781788753, "learning_rate": 2.7606982666655074e-06, "loss": 0.429, "step": 2451 }, { "epoch": 1.03, "grad_norm": 0.5704524501724009, "learning_rate": 2.758829494137557e-06, "loss": 0.4196, "step": 2452 }, { "epoch": 1.03, "grad_norm": 0.5700116449229535, "learning_rate": 2.7569605754050455e-06, "loss": 0.4317, "step": 2453 }, { "epoch": 1.03, "grad_norm": 0.599236712281217, "learning_rate": 2.7550915115236636e-06, "loss": 0.4087, "step": 2454 }, { "epoch": 1.03, "grad_norm": 0.5731708478648513, "learning_rate": 2.7532223035491877e-06, "loss": 0.4149, "step": 2455 }, { "epoch": 1.04, "grad_norm": 0.5748213937213151, "learning_rate": 2.7513529525374725e-06, "loss": 0.4041, "step": 2456 }, { "epoch": 1.04, "grad_norm": 0.5736837321137316, "learning_rate": 2.749483459544457e-06, "loss": 0.4282, "step": 2457 }, { "epoch": 1.04, "grad_norm": 0.5637414256487769, "learning_rate": 2.7476138256261575e-06, "loss": 0.3905, "step": 2458 }, { "epoch": 1.04, "grad_norm": 0.6013990347615658, "learning_rate": 2.74574405183867e-06, "loss": 0.4183, "step": 2459 }, { "epoch": 1.04, "grad_norm": 0.5581978111233568, "learning_rate": 2.743874139238171e-06, "loss": 0.4325, "step": 2460 }, { "epoch": 1.04, "grad_norm": 0.5658199630731812, "learning_rate": 2.7420040888809153e-06, "loss": 0.4035, "step": 2461 }, { "epoch": 1.04, "grad_norm": 0.5691729761752431, "learning_rate": 2.740133901823234e-06, "loss": 0.4171, "step": 2462 }, { "epoch": 1.04, "grad_norm": 0.5571943454995789, "learning_rate": 2.7382635791215368e-06, "loss": 0.4092, "step": 2463 }, { "epoch": 1.04, "grad_norm": 0.5584570649456075, "learning_rate": 2.7363931218323103e-06, "loss": 0.4192, "step": 2464 }, { "epoch": 1.04, "grad_norm": 0.579128238271553, "learning_rate": 2.7345225310121155e-06, "loss": 0.4404, "step": 2465 }, { "epoch": 1.04, "grad_norm": 0.5929460640663197, "learning_rate": 2.7326518077175897e-06, "loss": 0.4197, "step": 2466 }, { "epoch": 1.04, "grad_norm": 0.5689782317669538, "learning_rate": 2.7307809530054456e-06, "loss": 0.4316, "step": 2467 }, { "epoch": 1.04, "grad_norm": 0.5854607172774103, "learning_rate": 2.7289099679324686e-06, "loss": 0.4431, "step": 2468 }, { "epoch": 1.04, "grad_norm": 0.5694064345867531, "learning_rate": 2.7270388535555207e-06, "loss": 0.4229, "step": 2469 }, { "epoch": 1.04, "grad_norm": 0.5694897278258253, "learning_rate": 2.725167610931534e-06, "loss": 0.4122, "step": 2470 }, { "epoch": 1.04, "grad_norm": 0.5799667414945328, "learning_rate": 2.7232962411175128e-06, "loss": 0.4227, "step": 2471 }, { "epoch": 1.04, "grad_norm": 0.5880421037268969, "learning_rate": 2.721424745170537e-06, "loss": 0.4239, "step": 2472 }, { "epoch": 1.04, "grad_norm": 0.5645195680029945, "learning_rate": 2.719553124147753e-06, "loss": 0.4131, "step": 2473 }, { "epoch": 1.04, "grad_norm": 0.5549310632780857, "learning_rate": 2.717681379106381e-06, "loss": 0.4221, "step": 2474 }, { "epoch": 1.04, "grad_norm": 0.5724363209257333, "learning_rate": 2.715809511103711e-06, "loss": 0.4331, "step": 2475 }, { "epoch": 1.04, "grad_norm": 0.5646087777604074, "learning_rate": 2.7139375211971e-06, "loss": 0.4285, "step": 2476 }, { "epoch": 1.04, "grad_norm": 0.5589589680565078, "learning_rate": 2.712065410443977e-06, "loss": 0.4225, "step": 2477 }, { "epoch": 1.04, "grad_norm": 0.5975042188527351, "learning_rate": 2.710193179901838e-06, "loss": 0.4475, "step": 2478 }, { "epoch": 1.04, "grad_norm": 0.5632223523751451, "learning_rate": 2.7083208306282455e-06, "loss": 0.4195, "step": 2479 }, { "epoch": 1.05, "grad_norm": 0.6271794519728584, "learning_rate": 2.7064483636808314e-06, "loss": 0.4383, "step": 2480 }, { "epoch": 1.05, "grad_norm": 0.6193844438411981, "learning_rate": 2.7045757801172918e-06, "loss": 0.4405, "step": 2481 }, { "epoch": 1.05, "grad_norm": 0.5762357913780729, "learning_rate": 2.70270308099539e-06, "loss": 0.4256, "step": 2482 }, { "epoch": 1.05, "grad_norm": 0.5686615359961465, "learning_rate": 2.7008302673729556e-06, "loss": 0.4228, "step": 2483 }, { "epoch": 1.05, "grad_norm": 0.5846035613411025, "learning_rate": 2.6989573403078793e-06, "loss": 0.4424, "step": 2484 }, { "epoch": 1.05, "grad_norm": 0.57106132811842, "learning_rate": 2.69708430085812e-06, "loss": 0.4249, "step": 2485 }, { "epoch": 1.05, "eval_loss": 0.4666215181350708, "eval_runtime": 6941.4114, "eval_samples_per_second": 41.834, "eval_steps_per_second": 2.092, "step": 2485 }, { "epoch": 1.05, "grad_norm": 0.5599313713923679, "learning_rate": 2.6952111500816972e-06, "loss": 0.4247, "step": 2486 }, { "epoch": 1.05, "grad_norm": 0.5633818405700421, "learning_rate": 2.6933378890366945e-06, "loss": 0.4292, "step": 2487 }, { "epoch": 1.05, "grad_norm": 0.5687785418378354, "learning_rate": 2.6914645187812573e-06, "loss": 0.4238, "step": 2488 }, { "epoch": 1.05, "grad_norm": 0.5882193651407125, "learning_rate": 2.6895910403735938e-06, "loss": 0.4105, "step": 2489 }, { "epoch": 1.05, "grad_norm": 0.5745904096533959, "learning_rate": 2.687717454871971e-06, "loss": 0.4307, "step": 2490 }, { "epoch": 1.05, "grad_norm": 0.5600088359405079, "learning_rate": 2.6858437633347197e-06, "loss": 0.4149, "step": 2491 }, { "epoch": 1.05, "grad_norm": 0.5900737321936005, "learning_rate": 2.6839699668202275e-06, "loss": 0.4426, "step": 2492 }, { "epoch": 1.05, "grad_norm": 0.5993093534443086, "learning_rate": 2.682096066386943e-06, "loss": 0.4079, "step": 2493 }, { "epoch": 1.05, "grad_norm": 0.573184519185299, "learning_rate": 2.680222063093372e-06, "loss": 0.4052, "step": 2494 }, { "epoch": 1.05, "grad_norm": 0.5618782823660212, "learning_rate": 2.678347957998081e-06, "loss": 0.4364, "step": 2495 }, { "epoch": 1.05, "grad_norm": 0.584859311676964, "learning_rate": 2.6764737521596917e-06, "loss": 0.4052, "step": 2496 }, { "epoch": 1.05, "grad_norm": 0.5477234902368038, "learning_rate": 2.6745994466368846e-06, "loss": 0.4302, "step": 2497 }, { "epoch": 1.05, "grad_norm": 0.6063996852367219, "learning_rate": 2.672725042488393e-06, "loss": 0.4181, "step": 2498 }, { "epoch": 1.05, "grad_norm": 0.5760512448701784, "learning_rate": 2.6708505407730106e-06, "loss": 0.4134, "step": 2499 }, { "epoch": 1.05, "grad_norm": 0.5519722703829206, "learning_rate": 2.6689759425495833e-06, "loss": 0.4222, "step": 2500 }, { "epoch": 1.05, "grad_norm": 0.5683672052752755, "learning_rate": 2.6671012488770104e-06, "loss": 0.4268, "step": 2501 }, { "epoch": 1.05, "grad_norm": 0.5877420100279226, "learning_rate": 2.6652264608142487e-06, "loss": 0.4178, "step": 2502 }, { "epoch": 1.06, "grad_norm": 0.607133857685082, "learning_rate": 2.663351579420307e-06, "loss": 0.417, "step": 2503 }, { "epoch": 1.06, "grad_norm": 0.5772639249289018, "learning_rate": 2.661476605754244e-06, "loss": 0.4377, "step": 2504 }, { "epoch": 1.06, "grad_norm": 0.6163851599426933, "learning_rate": 2.659601540875174e-06, "loss": 0.4059, "step": 2505 }, { "epoch": 1.06, "grad_norm": 0.6150547368429744, "learning_rate": 2.6577263858422623e-06, "loss": 0.4473, "step": 2506 }, { "epoch": 1.06, "grad_norm": 0.6277681834624624, "learning_rate": 2.6558511417147225e-06, "loss": 0.4312, "step": 2507 }, { "epoch": 1.06, "grad_norm": 0.5576110114818135, "learning_rate": 2.653975809551823e-06, "loss": 0.415, "step": 2508 }, { "epoch": 1.06, "grad_norm": 0.5603219178351719, "learning_rate": 2.6521003904128772e-06, "loss": 0.4021, "step": 2509 }, { "epoch": 1.06, "grad_norm": 0.5842308669890593, "learning_rate": 2.650224885357251e-06, "loss": 0.4377, "step": 2510 }, { "epoch": 1.06, "grad_norm": 0.6027928815424282, "learning_rate": 2.648349295444358e-06, "loss": 0.4326, "step": 2511 }, { "epoch": 1.06, "grad_norm": 0.6056130362871082, "learning_rate": 2.646473621733658e-06, "loss": 0.4253, "step": 2512 }, { "epoch": 1.06, "grad_norm": 0.6243077090025687, "learning_rate": 2.6445978652846605e-06, "loss": 0.4478, "step": 2513 }, { "epoch": 1.06, "grad_norm": 0.6116645505267284, "learning_rate": 2.6427220271569206e-06, "loss": 0.4576, "step": 2514 }, { "epoch": 1.06, "grad_norm": 0.5491346480936342, "learning_rate": 2.640846108410039e-06, "loss": 0.3972, "step": 2515 }, { "epoch": 1.06, "grad_norm": 0.5681342786794281, "learning_rate": 2.6389701101036635e-06, "loss": 0.4351, "step": 2516 }, { "epoch": 1.06, "grad_norm": 0.5866245904284294, "learning_rate": 2.6370940332974864e-06, "loss": 0.4201, "step": 2517 }, { "epoch": 1.06, "grad_norm": 0.5501785457503201, "learning_rate": 2.6352178790512425e-06, "loss": 0.3985, "step": 2518 }, { "epoch": 1.06, "grad_norm": 0.5783704626903385, "learning_rate": 2.6333416484247126e-06, "loss": 0.4582, "step": 2519 }, { "epoch": 1.06, "grad_norm": 0.5601581473403204, "learning_rate": 2.6314653424777194e-06, "loss": 0.4502, "step": 2520 }, { "epoch": 1.06, "grad_norm": 0.5776661184654673, "learning_rate": 2.6295889622701287e-06, "loss": 0.3946, "step": 2521 }, { "epoch": 1.06, "grad_norm": 0.6070561840129438, "learning_rate": 2.6277125088618496e-06, "loss": 0.4253, "step": 2522 }, { "epoch": 1.06, "grad_norm": 0.5860572342161754, "learning_rate": 2.6258359833128284e-06, "loss": 0.4134, "step": 2523 }, { "epoch": 1.06, "grad_norm": 0.5814356096505094, "learning_rate": 2.623959386683056e-06, "loss": 0.4144, "step": 2524 }, { "epoch": 1.06, "grad_norm": 0.5645239452943359, "learning_rate": 2.6220827200325628e-06, "loss": 0.4072, "step": 2525 }, { "epoch": 1.07, "grad_norm": 0.5555072889719695, "learning_rate": 2.620205984421418e-06, "loss": 0.4115, "step": 2526 }, { "epoch": 1.07, "grad_norm": 0.5857812167489317, "learning_rate": 2.618329180909728e-06, "loss": 0.3889, "step": 2527 }, { "epoch": 1.07, "grad_norm": 0.5540712238808715, "learning_rate": 2.6164523105576436e-06, "loss": 0.4143, "step": 2528 }, { "epoch": 1.07, "grad_norm": 0.5726575598093935, "learning_rate": 2.614575374425345e-06, "loss": 0.4062, "step": 2529 }, { "epoch": 1.07, "grad_norm": 0.580473478944349, "learning_rate": 2.612698373573056e-06, "loss": 0.4155, "step": 2530 }, { "epoch": 1.07, "grad_norm": 0.5921217133427389, "learning_rate": 2.6108213090610352e-06, "loss": 0.4462, "step": 2531 }, { "epoch": 1.07, "grad_norm": 0.5801087342197929, "learning_rate": 2.608944181949575e-06, "loss": 0.4099, "step": 2532 }, { "epoch": 1.07, "grad_norm": 0.5749901636296175, "learning_rate": 2.607066993299007e-06, "loss": 0.4093, "step": 2533 }, { "epoch": 1.07, "grad_norm": 0.6164831721085331, "learning_rate": 2.6051897441696926e-06, "loss": 0.4283, "step": 2534 }, { "epoch": 1.07, "grad_norm": 0.5874464536341129, "learning_rate": 2.603312435622033e-06, "loss": 0.4446, "step": 2535 }, { "epoch": 1.07, "grad_norm": 0.5662147896859149, "learning_rate": 2.6014350687164598e-06, "loss": 0.4194, "step": 2536 }, { "epoch": 1.07, "grad_norm": 0.5495971494931703, "learning_rate": 2.5995576445134364e-06, "loss": 0.4057, "step": 2537 }, { "epoch": 1.07, "grad_norm": 0.5722070713750189, "learning_rate": 2.5976801640734604e-06, "loss": 0.4355, "step": 2538 }, { "epoch": 1.07, "grad_norm": 0.5594258522910831, "learning_rate": 2.595802628457063e-06, "loss": 0.4238, "step": 2539 }, { "epoch": 1.07, "grad_norm": 0.5700766032916892, "learning_rate": 2.593925038724802e-06, "loss": 0.4286, "step": 2540 }, { "epoch": 1.07, "grad_norm": 0.5778449462446359, "learning_rate": 2.5920473959372695e-06, "loss": 0.4337, "step": 2541 }, { "epoch": 1.07, "grad_norm": 0.584060965567766, "learning_rate": 2.5901697011550857e-06, "loss": 0.4259, "step": 2542 }, { "epoch": 1.07, "grad_norm": 0.563752954121171, "learning_rate": 2.5882919554389007e-06, "loss": 0.4309, "step": 2543 }, { "epoch": 1.07, "grad_norm": 0.5899104452933682, "learning_rate": 2.586414159849394e-06, "loss": 0.4461, "step": 2544 }, { "epoch": 1.07, "grad_norm": 0.5820330641642761, "learning_rate": 2.5845363154472725e-06, "loss": 0.4301, "step": 2545 }, { "epoch": 1.07, "grad_norm": 0.5446163156937063, "learning_rate": 2.5826584232932707e-06, "loss": 0.3957, "step": 2546 }, { "epoch": 1.07, "grad_norm": 0.5945376021090512, "learning_rate": 2.5807804844481506e-06, "loss": 0.4091, "step": 2547 }, { "epoch": 1.07, "grad_norm": 0.5638002168999431, "learning_rate": 2.5789024999727e-06, "loss": 0.414, "step": 2548 }, { "epoch": 1.07, "grad_norm": 0.5951090126233547, "learning_rate": 2.577024470927732e-06, "loss": 0.4237, "step": 2549 }, { "epoch": 1.08, "grad_norm": 0.5684255566290534, "learning_rate": 2.575146398374087e-06, "loss": 0.4431, "step": 2550 }, { "epoch": 1.08, "grad_norm": 0.5595853495309726, "learning_rate": 2.5732682833726274e-06, "loss": 0.4221, "step": 2551 }, { "epoch": 1.08, "grad_norm": 0.5742831043384634, "learning_rate": 2.5713901269842405e-06, "loss": 0.4285, "step": 2552 }, { "epoch": 1.08, "grad_norm": 0.5640089992024232, "learning_rate": 2.569511930269839e-06, "loss": 0.4196, "step": 2553 }, { "epoch": 1.08, "grad_norm": 0.5572383872005084, "learning_rate": 2.5676336942903547e-06, "loss": 0.4015, "step": 2554 }, { "epoch": 1.08, "grad_norm": 0.6180725350224047, "learning_rate": 2.565755420106744e-06, "loss": 0.4346, "step": 2555 }, { "epoch": 1.08, "grad_norm": 0.586931012696472, "learning_rate": 2.5638771087799857e-06, "loss": 0.4377, "step": 2556 }, { "epoch": 1.08, "eval_loss": 0.46661555767059326, "eval_runtime": 6940.8186, "eval_samples_per_second": 41.838, "eval_steps_per_second": 2.092, "step": 2556 }, { "epoch": 1.08, "grad_norm": 0.60890234512916, "learning_rate": 2.5619987613710757e-06, "loss": 0.4203, "step": 2557 }, { "epoch": 1.08, "grad_norm": 0.6130809794076856, "learning_rate": 2.5601203789410344e-06, "loss": 0.3926, "step": 2558 }, { "epoch": 1.08, "grad_norm": 0.5758560877978278, "learning_rate": 2.5582419625509004e-06, "loss": 0.4193, "step": 2559 }, { "epoch": 1.08, "grad_norm": 0.5611835821629346, "learning_rate": 2.5563635132617305e-06, "loss": 0.4106, "step": 2560 }, { "epoch": 1.08, "grad_norm": 0.5500032992402379, "learning_rate": 2.5544850321346026e-06, "loss": 0.4103, "step": 2561 }, { "epoch": 1.08, "grad_norm": 0.5991303836828948, "learning_rate": 2.55260652023061e-06, "loss": 0.4176, "step": 2562 }, { "epoch": 1.08, "grad_norm": 0.5495267980558447, "learning_rate": 2.550727978610864e-06, "loss": 0.412, "step": 2563 }, { "epoch": 1.08, "grad_norm": 0.5903181299884372, "learning_rate": 2.5488494083364946e-06, "loss": 0.4186, "step": 2564 }, { "epoch": 1.08, "grad_norm": 0.5775725024291222, "learning_rate": 2.5469708104686452e-06, "loss": 0.4017, "step": 2565 }, { "epoch": 1.08, "grad_norm": 0.5794156104604055, "learning_rate": 2.5450921860684765e-06, "loss": 0.3933, "step": 2566 }, { "epoch": 1.08, "grad_norm": 0.5929127352748987, "learning_rate": 2.543213536197164e-06, "loss": 0.4474, "step": 2567 }, { "epoch": 1.08, "grad_norm": 0.6307527970701059, "learning_rate": 2.5413348619158966e-06, "loss": 0.4367, "step": 2568 }, { "epoch": 1.08, "grad_norm": 0.5792027225417186, "learning_rate": 2.5394561642858785e-06, "loss": 0.4371, "step": 2569 }, { "epoch": 1.08, "grad_norm": 0.5781330553234431, "learning_rate": 2.5375774443683263e-06, "loss": 0.416, "step": 2570 }, { "epoch": 1.08, "grad_norm": 0.574771297321989, "learning_rate": 2.5356987032244686e-06, "loss": 0.4075, "step": 2571 }, { "epoch": 1.08, "grad_norm": 0.5558908214262773, "learning_rate": 2.5338199419155473e-06, "loss": 0.41, "step": 2572 }, { "epoch": 1.09, "grad_norm": 0.570022965299663, "learning_rate": 2.5319411615028144e-06, "loss": 0.4197, "step": 2573 }, { "epoch": 1.09, "grad_norm": 0.5953309626927854, "learning_rate": 2.530062363047534e-06, "loss": 0.4489, "step": 2574 }, { "epoch": 1.09, "grad_norm": 0.5678767865025237, "learning_rate": 2.5281835476109796e-06, "loss": 0.4328, "step": 2575 }, { "epoch": 1.09, "grad_norm": 0.5830901884956002, "learning_rate": 2.5263047162544335e-06, "loss": 0.4117, "step": 2576 }, { "epoch": 1.09, "grad_norm": 0.565926311345139, "learning_rate": 2.5244258700391888e-06, "loss": 0.4232, "step": 2577 }, { "epoch": 1.09, "grad_norm": 0.5813850996625575, "learning_rate": 2.522547010026546e-06, "loss": 0.4351, "step": 2578 }, { "epoch": 1.09, "grad_norm": 0.5766636305766427, "learning_rate": 2.5206681372778126e-06, "loss": 0.4281, "step": 2579 }, { "epoch": 1.09, "grad_norm": 0.547136463994455, "learning_rate": 2.518789252854305e-06, "loss": 0.399, "step": 2580 }, { "epoch": 1.09, "grad_norm": 0.5673733087045142, "learning_rate": 2.5169103578173455e-06, "loss": 0.4429, "step": 2581 }, { "epoch": 1.09, "grad_norm": 0.577885776445999, "learning_rate": 2.5150314532282615e-06, "loss": 0.4373, "step": 2582 }, { "epoch": 1.09, "grad_norm": 0.5681163890434137, "learning_rate": 2.5131525401483863e-06, "loss": 0.4314, "step": 2583 }, { "epoch": 1.09, "grad_norm": 0.5914855836747088, "learning_rate": 2.51127361963906e-06, "loss": 0.4156, "step": 2584 }, { "epoch": 1.09, "grad_norm": 0.5695378199879212, "learning_rate": 2.5093946927616227e-06, "loss": 0.4137, "step": 2585 }, { "epoch": 1.09, "grad_norm": 0.615520565415579, "learning_rate": 2.507515760577423e-06, "loss": 0.4202, "step": 2586 }, { "epoch": 1.09, "grad_norm": 0.5962868782142557, "learning_rate": 2.505636824147808e-06, "loss": 0.4433, "step": 2587 }, { "epoch": 1.09, "grad_norm": 0.5642173652784211, "learning_rate": 2.50375788453413e-06, "loss": 0.4296, "step": 2588 }, { "epoch": 1.09, "grad_norm": 0.5643198205173939, "learning_rate": 2.501878942797743e-06, "loss": 0.4163, "step": 2589 }, { "epoch": 1.09, "grad_norm": 0.5730693457148456, "learning_rate": 2.5e-06, "loss": 0.4184, "step": 2590 }, { "epoch": 1.09, "grad_norm": 0.5648030559058376, "learning_rate": 2.498121057202258e-06, "loss": 0.4286, "step": 2591 }, { "epoch": 1.09, "grad_norm": 0.7310458154947486, "learning_rate": 2.4962421154658706e-06, "loss": 0.4552, "step": 2592 }, { "epoch": 1.09, "grad_norm": 0.8642590895577826, "learning_rate": 2.4943631758521924e-06, "loss": 0.43, "step": 2593 }, { "epoch": 1.09, "grad_norm": 0.5957297069878567, "learning_rate": 2.492484239422578e-06, "loss": 0.4174, "step": 2594 }, { "epoch": 1.09, "grad_norm": 0.5288913445348521, "learning_rate": 2.4906053072383773e-06, "loss": 0.4197, "step": 2595 }, { "epoch": 1.09, "grad_norm": 0.5669908964180154, "learning_rate": 2.4887263803609415e-06, "loss": 0.4228, "step": 2596 }, { "epoch": 1.1, "grad_norm": 0.5619854120471871, "learning_rate": 2.486847459851614e-06, "loss": 0.4204, "step": 2597 }, { "epoch": 1.1, "grad_norm": 0.573591559908782, "learning_rate": 2.4849685467717397e-06, "loss": 0.4043, "step": 2598 }, { "epoch": 1.1, "grad_norm": 0.5660399643338767, "learning_rate": 2.4830896421826554e-06, "loss": 0.4151, "step": 2599 }, { "epoch": 1.1, "grad_norm": 0.6061575332078436, "learning_rate": 2.4812107471456958e-06, "loss": 0.4261, "step": 2600 }, { "epoch": 1.1, "grad_norm": 0.558833085401998, "learning_rate": 2.479331862722188e-06, "loss": 0.4253, "step": 2601 }, { "epoch": 1.1, "grad_norm": 0.5661726057886762, "learning_rate": 2.477452989973455e-06, "loss": 0.4282, "step": 2602 }, { "epoch": 1.1, "grad_norm": 0.5765250370061781, "learning_rate": 2.475574129960812e-06, "loss": 0.3895, "step": 2603 }, { "epoch": 1.1, "grad_norm": 0.6094913200037948, "learning_rate": 2.4736952837455665e-06, "loss": 0.4438, "step": 2604 }, { "epoch": 1.1, "grad_norm": 0.5958403528694752, "learning_rate": 2.4718164523890212e-06, "loss": 0.4276, "step": 2605 }, { "epoch": 1.1, "grad_norm": 0.5711855859912395, "learning_rate": 2.4699376369524665e-06, "loss": 0.422, "step": 2606 }, { "epoch": 1.1, "grad_norm": 0.5827746728273364, "learning_rate": 2.4680588384971864e-06, "loss": 0.4094, "step": 2607 }, { "epoch": 1.1, "grad_norm": 0.5652093041093819, "learning_rate": 2.4661800580844535e-06, "loss": 0.4404, "step": 2608 }, { "epoch": 1.1, "grad_norm": 0.5815716319807065, "learning_rate": 2.4643012967755327e-06, "loss": 0.4188, "step": 2609 }, { "epoch": 1.1, "grad_norm": 0.5667592036448388, "learning_rate": 2.4624225556316745e-06, "loss": 0.4103, "step": 2610 }, { "epoch": 1.1, "grad_norm": 0.6003054597720718, "learning_rate": 2.4605438357141223e-06, "loss": 0.4422, "step": 2611 }, { "epoch": 1.1, "grad_norm": 0.5718504015079162, "learning_rate": 2.458665138084104e-06, "loss": 0.4057, "step": 2612 }, { "epoch": 1.1, "grad_norm": 0.5894624642862655, "learning_rate": 2.4567864638028374e-06, "loss": 0.4164, "step": 2613 }, { "epoch": 1.1, "grad_norm": 0.6181729940652014, "learning_rate": 2.4549078139315243e-06, "loss": 0.4354, "step": 2614 }, { "epoch": 1.1, "grad_norm": 0.5831842560896243, "learning_rate": 2.453029189531356e-06, "loss": 0.4059, "step": 2615 }, { "epoch": 1.1, "grad_norm": 0.5711907705253331, "learning_rate": 2.451150591663506e-06, "loss": 0.4161, "step": 2616 }, { "epoch": 1.1, "grad_norm": 0.5840971955664861, "learning_rate": 2.449272021389136e-06, "loss": 0.4266, "step": 2617 }, { "epoch": 1.1, "grad_norm": 0.6151549174106447, "learning_rate": 2.4473934797693908e-06, "loss": 0.4451, "step": 2618 }, { "epoch": 1.1, "grad_norm": 0.5912926857180598, "learning_rate": 2.4455149678653982e-06, "loss": 0.424, "step": 2619 }, { "epoch": 1.11, "grad_norm": 0.5690720352177238, "learning_rate": 2.44363648673827e-06, "loss": 0.4094, "step": 2620 }, { "epoch": 1.11, "grad_norm": 0.5880654368148057, "learning_rate": 2.4417580374491e-06, "loss": 0.4374, "step": 2621 }, { "epoch": 1.11, "grad_norm": 0.592818300351086, "learning_rate": 2.439879621058966e-06, "loss": 0.4392, "step": 2622 }, { "epoch": 1.11, "grad_norm": 0.5904613946521675, "learning_rate": 2.438001238628925e-06, "loss": 0.421, "step": 2623 }, { "epoch": 1.11, "grad_norm": 0.5780768348236618, "learning_rate": 2.436122891220016e-06, "loss": 0.412, "step": 2624 }, { "epoch": 1.11, "grad_norm": 0.6165111418892802, "learning_rate": 2.4342445798932563e-06, "loss": 0.4225, "step": 2625 }, { "epoch": 1.11, "grad_norm": 0.5864395429245397, "learning_rate": 2.4323663057096466e-06, "loss": 0.4158, "step": 2626 }, { "epoch": 1.11, "grad_norm": 0.5790868452034496, "learning_rate": 2.4304880697301615e-06, "loss": 0.4018, "step": 2627 }, { "epoch": 1.11, "eval_loss": 0.46582120656967163, "eval_runtime": 6934.7187, "eval_samples_per_second": 41.875, "eval_steps_per_second": 2.094, "step": 2627 }, { "epoch": 1.11, "grad_norm": 0.5854060000762511, "learning_rate": 2.42860987301576e-06, "loss": 0.429, "step": 2628 }, { "epoch": 1.11, "grad_norm": 0.5793144094485164, "learning_rate": 2.4267317166273734e-06, "loss": 0.4008, "step": 2629 }, { "epoch": 1.11, "grad_norm": 0.557725397610062, "learning_rate": 2.4248536016259137e-06, "loss": 0.4339, "step": 2630 }, { "epoch": 1.11, "grad_norm": 0.5707416944794899, "learning_rate": 2.422975529072269e-06, "loss": 0.4144, "step": 2631 }, { "epoch": 1.11, "grad_norm": 0.5781786267589856, "learning_rate": 2.4210975000273005e-06, "loss": 0.4475, "step": 2632 }, { "epoch": 1.11, "grad_norm": 0.5688305465034205, "learning_rate": 2.41921951555185e-06, "loss": 0.4145, "step": 2633 }, { "epoch": 1.11, "grad_norm": 0.583143118932043, "learning_rate": 2.4173415767067297e-06, "loss": 0.4286, "step": 2634 }, { "epoch": 1.11, "grad_norm": 0.5507249017882659, "learning_rate": 2.4154636845527284e-06, "loss": 0.4137, "step": 2635 }, { "epoch": 1.11, "grad_norm": 0.6128066967756481, "learning_rate": 2.4135858401506066e-06, "loss": 0.423, "step": 2636 }, { "epoch": 1.11, "grad_norm": 0.5984198722869004, "learning_rate": 2.4117080445611e-06, "loss": 0.4315, "step": 2637 }, { "epoch": 1.11, "grad_norm": 0.566032787321914, "learning_rate": 2.4098302988449147e-06, "loss": 0.4614, "step": 2638 }, { "epoch": 1.11, "grad_norm": 0.5507111424965526, "learning_rate": 2.4079526040627318e-06, "loss": 0.4359, "step": 2639 }, { "epoch": 1.11, "grad_norm": 0.5688047320342015, "learning_rate": 2.4060749612751987e-06, "loss": 0.4466, "step": 2640 }, { "epoch": 1.11, "grad_norm": 0.580032240572495, "learning_rate": 2.404197371542938e-06, "loss": 0.4032, "step": 2641 }, { "epoch": 1.11, "grad_norm": 0.5820043004603282, "learning_rate": 2.40231983592654e-06, "loss": 0.42, "step": 2642 }, { "epoch": 1.12, "grad_norm": 0.5788132559135092, "learning_rate": 2.400442355486564e-06, "loss": 0.4183, "step": 2643 }, { "epoch": 1.12, "grad_norm": 0.5859683700448465, "learning_rate": 2.398564931283541e-06, "loss": 0.4123, "step": 2644 }, { "epoch": 1.12, "grad_norm": 0.5862017007386782, "learning_rate": 2.396687564377967e-06, "loss": 0.4285, "step": 2645 }, { "epoch": 1.12, "grad_norm": 0.5925607615927038, "learning_rate": 2.394810255830308e-06, "loss": 0.4283, "step": 2646 }, { "epoch": 1.12, "grad_norm": 0.5703213873210229, "learning_rate": 2.3929330067009944e-06, "loss": 0.4331, "step": 2647 }, { "epoch": 1.12, "grad_norm": 0.5429662869911331, "learning_rate": 2.391055818050426e-06, "loss": 0.4009, "step": 2648 }, { "epoch": 1.12, "grad_norm": 0.5805429721405676, "learning_rate": 2.389178690938965e-06, "loss": 0.4191, "step": 2649 }, { "epoch": 1.12, "grad_norm": 0.5669011883051497, "learning_rate": 2.3873016264269446e-06, "loss": 0.4112, "step": 2650 }, { "epoch": 1.12, "grad_norm": 0.567460249220951, "learning_rate": 2.3854246255746555e-06, "loss": 0.4315, "step": 2651 }, { "epoch": 1.12, "grad_norm": 0.5623656342926924, "learning_rate": 2.3835476894423577e-06, "loss": 0.4059, "step": 2652 }, { "epoch": 1.12, "grad_norm": 0.5887115656599653, "learning_rate": 2.3816708190902722e-06, "loss": 0.4292, "step": 2653 }, { "epoch": 1.12, "grad_norm": 0.5801851574567665, "learning_rate": 2.3797940155785837e-06, "loss": 0.4377, "step": 2654 }, { "epoch": 1.12, "grad_norm": 0.5957884506649995, "learning_rate": 2.3779172799674377e-06, "loss": 0.4291, "step": 2655 }, { "epoch": 1.12, "grad_norm": 0.5922308503207093, "learning_rate": 2.376040613316944e-06, "loss": 0.4219, "step": 2656 }, { "epoch": 1.12, "grad_norm": 0.5657227192474519, "learning_rate": 2.374164016687173e-06, "loss": 0.4338, "step": 2657 }, { "epoch": 1.12, "grad_norm": 0.6033671454063749, "learning_rate": 2.3722874911381517e-06, "loss": 0.4554, "step": 2658 }, { "epoch": 1.12, "grad_norm": 0.5691097763031018, "learning_rate": 2.3704110377298717e-06, "loss": 0.4105, "step": 2659 }, { "epoch": 1.12, "grad_norm": 0.5617846566458164, "learning_rate": 2.368534657522281e-06, "loss": 0.4439, "step": 2660 }, { "epoch": 1.12, "grad_norm": 0.5551937150400084, "learning_rate": 2.366658351575288e-06, "loss": 0.4266, "step": 2661 }, { "epoch": 1.12, "grad_norm": 0.5726467432147245, "learning_rate": 2.364782120948758e-06, "loss": 0.417, "step": 2662 }, { "epoch": 1.12, "grad_norm": 0.6092370186148129, "learning_rate": 2.362905966702515e-06, "loss": 0.4201, "step": 2663 }, { "epoch": 1.12, "grad_norm": 0.5647441929287327, "learning_rate": 2.3610298898963373e-06, "loss": 0.4024, "step": 2664 }, { "epoch": 1.12, "grad_norm": 0.5612482266868575, "learning_rate": 2.359153891589962e-06, "loss": 0.4272, "step": 2665 }, { "epoch": 1.12, "grad_norm": 0.5404969509411939, "learning_rate": 2.35727797284308e-06, "loss": 0.4097, "step": 2666 }, { "epoch": 1.13, "grad_norm": 0.5847499166914444, "learning_rate": 2.3554021347153403e-06, "loss": 0.4522, "step": 2667 }, { "epoch": 1.13, "grad_norm": 0.5671574895969574, "learning_rate": 2.3535263782663425e-06, "loss": 0.4028, "step": 2668 }, { "epoch": 1.13, "grad_norm": 0.552780149490217, "learning_rate": 2.351650704555643e-06, "loss": 0.4086, "step": 2669 }, { "epoch": 1.13, "grad_norm": 0.5803614826769898, "learning_rate": 2.3497751146427494e-06, "loss": 0.4155, "step": 2670 }, { "epoch": 1.13, "grad_norm": 0.5652376067469718, "learning_rate": 2.3478996095871228e-06, "loss": 0.4084, "step": 2671 }, { "epoch": 1.13, "grad_norm": 0.5845008843251347, "learning_rate": 2.3460241904481778e-06, "loss": 0.4027, "step": 2672 }, { "epoch": 1.13, "grad_norm": 0.5799385879129909, "learning_rate": 2.3441488582852774e-06, "loss": 0.4268, "step": 2673 }, { "epoch": 1.13, "grad_norm": 0.5738670567740277, "learning_rate": 2.342273614157739e-06, "loss": 0.4464, "step": 2674 }, { "epoch": 1.13, "grad_norm": 0.544552930903809, "learning_rate": 2.3403984591248265e-06, "loss": 0.3964, "step": 2675 }, { "epoch": 1.13, "grad_norm": 0.6124679370289692, "learning_rate": 2.3385233942457574e-06, "loss": 0.4202, "step": 2676 }, { "epoch": 1.13, "grad_norm": 0.5610056639464951, "learning_rate": 2.336648420579694e-06, "loss": 0.4094, "step": 2677 }, { "epoch": 1.13, "grad_norm": 0.5615215146670192, "learning_rate": 2.3347735391857517e-06, "loss": 0.3789, "step": 2678 }, { "epoch": 1.13, "grad_norm": 0.5472440549240616, "learning_rate": 2.33289875112299e-06, "loss": 0.3896, "step": 2679 }, { "epoch": 1.13, "grad_norm": 0.5874793154794317, "learning_rate": 2.3310240574504184e-06, "loss": 0.4205, "step": 2680 }, { "epoch": 1.13, "grad_norm": 0.5738763492928816, "learning_rate": 2.3291494592269902e-06, "loss": 0.4152, "step": 2681 }, { "epoch": 1.13, "grad_norm": 0.5752705880264377, "learning_rate": 2.327274957511607e-06, "loss": 0.4032, "step": 2682 }, { "epoch": 1.13, "grad_norm": 0.6094503057397006, "learning_rate": 2.3254005533631162e-06, "loss": 0.4271, "step": 2683 }, { "epoch": 1.13, "grad_norm": 0.6066369045019371, "learning_rate": 2.3235262478403082e-06, "loss": 0.4241, "step": 2684 }, { "epoch": 1.13, "grad_norm": 0.5939355951481937, "learning_rate": 2.3216520420019194e-06, "loss": 0.3942, "step": 2685 }, { "epoch": 1.13, "grad_norm": 0.5616286358779028, "learning_rate": 2.3197779369066287e-06, "loss": 0.4052, "step": 2686 }, { "epoch": 1.13, "grad_norm": 0.5714073547567622, "learning_rate": 2.3179039336130588e-06, "loss": 0.4156, "step": 2687 }, { "epoch": 1.13, "grad_norm": 0.5870213755012056, "learning_rate": 2.3160300331797734e-06, "loss": 0.419, "step": 2688 }, { "epoch": 1.13, "grad_norm": 0.5872084599723584, "learning_rate": 2.3141562366652816e-06, "loss": 0.4094, "step": 2689 }, { "epoch": 1.14, "grad_norm": 0.5712310144109413, "learning_rate": 2.3122825451280294e-06, "loss": 0.4185, "step": 2690 }, { "epoch": 1.14, "grad_norm": 0.5974668557328553, "learning_rate": 2.3104089596264075e-06, "loss": 0.4271, "step": 2691 }, { "epoch": 1.14, "grad_norm": 0.5925710616458487, "learning_rate": 2.3085354812187436e-06, "loss": 0.4445, "step": 2692 }, { "epoch": 1.14, "grad_norm": 0.574277995467648, "learning_rate": 2.306662110963307e-06, "loss": 0.4042, "step": 2693 }, { "epoch": 1.14, "grad_norm": 0.5763082849196323, "learning_rate": 2.3047888499183036e-06, "loss": 0.417, "step": 2694 }, { "epoch": 1.14, "grad_norm": 0.5593667983969396, "learning_rate": 2.30291569914188e-06, "loss": 0.412, "step": 2695 }, { "epoch": 1.14, "grad_norm": 0.5768991359362865, "learning_rate": 2.301042659692121e-06, "loss": 0.4041, "step": 2696 }, { "epoch": 1.14, "grad_norm": 0.592377033641057, "learning_rate": 2.299169732627045e-06, "loss": 0.3905, "step": 2697 }, { "epoch": 1.14, "grad_norm": 0.5978581423236226, "learning_rate": 2.2972969190046104e-06, "loss": 0.3998, "step": 2698 }, { "epoch": 1.14, "eval_loss": 0.4642565846443176, "eval_runtime": 6929.6676, "eval_samples_per_second": 41.905, "eval_steps_per_second": 2.095, "step": 2698 }, { "epoch": 1.14, "grad_norm": 0.5739162693130145, "learning_rate": 2.2954242198827082e-06, "loss": 0.4196, "step": 2699 }, { "epoch": 1.14, "grad_norm": 0.5844044219912439, "learning_rate": 2.2935516363191695e-06, "loss": 0.433, "step": 2700 }, { "epoch": 1.14, "grad_norm": 0.5576878153425602, "learning_rate": 2.2916791693717553e-06, "loss": 0.4116, "step": 2701 }, { "epoch": 1.14, "grad_norm": 0.5975344250607371, "learning_rate": 2.2898068200981633e-06, "loss": 0.4239, "step": 2702 }, { "epoch": 1.14, "grad_norm": 0.5486064916195111, "learning_rate": 2.287934589556024e-06, "loss": 0.3899, "step": 2703 }, { "epoch": 1.14, "grad_norm": 0.5954206100798385, "learning_rate": 2.2860624788029013e-06, "loss": 0.3948, "step": 2704 }, { "epoch": 1.14, "grad_norm": 0.582033730964023, "learning_rate": 2.2841904888962903e-06, "loss": 0.4024, "step": 2705 }, { "epoch": 1.14, "grad_norm": 0.61366156641062, "learning_rate": 2.2823186208936205e-06, "loss": 0.4492, "step": 2706 }, { "epoch": 1.14, "grad_norm": 0.5837374648931264, "learning_rate": 2.280446875852248e-06, "loss": 0.4394, "step": 2707 }, { "epoch": 1.14, "grad_norm": 0.6159725968971995, "learning_rate": 2.2785752548294637e-06, "loss": 0.4345, "step": 2708 }, { "epoch": 1.14, "grad_norm": 0.561818864484124, "learning_rate": 2.2767037588824877e-06, "loss": 0.4053, "step": 2709 }, { "epoch": 1.14, "grad_norm": 0.5693511275506383, "learning_rate": 2.2748323890684664e-06, "loss": 0.4167, "step": 2710 }, { "epoch": 1.14, "grad_norm": 0.5676875307416003, "learning_rate": 2.2729611464444797e-06, "loss": 0.4087, "step": 2711 }, { "epoch": 1.14, "grad_norm": 0.5865113412123945, "learning_rate": 2.2710900320675314e-06, "loss": 0.4044, "step": 2712 }, { "epoch": 1.14, "grad_norm": 0.5877911286647893, "learning_rate": 2.2692190469945557e-06, "loss": 0.4416, "step": 2713 }, { "epoch": 1.15, "grad_norm": 0.5405283913293546, "learning_rate": 2.267348192282411e-06, "loss": 0.3994, "step": 2714 }, { "epoch": 1.15, "grad_norm": 0.5721166898641502, "learning_rate": 2.2654774689878862e-06, "loss": 0.4269, "step": 2715 }, { "epoch": 1.15, "grad_norm": 0.5664221347108425, "learning_rate": 2.2636068781676905e-06, "loss": 0.4172, "step": 2716 }, { "epoch": 1.15, "grad_norm": 0.5609241860387546, "learning_rate": 2.261736420878464e-06, "loss": 0.4323, "step": 2717 }, { "epoch": 1.15, "grad_norm": 0.5846699602141291, "learning_rate": 2.2598660981767667e-06, "loss": 0.4173, "step": 2718 }, { "epoch": 1.15, "grad_norm": 0.5487147727790713, "learning_rate": 2.257995911119086e-06, "loss": 0.4091, "step": 2719 }, { "epoch": 1.15, "grad_norm": 0.5642527354564277, "learning_rate": 2.2561258607618296e-06, "loss": 0.4208, "step": 2720 }, { "epoch": 1.15, "grad_norm": 0.5596940402216087, "learning_rate": 2.25425594816133e-06, "loss": 0.4177, "step": 2721 }, { "epoch": 1.15, "grad_norm": 0.5737740293350555, "learning_rate": 2.2523861743738433e-06, "loss": 0.4079, "step": 2722 }, { "epoch": 1.15, "grad_norm": 0.5542429347678349, "learning_rate": 2.2505165404555434e-06, "loss": 0.4177, "step": 2723 }, { "epoch": 1.15, "grad_norm": 0.58808254423625, "learning_rate": 2.248647047462528e-06, "loss": 0.4558, "step": 2724 }, { "epoch": 1.15, "grad_norm": 0.6083740436546075, "learning_rate": 2.246777696450813e-06, "loss": 0.4367, "step": 2725 }, { "epoch": 1.15, "grad_norm": 0.577052191765705, "learning_rate": 2.244908488476337e-06, "loss": 0.4133, "step": 2726 }, { "epoch": 1.15, "grad_norm": 0.59097397240932, "learning_rate": 2.2430394245949553e-06, "loss": 0.4327, "step": 2727 }, { "epoch": 1.15, "grad_norm": 0.5524670352667417, "learning_rate": 2.2411705058624437e-06, "loss": 0.4364, "step": 2728 }, { "epoch": 1.15, "grad_norm": 0.5889869749199854, "learning_rate": 2.2393017333344935e-06, "loss": 0.4389, "step": 2729 }, { "epoch": 1.15, "grad_norm": 0.5781616268517972, "learning_rate": 2.2374331080667168e-06, "loss": 0.4146, "step": 2730 }, { "epoch": 1.15, "grad_norm": 0.5516338673454813, "learning_rate": 2.235564631114639e-06, "loss": 0.42, "step": 2731 }, { "epoch": 1.15, "grad_norm": 0.5737980509227198, "learning_rate": 2.2336963035337037e-06, "loss": 0.4049, "step": 2732 }, { "epoch": 1.15, "grad_norm": 0.6099792863000669, "learning_rate": 2.2318281263792714e-06, "loss": 0.4329, "step": 2733 }, { "epoch": 1.15, "grad_norm": 0.6215436378061963, "learning_rate": 2.229960100706614e-06, "loss": 0.4064, "step": 2734 }, { "epoch": 1.15, "grad_norm": 0.6070304940895107, "learning_rate": 2.2280922275709216e-06, "loss": 0.4172, "step": 2735 }, { "epoch": 1.15, "grad_norm": 0.6137903976399841, "learning_rate": 2.2262245080272947e-06, "loss": 0.4191, "step": 2736 }, { "epoch": 1.16, "grad_norm": 0.589617423215821, "learning_rate": 2.2243569431307506e-06, "loss": 0.4236, "step": 2737 }, { "epoch": 1.16, "grad_norm": 0.5714692395337472, "learning_rate": 2.2224895339362153e-06, "loss": 0.4174, "step": 2738 }, { "epoch": 1.16, "grad_norm": 0.566616099477279, "learning_rate": 2.2206222814985316e-06, "loss": 0.4172, "step": 2739 }, { "epoch": 1.16, "grad_norm": 0.5918370633556324, "learning_rate": 2.2187551868724487e-06, "loss": 0.4022, "step": 2740 }, { "epoch": 1.16, "grad_norm": 0.5621764463135852, "learning_rate": 2.2168882511126306e-06, "loss": 0.4033, "step": 2741 }, { "epoch": 1.16, "grad_norm": 0.5602317292982957, "learning_rate": 2.215021475273649e-06, "loss": 0.3948, "step": 2742 }, { "epoch": 1.16, "grad_norm": 0.5821498763067597, "learning_rate": 2.213154860409987e-06, "loss": 0.4342, "step": 2743 }, { "epoch": 1.16, "grad_norm": 0.56289957110064, "learning_rate": 2.211288407576035e-06, "loss": 0.4224, "step": 2744 }, { "epoch": 1.16, "grad_norm": 0.5891254592215949, "learning_rate": 2.209422117826094e-06, "loss": 0.4217, "step": 2745 }, { "epoch": 1.16, "grad_norm": 0.5729813741965974, "learning_rate": 2.207555992214372e-06, "loss": 0.4369, "step": 2746 }, { "epoch": 1.16, "grad_norm": 0.5673765067747608, "learning_rate": 2.2056900317949835e-06, "loss": 0.4242, "step": 2747 }, { "epoch": 1.16, "grad_norm": 0.5690722337898687, "learning_rate": 2.203824237621951e-06, "loss": 0.4284, "step": 2748 }, { "epoch": 1.16, "grad_norm": 0.5690375147524728, "learning_rate": 2.2019586107492005e-06, "loss": 0.4238, "step": 2749 }, { "epoch": 1.16, "grad_norm": 0.5507462792620564, "learning_rate": 2.200093152230568e-06, "loss": 0.4135, "step": 2750 }, { "epoch": 1.16, "grad_norm": 0.5719332339424014, "learning_rate": 2.1982278631197895e-06, "loss": 0.4174, "step": 2751 }, { "epoch": 1.16, "grad_norm": 0.5874278219304234, "learning_rate": 2.1963627444705097e-06, "loss": 0.4053, "step": 2752 }, { "epoch": 1.16, "grad_norm": 0.568889174310937, "learning_rate": 2.1944977973362728e-06, "loss": 0.4123, "step": 2753 }, { "epoch": 1.16, "grad_norm": 0.6038767993949933, "learning_rate": 2.19263302277053e-06, "loss": 0.4349, "step": 2754 }, { "epoch": 1.16, "grad_norm": 0.5872897799798638, "learning_rate": 2.190768421826631e-06, "loss": 0.4083, "step": 2755 }, { "epoch": 1.16, "grad_norm": 0.5905095099111151, "learning_rate": 2.1889039955578327e-06, "loss": 0.4146, "step": 2756 }, { "epoch": 1.16, "grad_norm": 0.5527103292721002, "learning_rate": 2.1870397450172876e-06, "loss": 0.4191, "step": 2757 }, { "epoch": 1.16, "grad_norm": 0.5818092591622248, "learning_rate": 2.1851756712580526e-06, "loss": 0.3905, "step": 2758 }, { "epoch": 1.16, "grad_norm": 0.5560986512545047, "learning_rate": 2.1833117753330847e-06, "loss": 0.4212, "step": 2759 }, { "epoch": 1.17, "grad_norm": 0.583827443059487, "learning_rate": 2.1814480582952376e-06, "loss": 0.439, "step": 2760 }, { "epoch": 1.17, "grad_norm": 0.6047213755513012, "learning_rate": 2.1795845211972684e-06, "loss": 0.4098, "step": 2761 }, { "epoch": 1.17, "grad_norm": 0.5719149813855363, "learning_rate": 2.1777211650918276e-06, "loss": 0.4436, "step": 2762 }, { "epoch": 1.17, "grad_norm": 0.5636987195875225, "learning_rate": 2.1758579910314677e-06, "loss": 0.4212, "step": 2763 }, { "epoch": 1.17, "grad_norm": 0.5821057683244233, "learning_rate": 2.1739950000686354e-06, "loss": 0.4157, "step": 2764 }, { "epoch": 1.17, "grad_norm": 0.5763014032864897, "learning_rate": 2.1721321932556753e-06, "loss": 0.4167, "step": 2765 }, { "epoch": 1.17, "grad_norm": 0.5741607793695896, "learning_rate": 2.1702695716448276e-06, "loss": 0.4118, "step": 2766 }, { "epoch": 1.17, "grad_norm": 0.583977923198581, "learning_rate": 2.16840713628823e-06, "loss": 0.4362, "step": 2767 }, { "epoch": 1.17, "grad_norm": 0.567095712657335, "learning_rate": 2.16654488823791e-06, "loss": 0.3994, "step": 2768 }, { "epoch": 1.17, "grad_norm": 0.5882945712372565, "learning_rate": 2.164682828545795e-06, "loss": 0.415, "step": 2769 }, { "epoch": 1.17, "eval_loss": 0.4636525809764862, "eval_runtime": 6931.4457, "eval_samples_per_second": 41.894, "eval_steps_per_second": 2.095, "step": 2769 }, { "epoch": 1.17, "grad_norm": 0.5800810184022894, "learning_rate": 2.1628209582637024e-06, "loss": 0.423, "step": 2770 }, { "epoch": 1.17, "grad_norm": 0.5877561785586146, "learning_rate": 2.160959278443342e-06, "loss": 0.4191, "step": 2771 }, { "epoch": 1.17, "grad_norm": 0.5450021019526348, "learning_rate": 2.1590977901363215e-06, "loss": 0.4121, "step": 2772 }, { "epoch": 1.17, "grad_norm": 0.586036868971513, "learning_rate": 2.157236494394133e-06, "loss": 0.4231, "step": 2773 }, { "epoch": 1.17, "grad_norm": 0.6082884207803223, "learning_rate": 2.155375392268165e-06, "loss": 0.4347, "step": 2774 }, { "epoch": 1.17, "grad_norm": 0.5818956219426625, "learning_rate": 2.1535144848096943e-06, "loss": 0.4267, "step": 2775 }, { "epoch": 1.17, "grad_norm": 0.5882181261304475, "learning_rate": 2.1516537730698895e-06, "loss": 0.4122, "step": 2776 }, { "epoch": 1.17, "grad_norm": 0.5855723087856304, "learning_rate": 2.1497932580998055e-06, "loss": 0.4196, "step": 2777 }, { "epoch": 1.17, "grad_norm": 0.5621462555991732, "learning_rate": 2.147932940950391e-06, "loss": 0.4313, "step": 2778 }, { "epoch": 1.17, "grad_norm": 0.5619630139053288, "learning_rate": 2.1460728226724768e-06, "loss": 0.4171, "step": 2779 }, { "epoch": 1.17, "grad_norm": 0.5803428278169429, "learning_rate": 2.1442129043167877e-06, "loss": 0.4137, "step": 2780 }, { "epoch": 1.17, "grad_norm": 0.6003743965203625, "learning_rate": 2.1423531869339307e-06, "loss": 0.4094, "step": 2781 }, { "epoch": 1.17, "grad_norm": 0.5838530366206433, "learning_rate": 2.140493671574402e-06, "loss": 0.4256, "step": 2782 }, { "epoch": 1.17, "grad_norm": 0.5750052865571543, "learning_rate": 2.138634359288581e-06, "loss": 0.4182, "step": 2783 }, { "epoch": 1.18, "grad_norm": 0.555105523968193, "learning_rate": 2.1367752511267366e-06, "loss": 0.4174, "step": 2784 }, { "epoch": 1.18, "grad_norm": 0.569042426766486, "learning_rate": 2.134916348139019e-06, "loss": 0.4137, "step": 2785 }, { "epoch": 1.18, "grad_norm": 0.5907654743125312, "learning_rate": 2.133057651375463e-06, "loss": 0.4148, "step": 2786 }, { "epoch": 1.18, "grad_norm": 0.5717943994371465, "learning_rate": 2.1311991618859883e-06, "loss": 0.4307, "step": 2787 }, { "epoch": 1.18, "grad_norm": 0.5562054796755491, "learning_rate": 2.129340880720395e-06, "loss": 0.412, "step": 2788 }, { "epoch": 1.18, "grad_norm": 0.5745208858282651, "learning_rate": 2.1274828089283696e-06, "loss": 0.4315, "step": 2789 }, { "epoch": 1.18, "grad_norm": 0.6261475696552913, "learning_rate": 2.125624947559475e-06, "loss": 0.4141, "step": 2790 }, { "epoch": 1.18, "grad_norm": 0.587239695175737, "learning_rate": 2.123767297663161e-06, "loss": 0.4384, "step": 2791 }, { "epoch": 1.18, "grad_norm": 0.5583403908079667, "learning_rate": 2.1219098602887524e-06, "loss": 0.4215, "step": 2792 }, { "epoch": 1.18, "grad_norm": 0.5697359070123751, "learning_rate": 2.1200526364854583e-06, "loss": 0.4195, "step": 2793 }, { "epoch": 1.18, "grad_norm": 0.5746434077589057, "learning_rate": 2.118195627302364e-06, "loss": 0.4397, "step": 2794 }, { "epoch": 1.18, "grad_norm": 0.5770878978662568, "learning_rate": 2.116338833788437e-06, "loss": 0.4142, "step": 2795 }, { "epoch": 1.18, "grad_norm": 0.615557087883441, "learning_rate": 2.114482256992519e-06, "loss": 0.4124, "step": 2796 }, { "epoch": 1.18, "grad_norm": 0.5665043322437245, "learning_rate": 2.112625897963333e-06, "loss": 0.423, "step": 2797 }, { "epoch": 1.18, "grad_norm": 0.5598727728785423, "learning_rate": 2.1107697577494764e-06, "loss": 0.4014, "step": 2798 }, { "epoch": 1.18, "grad_norm": 0.567119767667523, "learning_rate": 2.1089138373994226e-06, "loss": 0.4072, "step": 2799 }, { "epoch": 1.18, "grad_norm": 0.5672415415140308, "learning_rate": 2.1070581379615253e-06, "loss": 0.4336, "step": 2800 }, { "epoch": 1.18, "grad_norm": 0.5503253498723211, "learning_rate": 2.1052026604840066e-06, "loss": 0.4181, "step": 2801 }, { "epoch": 1.18, "grad_norm": 0.5837370512357433, "learning_rate": 2.10334740601497e-06, "loss": 0.4014, "step": 2802 }, { "epoch": 1.18, "grad_norm": 0.5485492339607526, "learning_rate": 2.101492375602387e-06, "loss": 0.3878, "step": 2803 }, { "epoch": 1.18, "grad_norm": 0.5901125098812506, "learning_rate": 2.099637570294108e-06, "loss": 0.4315, "step": 2804 }, { "epoch": 1.18, "grad_norm": 0.5790878839442004, "learning_rate": 2.0977829911378507e-06, "loss": 0.4082, "step": 2805 }, { "epoch": 1.18, "grad_norm": 0.5486030886397184, "learning_rate": 2.0959286391812116e-06, "loss": 0.4131, "step": 2806 }, { "epoch": 1.19, "grad_norm": 0.5731467050022245, "learning_rate": 2.0940745154716516e-06, "loss": 0.4213, "step": 2807 }, { "epoch": 1.19, "grad_norm": 0.563249940690925, "learning_rate": 2.0922206210565088e-06, "loss": 0.4275, "step": 2808 }, { "epoch": 1.19, "grad_norm": 0.5588267746939021, "learning_rate": 2.090366956982988e-06, "loss": 0.4053, "step": 2809 }, { "epoch": 1.19, "grad_norm": 0.6276548597898437, "learning_rate": 2.088513524298165e-06, "loss": 0.4044, "step": 2810 }, { "epoch": 1.19, "grad_norm": 0.5654177414506385, "learning_rate": 2.086660324048987e-06, "loss": 0.3936, "step": 2811 }, { "epoch": 1.19, "grad_norm": 0.5912299181232848, "learning_rate": 2.084807357282266e-06, "loss": 0.4449, "step": 2812 }, { "epoch": 1.19, "grad_norm": 0.5934179796428065, "learning_rate": 2.0829546250446846e-06, "loss": 0.4316, "step": 2813 }, { "epoch": 1.19, "grad_norm": 0.660477044543163, "learning_rate": 2.0811021283827928e-06, "loss": 0.4134, "step": 2814 }, { "epoch": 1.19, "grad_norm": 0.5785577080236974, "learning_rate": 2.0792498683430072e-06, "loss": 0.4289, "step": 2815 }, { "epoch": 1.19, "grad_norm": 0.5359940863049542, "learning_rate": 2.077397845971609e-06, "loss": 0.3887, "step": 2816 }, { "epoch": 1.19, "grad_norm": 0.5579305558402582, "learning_rate": 2.07554606231475e-06, "loss": 0.4046, "step": 2817 }, { "epoch": 1.19, "grad_norm": 0.5768033221733195, "learning_rate": 2.0736945184184406e-06, "loss": 0.4306, "step": 2818 }, { "epoch": 1.19, "grad_norm": 0.5574558414654748, "learning_rate": 2.0718432153285615e-06, "loss": 0.4266, "step": 2819 }, { "epoch": 1.19, "grad_norm": 0.5523510875719263, "learning_rate": 2.0699921540908542e-06, "loss": 0.4209, "step": 2820 }, { "epoch": 1.19, "grad_norm": 0.586454598962645, "learning_rate": 2.068141335750925e-06, "loss": 0.4038, "step": 2821 }, { "epoch": 1.19, "grad_norm": 0.5767355936333549, "learning_rate": 2.0662907613542405e-06, "loss": 0.4219, "step": 2822 }, { "epoch": 1.19, "grad_norm": 0.5748539680981339, "learning_rate": 2.064440431946133e-06, "loss": 0.382, "step": 2823 }, { "epoch": 1.19, "grad_norm": 0.5642219467594125, "learning_rate": 2.062590348571796e-06, "loss": 0.4051, "step": 2824 }, { "epoch": 1.19, "grad_norm": 0.5825964555184913, "learning_rate": 2.0607405122762806e-06, "loss": 0.4096, "step": 2825 }, { "epoch": 1.19, "grad_norm": 0.5629400713760981, "learning_rate": 2.058890924104502e-06, "loss": 0.4, "step": 2826 }, { "epoch": 1.19, "grad_norm": 0.5463214353524036, "learning_rate": 2.057041585101232e-06, "loss": 0.3901, "step": 2827 }, { "epoch": 1.19, "grad_norm": 0.5728170268667269, "learning_rate": 2.0551924963111064e-06, "loss": 0.4252, "step": 2828 }, { "epoch": 1.19, "grad_norm": 0.5543782845949665, "learning_rate": 2.053343658778613e-06, "loss": 0.4214, "step": 2829 }, { "epoch": 1.19, "grad_norm": 0.58082955878978, "learning_rate": 2.0514950735481053e-06, "loss": 0.4137, "step": 2830 }, { "epoch": 1.2, "grad_norm": 0.5531229406229621, "learning_rate": 2.049646741663788e-06, "loss": 0.4131, "step": 2831 }, { "epoch": 1.2, "grad_norm": 0.5541337155553405, "learning_rate": 2.0477986641697263e-06, "loss": 0.4068, "step": 2832 }, { "epoch": 1.2, "grad_norm": 0.5512293318719139, "learning_rate": 2.0459508421098383e-06, "loss": 0.4193, "step": 2833 }, { "epoch": 1.2, "grad_norm": 0.6481077588847556, "learning_rate": 2.0441032765279036e-06, "loss": 0.4195, "step": 2834 }, { "epoch": 1.2, "grad_norm": 0.5553462347610719, "learning_rate": 2.0422559684675498e-06, "loss": 0.4131, "step": 2835 }, { "epoch": 1.2, "grad_norm": 0.5854214572008192, "learning_rate": 2.040408918972264e-06, "loss": 0.4223, "step": 2836 }, { "epoch": 1.2, "grad_norm": 0.5524192472655216, "learning_rate": 2.038562129085387e-06, "loss": 0.3845, "step": 2837 }, { "epoch": 1.2, "grad_norm": 0.56829946589284, "learning_rate": 2.0367155998501092e-06, "loss": 0.4013, "step": 2838 }, { "epoch": 1.2, "grad_norm": 0.5878475779332522, "learning_rate": 2.03486933230948e-06, "loss": 0.4206, "step": 2839 }, { "epoch": 1.2, "grad_norm": 0.5637907040986664, "learning_rate": 2.033023327506393e-06, "loss": 0.4052, "step": 2840 }, { "epoch": 1.2, "eval_loss": 0.4627860486507416, "eval_runtime": 6936.5234, "eval_samples_per_second": 41.864, "eval_steps_per_second": 2.093, "step": 2840 }, { "epoch": 1.2, "grad_norm": 0.5782227933222694, "learning_rate": 2.0311775864836007e-06, "loss": 0.4184, "step": 2841 }, { "epoch": 1.2, "grad_norm": 0.5726678150106956, "learning_rate": 2.0293321102837023e-06, "loss": 0.4379, "step": 2842 }, { "epoch": 1.2, "grad_norm": 0.5653275322857352, "learning_rate": 2.0274868999491496e-06, "loss": 0.411, "step": 2843 }, { "epoch": 1.2, "grad_norm": 0.6143518059743222, "learning_rate": 2.0256419565222423e-06, "loss": 0.4202, "step": 2844 }, { "epoch": 1.2, "grad_norm": 0.5533020572133284, "learning_rate": 2.023797281045132e-06, "loss": 0.4015, "step": 2845 }, { "epoch": 1.2, "grad_norm": 0.5551626428007986, "learning_rate": 2.0219528745598145e-06, "loss": 0.4148, "step": 2846 }, { "epoch": 1.2, "grad_norm": 0.5889239778826716, "learning_rate": 2.020108738108139e-06, "loss": 0.4052, "step": 2847 }, { "epoch": 1.2, "grad_norm": 0.6024562268514936, "learning_rate": 2.0182648727317986e-06, "loss": 0.4287, "step": 2848 }, { "epoch": 1.2, "grad_norm": 0.5746780507322024, "learning_rate": 2.0164212794723336e-06, "loss": 0.4165, "step": 2849 }, { "epoch": 1.2, "grad_norm": 0.5908540121904199, "learning_rate": 2.014577959371134e-06, "loss": 0.4391, "step": 2850 }, { "epoch": 1.2, "grad_norm": 0.5820354589271028, "learning_rate": 2.012734913469429e-06, "loss": 0.4151, "step": 2851 }, { "epoch": 1.2, "grad_norm": 0.5473344454525257, "learning_rate": 2.0108921428083e-06, "loss": 0.4279, "step": 2852 }, { "epoch": 1.2, "grad_norm": 0.5832936596852468, "learning_rate": 2.009049648428668e-06, "loss": 0.4288, "step": 2853 }, { "epoch": 1.21, "grad_norm": 0.6084388529390307, "learning_rate": 2.0072074313713e-06, "loss": 0.4461, "step": 2854 }, { "epoch": 1.21, "grad_norm": 0.5740745870244656, "learning_rate": 2.0053654926768044e-06, "loss": 0.4021, "step": 2855 }, { "epoch": 1.21, "grad_norm": 0.5515506740065009, "learning_rate": 2.003523833385637e-06, "loss": 0.3928, "step": 2856 }, { "epoch": 1.21, "grad_norm": 0.5624075725007869, "learning_rate": 2.0016824545380895e-06, "loss": 0.4191, "step": 2857 }, { "epoch": 1.21, "grad_norm": 0.5863935111984939, "learning_rate": 1.9998413571743006e-06, "loss": 0.4224, "step": 2858 }, { "epoch": 1.21, "grad_norm": 0.5721820837476251, "learning_rate": 1.9980005423342462e-06, "loss": 0.4364, "step": 2859 }, { "epoch": 1.21, "grad_norm": 0.5729299550286993, "learning_rate": 1.996160011057746e-06, "loss": 0.4112, "step": 2860 }, { "epoch": 1.21, "grad_norm": 0.5933829400685822, "learning_rate": 1.9943197643844554e-06, "loss": 0.399, "step": 2861 }, { "epoch": 1.21, "grad_norm": 0.601305825995989, "learning_rate": 1.992479803353872e-06, "loss": 0.4265, "step": 2862 }, { "epoch": 1.21, "grad_norm": 0.5692294662214427, "learning_rate": 1.9906401290053323e-06, "loss": 0.4071, "step": 2863 }, { "epoch": 1.21, "grad_norm": 0.570531202757177, "learning_rate": 1.9888007423780095e-06, "loss": 0.4161, "step": 2864 }, { "epoch": 1.21, "grad_norm": 0.5765127200524025, "learning_rate": 1.9869616445109146e-06, "loss": 0.42, "step": 2865 }, { "epoch": 1.21, "grad_norm": 0.5652493468336516, "learning_rate": 1.985122836442895e-06, "loss": 0.4447, "step": 2866 }, { "epoch": 1.21, "grad_norm": 0.5345921455849048, "learning_rate": 1.9832843192126367e-06, "loss": 0.4114, "step": 2867 }, { "epoch": 1.21, "grad_norm": 0.553833707765255, "learning_rate": 1.9814460938586572e-06, "loss": 0.4192, "step": 2868 }, { "epoch": 1.21, "grad_norm": 0.5659115457713205, "learning_rate": 1.9796081614193143e-06, "loss": 0.4195, "step": 2869 }, { "epoch": 1.21, "grad_norm": 0.5667385460753466, "learning_rate": 1.9777705229327954e-06, "loss": 0.4177, "step": 2870 }, { "epoch": 1.21, "grad_norm": 0.5807688119962365, "learning_rate": 1.9759331794371255e-06, "loss": 0.4015, "step": 2871 }, { "epoch": 1.21, "grad_norm": 0.5637812729284788, "learning_rate": 1.97409613197016e-06, "loss": 0.4288, "step": 2872 }, { "epoch": 1.21, "grad_norm": 0.5695007837691306, "learning_rate": 1.972259381569592e-06, "loss": 0.4029, "step": 2873 }, { "epoch": 1.21, "grad_norm": 0.5977467234875765, "learning_rate": 1.9704229292729393e-06, "loss": 0.4189, "step": 2874 }, { "epoch": 1.21, "grad_norm": 0.569978022020393, "learning_rate": 1.9685867761175584e-06, "loss": 0.3981, "step": 2875 }, { "epoch": 1.21, "grad_norm": 0.5622730363649805, "learning_rate": 1.9667509231406332e-06, "loss": 0.4072, "step": 2876 }, { "epoch": 1.22, "grad_norm": 0.5772397599288341, "learning_rate": 1.964915371379178e-06, "loss": 0.4224, "step": 2877 }, { "epoch": 1.22, "grad_norm": 0.5550624772060395, "learning_rate": 1.9630801218700397e-06, "loss": 0.4168, "step": 2878 }, { "epoch": 1.22, "grad_norm": 0.593384835711817, "learning_rate": 1.961245175649889e-06, "loss": 0.4082, "step": 2879 }, { "epoch": 1.22, "grad_norm": 0.5648138827206016, "learning_rate": 1.959410533755232e-06, "loss": 0.4386, "step": 2880 }, { "epoch": 1.22, "grad_norm": 0.590345590782358, "learning_rate": 1.9575761972223983e-06, "loss": 0.4318, "step": 2881 }, { "epoch": 1.22, "grad_norm": 0.5814485952827325, "learning_rate": 1.955742167087547e-06, "loss": 0.4288, "step": 2882 }, { "epoch": 1.22, "grad_norm": 0.5833916546222742, "learning_rate": 1.953908444386662e-06, "loss": 0.4093, "step": 2883 }, { "epoch": 1.22, "grad_norm": 0.599723766340312, "learning_rate": 1.9520750301555574e-06, "loss": 0.4177, "step": 2884 }, { "epoch": 1.22, "grad_norm": 0.6655917345901515, "learning_rate": 1.9502419254298674e-06, "loss": 0.4105, "step": 2885 }, { "epoch": 1.22, "grad_norm": 0.5717968178433537, "learning_rate": 1.9484091312450577e-06, "loss": 0.4214, "step": 2886 }, { "epoch": 1.22, "grad_norm": 0.5844960834468691, "learning_rate": 1.9465766486364145e-06, "loss": 0.4249, "step": 2887 }, { "epoch": 1.22, "grad_norm": 0.5832680992228242, "learning_rate": 1.944744478639048e-06, "loss": 0.4095, "step": 2888 }, { "epoch": 1.22, "grad_norm": 0.5814549183663302, "learning_rate": 1.9429126222878954e-06, "loss": 0.4303, "step": 2889 }, { "epoch": 1.22, "grad_norm": 0.5589845040306947, "learning_rate": 1.9410810806177105e-06, "loss": 0.4008, "step": 2890 }, { "epoch": 1.22, "grad_norm": 0.5823353970607823, "learning_rate": 1.9392498546630767e-06, "loss": 0.4206, "step": 2891 }, { "epoch": 1.22, "grad_norm": 0.5773798607020071, "learning_rate": 1.937418945458393e-06, "loss": 0.4107, "step": 2892 }, { "epoch": 1.22, "grad_norm": 0.5758173233706595, "learning_rate": 1.935588354037883e-06, "loss": 0.4299, "step": 2893 }, { "epoch": 1.22, "grad_norm": 0.5823417001041828, "learning_rate": 1.9337580814355887e-06, "loss": 0.4109, "step": 2894 }, { "epoch": 1.22, "grad_norm": 0.5969616099880778, "learning_rate": 1.931928128685375e-06, "loss": 0.4005, "step": 2895 }, { "epoch": 1.22, "grad_norm": 0.5732522814857013, "learning_rate": 1.9300984968209215e-06, "loss": 0.4251, "step": 2896 }, { "epoch": 1.22, "grad_norm": 0.6361313903565526, "learning_rate": 1.928269186875731e-06, "loss": 0.4294, "step": 2897 }, { "epoch": 1.22, "grad_norm": 0.5936983346727575, "learning_rate": 1.9264401998831213e-06, "loss": 0.4503, "step": 2898 }, { "epoch": 1.22, "grad_norm": 0.566199487669491, "learning_rate": 1.9246115368762307e-06, "loss": 0.4267, "step": 2899 }, { "epoch": 1.22, "grad_norm": 0.5658880403316346, "learning_rate": 1.922783198888011e-06, "loss": 0.4211, "step": 2900 }, { "epoch": 1.23, "grad_norm": 0.5861694408434711, "learning_rate": 1.9209551869512326e-06, "loss": 0.4142, "step": 2901 }, { "epoch": 1.23, "grad_norm": 0.5861893981468904, "learning_rate": 1.919127502098483e-06, "loss": 0.4285, "step": 2902 }, { "epoch": 1.23, "grad_norm": 0.6057636807953072, "learning_rate": 1.9173001453621615e-06, "loss": 0.4334, "step": 2903 }, { "epoch": 1.23, "grad_norm": 0.5825991271880557, "learning_rate": 1.9154731177744858e-06, "loss": 0.4135, "step": 2904 }, { "epoch": 1.23, "grad_norm": 0.5685728541574715, "learning_rate": 1.913646420367483e-06, "loss": 0.4239, "step": 2905 }, { "epoch": 1.23, "grad_norm": 0.5435808297310615, "learning_rate": 1.911820054173e-06, "loss": 0.4092, "step": 2906 }, { "epoch": 1.23, "grad_norm": 0.5768278588163496, "learning_rate": 1.9099940202226895e-06, "loss": 0.4192, "step": 2907 }, { "epoch": 1.23, "grad_norm": 0.5528574586286196, "learning_rate": 1.908168319548023e-06, "loss": 0.4395, "step": 2908 }, { "epoch": 1.23, "grad_norm": 0.5460290788495442, "learning_rate": 1.9063429531802788e-06, "loss": 0.4089, "step": 2909 }, { "epoch": 1.23, "grad_norm": 0.5836425311067438, "learning_rate": 1.9045179221505497e-06, "loss": 0.4096, "step": 2910 }, { "epoch": 1.23, "grad_norm": 0.5368200883277836, "learning_rate": 1.902693227489737e-06, "loss": 0.4098, "step": 2911 }, { "epoch": 1.23, "eval_loss": 0.46161049604415894, "eval_runtime": 6937.863, "eval_samples_per_second": 41.856, "eval_steps_per_second": 2.093, "step": 2911 }, { "epoch": 1.23, "grad_norm": 0.5737532658966875, "learning_rate": 1.9008688702285532e-06, "loss": 0.4076, "step": 2912 }, { "epoch": 1.23, "grad_norm": 0.5929933734472119, "learning_rate": 1.899044851397519e-06, "loss": 0.386, "step": 2913 }, { "epoch": 1.23, "grad_norm": 0.5904060193816585, "learning_rate": 1.8972211720269657e-06, "loss": 0.4006, "step": 2914 }, { "epoch": 1.23, "grad_norm": 0.5744303339968841, "learning_rate": 1.8953978331470322e-06, "loss": 0.3955, "step": 2915 }, { "epoch": 1.23, "grad_norm": 0.5805107950321803, "learning_rate": 1.8935748357876626e-06, "loss": 0.3978, "step": 2916 }, { "epoch": 1.23, "grad_norm": 0.6035250400166989, "learning_rate": 1.8917521809786136e-06, "loss": 0.4243, "step": 2917 }, { "epoch": 1.23, "grad_norm": 0.5889359436095863, "learning_rate": 1.8899298697494413e-06, "loss": 0.4332, "step": 2918 }, { "epoch": 1.23, "grad_norm": 0.638150416673843, "learning_rate": 1.8881079031295147e-06, "loss": 0.4289, "step": 2919 }, { "epoch": 1.23, "grad_norm": 0.5711856241860922, "learning_rate": 1.8862862821480023e-06, "loss": 0.4198, "step": 2920 }, { "epoch": 1.23, "grad_norm": 0.5695997739109594, "learning_rate": 1.8844650078338818e-06, "loss": 0.4008, "step": 2921 }, { "epoch": 1.23, "grad_norm": 0.5593680674464713, "learning_rate": 1.8826440812159321e-06, "loss": 0.3921, "step": 2922 }, { "epoch": 1.23, "grad_norm": 0.5684210443299574, "learning_rate": 1.8808235033227378e-06, "loss": 0.4109, "step": 2923 }, { "epoch": 1.24, "grad_norm": 1.791352852330744, "learning_rate": 1.8790032751826839e-06, "loss": 0.4426, "step": 2924 }, { "epoch": 1.24, "grad_norm": 0.590981142715526, "learning_rate": 1.8771833978239615e-06, "loss": 0.4044, "step": 2925 }, { "epoch": 1.24, "grad_norm": 0.5929104028257183, "learning_rate": 1.8753638722745601e-06, "loss": 0.4251, "step": 2926 }, { "epoch": 1.24, "grad_norm": 0.6088394356034341, "learning_rate": 1.8735446995622719e-06, "loss": 0.422, "step": 2927 }, { "epoch": 1.24, "grad_norm": 0.5812034504925151, "learning_rate": 1.8717258807146918e-06, "loss": 0.4417, "step": 2928 }, { "epoch": 1.24, "grad_norm": 0.5634954353609875, "learning_rate": 1.8699074167592097e-06, "loss": 0.4221, "step": 2929 }, { "epoch": 1.24, "grad_norm": 0.5805870682157763, "learning_rate": 1.8680893087230207e-06, "loss": 0.4148, "step": 2930 }, { "epoch": 1.24, "grad_norm": 0.5978992039621629, "learning_rate": 1.866271557633115e-06, "loss": 0.427, "step": 2931 }, { "epoch": 1.24, "grad_norm": 0.5433676698516975, "learning_rate": 1.8644541645162834e-06, "loss": 0.4378, "step": 2932 }, { "epoch": 1.24, "grad_norm": 0.5654081584633399, "learning_rate": 1.862637130399112e-06, "loss": 0.4255, "step": 2933 }, { "epoch": 1.24, "grad_norm": 0.5745394540754084, "learning_rate": 1.8608204563079874e-06, "loss": 0.4138, "step": 2934 }, { "epoch": 1.24, "grad_norm": 0.5651328725524883, "learning_rate": 1.8590041432690895e-06, "loss": 0.4289, "step": 2935 }, { "epoch": 1.24, "grad_norm": 0.5543604900455075, "learning_rate": 1.8571881923083976e-06, "loss": 0.4098, "step": 2936 }, { "epoch": 1.24, "grad_norm": 0.5541075458786751, "learning_rate": 1.8553726044516835e-06, "loss": 0.405, "step": 2937 }, { "epoch": 1.24, "grad_norm": 0.5662395052454143, "learning_rate": 1.8535573807245155e-06, "loss": 0.398, "step": 2938 }, { "epoch": 1.24, "grad_norm": 0.5415458339050451, "learning_rate": 1.8517425221522555e-06, "loss": 0.3944, "step": 2939 }, { "epoch": 1.24, "grad_norm": 0.5975828808599232, "learning_rate": 1.8499280297600594e-06, "loss": 0.4102, "step": 2940 }, { "epoch": 1.24, "grad_norm": 0.5586543614587621, "learning_rate": 1.848113904572878e-06, "loss": 0.4071, "step": 2941 }, { "epoch": 1.24, "grad_norm": 0.5462110270977989, "learning_rate": 1.8463001476154508e-06, "loss": 0.4151, "step": 2942 }, { "epoch": 1.24, "grad_norm": 0.5750571044180495, "learning_rate": 1.844486759912313e-06, "loss": 0.4111, "step": 2943 }, { "epoch": 1.24, "grad_norm": 0.5670618407119518, "learning_rate": 1.8426737424877883e-06, "loss": 0.4188, "step": 2944 }, { "epoch": 1.24, "grad_norm": 0.5916977571948979, "learning_rate": 1.840861096365995e-06, "loss": 0.4323, "step": 2945 }, { "epoch": 1.24, "grad_norm": 0.5723973722291331, "learning_rate": 1.8390488225708364e-06, "loss": 0.4216, "step": 2946 }, { "epoch": 1.24, "grad_norm": 0.5788397104566307, "learning_rate": 1.8372369221260106e-06, "loss": 0.4185, "step": 2947 }, { "epoch": 1.25, "grad_norm": 0.605618630484289, "learning_rate": 1.8354253960550017e-06, "loss": 0.4073, "step": 2948 }, { "epoch": 1.25, "grad_norm": 0.5986751285211731, "learning_rate": 1.833614245381084e-06, "loss": 0.4254, "step": 2949 }, { "epoch": 1.25, "grad_norm": 0.5555679218850047, "learning_rate": 1.8318034711273181e-06, "loss": 0.4029, "step": 2950 }, { "epoch": 1.25, "grad_norm": 0.5541685467504466, "learning_rate": 1.8299930743165537e-06, "loss": 0.3933, "step": 2951 }, { "epoch": 1.25, "grad_norm": 0.5797747508458493, "learning_rate": 1.8281830559714248e-06, "loss": 0.4167, "step": 2952 }, { "epoch": 1.25, "grad_norm": 0.5724155369928727, "learning_rate": 1.8263734171143552e-06, "loss": 0.4065, "step": 2953 }, { "epoch": 1.25, "grad_norm": 0.5842672183423349, "learning_rate": 1.8245641587675523e-06, "loss": 0.4193, "step": 2954 }, { "epoch": 1.25, "grad_norm": 0.5670445430376003, "learning_rate": 1.822755281953007e-06, "loss": 0.4362, "step": 2955 }, { "epoch": 1.25, "grad_norm": 0.5785669095113156, "learning_rate": 1.8209467876924992e-06, "loss": 0.4115, "step": 2956 }, { "epoch": 1.25, "grad_norm": 0.5942980368738244, "learning_rate": 1.8191386770075863e-06, "loss": 0.4273, "step": 2957 }, { "epoch": 1.25, "grad_norm": 0.5928392121690472, "learning_rate": 1.8173309509196158e-06, "loss": 0.4224, "step": 2958 }, { "epoch": 1.25, "grad_norm": 0.5922420607132926, "learning_rate": 1.8155236104497128e-06, "loss": 0.4394, "step": 2959 }, { "epoch": 1.25, "grad_norm": 0.6223806379306809, "learning_rate": 1.813716656618788e-06, "loss": 0.434, "step": 2960 }, { "epoch": 1.25, "grad_norm": 0.5845357998232491, "learning_rate": 1.8119100904475306e-06, "loss": 0.441, "step": 2961 }, { "epoch": 1.25, "grad_norm": 0.5576654968046494, "learning_rate": 1.8101039129564142e-06, "loss": 0.4018, "step": 2962 }, { "epoch": 1.25, "grad_norm": 0.5734931825622845, "learning_rate": 1.8082981251656887e-06, "loss": 0.4122, "step": 2963 }, { "epoch": 1.25, "grad_norm": 0.5992264311549484, "learning_rate": 1.8064927280953893e-06, "loss": 0.4426, "step": 2964 }, { "epoch": 1.25, "grad_norm": 0.5768107642343978, "learning_rate": 1.8046877227653248e-06, "loss": 0.388, "step": 2965 }, { "epoch": 1.25, "grad_norm": 0.5821336521989418, "learning_rate": 1.8028831101950866e-06, "loss": 0.4245, "step": 2966 }, { "epoch": 1.25, "grad_norm": 0.5390570291656819, "learning_rate": 1.8010788914040444e-06, "loss": 0.3867, "step": 2967 }, { "epoch": 1.25, "grad_norm": 0.5704847089539564, "learning_rate": 1.7992750674113414e-06, "loss": 0.4278, "step": 2968 }, { "epoch": 1.25, "grad_norm": 0.6111690436052009, "learning_rate": 1.7974716392359026e-06, "loss": 0.4479, "step": 2969 }, { "epoch": 1.25, "grad_norm": 0.5802046152902072, "learning_rate": 1.7956686078964257e-06, "loss": 0.4298, "step": 2970 }, { "epoch": 1.26, "grad_norm": 0.5455052953532722, "learning_rate": 1.793865974411388e-06, "loss": 0.4181, "step": 2971 }, { "epoch": 1.26, "grad_norm": 0.5686949830786789, "learning_rate": 1.7920637397990373e-06, "loss": 0.4203, "step": 2972 }, { "epoch": 1.26, "grad_norm": 0.5957176939693254, "learning_rate": 1.7902619050774006e-06, "loss": 0.4026, "step": 2973 }, { "epoch": 1.26, "grad_norm": 0.6022161318571446, "learning_rate": 1.788460471264276e-06, "loss": 0.3995, "step": 2974 }, { "epoch": 1.26, "grad_norm": 0.5831579158285504, "learning_rate": 1.7866594393772375e-06, "loss": 0.4035, "step": 2975 }, { "epoch": 1.26, "grad_norm": 0.5801852393556498, "learning_rate": 1.7848588104336293e-06, "loss": 0.4123, "step": 2976 }, { "epoch": 1.26, "grad_norm": 0.5961588266575963, "learning_rate": 1.783058585450571e-06, "loss": 0.4317, "step": 2977 }, { "epoch": 1.26, "grad_norm": 0.573944779319003, "learning_rate": 1.781258765444951e-06, "loss": 0.4344, "step": 2978 }, { "epoch": 1.26, "grad_norm": 0.5879109677820142, "learning_rate": 1.7794593514334313e-06, "loss": 0.4182, "step": 2979 }, { "epoch": 1.26, "grad_norm": 0.5888697412742766, "learning_rate": 1.7776603444324445e-06, "loss": 0.4119, "step": 2980 }, { "epoch": 1.26, "grad_norm": 0.570679582164104, "learning_rate": 1.775861745458191e-06, "loss": 0.4411, "step": 2981 }, { "epoch": 1.26, "grad_norm": 0.5775235217718448, "learning_rate": 1.774063555526644e-06, "loss": 0.4003, "step": 2982 }, { "epoch": 1.26, "eval_loss": 0.4607752859592438, "eval_runtime": 6938.4518, "eval_samples_per_second": 41.852, "eval_steps_per_second": 2.093, "step": 2982 }, { "epoch": 1.26, "grad_norm": 0.5772173427303933, "learning_rate": 1.7722657756535422e-06, "loss": 0.4292, "step": 2983 }, { "epoch": 1.26, "grad_norm": 0.5620226835290322, "learning_rate": 1.7704684068543953e-06, "loss": 0.396, "step": 2984 }, { "epoch": 1.26, "grad_norm": 0.6017908973225675, "learning_rate": 1.7686714501444791e-06, "loss": 0.4228, "step": 2985 }, { "epoch": 1.26, "grad_norm": 0.5842773295597439, "learning_rate": 1.7668749065388385e-06, "loss": 0.4235, "step": 2986 }, { "epoch": 1.26, "grad_norm": 0.5881893778638159, "learning_rate": 1.7650787770522831e-06, "loss": 0.3885, "step": 2987 }, { "epoch": 1.26, "grad_norm": 0.5921874088208304, "learning_rate": 1.76328306269939e-06, "loss": 0.4118, "step": 2988 }, { "epoch": 1.26, "grad_norm": 0.5860478876629458, "learning_rate": 1.7614877644945002e-06, "loss": 0.4259, "step": 2989 }, { "epoch": 1.26, "grad_norm": 0.5566234951598134, "learning_rate": 1.759692883451721e-06, "loss": 0.4062, "step": 2990 }, { "epoch": 1.26, "grad_norm": 0.5886058120181612, "learning_rate": 1.757898420584925e-06, "loss": 0.4157, "step": 2991 }, { "epoch": 1.26, "grad_norm": 0.5835204891594994, "learning_rate": 1.756104376907746e-06, "loss": 0.4287, "step": 2992 }, { "epoch": 1.26, "grad_norm": 0.5820795493170282, "learning_rate": 1.7543107534335828e-06, "loss": 0.4224, "step": 2993 }, { "epoch": 1.26, "grad_norm": 0.6013433453081255, "learning_rate": 1.752517551175596e-06, "loss": 0.4354, "step": 2994 }, { "epoch": 1.27, "grad_norm": 0.6033893634449149, "learning_rate": 1.750724771146709e-06, "loss": 0.4286, "step": 2995 }, { "epoch": 1.27, "grad_norm": 0.54509644376269, "learning_rate": 1.748932414359605e-06, "loss": 0.4082, "step": 2996 }, { "epoch": 1.27, "grad_norm": 0.5826151129215978, "learning_rate": 1.7471404818267319e-06, "loss": 0.4051, "step": 2997 }, { "epoch": 1.27, "grad_norm": 0.5880061063869555, "learning_rate": 1.745348974560293e-06, "loss": 0.4386, "step": 2998 }, { "epoch": 1.27, "grad_norm": 0.5656468304808548, "learning_rate": 1.743557893572256e-06, "loss": 0.4146, "step": 2999 }, { "epoch": 1.27, "grad_norm": 0.5689192538618583, "learning_rate": 1.741767239874344e-06, "loss": 0.4103, "step": 3000 }, { "epoch": 1.27, "grad_norm": 0.5840118133437147, "learning_rate": 1.7399770144780414e-06, "loss": 0.4199, "step": 3001 }, { "epoch": 1.27, "grad_norm": 0.5550874833815063, "learning_rate": 1.7381872183945885e-06, "loss": 0.3935, "step": 3002 }, { "epoch": 1.27, "grad_norm": 0.5731500465902329, "learning_rate": 1.7363978526349857e-06, "loss": 0.4307, "step": 3003 }, { "epoch": 1.27, "grad_norm": 0.5579055837980196, "learning_rate": 1.734608918209989e-06, "loss": 0.4152, "step": 3004 }, { "epoch": 1.27, "grad_norm": 0.5619015288206511, "learning_rate": 1.7328204161301084e-06, "loss": 0.4257, "step": 3005 }, { "epoch": 1.27, "grad_norm": 0.5685518518480854, "learning_rate": 1.7310323474056154e-06, "loss": 0.4292, "step": 3006 }, { "epoch": 1.27, "grad_norm": 0.572223039710208, "learning_rate": 1.7292447130465296e-06, "loss": 0.4027, "step": 3007 }, { "epoch": 1.27, "grad_norm": 0.5923422187875147, "learning_rate": 1.7274575140626318e-06, "loss": 0.4185, "step": 3008 }, { "epoch": 1.27, "grad_norm": 0.578248418530715, "learning_rate": 1.7256707514634521e-06, "loss": 0.4225, "step": 3009 }, { "epoch": 1.27, "grad_norm": 0.5738266276609935, "learning_rate": 1.723884426258277e-06, "loss": 0.4073, "step": 3010 }, { "epoch": 1.27, "grad_norm": 0.5804211498201226, "learning_rate": 1.7220985394561445e-06, "loss": 0.4253, "step": 3011 }, { "epoch": 1.27, "grad_norm": 0.5703132294176713, "learning_rate": 1.7203130920658457e-06, "loss": 0.4127, "step": 3012 }, { "epoch": 1.27, "grad_norm": 0.580926877910347, "learning_rate": 1.7185280850959215e-06, "loss": 0.4152, "step": 3013 }, { "epoch": 1.27, "grad_norm": 0.5807574727618313, "learning_rate": 1.7167435195546683e-06, "loss": 0.4133, "step": 3014 }, { "epoch": 1.27, "grad_norm": 0.5834413664173587, "learning_rate": 1.7149593964501285e-06, "loss": 0.4132, "step": 3015 }, { "epoch": 1.27, "grad_norm": 0.592960565681535, "learning_rate": 1.7131757167900966e-06, "loss": 0.4159, "step": 3016 }, { "epoch": 1.27, "grad_norm": 0.5734356639268737, "learning_rate": 1.711392481582119e-06, "loss": 0.4037, "step": 3017 }, { "epoch": 1.28, "grad_norm": 0.608289566001257, "learning_rate": 1.7096096918334853e-06, "loss": 0.4225, "step": 3018 }, { "epoch": 1.28, "grad_norm": 0.5553937161005909, "learning_rate": 1.7078273485512392e-06, "loss": 0.428, "step": 3019 }, { "epoch": 1.28, "grad_norm": 0.5800080754367256, "learning_rate": 1.7060454527421688e-06, "loss": 0.4497, "step": 3020 }, { "epoch": 1.28, "grad_norm": 0.5956520471145508, "learning_rate": 1.7042640054128112e-06, "loss": 0.4023, "step": 3021 }, { "epoch": 1.28, "grad_norm": 0.5513802042755411, "learning_rate": 1.7024830075694483e-06, "loss": 0.4057, "step": 3022 }, { "epoch": 1.28, "grad_norm": 0.5663784002610377, "learning_rate": 1.7007024602181105e-06, "loss": 0.4072, "step": 3023 }, { "epoch": 1.28, "grad_norm": 0.5728768465657827, "learning_rate": 1.6989223643645706e-06, "loss": 0.4027, "step": 3024 }, { "epoch": 1.28, "grad_norm": 0.5540154660402091, "learning_rate": 1.6971427210143503e-06, "loss": 0.417, "step": 3025 }, { "epoch": 1.28, "grad_norm": 0.580580249925815, "learning_rate": 1.6953635311727126e-06, "loss": 0.4203, "step": 3026 }, { "epoch": 1.28, "grad_norm": 0.6204069176031851, "learning_rate": 1.6935847958446657e-06, "loss": 0.438, "step": 3027 }, { "epoch": 1.28, "grad_norm": 0.5536941783725637, "learning_rate": 1.6918065160349604e-06, "loss": 0.4255, "step": 3028 }, { "epoch": 1.28, "grad_norm": 0.569556045523807, "learning_rate": 1.6900286927480898e-06, "loss": 0.4122, "step": 3029 }, { "epoch": 1.28, "grad_norm": 0.5546113410095393, "learning_rate": 1.6882513269882916e-06, "loss": 0.4194, "step": 3030 }, { "epoch": 1.28, "grad_norm": 0.5749490985614878, "learning_rate": 1.6864744197595418e-06, "loss": 0.4213, "step": 3031 }, { "epoch": 1.28, "grad_norm": 0.5849700553588103, "learning_rate": 1.68469797206556e-06, "loss": 0.414, "step": 3032 }, { "epoch": 1.28, "grad_norm": 0.5642184593848872, "learning_rate": 1.6829219849098035e-06, "loss": 0.4217, "step": 3033 }, { "epoch": 1.28, "grad_norm": 0.5928018316436391, "learning_rate": 1.681146459295473e-06, "loss": 0.4244, "step": 3034 }, { "epoch": 1.28, "grad_norm": 0.591820965440071, "learning_rate": 1.6793713962255043e-06, "loss": 0.4609, "step": 3035 }, { "epoch": 1.28, "grad_norm": 0.5886651969708854, "learning_rate": 1.6775967967025764e-06, "loss": 0.418, "step": 3036 }, { "epoch": 1.28, "grad_norm": 0.563924203947021, "learning_rate": 1.675822661729103e-06, "loss": 0.3846, "step": 3037 }, { "epoch": 1.28, "grad_norm": 0.5882580889291531, "learning_rate": 1.674048992307237e-06, "loss": 0.4225, "step": 3038 }, { "epoch": 1.28, "grad_norm": 0.5609824212530277, "learning_rate": 1.6722757894388675e-06, "loss": 0.3968, "step": 3039 }, { "epoch": 1.28, "grad_norm": 0.5783821889365726, "learning_rate": 1.6705030541256211e-06, "loss": 0.4044, "step": 3040 }, { "epoch": 1.29, "grad_norm": 0.5904279554578172, "learning_rate": 1.6687307873688583e-06, "loss": 0.4334, "step": 3041 }, { "epoch": 1.29, "grad_norm": 0.6379074710732456, "learning_rate": 1.6669589901696778e-06, "loss": 0.4264, "step": 3042 }, { "epoch": 1.29, "grad_norm": 0.5522942159204164, "learning_rate": 1.665187663528912e-06, "loss": 0.4256, "step": 3043 }, { "epoch": 1.29, "grad_norm": 0.5665275799582181, "learning_rate": 1.6634168084471252e-06, "loss": 0.4099, "step": 3044 }, { "epoch": 1.29, "grad_norm": 0.5800540818159535, "learning_rate": 1.661646425924619e-06, "loss": 0.4272, "step": 3045 }, { "epoch": 1.29, "grad_norm": 0.59834302679002, "learning_rate": 1.6598765169614245e-06, "loss": 0.417, "step": 3046 }, { "epoch": 1.29, "grad_norm": 0.5425819631570858, "learning_rate": 1.6581070825573093e-06, "loss": 0.4123, "step": 3047 }, { "epoch": 1.29, "grad_norm": 0.549947011538962, "learning_rate": 1.6563381237117688e-06, "loss": 0.4141, "step": 3048 }, { "epoch": 1.29, "grad_norm": 0.5708925242028148, "learning_rate": 1.6545696414240326e-06, "loss": 0.4306, "step": 3049 }, { "epoch": 1.29, "grad_norm": 0.5742846219284804, "learning_rate": 1.6528016366930594e-06, "loss": 0.4147, "step": 3050 }, { "epoch": 1.29, "grad_norm": 0.6145663624908198, "learning_rate": 1.6510341105175401e-06, "loss": 0.4262, "step": 3051 }, { "epoch": 1.29, "grad_norm": 0.6072916721819851, "learning_rate": 1.6492670638958924e-06, "loss": 0.441, "step": 3052 }, { "epoch": 1.29, "grad_norm": 0.5834453619650581, "learning_rate": 1.647500497826267e-06, "loss": 0.4159, "step": 3053 }, { "epoch": 1.29, "eval_loss": 0.4601620137691498, "eval_runtime": 6944.585, "eval_samples_per_second": 41.815, "eval_steps_per_second": 2.091, "step": 3053 }, { "epoch": 1.29, "grad_norm": 0.558687925184084, "learning_rate": 1.6457344133065395e-06, "loss": 0.4053, "step": 3054 }, { "epoch": 1.29, "grad_norm": 0.5674610161351168, "learning_rate": 1.643968811334315e-06, "loss": 0.417, "step": 3055 }, { "epoch": 1.29, "grad_norm": 0.5640253416506891, "learning_rate": 1.642203692906927e-06, "loss": 0.3994, "step": 3056 }, { "epoch": 1.29, "grad_norm": 0.5508590652466895, "learning_rate": 1.640439059021433e-06, "loss": 0.4221, "step": 3057 }, { "epoch": 1.29, "grad_norm": 0.5938928064517502, "learning_rate": 1.6386749106746214e-06, "loss": 0.4267, "step": 3058 }, { "epoch": 1.29, "grad_norm": 0.5570296609089503, "learning_rate": 1.6369112488630009e-06, "loss": 0.4032, "step": 3059 }, { "epoch": 1.29, "grad_norm": 0.5854299661276051, "learning_rate": 1.6351480745828098e-06, "loss": 0.4095, "step": 3060 }, { "epoch": 1.29, "grad_norm": 0.559667720654985, "learning_rate": 1.6333853888300083e-06, "loss": 0.4221, "step": 3061 }, { "epoch": 1.29, "grad_norm": 0.5735241871511867, "learning_rate": 1.6316231926002823e-06, "loss": 0.395, "step": 3062 }, { "epoch": 1.29, "grad_norm": 0.5416537755969776, "learning_rate": 1.629861486889039e-06, "loss": 0.3963, "step": 3063 }, { "epoch": 1.29, "grad_norm": 0.5673587035921341, "learning_rate": 1.6281002726914125e-06, "loss": 0.4109, "step": 3064 }, { "epoch": 1.3, "grad_norm": 0.5433392636524627, "learning_rate": 1.6263395510022546e-06, "loss": 0.4001, "step": 3065 }, { "epoch": 1.3, "grad_norm": 0.5550782192229988, "learning_rate": 1.6245793228161421e-06, "loss": 0.4045, "step": 3066 }, { "epoch": 1.3, "grad_norm": 0.5861247355677058, "learning_rate": 1.622819589127372e-06, "loss": 0.4346, "step": 3067 }, { "epoch": 1.3, "grad_norm": 0.607146634130001, "learning_rate": 1.6210603509299604e-06, "loss": 0.4115, "step": 3068 }, { "epoch": 1.3, "grad_norm": 0.5562854235248, "learning_rate": 1.6193016092176484e-06, "loss": 0.4203, "step": 3069 }, { "epoch": 1.3, "grad_norm": 0.5627927077991777, "learning_rate": 1.6175433649838901e-06, "loss": 0.4262, "step": 3070 }, { "epoch": 1.3, "grad_norm": 0.5750081673543436, "learning_rate": 1.615785619221864e-06, "loss": 0.3833, "step": 3071 }, { "epoch": 1.3, "grad_norm": 0.5936159776999882, "learning_rate": 1.6140283729244638e-06, "loss": 0.4203, "step": 3072 }, { "epoch": 1.3, "grad_norm": 0.5718250335162808, "learning_rate": 1.6122716270843025e-06, "loss": 0.4065, "step": 3073 }, { "epoch": 1.3, "grad_norm": 0.5771598792199492, "learning_rate": 1.6105153826937087e-06, "loss": 0.4185, "step": 3074 }, { "epoch": 1.3, "grad_norm": 0.5759622276853105, "learning_rate": 1.6087596407447314e-06, "loss": 0.42, "step": 3075 }, { "epoch": 1.3, "grad_norm": 0.611717304812912, "learning_rate": 1.607004402229132e-06, "loss": 0.4072, "step": 3076 }, { "epoch": 1.3, "grad_norm": 0.5872035309615434, "learning_rate": 1.60524966813839e-06, "loss": 0.4354, "step": 3077 }, { "epoch": 1.3, "grad_norm": 0.5811950216335751, "learning_rate": 1.6034954394636977e-06, "loss": 0.4073, "step": 3078 }, { "epoch": 1.3, "grad_norm": 0.5706512241287943, "learning_rate": 1.6017417171959643e-06, "loss": 0.4203, "step": 3079 }, { "epoch": 1.3, "grad_norm": 0.5826675112936385, "learning_rate": 1.5999885023258099e-06, "loss": 0.3995, "step": 3080 }, { "epoch": 1.3, "grad_norm": 0.5682279192683956, "learning_rate": 1.5982357958435723e-06, "loss": 0.3878, "step": 3081 }, { "epoch": 1.3, "grad_norm": 0.5713141319558672, "learning_rate": 1.5964835987392991e-06, "loss": 0.4014, "step": 3082 }, { "epoch": 1.3, "grad_norm": 0.5747223757155583, "learning_rate": 1.59473191200275e-06, "loss": 0.4146, "step": 3083 }, { "epoch": 1.3, "grad_norm": 0.5809784351004627, "learning_rate": 1.5929807366233979e-06, "loss": 0.4445, "step": 3084 }, { "epoch": 1.3, "grad_norm": 0.5632117792726165, "learning_rate": 1.5912300735904252e-06, "loss": 0.3946, "step": 3085 }, { "epoch": 1.3, "grad_norm": 0.581279032610451, "learning_rate": 1.5894799238927277e-06, "loss": 0.395, "step": 3086 }, { "epoch": 1.3, "grad_norm": 0.5877431541038257, "learning_rate": 1.5877302885189077e-06, "loss": 0.4212, "step": 3087 }, { "epoch": 1.31, "grad_norm": 0.6178627095748951, "learning_rate": 1.5859811684572796e-06, "loss": 0.4237, "step": 3088 }, { "epoch": 1.31, "grad_norm": 0.5791179237596596, "learning_rate": 1.584232564695865e-06, "loss": 0.4243, "step": 3089 }, { "epoch": 1.31, "grad_norm": 0.5657862133711392, "learning_rate": 1.5824844782223956e-06, "loss": 0.4234, "step": 3090 }, { "epoch": 1.31, "grad_norm": 0.5767524507801428, "learning_rate": 1.5807369100243084e-06, "loss": 0.4211, "step": 3091 }, { "epoch": 1.31, "grad_norm": 0.5697331723482661, "learning_rate": 1.578989861088751e-06, "loss": 0.414, "step": 3092 }, { "epoch": 1.31, "grad_norm": 0.5476753293523711, "learning_rate": 1.5772433324025748e-06, "loss": 0.406, "step": 3093 }, { "epoch": 1.31, "grad_norm": 0.5561716385297626, "learning_rate": 1.5754973249523387e-06, "loss": 0.4155, "step": 3094 }, { "epoch": 1.31, "grad_norm": 0.572512956688457, "learning_rate": 1.5737518397243074e-06, "loss": 0.4286, "step": 3095 }, { "epoch": 1.31, "grad_norm": 0.5939672288238211, "learning_rate": 1.5720068777044479e-06, "loss": 0.4276, "step": 3096 }, { "epoch": 1.31, "grad_norm": 0.5966500948903306, "learning_rate": 1.570262439878437e-06, "loss": 0.4211, "step": 3097 }, { "epoch": 1.31, "grad_norm": 0.5952090362230588, "learning_rate": 1.56851852723165e-06, "loss": 0.4447, "step": 3098 }, { "epoch": 1.31, "grad_norm": 0.5673871145350723, "learning_rate": 1.5667751407491689e-06, "loss": 0.4408, "step": 3099 }, { "epoch": 1.31, "grad_norm": 0.5663044725908228, "learning_rate": 1.5650322814157764e-06, "loss": 0.4219, "step": 3100 }, { "epoch": 1.31, "grad_norm": 0.5902068842643253, "learning_rate": 1.5632899502159594e-06, "loss": 0.4104, "step": 3101 }, { "epoch": 1.31, "grad_norm": 0.5961011802361124, "learning_rate": 1.561548148133904e-06, "loss": 0.3999, "step": 3102 }, { "epoch": 1.31, "grad_norm": 0.5676333875668861, "learning_rate": 1.559806876153501e-06, "loss": 0.42, "step": 3103 }, { "epoch": 1.31, "grad_norm": 0.5717204537217901, "learning_rate": 1.5580661352583377e-06, "loss": 0.3927, "step": 3104 }, { "epoch": 1.31, "grad_norm": 0.5932365555296837, "learning_rate": 1.5563259264317048e-06, "loss": 0.4069, "step": 3105 }, { "epoch": 1.31, "grad_norm": 0.60609083336309, "learning_rate": 1.55458625065659e-06, "loss": 0.4232, "step": 3106 }, { "epoch": 1.31, "grad_norm": 0.5568592972383142, "learning_rate": 1.5528471089156805e-06, "loss": 0.4035, "step": 3107 }, { "epoch": 1.31, "grad_norm": 0.5811068115782658, "learning_rate": 1.5511085021913644e-06, "loss": 0.4381, "step": 3108 }, { "epoch": 1.31, "grad_norm": 0.5978207369401186, "learning_rate": 1.5493704314657232e-06, "loss": 0.438, "step": 3109 }, { "epoch": 1.31, "grad_norm": 0.5720438095205538, "learning_rate": 1.5476328977205396e-06, "loss": 0.4279, "step": 3110 }, { "epoch": 1.31, "grad_norm": 0.5893474852498958, "learning_rate": 1.5458959019372893e-06, "loss": 0.431, "step": 3111 }, { "epoch": 1.32, "grad_norm": 0.5490466454197072, "learning_rate": 1.544159445097148e-06, "loss": 0.4092, "step": 3112 }, { "epoch": 1.32, "grad_norm": 0.5384899530575747, "learning_rate": 1.542423528180983e-06, "loss": 0.3936, "step": 3113 }, { "epoch": 1.32, "grad_norm": 0.5850096033351098, "learning_rate": 1.5406881521693606e-06, "loss": 0.4019, "step": 3114 }, { "epoch": 1.32, "grad_norm": 0.587195704302166, "learning_rate": 1.5389533180425387e-06, "loss": 0.4144, "step": 3115 }, { "epoch": 1.32, "grad_norm": 0.5635366537239104, "learning_rate": 1.5372190267804704e-06, "loss": 0.4129, "step": 3116 }, { "epoch": 1.32, "grad_norm": 0.5718044990589992, "learning_rate": 1.5354852793628007e-06, "loss": 0.4009, "step": 3117 }, { "epoch": 1.32, "grad_norm": 0.5926450332610386, "learning_rate": 1.53375207676887e-06, "loss": 0.4348, "step": 3118 }, { "epoch": 1.32, "grad_norm": 0.564012302965777, "learning_rate": 1.5320194199777078e-06, "loss": 0.4008, "step": 3119 }, { "epoch": 1.32, "grad_norm": 0.5881564721462378, "learning_rate": 1.5302873099680378e-06, "loss": 0.4116, "step": 3120 }, { "epoch": 1.32, "grad_norm": 0.5805570882385258, "learning_rate": 1.5285557477182744e-06, "loss": 0.3955, "step": 3121 }, { "epoch": 1.32, "grad_norm": 0.601026663572398, "learning_rate": 1.5268247342065215e-06, "loss": 0.4222, "step": 3122 }, { "epoch": 1.32, "grad_norm": 0.5688916955974609, "learning_rate": 1.525094270410574e-06, "loss": 0.4077, "step": 3123 }, { "epoch": 1.32, "grad_norm": 0.6376814875861784, "learning_rate": 1.5233643573079148e-06, "loss": 0.4198, "step": 3124 }, { "epoch": 1.32, "eval_loss": 0.45952433347702026, "eval_runtime": 6938.4565, "eval_samples_per_second": 41.852, "eval_steps_per_second": 2.093, "step": 3124 }, { "epoch": 1.32, "grad_norm": 0.5675589935533134, "learning_rate": 1.5216349958757187e-06, "loss": 0.4276, "step": 3125 }, { "epoch": 1.32, "grad_norm": 0.5819253780607699, "learning_rate": 1.5199061870908457e-06, "loss": 0.4135, "step": 3126 }, { "epoch": 1.32, "grad_norm": 0.5473928976130037, "learning_rate": 1.518177931929846e-06, "loss": 0.39, "step": 3127 }, { "epoch": 1.32, "grad_norm": 0.5668895033063771, "learning_rate": 1.516450231368955e-06, "loss": 0.4145, "step": 3128 }, { "epoch": 1.32, "grad_norm": 0.5566976729596672, "learning_rate": 1.5147230863840968e-06, "loss": 0.417, "step": 3129 }, { "epoch": 1.32, "grad_norm": 0.5599445867608915, "learning_rate": 1.5129964979508792e-06, "loss": 0.4049, "step": 3130 }, { "epoch": 1.32, "grad_norm": 0.5689525691882638, "learning_rate": 1.5112704670445994e-06, "loss": 0.414, "step": 3131 }, { "epoch": 1.32, "grad_norm": 0.5745693019904166, "learning_rate": 1.509544994640236e-06, "loss": 0.415, "step": 3132 }, { "epoch": 1.32, "grad_norm": 0.5703134113139972, "learning_rate": 1.507820081712454e-06, "loss": 0.4202, "step": 3133 }, { "epoch": 1.32, "grad_norm": 0.5798680593678225, "learning_rate": 1.5060957292356021e-06, "loss": 0.4224, "step": 3134 }, { "epoch": 1.33, "grad_norm": 0.5634109776233843, "learning_rate": 1.5043719381837113e-06, "loss": 0.4165, "step": 3135 }, { "epoch": 1.33, "grad_norm": 0.6007001621304139, "learning_rate": 1.5026487095304982e-06, "loss": 0.4143, "step": 3136 }, { "epoch": 1.33, "grad_norm": 0.5816452933961378, "learning_rate": 1.5009260442493582e-06, "loss": 0.3987, "step": 3137 }, { "epoch": 1.33, "grad_norm": 0.5520751522048931, "learning_rate": 1.4992039433133715e-06, "loss": 0.4151, "step": 3138 }, { "epoch": 1.33, "grad_norm": 0.6053997190844257, "learning_rate": 1.497482407695297e-06, "loss": 0.4311, "step": 3139 }, { "epoch": 1.33, "grad_norm": 0.5574155007809499, "learning_rate": 1.495761438367577e-06, "loss": 0.3934, "step": 3140 }, { "epoch": 1.33, "grad_norm": 0.5612828220380264, "learning_rate": 1.4940410363023306e-06, "loss": 0.3898, "step": 3141 }, { "epoch": 1.33, "grad_norm": 0.5900288585719455, "learning_rate": 1.4923212024713602e-06, "loss": 0.4369, "step": 3142 }, { "epoch": 1.33, "grad_norm": 0.5818357215539803, "learning_rate": 1.4906019378461437e-06, "loss": 0.4448, "step": 3143 }, { "epoch": 1.33, "grad_norm": 0.553139692347851, "learning_rate": 1.4888832433978403e-06, "loss": 0.426, "step": 3144 }, { "epoch": 1.33, "grad_norm": 0.5952446958459735, "learning_rate": 1.4871651200972854e-06, "loss": 0.4246, "step": 3145 }, { "epoch": 1.33, "grad_norm": 0.6187484191420308, "learning_rate": 1.485447568914991e-06, "loss": 0.4145, "step": 3146 }, { "epoch": 1.33, "grad_norm": 0.6002358417607456, "learning_rate": 1.4837305908211502e-06, "loss": 0.407, "step": 3147 }, { "epoch": 1.33, "grad_norm": 0.55885003361605, "learning_rate": 1.4820141867856268e-06, "loss": 0.3977, "step": 3148 }, { "epoch": 1.33, "grad_norm": 0.5703620407770618, "learning_rate": 1.4802983577779651e-06, "loss": 0.4402, "step": 3149 }, { "epoch": 1.33, "grad_norm": 0.5752264024437074, "learning_rate": 1.47858310476738e-06, "loss": 0.4081, "step": 3150 }, { "epoch": 1.33, "grad_norm": 0.587484294352918, "learning_rate": 1.4768684287227652e-06, "loss": 0.4258, "step": 3151 }, { "epoch": 1.33, "grad_norm": 0.577277316397856, "learning_rate": 1.4751543306126856e-06, "loss": 0.4076, "step": 3152 }, { "epoch": 1.33, "grad_norm": 0.5585293413380346, "learning_rate": 1.4734408114053822e-06, "loss": 0.4066, "step": 3153 }, { "epoch": 1.33, "grad_norm": 0.5996038294832453, "learning_rate": 1.471727872068766e-06, "loss": 0.413, "step": 3154 }, { "epoch": 1.33, "grad_norm": 0.5908728837224524, "learning_rate": 1.470015513570424e-06, "loss": 0.4135, "step": 3155 }, { "epoch": 1.33, "grad_norm": 0.5973053416179713, "learning_rate": 1.468303736877611e-06, "loss": 0.4216, "step": 3156 }, { "epoch": 1.33, "grad_norm": 0.5783789948908563, "learning_rate": 1.466592542957257e-06, "loss": 0.4292, "step": 3157 }, { "epoch": 1.34, "grad_norm": 0.549412909563158, "learning_rate": 1.4648819327759589e-06, "loss": 0.4151, "step": 3158 }, { "epoch": 1.34, "grad_norm": 0.5638177225917886, "learning_rate": 1.4631719072999884e-06, "loss": 0.43, "step": 3159 }, { "epoch": 1.34, "grad_norm": 0.5811841444119168, "learning_rate": 1.4614624674952843e-06, "loss": 0.4199, "step": 3160 }, { "epoch": 1.34, "grad_norm": 0.5745757615240445, "learning_rate": 1.4597536143274537e-06, "loss": 0.4223, "step": 3161 }, { "epoch": 1.34, "grad_norm": 0.5695416398603609, "learning_rate": 1.4580453487617747e-06, "loss": 0.403, "step": 3162 }, { "epoch": 1.34, "grad_norm": 0.5643072461632591, "learning_rate": 1.4563376717631906e-06, "loss": 0.4024, "step": 3163 }, { "epoch": 1.34, "grad_norm": 0.5855374862753647, "learning_rate": 1.4546305842963156e-06, "loss": 0.4169, "step": 3164 }, { "epoch": 1.34, "grad_norm": 0.5640742939680888, "learning_rate": 1.452924087325428e-06, "loss": 0.3986, "step": 3165 }, { "epoch": 1.34, "grad_norm": 0.5854422692893917, "learning_rate": 1.4512181818144763e-06, "loss": 0.4092, "step": 3166 }, { "epoch": 1.34, "grad_norm": 0.6013198747012948, "learning_rate": 1.4495128687270682e-06, "loss": 0.4147, "step": 3167 }, { "epoch": 1.34, "grad_norm": 0.6349112878398844, "learning_rate": 1.4478081490264841e-06, "loss": 0.4013, "step": 3168 }, { "epoch": 1.34, "grad_norm": 0.6018833225607165, "learning_rate": 1.4461040236756643e-06, "loss": 0.4034, "step": 3169 }, { "epoch": 1.34, "grad_norm": 0.572757916476602, "learning_rate": 1.4444004936372166e-06, "loss": 0.4139, "step": 3170 }, { "epoch": 1.34, "grad_norm": 0.5964839570478256, "learning_rate": 1.4426975598734103e-06, "loss": 0.4403, "step": 3171 }, { "epoch": 1.34, "grad_norm": 0.6532887857894302, "learning_rate": 1.4409952233461777e-06, "loss": 0.4089, "step": 3172 }, { "epoch": 1.34, "grad_norm": 0.599992570422661, "learning_rate": 1.4392934850171161e-06, "loss": 0.4328, "step": 3173 }, { "epoch": 1.34, "grad_norm": 0.5924831441960442, "learning_rate": 1.4375923458474822e-06, "loss": 0.4397, "step": 3174 }, { "epoch": 1.34, "grad_norm": 0.5694039028586787, "learning_rate": 1.4358918067981969e-06, "loss": 0.4052, "step": 3175 }, { "epoch": 1.34, "grad_norm": 0.5638688789990264, "learning_rate": 1.434191868829839e-06, "loss": 0.4306, "step": 3176 }, { "epoch": 1.34, "grad_norm": 0.6070863428317758, "learning_rate": 1.4324925329026526e-06, "loss": 0.4069, "step": 3177 }, { "epoch": 1.34, "grad_norm": 0.5970977280572529, "learning_rate": 1.4307937999765343e-06, "loss": 0.4264, "step": 3178 }, { "epoch": 1.34, "grad_norm": 0.5400844435082709, "learning_rate": 1.4290956710110477e-06, "loss": 0.4114, "step": 3179 }, { "epoch": 1.34, "grad_norm": 0.6189005664696162, "learning_rate": 1.4273981469654093e-06, "loss": 0.4393, "step": 3180 }, { "epoch": 1.34, "grad_norm": 0.5551865309592269, "learning_rate": 1.4257012287984994e-06, "loss": 0.3815, "step": 3181 }, { "epoch": 1.35, "grad_norm": 0.5788766743190364, "learning_rate": 1.4240049174688514e-06, "loss": 0.4129, "step": 3182 }, { "epoch": 1.35, "grad_norm": 0.6145740654828784, "learning_rate": 1.4223092139346583e-06, "loss": 0.4465, "step": 3183 }, { "epoch": 1.35, "grad_norm": 0.56123366419593, "learning_rate": 1.4206141191537681e-06, "loss": 0.4226, "step": 3184 }, { "epoch": 1.35, "grad_norm": 0.5702082132713671, "learning_rate": 1.4189196340836866e-06, "loss": 0.4407, "step": 3185 }, { "epoch": 1.35, "grad_norm": 0.568086319678547, "learning_rate": 1.4172257596815762e-06, "loss": 0.4235, "step": 3186 }, { "epoch": 1.35, "grad_norm": 0.570650833706996, "learning_rate": 1.41553249690425e-06, "loss": 0.397, "step": 3187 }, { "epoch": 1.35, "grad_norm": 0.5758906759310528, "learning_rate": 1.413839846708182e-06, "loss": 0.4008, "step": 3188 }, { "epoch": 1.35, "grad_norm": 0.5867057883601531, "learning_rate": 1.4121478100494926e-06, "loss": 0.3931, "step": 3189 }, { "epoch": 1.35, "grad_norm": 0.6216370173036398, "learning_rate": 1.4104563878839623e-06, "loss": 0.432, "step": 3190 }, { "epoch": 1.35, "grad_norm": 0.5834896894589426, "learning_rate": 1.4087655811670196e-06, "loss": 0.3892, "step": 3191 }, { "epoch": 1.35, "grad_norm": 0.55572187008008, "learning_rate": 1.4070753908537498e-06, "loss": 0.3945, "step": 3192 }, { "epoch": 1.35, "grad_norm": 0.6195431150659669, "learning_rate": 1.4053858178988866e-06, "loss": 0.4252, "step": 3193 }, { "epoch": 1.35, "grad_norm": 0.5855307932366061, "learning_rate": 1.4036968632568163e-06, "loss": 0.4023, "step": 3194 }, { "epoch": 1.35, "grad_norm": 0.5785248405460688, "learning_rate": 1.4020085278815745e-06, "loss": 0.4187, "step": 3195 }, { "epoch": 1.35, "eval_loss": 0.4585270285606384, "eval_runtime": 6935.0493, "eval_samples_per_second": 41.873, "eval_steps_per_second": 2.094, "step": 3195 }, { "epoch": 1.35, "grad_norm": 0.5793269431905544, "learning_rate": 1.4003208127268503e-06, "loss": 0.4362, "step": 3196 }, { "epoch": 1.35, "grad_norm": 0.5635960927267711, "learning_rate": 1.3986337187459787e-06, "loss": 0.396, "step": 3197 }, { "epoch": 1.35, "grad_norm": 0.5675588328279935, "learning_rate": 1.3969472468919462e-06, "loss": 0.4357, "step": 3198 }, { "epoch": 1.35, "grad_norm": 0.5813371441566617, "learning_rate": 1.3952613981173894e-06, "loss": 0.4202, "step": 3199 }, { "epoch": 1.35, "grad_norm": 0.5567802235455163, "learning_rate": 1.3935761733745865e-06, "loss": 0.4131, "step": 3200 }, { "epoch": 1.35, "grad_norm": 0.5821477938771911, "learning_rate": 1.3918915736154704e-06, "loss": 0.4214, "step": 3201 }, { "epoch": 1.35, "grad_norm": 0.5964536001348693, "learning_rate": 1.3902075997916164e-06, "loss": 0.4336, "step": 3202 }, { "epoch": 1.35, "grad_norm": 0.6086865744806292, "learning_rate": 1.3885242528542497e-06, "loss": 0.4551, "step": 3203 }, { "epoch": 1.35, "grad_norm": 0.575959160272487, "learning_rate": 1.3868415337542382e-06, "loss": 0.4224, "step": 3204 }, { "epoch": 1.36, "grad_norm": 0.6026326885124447, "learning_rate": 1.3851594434420968e-06, "loss": 0.4202, "step": 3205 }, { "epoch": 1.36, "grad_norm": 0.5493404687568753, "learning_rate": 1.383477982867984e-06, "loss": 0.3954, "step": 3206 }, { "epoch": 1.36, "grad_norm": 0.5781348712939294, "learning_rate": 1.3817971529817054e-06, "loss": 0.3956, "step": 3207 }, { "epoch": 1.36, "grad_norm": 0.5848758929707726, "learning_rate": 1.380116954732706e-06, "loss": 0.4298, "step": 3208 }, { "epoch": 1.36, "grad_norm": 0.5839753363394727, "learning_rate": 1.3784373890700789e-06, "loss": 0.4159, "step": 3209 }, { "epoch": 1.36, "grad_norm": 0.5473659827496421, "learning_rate": 1.3767584569425562e-06, "loss": 0.3982, "step": 3210 }, { "epoch": 1.36, "grad_norm": 0.5854856789628843, "learning_rate": 1.375080159298513e-06, "loss": 0.3985, "step": 3211 }, { "epoch": 1.36, "grad_norm": 0.6223285089698908, "learning_rate": 1.3734024970859672e-06, "loss": 0.3924, "step": 3212 }, { "epoch": 1.36, "grad_norm": 0.5818260019212478, "learning_rate": 1.3717254712525758e-06, "loss": 0.4091, "step": 3213 }, { "epoch": 1.36, "grad_norm": 0.588671059523623, "learning_rate": 1.3700490827456393e-06, "loss": 0.4278, "step": 3214 }, { "epoch": 1.36, "grad_norm": 0.542860404907633, "learning_rate": 1.3683733325120934e-06, "loss": 0.4131, "step": 3215 }, { "epoch": 1.36, "grad_norm": 0.5562613227316864, "learning_rate": 1.3666982214985208e-06, "loss": 0.4245, "step": 3216 }, { "epoch": 1.36, "grad_norm": 4.123028508699682, "learning_rate": 1.3650237506511333e-06, "loss": 0.4637, "step": 3217 }, { "epoch": 1.36, "grad_norm": 0.590425927959101, "learning_rate": 1.3633499209157898e-06, "loss": 0.4267, "step": 3218 }, { "epoch": 1.36, "grad_norm": 0.5832844350075718, "learning_rate": 1.3616767332379815e-06, "loss": 0.4182, "step": 3219 }, { "epoch": 1.36, "grad_norm": 0.5960509584308913, "learning_rate": 1.360004188562841e-06, "loss": 0.4169, "step": 3220 }, { "epoch": 1.36, "grad_norm": 0.592810575458578, "learning_rate": 1.3583322878351346e-06, "loss": 0.427, "step": 3221 }, { "epoch": 1.36, "grad_norm": 0.5842647152175641, "learning_rate": 1.3566610319992658e-06, "loss": 0.4101, "step": 3222 }, { "epoch": 1.36, "grad_norm": 0.5687572730019927, "learning_rate": 1.3549904219992732e-06, "loss": 0.4027, "step": 3223 }, { "epoch": 1.36, "grad_norm": 0.563025461962013, "learning_rate": 1.3533204587788323e-06, "loss": 0.4068, "step": 3224 }, { "epoch": 1.36, "grad_norm": 0.5694816221991685, "learning_rate": 1.351651143281253e-06, "loss": 0.4101, "step": 3225 }, { "epoch": 1.36, "grad_norm": 0.5642547386099277, "learning_rate": 1.3499824764494773e-06, "loss": 0.4127, "step": 3226 }, { "epoch": 1.36, "grad_norm": 0.5480375629168616, "learning_rate": 1.3483144592260844e-06, "loss": 0.397, "step": 3227 }, { "epoch": 1.36, "grad_norm": 0.5653603918404666, "learning_rate": 1.346647092553281e-06, "loss": 0.4109, "step": 3228 }, { "epoch": 1.37, "grad_norm": 0.5806409790053582, "learning_rate": 1.3449803773729115e-06, "loss": 0.3967, "step": 3229 }, { "epoch": 1.37, "grad_norm": 0.5605529434144851, "learning_rate": 1.3433143146264494e-06, "loss": 0.4287, "step": 3230 }, { "epoch": 1.37, "grad_norm": 0.5722146410788806, "learning_rate": 1.3416489052550019e-06, "loss": 0.4006, "step": 3231 }, { "epoch": 1.37, "grad_norm": 0.575347179976502, "learning_rate": 1.3399841501993056e-06, "loss": 0.4116, "step": 3232 }, { "epoch": 1.37, "grad_norm": 0.5949791657298646, "learning_rate": 1.338320050399727e-06, "loss": 0.4303, "step": 3233 }, { "epoch": 1.37, "grad_norm": 0.5794211854463649, "learning_rate": 1.3366566067962628e-06, "loss": 0.4124, "step": 3234 }, { "epoch": 1.37, "grad_norm": 0.5793797643264423, "learning_rate": 1.3349938203285412e-06, "loss": 0.4275, "step": 3235 }, { "epoch": 1.37, "grad_norm": 0.5842493222488487, "learning_rate": 1.3333316919358159e-06, "loss": 0.4209, "step": 3236 }, { "epoch": 1.37, "grad_norm": 0.5663991497625657, "learning_rate": 1.3316702225569708e-06, "loss": 0.3918, "step": 3237 }, { "epoch": 1.37, "grad_norm": 0.5486915223283709, "learning_rate": 1.3300094131305196e-06, "loss": 0.4073, "step": 3238 }, { "epoch": 1.37, "grad_norm": 0.5350475982264186, "learning_rate": 1.3283492645945966e-06, "loss": 0.3985, "step": 3239 }, { "epoch": 1.37, "grad_norm": 0.5886624292798438, "learning_rate": 1.3266897778869704e-06, "loss": 0.393, "step": 3240 }, { "epoch": 1.37, "grad_norm": 0.5755756067045215, "learning_rate": 1.3250309539450298e-06, "loss": 0.4184, "step": 3241 }, { "epoch": 1.37, "grad_norm": 0.5677276610035034, "learning_rate": 1.3233727937057938e-06, "loss": 0.4299, "step": 3242 }, { "epoch": 1.37, "grad_norm": 0.5690350952201221, "learning_rate": 1.3217152981059043e-06, "loss": 0.4292, "step": 3243 }, { "epoch": 1.37, "grad_norm": 0.5739165525454375, "learning_rate": 1.320058468081627e-06, "loss": 0.4076, "step": 3244 }, { "epoch": 1.37, "grad_norm": 0.5613813103721713, "learning_rate": 1.3184023045688515e-06, "loss": 0.4214, "step": 3245 }, { "epoch": 1.37, "grad_norm": 0.5954222168994903, "learning_rate": 1.3167468085030948e-06, "loss": 0.4513, "step": 3246 }, { "epoch": 1.37, "grad_norm": 0.5502927352970999, "learning_rate": 1.3150919808194917e-06, "loss": 0.4156, "step": 3247 }, { "epoch": 1.37, "grad_norm": 0.5608089205143509, "learning_rate": 1.3134378224528026e-06, "loss": 0.416, "step": 3248 }, { "epoch": 1.37, "grad_norm": 0.5536001064152207, "learning_rate": 1.311784334337411e-06, "loss": 0.4126, "step": 3249 }, { "epoch": 1.37, "grad_norm": 0.5766677004057259, "learning_rate": 1.3101315174073162e-06, "loss": 0.4304, "step": 3250 }, { "epoch": 1.37, "grad_norm": 0.5776009437096029, "learning_rate": 1.3084793725961447e-06, "loss": 0.3962, "step": 3251 }, { "epoch": 1.38, "grad_norm": 0.5479613505888217, "learning_rate": 1.3068279008371387e-06, "loss": 0.3994, "step": 3252 }, { "epoch": 1.38, "grad_norm": 0.5690887730011218, "learning_rate": 1.3051771030631644e-06, "loss": 0.4088, "step": 3253 }, { "epoch": 1.38, "grad_norm": 0.5799412277505956, "learning_rate": 1.303526980206704e-06, "loss": 0.422, "step": 3254 }, { "epoch": 1.38, "grad_norm": 0.5570959933884828, "learning_rate": 1.301877533199859e-06, "loss": 0.4104, "step": 3255 }, { "epoch": 1.38, "grad_norm": 0.5643212278599219, "learning_rate": 1.3002287629743488e-06, "loss": 0.4161, "step": 3256 }, { "epoch": 1.38, "grad_norm": 0.5618675409375589, "learning_rate": 1.2985806704615139e-06, "loss": 0.4352, "step": 3257 }, { "epoch": 1.38, "grad_norm": 0.5795363521047754, "learning_rate": 1.2969332565923068e-06, "loss": 0.4149, "step": 3258 }, { "epoch": 1.38, "grad_norm": 0.5607134789936746, "learning_rate": 1.2952865222973015e-06, "loss": 0.3938, "step": 3259 }, { "epoch": 1.38, "grad_norm": 0.5774563064680311, "learning_rate": 1.2936404685066852e-06, "loss": 0.4031, "step": 3260 }, { "epoch": 1.38, "grad_norm": 0.5784824116412987, "learning_rate": 1.2919950961502603e-06, "loss": 0.4175, "step": 3261 }, { "epoch": 1.38, "grad_norm": 0.5685744391843301, "learning_rate": 1.2903504061574467e-06, "loss": 0.4102, "step": 3262 }, { "epoch": 1.38, "grad_norm": 0.6133989126312784, "learning_rate": 1.2887063994572765e-06, "loss": 0.4397, "step": 3263 }, { "epoch": 1.38, "grad_norm": 0.5576015312128622, "learning_rate": 1.2870630769783985e-06, "loss": 0.4254, "step": 3264 }, { "epoch": 1.38, "grad_norm": 0.5854336579154686, "learning_rate": 1.2854204396490722e-06, "loss": 0.425, "step": 3265 }, { "epoch": 1.38, "grad_norm": 0.5851979998505886, "learning_rate": 1.2837784883971716e-06, "loss": 0.4158, "step": 3266 }, { "epoch": 1.38, "eval_loss": 0.4578353464603424, "eval_runtime": 6934.1495, "eval_samples_per_second": 41.878, "eval_steps_per_second": 2.094, "step": 3266 }, { "epoch": 1.38, "grad_norm": 0.5605246475286948, "learning_rate": 1.2821372241501814e-06, "loss": 0.4197, "step": 3267 }, { "epoch": 1.38, "grad_norm": 0.5907220198204197, "learning_rate": 1.280496647835202e-06, "loss": 0.4239, "step": 3268 }, { "epoch": 1.38, "grad_norm": 0.5543362298552139, "learning_rate": 1.278856760378941e-06, "loss": 0.4027, "step": 3269 }, { "epoch": 1.38, "grad_norm": 0.5824324760965199, "learning_rate": 1.2772175627077204e-06, "loss": 0.3936, "step": 3270 }, { "epoch": 1.38, "grad_norm": 0.5596093376012918, "learning_rate": 1.27557905574747e-06, "loss": 0.3892, "step": 3271 }, { "epoch": 1.38, "grad_norm": 0.5847030750956056, "learning_rate": 1.2739412404237306e-06, "loss": 0.4517, "step": 3272 }, { "epoch": 1.38, "grad_norm": 0.5532504862482353, "learning_rate": 1.2723041176616508e-06, "loss": 0.4178, "step": 3273 }, { "epoch": 1.38, "grad_norm": 0.5842377189415942, "learning_rate": 1.2706676883859902e-06, "loss": 0.4135, "step": 3274 }, { "epoch": 1.39, "grad_norm": 0.5442392432462477, "learning_rate": 1.2690319535211171e-06, "loss": 0.3921, "step": 3275 }, { "epoch": 1.39, "grad_norm": 0.5655162202220977, "learning_rate": 1.2673969139910047e-06, "loss": 0.4281, "step": 3276 } ], "logging_steps": 1, "max_steps": 4680, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 468, "total_flos": 3429507897753600.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }