{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995577178239717, "eval_steps": 142, "global_step": 565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.6921162605285645, "learning_rate": 5e-05, "loss": 3.3182, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.3362529277801514, "eval_runtime": 14.4412, "eval_samples_per_second": 33.031, "eval_steps_per_second": 8.31, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.620742321014404, "learning_rate": 0.0001, "loss": 3.2788, "step": 2 }, { "epoch": 0.01, "grad_norm": 4.650161266326904, "learning_rate": 0.00015, "loss": 3.2271, "step": 3 }, { "epoch": 0.01, "grad_norm": 4.024933815002441, "learning_rate": 0.0002, "loss": 2.402, "step": 4 }, { "epoch": 0.01, "grad_norm": 2.751981496810913, "learning_rate": 0.00025, "loss": 1.0544, "step": 5 }, { "epoch": 0.01, "grad_norm": 1.4766970872879028, "learning_rate": 0.0003, "loss": 0.3549, "step": 6 }, { "epoch": 0.01, "grad_norm": 0.8064658641815186, "learning_rate": 0.00035, "loss": 0.1533, "step": 7 }, { "epoch": 0.01, "grad_norm": 2.232205390930176, "learning_rate": 0.0004, "loss": 0.31, "step": 8 }, { "epoch": 0.02, "grad_norm": 1.1898847818374634, "learning_rate": 0.00045000000000000004, "loss": 0.1818, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.7394833564758301, "learning_rate": 0.0005, "loss": 0.1751, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.16317571699619293, "learning_rate": 0.0004999995654799487, "loss": 0.1411, "step": 11 }, { "epoch": 0.02, "grad_norm": 0.10235322266817093, "learning_rate": 0.0004999982619213052, "loss": 0.1363, "step": 12 }, { "epoch": 0.02, "grad_norm": 0.19907887279987335, "learning_rate": 0.0004999960893286008, "loss": 0.128, "step": 13 }, { "epoch": 0.02, "grad_norm": 0.6823816299438477, "learning_rate": 0.0004999930477093878, "loss": 0.143, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.10187644511461258, "learning_rate": 0.0004999891370742394, "loss": 0.1322, "step": 15 }, { "epoch": 0.03, "grad_norm": 0.09401004016399384, "learning_rate": 0.0004999843574367498, "loss": 0.1361, "step": 16 }, { "epoch": 0.03, "grad_norm": 0.17946797609329224, "learning_rate": 0.0004999787088135334, "loss": 0.1412, "step": 17 }, { "epoch": 0.03, "grad_norm": 0.890545666217804, "learning_rate": 0.0004999721912242259, "loss": 0.1593, "step": 18 }, { "epoch": 0.03, "grad_norm": 0.434042751789093, "learning_rate": 0.0004999648046914836, "loss": 0.1548, "step": 19 }, { "epoch": 0.04, "grad_norm": 0.28103551268577576, "learning_rate": 0.0004999565492409831, "loss": 0.1459, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.2690610885620117, "learning_rate": 0.0004999474249014217, "loss": 0.1248, "step": 21 }, { "epoch": 0.04, "grad_norm": 0.37668731808662415, "learning_rate": 0.0004999374317045172, "loss": 0.1481, "step": 22 }, { "epoch": 0.04, "grad_norm": 0.23762015998363495, "learning_rate": 0.0004999265696850074, "loss": 0.1407, "step": 23 }, { "epoch": 0.04, "grad_norm": 0.1988176554441452, "learning_rate": 0.0004999148388806504, "loss": 0.1398, "step": 24 }, { "epoch": 0.04, "grad_norm": 0.3805619776248932, "learning_rate": 0.0004999022393322246, "loss": 0.1474, "step": 25 }, { "epoch": 0.05, "grad_norm": 0.5069771409034729, "learning_rate": 0.0004998887710835278, "loss": 0.1509, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.42066043615341187, "learning_rate": 0.0004998744341813779, "loss": 0.1341, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.0944904088973999, "learning_rate": 0.0004998592286756123, "loss": 0.1233, "step": 28 }, { "epoch": 0.05, "grad_norm": 0.849244236946106, "learning_rate": 0.0004998431546190875, "loss": 0.1999, "step": 29 }, { "epoch": 0.05, "grad_norm": 0.09785456210374832, "learning_rate": 0.00049982621206768, "loss": 0.1272, "step": 30 }, { "epoch": 0.05, "grad_norm": 0.38225457072257996, "learning_rate": 0.0004998084010802845, "loss": 0.1634, "step": 31 }, { "epoch": 0.06, "grad_norm": 0.08135183900594711, "learning_rate": 0.0004997897217188149, "loss": 0.1383, "step": 32 }, { "epoch": 0.06, "grad_norm": 0.17299437522888184, "learning_rate": 0.0004997701740482036, "loss": 0.1427, "step": 33 }, { "epoch": 0.06, "grad_norm": 0.11125747114419937, "learning_rate": 0.0004997497581364015, "loss": 0.1379, "step": 34 }, { "epoch": 0.06, "grad_norm": 0.08914893865585327, "learning_rate": 0.0004997284740543776, "loss": 0.1388, "step": 35 }, { "epoch": 0.06, "grad_norm": 0.034590039402246475, "learning_rate": 0.0004997063218761188, "loss": 0.1387, "step": 36 }, { "epoch": 0.07, "grad_norm": 0.08675777167081833, "learning_rate": 0.0004996833016786296, "loss": 0.1384, "step": 37 }, { "epoch": 0.07, "grad_norm": 0.4440009295940399, "learning_rate": 0.0004996594135419318, "loss": 0.152, "step": 38 }, { "epoch": 0.07, "grad_norm": 0.0814109519124031, "learning_rate": 0.0004996346575490646, "loss": 0.1373, "step": 39 }, { "epoch": 0.07, "grad_norm": 0.37724560499191284, "learning_rate": 0.0004996090337860836, "loss": 0.1362, "step": 40 }, { "epoch": 0.07, "grad_norm": 0.21177273988723755, "learning_rate": 0.0004995825423420613, "loss": 0.1423, "step": 41 }, { "epoch": 0.07, "grad_norm": 0.12168041616678238, "learning_rate": 0.000499555183309086, "loss": 0.1381, "step": 42 }, { "epoch": 0.08, "grad_norm": 0.21096466481685638, "learning_rate": 0.0004995269567822623, "loss": 0.139, "step": 43 }, { "epoch": 0.08, "grad_norm": 0.49395841360092163, "learning_rate": 0.0004994978628597099, "loss": 0.1016, "step": 44 }, { "epoch": 0.08, "grad_norm": 0.1108216792345047, "learning_rate": 0.0004994679016425642, "loss": 0.1334, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.5518127679824829, "learning_rate": 0.000499437073234975, "loss": 0.1568, "step": 46 }, { "epoch": 0.08, "grad_norm": 0.4762812852859497, "learning_rate": 0.0004994053777441069, "loss": 0.1543, "step": 47 }, { "epoch": 0.08, "grad_norm": 0.3477722108364105, "learning_rate": 0.0004993728152801384, "loss": 0.1464, "step": 48 }, { "epoch": 0.09, "grad_norm": 0.4996407628059387, "learning_rate": 0.0004993393859562621, "loss": 0.154, "step": 49 }, { "epoch": 0.09, "grad_norm": 0.20425601303577423, "learning_rate": 0.0004993050898886833, "loss": 0.1372, "step": 50 }, { "epoch": 0.09, "grad_norm": 0.034631408751010895, "learning_rate": 0.000499269927196621, "loss": 0.137, "step": 51 }, { "epoch": 0.09, "grad_norm": 0.24027873575687408, "learning_rate": 0.0004992338980023062, "loss": 0.1468, "step": 52 }, { "epoch": 0.09, "grad_norm": 0.22242723405361176, "learning_rate": 0.000499197002430982, "loss": 0.1418, "step": 53 }, { "epoch": 0.1, "grad_norm": 0.6540514826774597, "learning_rate": 0.0004991592406109036, "loss": 0.1564, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.030118577182292938, "learning_rate": 0.000499120612673337, "loss": 0.1365, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.07544097304344177, "learning_rate": 0.0004990811187525592, "loss": 0.1334, "step": 56 }, { "epoch": 0.1, "grad_norm": 0.37415480613708496, "learning_rate": 0.0004990407589858572, "loss": 0.155, "step": 57 }, { "epoch": 0.1, "grad_norm": 0.557809054851532, "learning_rate": 0.0004989995335135282, "loss": 0.1603, "step": 58 }, { "epoch": 0.1, "grad_norm": 0.14802873134613037, "learning_rate": 0.0004989574424788787, "loss": 0.1387, "step": 59 }, { "epoch": 0.11, "grad_norm": 0.3581993281841278, "learning_rate": 0.0004989144860282236, "loss": 0.1475, "step": 60 }, { "epoch": 0.11, "grad_norm": 0.04818522185087204, "learning_rate": 0.0004988706643108864, "loss": 0.1362, "step": 61 }, { "epoch": 0.11, "grad_norm": 0.21908174455165863, "learning_rate": 0.0004988259774791987, "loss": 0.1386, "step": 62 }, { "epoch": 0.11, "grad_norm": 0.1852695643901825, "learning_rate": 0.0004987804256884988, "loss": 0.1387, "step": 63 }, { "epoch": 0.11, "grad_norm": 0.025747304782271385, "learning_rate": 0.0004987340090971323, "loss": 0.1393, "step": 64 }, { "epoch": 0.11, "grad_norm": 0.045346710830926895, "learning_rate": 0.0004986867278664504, "loss": 0.1354, "step": 65 }, { "epoch": 0.12, "grad_norm": 0.34946465492248535, "learning_rate": 0.0004986385821608106, "loss": 0.152, "step": 66 }, { "epoch": 0.12, "grad_norm": 0.2552882432937622, "learning_rate": 0.0004985895721475748, "loss": 0.1463, "step": 67 }, { "epoch": 0.12, "grad_norm": 0.0560542456805706, "learning_rate": 0.0004985396979971099, "loss": 0.1391, "step": 68 }, { "epoch": 0.12, "grad_norm": 0.14347511529922485, "learning_rate": 0.0004984889598827863, "loss": 0.1353, "step": 69 }, { "epoch": 0.12, "grad_norm": 0.12386342883110046, "learning_rate": 0.0004984373579809778, "loss": 0.1343, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.03070697747170925, "learning_rate": 0.000498384892471061, "loss": 0.1356, "step": 71 }, { "epoch": 0.13, "grad_norm": 0.0531514473259449, "learning_rate": 0.0004983315635354144, "loss": 0.1346, "step": 72 }, { "epoch": 0.13, "grad_norm": 0.24197503924369812, "learning_rate": 0.0004982773713594178, "loss": 0.1217, "step": 73 }, { "epoch": 0.13, "grad_norm": 0.08417380601167679, "learning_rate": 0.0004982223161314522, "loss": 0.1223, "step": 74 }, { "epoch": 0.13, "grad_norm": 0.40045711398124695, "learning_rate": 0.000498166398042898, "loss": 0.1513, "step": 75 }, { "epoch": 0.13, "grad_norm": 0.12452740222215652, "learning_rate": 0.0004981096172881358, "loss": 0.1296, "step": 76 }, { "epoch": 0.14, "grad_norm": 0.21590262651443481, "learning_rate": 0.0004980519740645444, "loss": 0.1375, "step": 77 }, { "epoch": 0.14, "grad_norm": 0.07704459875822067, "learning_rate": 0.0004979934685725011, "loss": 0.1299, "step": 78 }, { "epoch": 0.14, "grad_norm": 0.28334081172943115, "learning_rate": 0.0004979341010153801, "loss": 0.1387, "step": 79 }, { "epoch": 0.14, "grad_norm": 0.12374007701873779, "learning_rate": 0.0004978738715995526, "loss": 0.1383, "step": 80 }, { "epoch": 0.14, "grad_norm": 0.040613267570734024, "learning_rate": 0.000497812780534386, "loss": 0.1367, "step": 81 }, { "epoch": 0.15, "grad_norm": 0.09974126517772675, "learning_rate": 0.0004977508280322423, "loss": 0.1248, "step": 82 }, { "epoch": 0.15, "grad_norm": 0.2616259753704071, "learning_rate": 0.0004976880143084786, "loss": 0.1311, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.15635579824447632, "learning_rate": 0.0004976243395814452, "loss": 0.1189, "step": 84 }, { "epoch": 0.15, "grad_norm": 0.259250670671463, "learning_rate": 0.000497559804072486, "loss": 0.1099, "step": 85 }, { "epoch": 0.15, "grad_norm": 1.2778699398040771, "learning_rate": 0.0004974944080059365, "loss": 0.1416, "step": 86 }, { "epoch": 0.15, "grad_norm": 0.2155281901359558, "learning_rate": 0.000497428151609124, "loss": 0.1253, "step": 87 }, { "epoch": 0.16, "grad_norm": 0.17533721029758453, "learning_rate": 0.0004973610351123664, "loss": 0.1446, "step": 88 }, { "epoch": 0.16, "grad_norm": 0.07620590180158615, "learning_rate": 0.0004972930587489715, "loss": 0.1309, "step": 89 }, { "epoch": 0.16, "grad_norm": 0.2370779663324356, "learning_rate": 0.0004972242227552358, "loss": 0.149, "step": 90 }, { "epoch": 0.16, "grad_norm": 0.06374065577983856, "learning_rate": 0.0004971545273704444, "loss": 0.1307, "step": 91 }, { "epoch": 0.16, "grad_norm": 0.22728750109672546, "learning_rate": 0.0004970839728368697, "loss": 0.1438, "step": 92 }, { "epoch": 0.16, "grad_norm": 0.16872233152389526, "learning_rate": 0.0004970125593997706, "loss": 0.1364, "step": 93 }, { "epoch": 0.17, "grad_norm": 0.18773947656154633, "learning_rate": 0.0004969402873073914, "loss": 0.146, "step": 94 }, { "epoch": 0.17, "grad_norm": 0.1468167006969452, "learning_rate": 0.0004968671568109616, "loss": 0.1401, "step": 95 }, { "epoch": 0.17, "grad_norm": 0.09030504524707794, "learning_rate": 0.0004967931681646948, "loss": 0.1318, "step": 96 }, { "epoch": 0.17, "grad_norm": 0.061796192079782486, "learning_rate": 0.000496718321625787, "loss": 0.1244, "step": 97 }, { "epoch": 0.17, "grad_norm": 0.045495226979255676, "learning_rate": 0.0004966426174544171, "loss": 0.1265, "step": 98 }, { "epoch": 0.18, "grad_norm": 0.08449025452136993, "learning_rate": 0.0004965660559137448, "loss": 0.1276, "step": 99 }, { "epoch": 0.18, "grad_norm": 0.09982559829950333, "learning_rate": 0.0004964886372699102, "loss": 0.1253, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.05831208825111389, "learning_rate": 0.0004964103617920332, "loss": 0.1271, "step": 101 }, { "epoch": 0.18, "grad_norm": 0.20548835396766663, "learning_rate": 0.0004963312297522116, "loss": 0.1415, "step": 102 }, { "epoch": 0.18, "grad_norm": 0.09664470702409744, "learning_rate": 0.0004962512414255214, "loss": 0.1083, "step": 103 }, { "epoch": 0.18, "grad_norm": 0.16931602358818054, "learning_rate": 0.0004961703970900145, "loss": 0.1431, "step": 104 }, { "epoch": 0.19, "grad_norm": 0.10859667509794235, "learning_rate": 0.000496088697026719, "loss": 0.12, "step": 105 }, { "epoch": 0.19, "grad_norm": 0.21958191692829132, "learning_rate": 0.0004960061415196374, "loss": 0.1492, "step": 106 }, { "epoch": 0.19, "grad_norm": 0.06437578052282333, "learning_rate": 0.0004959227308557459, "loss": 0.1083, "step": 107 }, { "epoch": 0.19, "grad_norm": 0.14975550770759583, "learning_rate": 0.0004958384653249932, "loss": 0.1155, "step": 108 }, { "epoch": 0.19, "grad_norm": 0.11868852376937866, "learning_rate": 0.0004957533452203, "loss": 0.1237, "step": 109 }, { "epoch": 0.19, "grad_norm": 0.2610260546207428, "learning_rate": 0.0004956673708375574, "loss": 0.1264, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.378467321395874, "learning_rate": 0.000495580542475626, "loss": 0.1222, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.0926096960902214, "learning_rate": 0.0004954928604363352, "loss": 0.1096, "step": 112 }, { "epoch": 0.2, "grad_norm": 0.06858692318201065, "learning_rate": 0.0004954043250244819, "loss": 0.1144, "step": 113 }, { "epoch": 0.2, "grad_norm": 0.3068992495536804, "learning_rate": 0.0004953149365478293, "loss": 0.1563, "step": 114 }, { "epoch": 0.2, "grad_norm": 0.15458936989307404, "learning_rate": 0.0004952246953171061, "loss": 0.1216, "step": 115 }, { "epoch": 0.21, "grad_norm": 0.10287577658891678, "learning_rate": 0.0004951336016460053, "loss": 0.0893, "step": 116 }, { "epoch": 0.21, "grad_norm": 0.1215134710073471, "learning_rate": 0.0004950416558511833, "loss": 0.1016, "step": 117 }, { "epoch": 0.21, "grad_norm": 0.1392650604248047, "learning_rate": 0.000494948858252258, "loss": 0.1111, "step": 118 }, { "epoch": 0.21, "grad_norm": 0.4350431263446808, "learning_rate": 0.0004948552091718092, "loss": 0.1192, "step": 119 }, { "epoch": 0.21, "grad_norm": 0.21448662877082825, "learning_rate": 0.0004947607089353758, "loss": 0.07, "step": 120 }, { "epoch": 0.21, "grad_norm": 1.6086686849594116, "learning_rate": 0.0004946653578714559, "loss": 0.1352, "step": 121 }, { "epoch": 0.22, "grad_norm": 0.25963085889816284, "learning_rate": 0.0004945691563115051, "loss": 0.1447, "step": 122 }, { "epoch": 0.22, "grad_norm": 0.11575956642627716, "learning_rate": 0.0004944721045899356, "loss": 0.1055, "step": 123 }, { "epoch": 0.22, "grad_norm": 0.11230157315731049, "learning_rate": 0.0004943742030441145, "loss": 0.0917, "step": 124 }, { "epoch": 0.22, "grad_norm": 0.3376341760158539, "learning_rate": 0.0004942754520143634, "loss": 0.1364, "step": 125 }, { "epoch": 0.22, "grad_norm": 0.2757412791252136, "learning_rate": 0.0004941758518439566, "loss": 0.1418, "step": 126 }, { "epoch": 0.22, "grad_norm": 0.1438644975423813, "learning_rate": 0.0004940754028791205, "loss": 0.1162, "step": 127 }, { "epoch": 0.23, "grad_norm": 0.14210884273052216, "learning_rate": 0.0004939741054690316, "loss": 0.1312, "step": 128 }, { "epoch": 0.23, "grad_norm": 0.1861649751663208, "learning_rate": 0.0004938719599658162, "loss": 0.1447, "step": 129 }, { "epoch": 0.23, "grad_norm": 0.19665485620498657, "learning_rate": 0.0004937689667245481, "loss": 0.1439, "step": 130 }, { "epoch": 0.23, "grad_norm": 0.22447055578231812, "learning_rate": 0.0004936651261032486, "loss": 0.1568, "step": 131 }, { "epoch": 0.23, "grad_norm": 0.10008269548416138, "learning_rate": 0.0004935604384628843, "loss": 0.1081, "step": 132 }, { "epoch": 0.24, "grad_norm": 0.0549234002828598, "learning_rate": 0.0004934549041673661, "loss": 0.1216, "step": 133 }, { "epoch": 0.24, "grad_norm": 0.11616212874650955, "learning_rate": 0.0004933485235835483, "loss": 0.1108, "step": 134 }, { "epoch": 0.24, "grad_norm": 0.08554813265800476, "learning_rate": 0.0004932412970812269, "loss": 0.135, "step": 135 }, { "epoch": 0.24, "grad_norm": 0.08642842620611191, "learning_rate": 0.0004931332250331382, "loss": 0.1205, "step": 136 }, { "epoch": 0.24, "grad_norm": 0.20417262613773346, "learning_rate": 0.0004930243078149582, "loss": 0.1169, "step": 137 }, { "epoch": 0.24, "grad_norm": 0.11088764667510986, "learning_rate": 0.0004929145458053005, "loss": 0.1014, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.3510516881942749, "learning_rate": 0.0004928039393857155, "loss": 0.0967, "step": 139 }, { "epoch": 0.25, "grad_norm": 0.2401883453130722, "learning_rate": 0.0004926924889406888, "loss": 0.106, "step": 140 }, { "epoch": 0.25, "grad_norm": 0.28403300046920776, "learning_rate": 0.0004925801948576402, "loss": 0.079, "step": 141 }, { "epoch": 0.25, "grad_norm": 0.46027252078056335, "learning_rate": 0.0004924670575269217, "loss": 0.0899, "step": 142 }, { "epoch": 0.25, "eval_loss": 0.09421269595623016, "eval_runtime": 14.7696, "eval_samples_per_second": 32.296, "eval_steps_per_second": 8.125, "step": 142 }, { "epoch": 0.25, "grad_norm": 0.29767730832099915, "learning_rate": 0.0004923530773418169, "loss": 0.1265, "step": 143 }, { "epoch": 0.25, "grad_norm": 0.37391072511672974, "learning_rate": 0.0004922382546985394, "loss": 0.1244, "step": 144 }, { "epoch": 0.26, "grad_norm": 0.8874172568321228, "learning_rate": 0.0004921225899962308, "loss": 0.1796, "step": 145 }, { "epoch": 0.26, "grad_norm": 0.2554258704185486, "learning_rate": 0.0004920060836369603, "loss": 0.0528, "step": 146 }, { "epoch": 0.26, "grad_norm": 0.1981816440820694, "learning_rate": 0.0004918887360257228, "loss": 0.1159, "step": 147 }, { "epoch": 0.26, "grad_norm": 0.14500874280929565, "learning_rate": 0.0004917705475704373, "loss": 0.0992, "step": 148 }, { "epoch": 0.26, "grad_norm": 0.1315220594406128, "learning_rate": 0.000491651518681946, "loss": 0.1248, "step": 149 }, { "epoch": 0.27, "grad_norm": 0.0798826813697815, "learning_rate": 0.0004915316497740121, "loss": 0.1151, "step": 150 }, { "epoch": 0.27, "grad_norm": 0.10213784873485565, "learning_rate": 0.0004914109412633194, "loss": 0.1098, "step": 151 }, { "epoch": 0.27, "grad_norm": 0.23167072236537933, "learning_rate": 0.00049128939356947, "loss": 0.1236, "step": 152 }, { "epoch": 0.27, "grad_norm": 0.173340305685997, "learning_rate": 0.0004911670071149831, "loss": 0.1098, "step": 153 }, { "epoch": 0.27, "grad_norm": 0.1079009547829628, "learning_rate": 0.0004910437823252937, "loss": 0.1014, "step": 154 }, { "epoch": 0.27, "grad_norm": 0.320765882730484, "learning_rate": 0.0004909197196287509, "loss": 0.1285, "step": 155 }, { "epoch": 0.28, "grad_norm": 0.40041017532348633, "learning_rate": 0.0004907948194566166, "loss": 0.1421, "step": 156 }, { "epoch": 0.28, "grad_norm": 0.4091287851333618, "learning_rate": 0.0004906690822430638, "loss": 0.1451, "step": 157 }, { "epoch": 0.28, "grad_norm": 0.39893922209739685, "learning_rate": 0.0004905425084251753, "loss": 0.1289, "step": 158 }, { "epoch": 0.28, "grad_norm": 0.14173893630504608, "learning_rate": 0.0004904150984429419, "loss": 0.0712, "step": 159 }, { "epoch": 0.28, "grad_norm": 0.4723054766654968, "learning_rate": 0.0004902868527392611, "loss": 0.2141, "step": 160 }, { "epoch": 0.28, "grad_norm": 0.13493523001670837, "learning_rate": 0.0004901577717599355, "loss": 0.0881, "step": 161 }, { "epoch": 0.29, "grad_norm": 0.10770414024591446, "learning_rate": 0.0004900278559536716, "loss": 0.0746, "step": 162 }, { "epoch": 0.29, "grad_norm": 0.5121994614601135, "learning_rate": 0.0004898971057720773, "loss": 0.1705, "step": 163 }, { "epoch": 0.29, "grad_norm": 0.09419309347867966, "learning_rate": 0.0004897655216696612, "loss": 0.1085, "step": 164 }, { "epoch": 0.29, "grad_norm": 0.3557867407798767, "learning_rate": 0.0004896331041038309, "loss": 0.1027, "step": 165 }, { "epoch": 0.29, "grad_norm": 0.082126185297966, "learning_rate": 0.000489499853534891, "loss": 0.1113, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.8520584106445312, "learning_rate": 0.0004893657704260419, "loss": 0.1291, "step": 167 }, { "epoch": 0.3, "grad_norm": 0.4607222080230713, "learning_rate": 0.000489230855243378, "loss": 0.1241, "step": 168 }, { "epoch": 0.3, "grad_norm": 0.5181136727333069, "learning_rate": 0.0004890951084558859, "loss": 0.0957, "step": 169 }, { "epoch": 0.3, "grad_norm": 0.42894089221954346, "learning_rate": 0.0004889585305354435, "loss": 0.0895, "step": 170 }, { "epoch": 0.3, "grad_norm": 0.14509521424770355, "learning_rate": 0.0004888211219568175, "loss": 0.0732, "step": 171 }, { "epoch": 0.3, "grad_norm": 0.24262909591197968, "learning_rate": 0.0004886828831976621, "loss": 0.0917, "step": 172 }, { "epoch": 0.31, "grad_norm": 0.44387635588645935, "learning_rate": 0.0004885438147385175, "loss": 0.0636, "step": 173 }, { "epoch": 0.31, "grad_norm": 0.1804012507200241, "learning_rate": 0.0004884039170628077, "loss": 0.0295, "step": 174 }, { "epoch": 0.31, "grad_norm": 0.5566735863685608, "learning_rate": 0.0004882631906568398, "loss": 0.1104, "step": 175 }, { "epoch": 0.31, "grad_norm": 0.9653083682060242, "learning_rate": 0.0004881216360098012, "loss": 0.2236, "step": 176 }, { "epoch": 0.31, "grad_norm": 0.27046507596969604, "learning_rate": 0.0004879792536137585, "loss": 0.1082, "step": 177 }, { "epoch": 0.31, "grad_norm": 0.47974228858947754, "learning_rate": 0.00048783604396365586, "loss": 0.0884, "step": 178 }, { "epoch": 0.32, "grad_norm": 0.23638087511062622, "learning_rate": 0.0004876920075573129, "loss": 0.0968, "step": 179 }, { "epoch": 0.32, "grad_norm": 0.12476328015327454, "learning_rate": 0.0004875471448954234, "loss": 0.1078, "step": 180 }, { "epoch": 0.32, "grad_norm": 0.2455732375383377, "learning_rate": 0.00048740145648155307, "loss": 0.1124, "step": 181 }, { "epoch": 0.32, "grad_norm": 0.2744804620742798, "learning_rate": 0.0004872549428221384, "loss": 0.0797, "step": 182 }, { "epoch": 0.32, "grad_norm": 0.19536937773227692, "learning_rate": 0.00048710760442648415, "loss": 0.1091, "step": 183 }, { "epoch": 0.33, "grad_norm": 0.5277348160743713, "learning_rate": 0.0004869594418067623, "loss": 0.1261, "step": 184 }, { "epoch": 0.33, "grad_norm": 0.13960392773151398, "learning_rate": 0.00048681045547801003, "loss": 0.0879, "step": 185 }, { "epoch": 0.33, "grad_norm": 0.2567049562931061, "learning_rate": 0.00048666064595812746, "loss": 0.083, "step": 186 }, { "epoch": 0.33, "grad_norm": 0.3075740337371826, "learning_rate": 0.00048651001376787676, "loss": 0.1167, "step": 187 }, { "epoch": 0.33, "grad_norm": 0.5257586240768433, "learning_rate": 0.0004863585594308794, "loss": 0.1019, "step": 188 }, { "epoch": 0.33, "grad_norm": 0.41611766815185547, "learning_rate": 0.00048620628347361496, "loss": 0.1392, "step": 189 }, { "epoch": 0.34, "grad_norm": 0.30399614572525024, "learning_rate": 0.00048605318642541917, "loss": 0.1339, "step": 190 }, { "epoch": 0.34, "grad_norm": 0.41276878118515015, "learning_rate": 0.00048589926881848194, "loss": 0.1028, "step": 191 }, { "epoch": 0.34, "grad_norm": 0.19717253744602203, "learning_rate": 0.0004857445311878456, "loss": 0.1032, "step": 192 }, { "epoch": 0.34, "grad_norm": 0.3766873776912689, "learning_rate": 0.0004855889740714028, "loss": 0.1486, "step": 193 }, { "epoch": 0.34, "grad_norm": 0.17443525791168213, "learning_rate": 0.0004854325980098951, "loss": 0.096, "step": 194 }, { "epoch": 0.34, "grad_norm": 0.1278471201658249, "learning_rate": 0.0004852754035469109, "loss": 0.0746, "step": 195 }, { "epoch": 0.35, "grad_norm": 0.14356929063796997, "learning_rate": 0.0004851173912288833, "loss": 0.0857, "step": 196 }, { "epoch": 0.35, "grad_norm": 0.20514866709709167, "learning_rate": 0.0004849585616050884, "loss": 0.0833, "step": 197 }, { "epoch": 0.35, "grad_norm": 0.4683605134487152, "learning_rate": 0.0004847989152276435, "loss": 0.1538, "step": 198 }, { "epoch": 0.35, "grad_norm": 0.29194721579551697, "learning_rate": 0.00048463845265150495, "loss": 0.1035, "step": 199 }, { "epoch": 0.35, "grad_norm": 0.22838515043258667, "learning_rate": 0.0004844771744344666, "loss": 0.0762, "step": 200 }, { "epoch": 0.36, "grad_norm": 0.3635599911212921, "learning_rate": 0.0004843150811371572, "loss": 0.1165, "step": 201 }, { "epoch": 0.36, "grad_norm": 0.2508073151111603, "learning_rate": 0.0004841521733230391, "loss": 0.0736, "step": 202 }, { "epoch": 0.36, "grad_norm": 0.24161550402641296, "learning_rate": 0.000483988451558406, "loss": 0.1309, "step": 203 }, { "epoch": 0.36, "grad_norm": 0.4697308838367462, "learning_rate": 0.0004838239164123811, "loss": 0.1731, "step": 204 }, { "epoch": 0.36, "grad_norm": 0.17773008346557617, "learning_rate": 0.0004836585684569148, "loss": 0.1158, "step": 205 }, { "epoch": 0.36, "grad_norm": 0.21285519003868103, "learning_rate": 0.0004834924082667833, "loss": 0.0949, "step": 206 }, { "epoch": 0.37, "grad_norm": 0.2403111308813095, "learning_rate": 0.0004833254364195859, "loss": 0.0801, "step": 207 }, { "epoch": 0.37, "grad_norm": 0.2033465951681137, "learning_rate": 0.0004831576534957437, "loss": 0.069, "step": 208 }, { "epoch": 0.37, "grad_norm": 0.5510303378105164, "learning_rate": 0.000482989060078497, "loss": 0.1766, "step": 209 }, { "epoch": 0.37, "grad_norm": 0.32342344522476196, "learning_rate": 0.0004828196567539034, "loss": 0.1229, "step": 210 }, { "epoch": 0.37, "grad_norm": 0.3102104663848877, "learning_rate": 0.00048264944411083625, "loss": 0.1297, "step": 211 }, { "epoch": 0.38, "grad_norm": 0.32639122009277344, "learning_rate": 0.00048247842274098187, "loss": 0.1011, "step": 212 }, { "epoch": 0.38, "grad_norm": 0.4720034897327423, "learning_rate": 0.00048230659323883804, "loss": 0.1282, "step": 213 }, { "epoch": 0.38, "grad_norm": 0.5249712467193604, "learning_rate": 0.00048213395620171166, "loss": 0.1376, "step": 214 }, { "epoch": 0.38, "grad_norm": 0.3953443467617035, "learning_rate": 0.00048196051222971673, "loss": 0.1186, "step": 215 }, { "epoch": 0.38, "grad_norm": 0.15697738528251648, "learning_rate": 0.0004817862619257723, "loss": 0.1079, "step": 216 }, { "epoch": 0.38, "grad_norm": 0.32511651515960693, "learning_rate": 0.0004816112058956005, "loss": 0.1052, "step": 217 }, { "epoch": 0.39, "grad_norm": 0.1850031018257141, "learning_rate": 0.00048143534474772397, "loss": 0.1236, "step": 218 }, { "epoch": 0.39, "grad_norm": 0.10901057720184326, "learning_rate": 0.0004812586790934645, "loss": 0.1094, "step": 219 }, { "epoch": 0.39, "grad_norm": 0.23395784199237823, "learning_rate": 0.00048108120954694014, "loss": 0.0556, "step": 220 }, { "epoch": 0.39, "grad_norm": 0.21469372510910034, "learning_rate": 0.00048090293672506347, "loss": 0.0594, "step": 221 }, { "epoch": 0.39, "grad_norm": 0.17289988696575165, "learning_rate": 0.00048072386124753944, "loss": 0.0219, "step": 222 }, { "epoch": 0.39, "grad_norm": 0.21490757167339325, "learning_rate": 0.0004805439837368631, "loss": 0.0203, "step": 223 }, { "epoch": 0.4, "grad_norm": 1.1259506940841675, "learning_rate": 0.0004803633048183176, "loss": 0.1576, "step": 224 }, { "epoch": 0.4, "grad_norm": 1.2934038639068604, "learning_rate": 0.00048018182511997185, "loss": 0.1233, "step": 225 }, { "epoch": 0.4, "grad_norm": 0.4250846207141876, "learning_rate": 0.0004799995452726783, "loss": 0.1023, "step": 226 }, { "epoch": 0.4, "grad_norm": 1.4675579071044922, "learning_rate": 0.000479816465910071, "loss": 0.1242, "step": 227 }, { "epoch": 0.4, "grad_norm": 0.7030429840087891, "learning_rate": 0.0004796325876685632, "loss": 0.0514, "step": 228 }, { "epoch": 0.41, "grad_norm": 0.5683910846710205, "learning_rate": 0.00047944791118734517, "loss": 0.0923, "step": 229 }, { "epoch": 0.41, "grad_norm": 0.8425244092941284, "learning_rate": 0.0004792624371083819, "loss": 0.0976, "step": 230 }, { "epoch": 0.41, "grad_norm": 0.21189981698989868, "learning_rate": 0.00047907616607641113, "loss": 0.1016, "step": 231 }, { "epoch": 0.41, "grad_norm": 0.36100390553474426, "learning_rate": 0.0004788890987389408, "loss": 0.1015, "step": 232 }, { "epoch": 0.41, "grad_norm": 0.42600420117378235, "learning_rate": 0.000478701235746247, "loss": 0.1401, "step": 233 }, { "epoch": 0.41, "grad_norm": 0.649318516254425, "learning_rate": 0.0004785125777513716, "loss": 0.1012, "step": 234 }, { "epoch": 0.42, "grad_norm": 0.3490477204322815, "learning_rate": 0.00047832312541012007, "loss": 0.1015, "step": 235 }, { "epoch": 0.42, "grad_norm": 0.6937799453735352, "learning_rate": 0.0004781328793810592, "loss": 0.1188, "step": 236 }, { "epoch": 0.42, "grad_norm": 1.0924077033996582, "learning_rate": 0.0004779418403255146, "loss": 0.1093, "step": 237 }, { "epoch": 0.42, "grad_norm": 0.36075183749198914, "learning_rate": 0.0004777500089075687, "loss": 0.0971, "step": 238 }, { "epoch": 0.42, "grad_norm": 0.41673243045806885, "learning_rate": 0.00047755738579405836, "loss": 0.0953, "step": 239 }, { "epoch": 0.42, "grad_norm": 0.13159583508968353, "learning_rate": 0.0004773639716545723, "loss": 0.0571, "step": 240 }, { "epoch": 0.43, "grad_norm": 0.9338862895965576, "learning_rate": 0.00047716976716144917, "loss": 0.202, "step": 241 }, { "epoch": 0.43, "grad_norm": 0.3190581798553467, "learning_rate": 0.0004769747729897749, "loss": 0.1071, "step": 242 }, { "epoch": 0.43, "grad_norm": 0.23796042799949646, "learning_rate": 0.0004767789898173806, "loss": 0.0659, "step": 243 }, { "epoch": 0.43, "grad_norm": 0.19194231927394867, "learning_rate": 0.0004765824183248399, "loss": 0.0611, "step": 244 }, { "epoch": 0.43, "grad_norm": 0.16703608632087708, "learning_rate": 0.0004763850591954668, "loss": 0.0855, "step": 245 }, { "epoch": 0.44, "grad_norm": 0.3395439684391022, "learning_rate": 0.0004761869131153135, "loss": 0.0926, "step": 246 }, { "epoch": 0.44, "grad_norm": 0.2820179760456085, "learning_rate": 0.0004759879807731673, "loss": 0.0508, "step": 247 }, { "epoch": 0.44, "grad_norm": 0.20656561851501465, "learning_rate": 0.00047578826286054897, "loss": 0.068, "step": 248 }, { "epoch": 0.44, "grad_norm": 0.4477837383747101, "learning_rate": 0.00047558776007171024, "loss": 0.0918, "step": 249 }, { "epoch": 0.44, "grad_norm": 0.18997950851917267, "learning_rate": 0.0004753864731036307, "loss": 0.0734, "step": 250 }, { "epoch": 0.44, "grad_norm": 0.2841518521308899, "learning_rate": 0.0004751844026560163, "loss": 0.1194, "step": 251 }, { "epoch": 0.45, "grad_norm": 0.29770052433013916, "learning_rate": 0.0004749815494312963, "loss": 0.0996, "step": 252 }, { "epoch": 0.45, "grad_norm": 0.2982254922389984, "learning_rate": 0.00047477791413462104, "loss": 0.0945, "step": 253 }, { "epoch": 0.45, "grad_norm": 0.4625980854034424, "learning_rate": 0.00047457349747385936, "loss": 0.131, "step": 254 }, { "epoch": 0.45, "grad_norm": 0.29756709933280945, "learning_rate": 0.00047436830015959653, "loss": 0.1057, "step": 255 }, { "epoch": 0.45, "grad_norm": 0.19971434772014618, "learning_rate": 0.00047416232290513127, "loss": 0.0794, "step": 256 }, { "epoch": 0.45, "grad_norm": 0.12171836197376251, "learning_rate": 0.0004739555664264736, "loss": 0.0527, "step": 257 }, { "epoch": 0.46, "grad_norm": 0.23848529160022736, "learning_rate": 0.00047374803144234213, "loss": 0.134, "step": 258 }, { "epoch": 0.46, "grad_norm": 0.12673752009868622, "learning_rate": 0.0004735397186741618, "loss": 0.0774, "step": 259 }, { "epoch": 0.46, "grad_norm": 0.11961629241704941, "learning_rate": 0.00047333062884606116, "loss": 0.0661, "step": 260 }, { "epoch": 0.46, "grad_norm": 0.18004140257835388, "learning_rate": 0.00047312076268487, "loss": 0.1132, "step": 261 }, { "epoch": 0.46, "grad_norm": 0.1698005348443985, "learning_rate": 0.00047291012092011685, "loss": 0.057, "step": 262 }, { "epoch": 0.47, "grad_norm": 0.1949334442615509, "learning_rate": 0.0004726987042840263, "loss": 0.0703, "step": 263 }, { "epoch": 0.47, "grad_norm": 0.4016534686088562, "learning_rate": 0.0004724865135115163, "loss": 0.1178, "step": 264 }, { "epoch": 0.47, "grad_norm": 0.36885496973991394, "learning_rate": 0.00047227354934019605, "loss": 0.1303, "step": 265 }, { "epoch": 0.47, "grad_norm": 0.3214585483074188, "learning_rate": 0.00047205981251036334, "loss": 0.1019, "step": 266 }, { "epoch": 0.47, "grad_norm": 0.15313082933425903, "learning_rate": 0.0004718453037650016, "loss": 0.0581, "step": 267 }, { "epoch": 0.47, "grad_norm": 0.3251878321170807, "learning_rate": 0.0004716300238497775, "loss": 0.099, "step": 268 }, { "epoch": 0.48, "grad_norm": 0.20356950163841248, "learning_rate": 0.0004714139735130388, "loss": 0.0767, "step": 269 }, { "epoch": 0.48, "grad_norm": 0.2644464373588562, "learning_rate": 0.00047119715350581095, "loss": 0.1003, "step": 270 }, { "epoch": 0.48, "grad_norm": 0.22035302221775055, "learning_rate": 0.000470979564581795, "loss": 0.0722, "step": 271 }, { "epoch": 0.48, "grad_norm": 0.5284466743469238, "learning_rate": 0.0004707612074973653, "loss": 0.1282, "step": 272 }, { "epoch": 0.48, "grad_norm": 0.34032565355300903, "learning_rate": 0.0004705420830115658, "loss": 0.099, "step": 273 }, { "epoch": 0.48, "grad_norm": 0.26527565717697144, "learning_rate": 0.00047032219188610836, "loss": 0.0911, "step": 274 }, { "epoch": 0.49, "grad_norm": 0.2254990190267563, "learning_rate": 0.0004701015348853699, "loss": 0.0667, "step": 275 }, { "epoch": 0.49, "grad_norm": 0.21334387362003326, "learning_rate": 0.0004698801127763895, "loss": 0.0659, "step": 276 }, { "epoch": 0.49, "grad_norm": 0.2917044758796692, "learning_rate": 0.0004696579263288661, "loss": 0.1159, "step": 277 }, { "epoch": 0.49, "grad_norm": 0.14027804136276245, "learning_rate": 0.00046943497631515526, "loss": 0.0323, "step": 278 }, { "epoch": 0.49, "grad_norm": 0.3988366425037384, "learning_rate": 0.00046921126351026697, "loss": 0.0887, "step": 279 }, { "epoch": 0.5, "grad_norm": 0.36629319190979004, "learning_rate": 0.00046898678869186297, "loss": 0.1079, "step": 280 }, { "epoch": 0.5, "grad_norm": 0.35548141598701477, "learning_rate": 0.0004687615526402536, "loss": 0.1056, "step": 281 }, { "epoch": 0.5, "grad_norm": 0.21030637621879578, "learning_rate": 0.0004685355561383956, "loss": 0.0717, "step": 282 }, { "epoch": 0.5, "grad_norm": 0.24192889034748077, "learning_rate": 0.000468308799971889, "loss": 0.1047, "step": 283 }, { "epoch": 0.5, "grad_norm": 0.16289295256137848, "learning_rate": 0.00046808128492897464, "loss": 0.0519, "step": 284 }, { "epoch": 0.5, "eval_loss": 0.08938124030828476, "eval_runtime": 14.7518, "eval_samples_per_second": 32.335, "eval_steps_per_second": 8.135, "step": 284 }, { "epoch": 0.5, "grad_norm": 0.23021583259105682, "learning_rate": 0.00046785301180053126, "loss": 0.1161, "step": 285 }, { "epoch": 0.51, "grad_norm": 0.3577558398246765, "learning_rate": 0.0004676239813800729, "loss": 0.1239, "step": 286 }, { "epoch": 0.51, "grad_norm": 0.15293735265731812, "learning_rate": 0.0004673941944637461, "loss": 0.0401, "step": 287 }, { "epoch": 0.51, "grad_norm": 0.342631459236145, "learning_rate": 0.00046716365185032696, "loss": 0.1358, "step": 288 }, { "epoch": 0.51, "grad_norm": 0.4987104833126068, "learning_rate": 0.0004669323543412186, "loss": 0.1312, "step": 289 }, { "epoch": 0.51, "grad_norm": 0.21678434312343597, "learning_rate": 0.0004667003027404483, "loss": 0.0791, "step": 290 }, { "epoch": 0.51, "grad_norm": 0.2781723141670227, "learning_rate": 0.00046646749785466464, "loss": 0.0809, "step": 291 }, { "epoch": 0.52, "grad_norm": 0.3997693359851837, "learning_rate": 0.00046623394049313474, "loss": 0.0938, "step": 292 }, { "epoch": 0.52, "grad_norm": 0.2478984147310257, "learning_rate": 0.00046599963146774136, "loss": 0.0671, "step": 293 }, { "epoch": 0.52, "grad_norm": 0.35655421018600464, "learning_rate": 0.0004657645715929805, "loss": 0.107, "step": 294 }, { "epoch": 0.52, "grad_norm": 0.41986069083213806, "learning_rate": 0.0004655287616859577, "loss": 0.1381, "step": 295 }, { "epoch": 0.52, "grad_norm": 0.2831580340862274, "learning_rate": 0.00046529220256638626, "loss": 0.1012, "step": 296 }, { "epoch": 0.53, "grad_norm": 0.2183172106742859, "learning_rate": 0.0004650548950565835, "loss": 0.0883, "step": 297 }, { "epoch": 0.53, "grad_norm": 0.1485687792301178, "learning_rate": 0.0004648168399814684, "loss": 0.094, "step": 298 }, { "epoch": 0.53, "grad_norm": 0.3192533552646637, "learning_rate": 0.0004645780381685586, "loss": 0.1144, "step": 299 }, { "epoch": 0.53, "grad_norm": 0.20768460631370544, "learning_rate": 0.0004643384904479675, "loss": 0.1119, "step": 300 }, { "epoch": 0.53, "grad_norm": 0.16704390943050385, "learning_rate": 0.00046409819765240147, "loss": 0.0852, "step": 301 }, { "epoch": 0.53, "grad_norm": 0.33123648166656494, "learning_rate": 0.0004638571606171567, "loss": 0.1608, "step": 302 }, { "epoch": 0.54, "grad_norm": 0.408978134393692, "learning_rate": 0.0004636153801801167, "loss": 0.0906, "step": 303 }, { "epoch": 0.54, "grad_norm": 0.29201096296310425, "learning_rate": 0.00046337285718174896, "loss": 0.1237, "step": 304 }, { "epoch": 0.54, "grad_norm": 0.45836058259010315, "learning_rate": 0.00046312959246510237, "loss": 0.0926, "step": 305 }, { "epoch": 0.54, "grad_norm": 0.5405777096748352, "learning_rate": 0.0004628855868758041, "loss": 0.0727, "step": 306 }, { "epoch": 0.54, "grad_norm": 0.3068138062953949, "learning_rate": 0.00046264084126205676, "loss": 0.1006, "step": 307 }, { "epoch": 0.54, "grad_norm": 0.2990975081920624, "learning_rate": 0.00046239535647463534, "loss": 0.1033, "step": 308 }, { "epoch": 0.55, "grad_norm": 0.2938540279865265, "learning_rate": 0.00046214913336688424, "loss": 0.1084, "step": 309 }, { "epoch": 0.55, "grad_norm": 0.49840983748435974, "learning_rate": 0.00046190217279471466, "loss": 0.1066, "step": 310 }, { "epoch": 0.55, "grad_norm": 0.4558626711368561, "learning_rate": 0.000461654475616601, "loss": 0.121, "step": 311 }, { "epoch": 0.55, "grad_norm": 0.20964759588241577, "learning_rate": 0.0004614060426935786, "loss": 0.0843, "step": 312 }, { "epoch": 0.55, "grad_norm": 0.2254151701927185, "learning_rate": 0.00046115687488923983, "loss": 0.0781, "step": 313 }, { "epoch": 0.56, "grad_norm": 0.6117066740989685, "learning_rate": 0.0004609069730697322, "loss": 0.1208, "step": 314 }, { "epoch": 0.56, "grad_norm": 0.5352897644042969, "learning_rate": 0.0004606563381037544, "loss": 0.1056, "step": 315 }, { "epoch": 0.56, "grad_norm": 0.31001269817352295, "learning_rate": 0.00046040497086255385, "loss": 0.1213, "step": 316 }, { "epoch": 0.56, "grad_norm": 0.22637054324150085, "learning_rate": 0.0004601528722199234, "loss": 0.105, "step": 317 }, { "epoch": 0.56, "grad_norm": 0.20077432692050934, "learning_rate": 0.0004599000430521984, "loss": 0.0837, "step": 318 }, { "epoch": 0.56, "grad_norm": 0.24702684581279755, "learning_rate": 0.0004596464842382534, "loss": 0.0695, "step": 319 }, { "epoch": 0.57, "grad_norm": 0.253387987613678, "learning_rate": 0.0004593921966594997, "loss": 0.1184, "step": 320 }, { "epoch": 0.57, "grad_norm": 0.2703799605369568, "learning_rate": 0.0004591371811998817, "loss": 0.117, "step": 321 }, { "epoch": 0.57, "grad_norm": 0.23513701558113098, "learning_rate": 0.00045888143874587396, "loss": 0.1359, "step": 322 }, { "epoch": 0.57, "grad_norm": 0.25604313611984253, "learning_rate": 0.00045862497018647833, "loss": 0.1018, "step": 323 }, { "epoch": 0.57, "grad_norm": 0.16947636008262634, "learning_rate": 0.0004583677764132207, "loss": 0.0958, "step": 324 }, { "epoch": 0.57, "grad_norm": 0.20054908096790314, "learning_rate": 0.0004581098583201478, "loss": 0.0803, "step": 325 }, { "epoch": 0.58, "grad_norm": 0.12656472623348236, "learning_rate": 0.00045785121680382436, "loss": 0.0679, "step": 326 }, { "epoch": 0.58, "grad_norm": 0.1423499882221222, "learning_rate": 0.0004575918527633297, "loss": 0.0959, "step": 327 }, { "epoch": 0.58, "grad_norm": 0.36370569467544556, "learning_rate": 0.0004573317671002549, "loss": 0.1088, "step": 328 }, { "epoch": 0.58, "grad_norm": 0.18775340914726257, "learning_rate": 0.0004570709607186995, "loss": 0.0905, "step": 329 }, { "epoch": 0.58, "grad_norm": 0.14833571016788483, "learning_rate": 0.0004568094345252681, "loss": 0.0661, "step": 330 }, { "epoch": 0.59, "grad_norm": 0.2987270653247833, "learning_rate": 0.00045654718942906794, "loss": 0.0872, "step": 331 }, { "epoch": 0.59, "grad_norm": 0.21985827386379242, "learning_rate": 0.000456284226341705, "loss": 0.0882, "step": 332 }, { "epoch": 0.59, "grad_norm": 0.2726268470287323, "learning_rate": 0.00045602054617728093, "loss": 0.0864, "step": 333 }, { "epoch": 0.59, "grad_norm": 0.2882244884967804, "learning_rate": 0.00045575614985239057, "loss": 0.1032, "step": 334 }, { "epoch": 0.59, "grad_norm": 0.427500456571579, "learning_rate": 0.0004554910382861178, "loss": 0.1309, "step": 335 }, { "epoch": 0.59, "grad_norm": 0.43029338121414185, "learning_rate": 0.000455225212400033, "loss": 0.1071, "step": 336 }, { "epoch": 0.6, "grad_norm": 0.2297673523426056, "learning_rate": 0.0004549586731181896, "loss": 0.0526, "step": 337 }, { "epoch": 0.6, "grad_norm": 0.4533613920211792, "learning_rate": 0.0004546914213671209, "loss": 0.1154, "step": 338 }, { "epoch": 0.6, "grad_norm": 0.3973630666732788, "learning_rate": 0.0004544234580758367, "loss": 0.0707, "step": 339 }, { "epoch": 0.6, "grad_norm": 0.40036290884017944, "learning_rate": 0.0004541547841758207, "loss": 0.0932, "step": 340 }, { "epoch": 0.6, "grad_norm": 0.4273395240306854, "learning_rate": 0.0004538854006010262, "loss": 0.1112, "step": 341 }, { "epoch": 0.61, "grad_norm": 0.28109779953956604, "learning_rate": 0.0004536153082878738, "loss": 0.1003, "step": 342 }, { "epoch": 0.61, "grad_norm": 0.21950216591358185, "learning_rate": 0.00045334450817524776, "loss": 0.0538, "step": 343 }, { "epoch": 0.61, "grad_norm": 0.2968471646308899, "learning_rate": 0.00045307300120449263, "loss": 0.0775, "step": 344 }, { "epoch": 0.61, "grad_norm": 0.1488364040851593, "learning_rate": 0.00045280078831941024, "loss": 0.0513, "step": 345 }, { "epoch": 0.61, "grad_norm": 0.22750218212604523, "learning_rate": 0.00045252787046625624, "loss": 0.0943, "step": 346 }, { "epoch": 0.61, "grad_norm": 0.3048767149448395, "learning_rate": 0.0004522542485937369, "loss": 0.079, "step": 347 }, { "epoch": 0.62, "grad_norm": 0.33520030975341797, "learning_rate": 0.0004519799236530057, "loss": 0.1584, "step": 348 }, { "epoch": 0.62, "grad_norm": 0.20777581632137299, "learning_rate": 0.00045170489659766003, "loss": 0.0903, "step": 349 }, { "epoch": 0.62, "grad_norm": 0.1602245271205902, "learning_rate": 0.00045142916838373826, "loss": 0.0446, "step": 350 }, { "epoch": 0.62, "grad_norm": 0.2512218952178955, "learning_rate": 0.0004511527399697158, "loss": 0.069, "step": 351 }, { "epoch": 0.62, "grad_norm": 0.17349962890148163, "learning_rate": 0.0004508756123165021, "loss": 0.0765, "step": 352 }, { "epoch": 0.62, "grad_norm": 0.26563215255737305, "learning_rate": 0.00045059778638743744, "loss": 0.0966, "step": 353 }, { "epoch": 0.63, "grad_norm": 0.23987066745758057, "learning_rate": 0.00045031926314828926, "loss": 0.0702, "step": 354 }, { "epoch": 0.63, "grad_norm": 0.21901372075080872, "learning_rate": 0.000450040043567249, "loss": 0.0457, "step": 355 }, { "epoch": 0.63, "grad_norm": 0.24179872870445251, "learning_rate": 0.00044976012861492877, "loss": 0.0651, "step": 356 }, { "epoch": 0.63, "grad_norm": 0.3544818162918091, "learning_rate": 0.0004494795192643578, "loss": 0.0622, "step": 357 }, { "epoch": 0.63, "grad_norm": 0.4363332986831665, "learning_rate": 0.00044919821649097916, "loss": 0.0972, "step": 358 }, { "epoch": 0.64, "grad_norm": 0.43788430094718933, "learning_rate": 0.0004489162212726465, "loss": 0.0843, "step": 359 }, { "epoch": 0.64, "grad_norm": 0.5084832906723022, "learning_rate": 0.00044863353458962044, "loss": 0.0888, "step": 360 }, { "epoch": 0.64, "grad_norm": 0.44660842418670654, "learning_rate": 0.0004483501574245652, "loss": 0.113, "step": 361 }, { "epoch": 0.64, "grad_norm": 0.7528813481330872, "learning_rate": 0.0004480660907625452, "loss": 0.0512, "step": 362 }, { "epoch": 0.64, "grad_norm": 0.9723535776138306, "learning_rate": 0.0004477813355910219, "loss": 0.1154, "step": 363 }, { "epoch": 0.64, "grad_norm": 0.2641480565071106, "learning_rate": 0.0004474958928998498, "loss": 0.0575, "step": 364 }, { "epoch": 0.65, "grad_norm": 0.12234170734882355, "learning_rate": 0.00044720976368127355, "loss": 0.0441, "step": 365 }, { "epoch": 0.65, "grad_norm": 0.26976636052131653, "learning_rate": 0.00044692294892992416, "loss": 0.0676, "step": 366 }, { "epoch": 0.65, "grad_norm": 0.22729526460170746, "learning_rate": 0.00044663544964281573, "loss": 0.098, "step": 367 }, { "epoch": 0.65, "grad_norm": 0.2270442545413971, "learning_rate": 0.0004463472668193419, "loss": 0.0842, "step": 368 }, { "epoch": 0.65, "grad_norm": 0.19249562919139862, "learning_rate": 0.0004460584014612724, "loss": 0.0537, "step": 369 }, { "epoch": 0.65, "grad_norm": 0.22312623262405396, "learning_rate": 0.0004457688545727496, "loss": 0.0547, "step": 370 }, { "epoch": 0.66, "grad_norm": 0.281658411026001, "learning_rate": 0.0004454786271602849, "loss": 0.089, "step": 371 }, { "epoch": 0.66, "grad_norm": 0.49952250719070435, "learning_rate": 0.00044518772023275526, "loss": 0.1298, "step": 372 }, { "epoch": 0.66, "grad_norm": 0.186232328414917, "learning_rate": 0.0004448961348013999, "loss": 0.0628, "step": 373 }, { "epoch": 0.66, "grad_norm": 0.2980823814868927, "learning_rate": 0.0004446038718798166, "loss": 0.0828, "step": 374 }, { "epoch": 0.66, "grad_norm": 0.3794187605381012, "learning_rate": 0.00044431093248395806, "loss": 0.0776, "step": 375 }, { "epoch": 0.67, "grad_norm": 0.29262277483940125, "learning_rate": 0.0004440173176321287, "loss": 0.0924, "step": 376 }, { "epoch": 0.67, "grad_norm": 0.30543988943099976, "learning_rate": 0.0004437230283449808, "loss": 0.1264, "step": 377 }, { "epoch": 0.67, "grad_norm": 0.3436485826969147, "learning_rate": 0.0004434280656455111, "loss": 0.1066, "step": 378 }, { "epoch": 0.67, "grad_norm": 0.23679965734481812, "learning_rate": 0.0004431324305590572, "loss": 0.075, "step": 379 }, { "epoch": 0.67, "grad_norm": 0.4399561882019043, "learning_rate": 0.0004428361241132943, "loss": 0.1445, "step": 380 }, { "epoch": 0.67, "grad_norm": 0.39203163981437683, "learning_rate": 0.0004425391473382309, "loss": 0.0995, "step": 381 }, { "epoch": 0.68, "grad_norm": 0.4687665104866028, "learning_rate": 0.0004422415012662061, "loss": 0.1489, "step": 382 }, { "epoch": 0.68, "grad_norm": 0.2634904086589813, "learning_rate": 0.00044194318693188526, "loss": 0.1164, "step": 383 }, { "epoch": 0.68, "grad_norm": 0.23031170666217804, "learning_rate": 0.0004416442053722569, "loss": 0.0742, "step": 384 }, { "epoch": 0.68, "grad_norm": 0.30467960238456726, "learning_rate": 0.00044134455762662894, "loss": 0.0984, "step": 385 }, { "epoch": 0.68, "grad_norm": 0.16692829132080078, "learning_rate": 0.0004410442447366249, "loss": 0.0732, "step": 386 }, { "epoch": 0.68, "grad_norm": 0.20100833475589752, "learning_rate": 0.00044074326774618065, "loss": 0.1082, "step": 387 }, { "epoch": 0.69, "grad_norm": 0.29799607396125793, "learning_rate": 0.0004404416277015404, "loss": 0.0761, "step": 388 }, { "epoch": 0.69, "grad_norm": 0.647639274597168, "learning_rate": 0.0004401393256512534, "loss": 0.1218, "step": 389 }, { "epoch": 0.69, "grad_norm": 0.2610540986061096, "learning_rate": 0.00043983636264617013, "loss": 0.0923, "step": 390 }, { "epoch": 0.69, "grad_norm": 0.4049086570739746, "learning_rate": 0.0004395327397394384, "loss": 0.1091, "step": 391 }, { "epoch": 0.69, "grad_norm": 0.36092105507850647, "learning_rate": 0.00043922845798650034, "loss": 0.0927, "step": 392 }, { "epoch": 0.7, "grad_norm": 0.542421281337738, "learning_rate": 0.00043892351844508805, "loss": 0.1014, "step": 393 }, { "epoch": 0.7, "grad_norm": 0.291595995426178, "learning_rate": 0.0004386179221752202, "loss": 0.0902, "step": 394 }, { "epoch": 0.7, "grad_norm": 0.17152707278728485, "learning_rate": 0.0004383116702391987, "loss": 0.0651, "step": 395 }, { "epoch": 0.7, "grad_norm": 0.16654878854751587, "learning_rate": 0.00043800476370160416, "loss": 0.0824, "step": 396 }, { "epoch": 0.7, "grad_norm": 0.18530108034610748, "learning_rate": 0.000437697203629293, "loss": 0.0549, "step": 397 }, { "epoch": 0.7, "grad_norm": 0.5760988593101501, "learning_rate": 0.0004373889910913934, "loss": 0.0803, "step": 398 }, { "epoch": 0.71, "grad_norm": 0.4253963232040405, "learning_rate": 0.00043708012715930154, "loss": 0.0728, "step": 399 }, { "epoch": 0.71, "grad_norm": 0.7932385206222534, "learning_rate": 0.00043677061290667805, "loss": 0.1442, "step": 400 }, { "epoch": 0.71, "grad_norm": 0.5904709696769714, "learning_rate": 0.00043646044940944407, "loss": 0.0999, "step": 401 }, { "epoch": 0.71, "grad_norm": 0.9570127129554749, "learning_rate": 0.0004361496377457777, "loss": 0.1298, "step": 402 }, { "epoch": 0.71, "grad_norm": 0.5049470663070679, "learning_rate": 0.00043583817899611017, "loss": 0.0263, "step": 403 }, { "epoch": 0.71, "grad_norm": 0.589408814907074, "learning_rate": 0.00043552607424312195, "loss": 0.1051, "step": 404 }, { "epoch": 0.72, "grad_norm": 0.43722283840179443, "learning_rate": 0.0004352133245717393, "loss": 0.0715, "step": 405 }, { "epoch": 0.72, "grad_norm": 1.3537758588790894, "learning_rate": 0.00043489993106913036, "loss": 0.0322, "step": 406 }, { "epoch": 0.72, "grad_norm": 0.3382836580276489, "learning_rate": 0.000434585894824701, "loss": 0.0818, "step": 407 }, { "epoch": 0.72, "grad_norm": 0.9946733713150024, "learning_rate": 0.00043427121693009164, "loss": 0.1536, "step": 408 }, { "epoch": 0.72, "grad_norm": 0.9138526320457458, "learning_rate": 0.0004339558984791732, "loss": 0.1299, "step": 409 }, { "epoch": 0.73, "grad_norm": 0.35993850231170654, "learning_rate": 0.0004336399405680432, "loss": 0.0654, "step": 410 }, { "epoch": 0.73, "grad_norm": 0.30418309569358826, "learning_rate": 0.0004333233442950219, "loss": 0.1026, "step": 411 }, { "epoch": 0.73, "grad_norm": 0.256728857755661, "learning_rate": 0.00043300611076064886, "loss": 0.083, "step": 412 }, { "epoch": 0.73, "grad_norm": 0.3472314774990082, "learning_rate": 0.00043268824106767865, "loss": 0.0637, "step": 413 }, { "epoch": 0.73, "grad_norm": 0.5615578293800354, "learning_rate": 0.00043236973632107735, "loss": 0.1028, "step": 414 }, { "epoch": 0.73, "grad_norm": 0.35775747895240784, "learning_rate": 0.00043205059762801854, "loss": 0.0829, "step": 415 }, { "epoch": 0.74, "grad_norm": 0.287270724773407, "learning_rate": 0.0004317308260978795, "loss": 0.0718, "step": 416 }, { "epoch": 0.74, "grad_norm": 0.3237059414386749, "learning_rate": 0.00043141042284223737, "loss": 0.0797, "step": 417 }, { "epoch": 0.74, "grad_norm": 0.2153153419494629, "learning_rate": 0.0004310893889748653, "loss": 0.0778, "step": 418 }, { "epoch": 0.74, "grad_norm": 0.33600860834121704, "learning_rate": 0.00043076772561172845, "loss": 0.0594, "step": 419 }, { "epoch": 0.74, "grad_norm": 0.8778895139694214, "learning_rate": 0.00043044543387098027, "loss": 0.1722, "step": 420 }, { "epoch": 0.74, "grad_norm": 0.503434419631958, "learning_rate": 0.0004301225148729586, "loss": 0.1228, "step": 421 }, { "epoch": 0.75, "grad_norm": 0.3694842457771301, "learning_rate": 0.00042979896974018166, "loss": 0.1033, "step": 422 }, { "epoch": 0.75, "grad_norm": 0.329818457365036, "learning_rate": 0.00042947479959734423, "loss": 0.0471, "step": 423 }, { "epoch": 0.75, "grad_norm": 0.19436267018318176, "learning_rate": 0.0004291500055713138, "loss": 0.0409, "step": 424 }, { "epoch": 0.75, "grad_norm": 0.3269858658313751, "learning_rate": 0.0004288245887911263, "loss": 0.096, "step": 425 }, { "epoch": 0.75, "grad_norm": 0.3542991578578949, "learning_rate": 0.00042849855038798283, "loss": 0.0986, "step": 426 }, { "epoch": 0.75, "eval_loss": 0.07792978733778, "eval_runtime": 14.8115, "eval_samples_per_second": 32.205, "eval_steps_per_second": 8.102, "step": 426 }, { "epoch": 0.76, "grad_norm": 0.29685521125793457, "learning_rate": 0.00042817189149524517, "loss": 0.11, "step": 427 }, { "epoch": 0.76, "grad_norm": 0.2116887867450714, "learning_rate": 0.00042784461324843194, "loss": 0.0686, "step": 428 }, { "epoch": 0.76, "grad_norm": 0.4866883456707001, "learning_rate": 0.00042751671678521486, "loss": 0.0824, "step": 429 }, { "epoch": 0.76, "grad_norm": 0.1293468475341797, "learning_rate": 0.00042718820324541475, "loss": 0.0464, "step": 430 }, { "epoch": 0.76, "grad_norm": 0.3253125250339508, "learning_rate": 0.0004268590737709972, "loss": 0.0996, "step": 431 }, { "epoch": 0.76, "grad_norm": 0.25559771060943604, "learning_rate": 0.00042652932950606917, "loss": 0.0545, "step": 432 }, { "epoch": 0.77, "grad_norm": 0.2788093686103821, "learning_rate": 0.0004261989715968746, "loss": 0.0502, "step": 433 }, { "epoch": 0.77, "grad_norm": 0.6902124285697937, "learning_rate": 0.00042586800119179046, "loss": 0.1598, "step": 434 }, { "epoch": 0.77, "grad_norm": 0.4788605570793152, "learning_rate": 0.00042553641944132316, "loss": 0.1552, "step": 435 }, { "epoch": 0.77, "grad_norm": 0.43495067954063416, "learning_rate": 0.00042520422749810395, "loss": 0.0907, "step": 436 }, { "epoch": 0.77, "grad_norm": 0.3549440801143646, "learning_rate": 0.0004248714265168853, "loss": 0.1152, "step": 437 }, { "epoch": 0.77, "grad_norm": 0.7210204601287842, "learning_rate": 0.00042453801765453687, "loss": 0.1891, "step": 438 }, { "epoch": 0.78, "grad_norm": 0.4578750729560852, "learning_rate": 0.00042420400207004126, "loss": 0.1383, "step": 439 }, { "epoch": 0.78, "grad_norm": 0.3323976993560791, "learning_rate": 0.00042386938092449036, "loss": 0.0936, "step": 440 }, { "epoch": 0.78, "grad_norm": 0.15145371854305267, "learning_rate": 0.00042353415538108076, "loss": 0.0608, "step": 441 }, { "epoch": 0.78, "grad_norm": 0.10744435340166092, "learning_rate": 0.00042319832660511037, "loss": 0.0865, "step": 442 }, { "epoch": 0.78, "grad_norm": 0.13599476218223572, "learning_rate": 0.0004228618957639738, "loss": 0.0763, "step": 443 }, { "epoch": 0.79, "grad_norm": 0.18250028789043427, "learning_rate": 0.00042252486402715865, "loss": 0.0813, "step": 444 }, { "epoch": 0.79, "grad_norm": 0.5180188417434692, "learning_rate": 0.00042218723256624136, "loss": 0.1603, "step": 445 }, { "epoch": 0.79, "grad_norm": 0.2943187355995178, "learning_rate": 0.000421849002554883, "loss": 0.1031, "step": 446 }, { "epoch": 0.79, "grad_norm": 0.14898087084293365, "learning_rate": 0.0004215101751688253, "loss": 0.071, "step": 447 }, { "epoch": 0.79, "grad_norm": 0.2951905131340027, "learning_rate": 0.00042117075158588663, "loss": 0.0772, "step": 448 }, { "epoch": 0.79, "grad_norm": 0.39807453751564026, "learning_rate": 0.00042083073298595787, "loss": 0.0561, "step": 449 }, { "epoch": 0.8, "grad_norm": 0.45217999815940857, "learning_rate": 0.0004204901205509981, "loss": 0.1076, "step": 450 }, { "epoch": 0.8, "grad_norm": 0.24114732444286346, "learning_rate": 0.000420148915465031, "loss": 0.1169, "step": 451 }, { "epoch": 0.8, "grad_norm": 0.6120204329490662, "learning_rate": 0.00041980711891413994, "loss": 0.1144, "step": 452 }, { "epoch": 0.8, "grad_norm": 0.3900619447231293, "learning_rate": 0.0004194647320864646, "loss": 0.0806, "step": 453 }, { "epoch": 0.8, "grad_norm": 0.3331635296344757, "learning_rate": 0.0004191217561721967, "loss": 0.0655, "step": 454 }, { "epoch": 0.8, "grad_norm": 0.29893186688423157, "learning_rate": 0.0004187781923635753, "loss": 0.0482, "step": 455 }, { "epoch": 0.81, "grad_norm": 0.20024164021015167, "learning_rate": 0.00041843404185488346, "loss": 0.0773, "step": 456 }, { "epoch": 0.81, "grad_norm": 0.3644329905509949, "learning_rate": 0.0004180893058424435, "loss": 0.1062, "step": 457 }, { "epoch": 0.81, "grad_norm": 0.5457159280776978, "learning_rate": 0.0004177439855246132, "loss": 0.1901, "step": 458 }, { "epoch": 0.81, "grad_norm": 0.282032310962677, "learning_rate": 0.0004173980821017812, "loss": 0.0656, "step": 459 }, { "epoch": 0.81, "grad_norm": 0.1957680881023407, "learning_rate": 0.00041705159677636334, "loss": 0.0725, "step": 460 }, { "epoch": 0.82, "grad_norm": 0.2736223042011261, "learning_rate": 0.00041670453075279827, "loss": 0.0897, "step": 461 }, { "epoch": 0.82, "grad_norm": 0.2145017832517624, "learning_rate": 0.0004163568852375431, "loss": 0.046, "step": 462 }, { "epoch": 0.82, "grad_norm": 0.1434750258922577, "learning_rate": 0.00041600866143906947, "loss": 0.0483, "step": 463 }, { "epoch": 0.82, "grad_norm": 0.2438279092311859, "learning_rate": 0.000415659860567859, "loss": 0.0935, "step": 464 }, { "epoch": 0.82, "grad_norm": 0.24830487370491028, "learning_rate": 0.00041531048383639966, "loss": 0.1061, "step": 465 }, { "epoch": 0.82, "grad_norm": 0.25185227394104004, "learning_rate": 0.000414960532459181, "loss": 0.082, "step": 466 }, { "epoch": 0.83, "grad_norm": 0.391631156206131, "learning_rate": 0.00041461000765269, "loss": 0.1274, "step": 467 }, { "epoch": 0.83, "grad_norm": 0.30484774708747864, "learning_rate": 0.0004142589106354071, "loss": 0.0672, "step": 468 }, { "epoch": 0.83, "grad_norm": 0.2584599554538727, "learning_rate": 0.0004139072426278021, "loss": 0.0863, "step": 469 }, { "epoch": 0.83, "grad_norm": 0.27182772755622864, "learning_rate": 0.0004135550048523292, "loss": 0.0996, "step": 470 }, { "epoch": 0.83, "grad_norm": 0.2670001685619354, "learning_rate": 0.00041320219853342347, "loss": 0.0592, "step": 471 }, { "epoch": 0.84, "grad_norm": 0.19571639597415924, "learning_rate": 0.0004128488248974962, "loss": 0.0618, "step": 472 }, { "epoch": 0.84, "grad_norm": 0.436814546585083, "learning_rate": 0.00041249488517293095, "loss": 0.1131, "step": 473 }, { "epoch": 0.84, "grad_norm": 0.21684250235557556, "learning_rate": 0.0004121403805900789, "loss": 0.0759, "step": 474 }, { "epoch": 0.84, "grad_norm": 0.39313605427742004, "learning_rate": 0.0004117853123812549, "loss": 0.0992, "step": 475 }, { "epoch": 0.84, "grad_norm": 0.3653202950954437, "learning_rate": 0.00041142968178073294, "loss": 0.099, "step": 476 }, { "epoch": 0.84, "grad_norm": 0.36615628004074097, "learning_rate": 0.00041107349002474206, "loss": 0.06, "step": 477 }, { "epoch": 0.85, "grad_norm": 0.2431243658065796, "learning_rate": 0.00041071673835146194, "loss": 0.0689, "step": 478 }, { "epoch": 0.85, "grad_norm": 0.7869367599487305, "learning_rate": 0.00041035942800101864, "loss": 0.1308, "step": 479 }, { "epoch": 0.85, "grad_norm": 0.2831230163574219, "learning_rate": 0.0004100015602154802, "loss": 0.087, "step": 480 }, { "epoch": 0.85, "grad_norm": 0.3709629774093628, "learning_rate": 0.0004096431362388525, "loss": 0.0822, "step": 481 }, { "epoch": 0.85, "grad_norm": 0.4082586467266083, "learning_rate": 0.0004092841573170748, "loss": 0.1114, "step": 482 }, { "epoch": 0.85, "grad_norm": 0.2919554114341736, "learning_rate": 0.0004089246246980154, "loss": 0.1059, "step": 483 }, { "epoch": 0.86, "grad_norm": 0.3750731945037842, "learning_rate": 0.0004085645396314673, "loss": 0.082, "step": 484 }, { "epoch": 0.86, "grad_norm": 0.21013659238815308, "learning_rate": 0.000408203903369144, "loss": 0.0819, "step": 485 }, { "epoch": 0.86, "grad_norm": 0.20771674811840057, "learning_rate": 0.00040784271716467503, "loss": 0.0687, "step": 486 }, { "epoch": 0.86, "grad_norm": 0.157434344291687, "learning_rate": 0.00040748098227360154, "loss": 0.0826, "step": 487 }, { "epoch": 0.86, "grad_norm": 0.40467727184295654, "learning_rate": 0.000407118699953372, "loss": 0.1131, "step": 488 }, { "epoch": 0.87, "grad_norm": 0.17521728575229645, "learning_rate": 0.0004067558714633378, "loss": 0.116, "step": 489 }, { "epoch": 0.87, "grad_norm": 0.2975709140300751, "learning_rate": 0.0004063924980647492, "loss": 0.0787, "step": 490 }, { "epoch": 0.87, "grad_norm": 0.22513332962989807, "learning_rate": 0.0004060285810207503, "loss": 0.0754, "step": 491 }, { "epoch": 0.87, "grad_norm": 0.2939409613609314, "learning_rate": 0.00040566412159637514, "loss": 0.0505, "step": 492 }, { "epoch": 0.87, "grad_norm": 0.21415212750434875, "learning_rate": 0.000405299121058543, "loss": 0.0486, "step": 493 }, { "epoch": 0.87, "grad_norm": 0.24846945703029633, "learning_rate": 0.00040493358067605445, "loss": 0.0645, "step": 494 }, { "epoch": 0.88, "grad_norm": 0.42928287386894226, "learning_rate": 0.00040456750171958655, "loss": 0.1455, "step": 495 }, { "epoch": 0.88, "grad_norm": 0.30920714139938354, "learning_rate": 0.0004042008854616883, "loss": 0.0743, "step": 496 }, { "epoch": 0.88, "grad_norm": 0.43211719393730164, "learning_rate": 0.00040383373317677687, "loss": 0.1037, "step": 497 }, { "epoch": 0.88, "grad_norm": 0.49942275881767273, "learning_rate": 0.00040346604614113215, "loss": 0.123, "step": 498 }, { "epoch": 0.88, "grad_norm": 0.18615621328353882, "learning_rate": 0.00040309782563289353, "loss": 0.0783, "step": 499 }, { "epoch": 0.88, "grad_norm": 0.22238926589488983, "learning_rate": 0.0004027290729320545, "loss": 0.0698, "step": 500 }, { "epoch": 0.89, "grad_norm": 0.31746548414230347, "learning_rate": 0.0004023597893204586, "loss": 0.1682, "step": 501 }, { "epoch": 0.89, "grad_norm": 0.19328100979328156, "learning_rate": 0.00040198997608179477, "loss": 0.1028, "step": 502 }, { "epoch": 0.89, "grad_norm": 0.15466806292533875, "learning_rate": 0.00040161963450159333, "loss": 0.065, "step": 503 }, { "epoch": 0.89, "grad_norm": 0.3000398874282837, "learning_rate": 0.00040124876586722103, "loss": 0.1071, "step": 504 }, { "epoch": 0.89, "grad_norm": 0.16753748059272766, "learning_rate": 0.00040087737146787654, "loss": 0.056, "step": 505 }, { "epoch": 0.9, "grad_norm": 0.17570586502552032, "learning_rate": 0.00040050545259458654, "loss": 0.0732, "step": 506 }, { "epoch": 0.9, "grad_norm": 0.19240190088748932, "learning_rate": 0.00040013301054020055, "loss": 0.0444, "step": 507 }, { "epoch": 0.9, "grad_norm": 0.23935984075069427, "learning_rate": 0.00039976004659938714, "loss": 0.0583, "step": 508 }, { "epoch": 0.9, "grad_norm": 0.22633028030395508, "learning_rate": 0.00039938656206862857, "loss": 0.065, "step": 509 }, { "epoch": 0.9, "grad_norm": 0.18621531128883362, "learning_rate": 0.000399012558246217, "loss": 0.0489, "step": 510 }, { "epoch": 0.9, "grad_norm": 0.37711310386657715, "learning_rate": 0.0003986380364322498, "loss": 0.1367, "step": 511 }, { "epoch": 0.91, "grad_norm": 0.26448771357536316, "learning_rate": 0.00039826299792862475, "loss": 0.076, "step": 512 }, { "epoch": 0.91, "grad_norm": 0.22461633384227753, "learning_rate": 0.00039788744403903604, "loss": 0.0734, "step": 513 }, { "epoch": 0.91, "grad_norm": 0.23908165097236633, "learning_rate": 0.00039751137606896907, "loss": 0.0718, "step": 514 }, { "epoch": 0.91, "grad_norm": 0.37807080149650574, "learning_rate": 0.00039713479532569646, "loss": 0.1495, "step": 515 }, { "epoch": 0.91, "grad_norm": 0.16840259730815887, "learning_rate": 0.00039675770311827337, "loss": 0.0491, "step": 516 }, { "epoch": 0.91, "grad_norm": 0.35179728269577026, "learning_rate": 0.00039638010075753274, "loss": 0.0839, "step": 517 }, { "epoch": 0.92, "grad_norm": 0.3631207048892975, "learning_rate": 0.00039600198955608084, "loss": 0.1348, "step": 518 }, { "epoch": 0.92, "grad_norm": 0.38650691509246826, "learning_rate": 0.00039562337082829304, "loss": 0.15, "step": 519 }, { "epoch": 0.92, "grad_norm": 0.2523843050003052, "learning_rate": 0.00039524424589030866, "loss": 0.1172, "step": 520 }, { "epoch": 0.92, "grad_norm": 0.2690166234970093, "learning_rate": 0.00039486461606002686, "loss": 0.0619, "step": 521 }, { "epoch": 0.92, "grad_norm": 0.31193405389785767, "learning_rate": 0.0003944844826571018, "loss": 0.0834, "step": 522 }, { "epoch": 0.93, "grad_norm": 0.21751855313777924, "learning_rate": 0.00039410384700293814, "loss": 0.068, "step": 523 }, { "epoch": 0.93, "grad_norm": 0.34191232919692993, "learning_rate": 0.0003937227104206865, "loss": 0.1337, "step": 524 }, { "epoch": 0.93, "grad_norm": 0.34457269310951233, "learning_rate": 0.0003933410742352388, "loss": 0.0929, "step": 525 }, { "epoch": 0.93, "grad_norm": 0.22599942982196808, "learning_rate": 0.0003929589397732236, "loss": 0.0899, "step": 526 }, { "epoch": 0.93, "grad_norm": 0.23162932693958282, "learning_rate": 0.0003925763083630017, "loss": 0.0869, "step": 527 }, { "epoch": 0.93, "grad_norm": 0.19502510130405426, "learning_rate": 0.00039219318133466104, "loss": 0.0834, "step": 528 }, { "epoch": 0.94, "grad_norm": 0.2539670169353485, "learning_rate": 0.0003918095600200128, "loss": 0.0589, "step": 529 }, { "epoch": 0.94, "grad_norm": 0.15578749775886536, "learning_rate": 0.00039142544575258614, "loss": 0.0471, "step": 530 }, { "epoch": 0.94, "grad_norm": 0.41006144881248474, "learning_rate": 0.00039104083986762396, "loss": 0.1215, "step": 531 }, { "epoch": 0.94, "grad_norm": 0.3161672055721283, "learning_rate": 0.00039065574370207785, "loss": 0.0599, "step": 532 }, { "epoch": 0.94, "grad_norm": 0.2556127607822418, "learning_rate": 0.00039027015859460394, "loss": 0.0882, "step": 533 }, { "epoch": 0.94, "grad_norm": 0.5484500527381897, "learning_rate": 0.000389884085885558, "loss": 0.1342, "step": 534 }, { "epoch": 0.95, "grad_norm": 0.3688224256038666, "learning_rate": 0.0003894975269169906, "loss": 0.062, "step": 535 }, { "epoch": 0.95, "grad_norm": 0.6328185796737671, "learning_rate": 0.0003891104830326427, "loss": 0.1068, "step": 536 }, { "epoch": 0.95, "grad_norm": 0.5094593167304993, "learning_rate": 0.00038872295557794103, "loss": 0.0593, "step": 537 }, { "epoch": 0.95, "grad_norm": 0.44920942187309265, "learning_rate": 0.0003883349458999931, "loss": 0.1134, "step": 538 }, { "epoch": 0.95, "grad_norm": 0.25559201836586, "learning_rate": 0.0003879464553475828, "loss": 0.0842, "step": 539 }, { "epoch": 0.96, "grad_norm": 0.24992522597312927, "learning_rate": 0.0003875574852711656, "loss": 0.0684, "step": 540 }, { "epoch": 0.96, "grad_norm": 0.7482407093048096, "learning_rate": 0.0003871680370228639, "loss": 0.1698, "step": 541 }, { "epoch": 0.96, "grad_norm": 0.42716777324676514, "learning_rate": 0.00038677811195646233, "loss": 0.1335, "step": 542 }, { "epoch": 0.96, "grad_norm": 0.5867021083831787, "learning_rate": 0.0003863877114274029, "loss": 0.153, "step": 543 }, { "epoch": 0.96, "grad_norm": 0.14882822334766388, "learning_rate": 0.0003859968367927805, "loss": 0.0548, "step": 544 }, { "epoch": 0.96, "grad_norm": 0.16213174164295197, "learning_rate": 0.0003856054894113381, "loss": 0.0859, "step": 545 }, { "epoch": 0.97, "grad_norm": 0.13216906785964966, "learning_rate": 0.0003852136706434619, "loss": 0.0837, "step": 546 }, { "epoch": 0.97, "grad_norm": 0.28230682015419006, "learning_rate": 0.00038482138185117685, "loss": 0.0746, "step": 547 }, { "epoch": 0.97, "grad_norm": 0.15776745975017548, "learning_rate": 0.0003844286243981417, "loss": 0.0758, "step": 548 }, { "epoch": 0.97, "grad_norm": 0.38748612999916077, "learning_rate": 0.0003840353996496444, "loss": 0.0946, "step": 549 }, { "epoch": 0.97, "grad_norm": 0.4377779960632324, "learning_rate": 0.0003836417089725971, "loss": 0.078, "step": 550 }, { "epoch": 0.97, "grad_norm": 0.4776962101459503, "learning_rate": 0.0003832475537355319, "loss": 0.0996, "step": 551 }, { "epoch": 0.98, "grad_norm": 0.16078083217144012, "learning_rate": 0.00038285293530859553, "loss": 0.0813, "step": 552 }, { "epoch": 0.98, "grad_norm": 0.19620949029922485, "learning_rate": 0.00038245785506354514, "loss": 0.0716, "step": 553 }, { "epoch": 0.98, "grad_norm": 0.23539945483207703, "learning_rate": 0.0003820623143737427, "loss": 0.0727, "step": 554 }, { "epoch": 0.98, "grad_norm": 0.2797366678714752, "learning_rate": 0.0003816663146141514, "loss": 0.0307, "step": 555 }, { "epoch": 0.98, "grad_norm": 0.31704849004745483, "learning_rate": 0.00038126985716132976, "loss": 0.0522, "step": 556 }, { "epoch": 0.99, "grad_norm": 1.038294792175293, "learning_rate": 0.00038087294339342765, "loss": 0.1602, "step": 557 }, { "epoch": 0.99, "grad_norm": 0.39535316824913025, "learning_rate": 0.00038047557469018077, "loss": 0.0672, "step": 558 }, { "epoch": 0.99, "grad_norm": 0.5337291359901428, "learning_rate": 0.00038007775243290666, "loss": 0.238, "step": 559 }, { "epoch": 0.99, "grad_norm": 0.7618711590766907, "learning_rate": 0.0003796794780044992, "loss": 0.0741, "step": 560 }, { "epoch": 0.99, "grad_norm": 0.3507292568683624, "learning_rate": 0.0003792807527894242, "loss": 0.1035, "step": 561 }, { "epoch": 0.99, "grad_norm": 0.29699352383613586, "learning_rate": 0.00037888157817371455, "loss": 0.0732, "step": 562 }, { "epoch": 1.0, "grad_norm": 0.1690889596939087, "learning_rate": 0.0003784819555449651, "loss": 0.0625, "step": 563 }, { "epoch": 1.0, "grad_norm": 0.28516581654548645, "learning_rate": 0.0003780818862923284, "loss": 0.0705, "step": 564 }, { "epoch": 1.0, "grad_norm": 0.3408360481262207, "learning_rate": 0.00037768137180650913, "loss": 0.1025, "step": 565 } ], "logging_steps": 1, "max_steps": 1695, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 565, "total_flos": 5.169945694856806e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }