{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 372040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 0.772857129573822, "learning_rate": 3.125e-05, "loss": 6.1994, "step": 1000 }, { "epoch": 0.11, "grad_norm": 0.9309808611869812, "learning_rate": 6.25e-05, "loss": 4.9973, "step": 2000 }, { "epoch": 0.16, "grad_norm": 0.893162190914154, "learning_rate": 9.375e-05, "loss": 4.6739, "step": 3000 }, { "epoch": 0.22, "grad_norm": 0.7998523116111755, "learning_rate": 0.000125, "loss": 4.4521, "step": 4000 }, { "epoch": 0.27, "grad_norm": 0.7740798592567444, "learning_rate": 0.00015625, "loss": 4.2992, "step": 5000 }, { "epoch": 0.32, "grad_norm": 0.725793719291687, "learning_rate": 0.0001875, "loss": 4.181, "step": 6000 }, { "epoch": 0.38, "grad_norm": 0.7274988889694214, "learning_rate": 0.00021875, "loss": 4.0768, "step": 7000 }, { "epoch": 0.43, "grad_norm": 0.6781344413757324, "learning_rate": 0.00025, "loss": 3.9903, "step": 8000 }, { "epoch": 0.48, "grad_norm": 0.6776793599128723, "learning_rate": 0.00028121875, "loss": 3.9081, "step": 9000 }, { "epoch": 0.54, "grad_norm": 0.6176994442939758, "learning_rate": 0.00031246875000000003, "loss": 3.8477, "step": 10000 }, { "epoch": 0.59, "grad_norm": 0.5840777158737183, "learning_rate": 0.00034368749999999997, "loss": 3.8126, "step": 11000 }, { "epoch": 0.65, "grad_norm": 0.5407571792602539, "learning_rate": 0.00037490625, "loss": 3.7654, "step": 12000 }, { "epoch": 0.7, "grad_norm": 0.5007668137550354, "learning_rate": 0.00040615625, "loss": 3.728, "step": 13000 }, { "epoch": 0.75, "grad_norm": 0.46616649627685547, "learning_rate": 0.00043737500000000005, "loss": 3.6994, "step": 14000 }, { "epoch": 0.81, "grad_norm": 0.40752965211868286, "learning_rate": 0.000468625, "loss": 3.6746, "step": 15000 }, { "epoch": 0.86, "grad_norm": 0.4074413478374481, "learning_rate": 0.00049984375, "loss": 3.6544, "step": 16000 }, { "epoch": 0.91, "grad_norm": 0.38538795709609985, "learning_rate": 0.00053109375, "loss": 3.6247, "step": 17000 }, { "epoch": 0.97, "grad_norm": 0.39283987879753113, "learning_rate": 0.0005623125, "loss": 3.6071, "step": 18000 }, { "epoch": 1.0, "eval_accuracy": 0.35732314046161223, "eval_loss": 3.788609266281128, "eval_runtime": 153.8947, "eval_samples_per_second": 376.368, "eval_steps_per_second": 5.887, "step": 18602 }, { "epoch": 1.02, "grad_norm": 0.33553165197372437, "learning_rate": 0.0005935625, "loss": 3.5753, "step": 19000 }, { "epoch": 1.08, "grad_norm": 0.33198973536491394, "learning_rate": 0.0006248125, "loss": 3.549, "step": 20000 }, { "epoch": 1.13, "grad_norm": 0.30177804827690125, "learning_rate": 0.000656, "loss": 3.5373, "step": 21000 }, { "epoch": 1.18, "grad_norm": 0.2844315767288208, "learning_rate": 0.00068725, "loss": 3.5339, "step": 22000 }, { "epoch": 1.24, "grad_norm": 0.2813923954963684, "learning_rate": 0.0007185000000000001, "loss": 3.5241, "step": 23000 }, { "epoch": 1.29, "grad_norm": 0.25613418221473694, "learning_rate": 0.00074971875, "loss": 3.5079, "step": 24000 }, { "epoch": 1.34, "grad_norm": 0.26424020528793335, "learning_rate": 0.00078096875, "loss": 3.4997, "step": 25000 }, { "epoch": 1.4, "grad_norm": 0.24137265980243683, "learning_rate": 0.00081221875, "loss": 3.4857, "step": 26000 }, { "epoch": 1.45, "grad_norm": 0.23364698886871338, "learning_rate": 0.00084346875, "loss": 3.4751, "step": 27000 }, { "epoch": 1.51, "grad_norm": 0.21730710566043854, "learning_rate": 0.0008746874999999999, "loss": 3.4681, "step": 28000 }, { "epoch": 1.56, "grad_norm": 0.2167159467935562, "learning_rate": 0.0009059375, "loss": 3.4576, "step": 29000 }, { "epoch": 1.61, "grad_norm": 0.2263418734073639, "learning_rate": 0.0009371562500000001, "loss": 3.4514, "step": 30000 }, { "epoch": 1.67, "grad_norm": 0.2026355117559433, "learning_rate": 0.0009684062500000001, "loss": 3.4421, "step": 31000 }, { "epoch": 1.72, "grad_norm": 0.2083364725112915, "learning_rate": 0.0009996562500000001, "loss": 3.4402, "step": 32000 }, { "epoch": 1.77, "grad_norm": 0.1914134919643402, "learning_rate": 0.0009970944594753558, "loss": 3.4246, "step": 33000 }, { "epoch": 1.83, "grad_norm": 0.2077130675315857, "learning_rate": 0.0009941536289848254, "loss": 3.4173, "step": 34000 }, { "epoch": 1.88, "grad_norm": 0.19742459058761597, "learning_rate": 0.0009912127984942948, "loss": 3.4043, "step": 35000 }, { "epoch": 1.94, "grad_norm": 0.18328115344047546, "learning_rate": 0.000988274908834255, "loss": 3.3924, "step": 36000 }, { "epoch": 1.99, "grad_norm": 0.17804497480392456, "learning_rate": 0.0009853340783437243, "loss": 3.379, "step": 37000 }, { "epoch": 2.0, "eval_accuracy": 0.3797682433009351, "eval_loss": 3.5653116703033447, "eval_runtime": 155.3698, "eval_samples_per_second": 372.794, "eval_steps_per_second": 5.831, "step": 37204 }, { "epoch": 2.04, "grad_norm": 0.1852262169122696, "learning_rate": 0.0009823932478531937, "loss": 3.3362, "step": 38000 }, { "epoch": 2.1, "grad_norm": 0.18120849132537842, "learning_rate": 0.0009794524173626633, "loss": 3.3286, "step": 39000 }, { "epoch": 2.15, "grad_norm": 0.1793203055858612, "learning_rate": 0.0009765145277026233, "loss": 3.3231, "step": 40000 }, { "epoch": 2.2, "grad_norm": 0.17847037315368652, "learning_rate": 0.0009735736972120928, "loss": 3.3184, "step": 41000 }, { "epoch": 2.26, "grad_norm": 0.17294944822788239, "learning_rate": 0.0009706358075520528, "loss": 3.3086, "step": 42000 }, { "epoch": 2.31, "grad_norm": 0.16597671806812286, "learning_rate": 0.0009676949770615222, "loss": 3.3067, "step": 43000 }, { "epoch": 2.37, "grad_norm": 0.16891473531723022, "learning_rate": 0.0009647570874014821, "loss": 3.3, "step": 44000 }, { "epoch": 2.42, "grad_norm": 0.17510074377059937, "learning_rate": 0.0009618162569109516, "loss": 3.3023, "step": 45000 }, { "epoch": 2.47, "grad_norm": 0.17169316112995148, "learning_rate": 0.0009588754264204211, "loss": 3.295, "step": 46000 }, { "epoch": 2.53, "grad_norm": 0.174598827958107, "learning_rate": 0.0009559345959298906, "loss": 3.2861, "step": 47000 }, { "epoch": 2.58, "grad_norm": 0.18967097997665405, "learning_rate": 0.0009529967062698506, "loss": 3.2845, "step": 48000 }, { "epoch": 2.63, "grad_norm": 0.1692247986793518, "learning_rate": 0.0009500617574403012, "loss": 3.2772, "step": 49000 }, { "epoch": 2.69, "grad_norm": 0.1717061698436737, "learning_rate": 0.0009471209269497706, "loss": 3.2767, "step": 50000 }, { "epoch": 2.74, "grad_norm": 0.18360279500484467, "learning_rate": 0.0009441800964592401, "loss": 3.2632, "step": 51000 }, { "epoch": 2.8, "grad_norm": 0.19667457044124603, "learning_rate": 0.0009412392659687096, "loss": 3.2638, "step": 52000 }, { "epoch": 2.85, "grad_norm": 0.16959011554718018, "learning_rate": 0.0009382984354781791, "loss": 3.2597, "step": 53000 }, { "epoch": 2.9, "grad_norm": 0.19635465741157532, "learning_rate": 0.0009353576049876486, "loss": 3.2503, "step": 54000 }, { "epoch": 2.96, "grad_norm": 0.17817912995815277, "learning_rate": 0.0009324167744971181, "loss": 3.2515, "step": 55000 }, { "epoch": 3.0, "eval_accuracy": 0.39181961046231156, "eval_loss": 3.4691600799560547, "eval_runtime": 155.8093, "eval_samples_per_second": 371.743, "eval_steps_per_second": 5.815, "step": 55806 }, { "epoch": 3.01, "grad_norm": 0.18061339855194092, "learning_rate": 0.0009294759440065876, "loss": 3.2344, "step": 56000 }, { "epoch": 3.06, "grad_norm": 0.17083428800106049, "learning_rate": 0.0009265380543465475, "loss": 3.1818, "step": 57000 }, { "epoch": 3.12, "grad_norm": 0.18623469769954681, "learning_rate": 0.000923597223856017, "loss": 3.1867, "step": 58000 }, { "epoch": 3.17, "grad_norm": 0.16947512328624725, "learning_rate": 0.0009206593341959769, "loss": 3.1896, "step": 59000 }, { "epoch": 3.23, "grad_norm": 0.18858687579631805, "learning_rate": 0.0009177185037054464, "loss": 3.1933, "step": 60000 }, { "epoch": 3.28, "grad_norm": 0.1784023642539978, "learning_rate": 0.0009147776732149159, "loss": 3.1876, "step": 61000 }, { "epoch": 3.33, "grad_norm": 0.18037760257720947, "learning_rate": 0.0009118427243853664, "loss": 3.1871, "step": 62000 }, { "epoch": 3.39, "grad_norm": 0.1854478120803833, "learning_rate": 0.0009089018938948359, "loss": 3.1915, "step": 63000 }, { "epoch": 3.44, "grad_norm": 0.17140689492225647, "learning_rate": 0.0009059610634043054, "loss": 3.1875, "step": 64000 }, { "epoch": 3.49, "grad_norm": 0.18045330047607422, "learning_rate": 0.0009030202329137749, "loss": 3.1863, "step": 65000 }, { "epoch": 3.55, "grad_norm": 0.1827496588230133, "learning_rate": 0.0009000794024232444, "loss": 3.1871, "step": 66000 }, { "epoch": 3.6, "grad_norm": 0.20431484282016754, "learning_rate": 0.0008971415127632044, "loss": 3.1825, "step": 67000 }, { "epoch": 3.66, "grad_norm": 0.17922812700271606, "learning_rate": 0.0008942006822726738, "loss": 3.181, "step": 68000 }, { "epoch": 3.71, "grad_norm": 0.18560312688350677, "learning_rate": 0.0008912627926126339, "loss": 3.1767, "step": 69000 }, { "epoch": 3.76, "grad_norm": 0.182236447930336, "learning_rate": 0.0008883219621221033, "loss": 3.1808, "step": 70000 }, { "epoch": 3.82, "grad_norm": 0.1626577526330948, "learning_rate": 0.0008853870132925539, "loss": 3.1743, "step": 71000 }, { "epoch": 3.87, "grad_norm": 0.1721421480178833, "learning_rate": 0.0008824461828020232, "loss": 3.1734, "step": 72000 }, { "epoch": 3.92, "grad_norm": 0.17298929393291473, "learning_rate": 0.0008795053523114927, "loss": 3.1718, "step": 73000 }, { "epoch": 3.98, "grad_norm": 0.19776953756809235, "learning_rate": 0.0008765645218209622, "loss": 3.1729, "step": 74000 }, { "epoch": 4.0, "eval_accuracy": 0.3983427178451906, "eval_loss": 3.419321060180664, "eval_runtime": 154.6724, "eval_samples_per_second": 374.475, "eval_steps_per_second": 5.858, "step": 74408 }, { "epoch": 4.03, "grad_norm": 0.19919687509536743, "learning_rate": 0.0008736236913304317, "loss": 3.1245, "step": 75000 }, { "epoch": 4.09, "grad_norm": 0.18255846202373505, "learning_rate": 0.0008706858016703917, "loss": 3.1058, "step": 76000 }, { "epoch": 4.14, "grad_norm": 0.22742678225040436, "learning_rate": 0.0008677449711798612, "loss": 3.108, "step": 77000 }, { "epoch": 4.19, "grad_norm": 0.23507148027420044, "learning_rate": 0.0008648041406893307, "loss": 3.1162, "step": 78000 }, { "epoch": 4.25, "grad_norm": 0.17946326732635498, "learning_rate": 0.0008618662510292908, "loss": 3.1139, "step": 79000 }, { "epoch": 4.3, "grad_norm": 0.20196717977523804, "learning_rate": 0.0008589254205387602, "loss": 3.1171, "step": 80000 }, { "epoch": 4.35, "grad_norm": 0.1969883292913437, "learning_rate": 0.0008559845900482296, "loss": 3.1167, "step": 81000 }, { "epoch": 4.41, "grad_norm": 0.20422372221946716, "learning_rate": 0.0008530437595576991, "loss": 3.1144, "step": 82000 }, { "epoch": 4.46, "grad_norm": 0.2068530172109604, "learning_rate": 0.0008501088107281496, "loss": 3.1169, "step": 83000 }, { "epoch": 4.52, "grad_norm": 0.20425015687942505, "learning_rate": 0.0008471679802376191, "loss": 3.1188, "step": 84000 }, { "epoch": 4.57, "grad_norm": 0.20677131414413452, "learning_rate": 0.0008442271497470886, "loss": 3.1181, "step": 85000 }, { "epoch": 4.62, "grad_norm": 0.1946183443069458, "learning_rate": 0.000841286319256558, "loss": 3.1165, "step": 86000 }, { "epoch": 4.68, "grad_norm": 0.19427254796028137, "learning_rate": 0.000838348429596518, "loss": 3.1205, "step": 87000 }, { "epoch": 4.73, "grad_norm": 0.2612309455871582, "learning_rate": 0.0008354075991059875, "loss": 3.1168, "step": 88000 }, { "epoch": 4.78, "grad_norm": 0.19651499390602112, "learning_rate": 0.0008324697094459476, "loss": 3.1181, "step": 89000 }, { "epoch": 4.84, "grad_norm": 0.21289727091789246, "learning_rate": 0.0008295288789554171, "loss": 3.1144, "step": 90000 }, { "epoch": 4.89, "grad_norm": 0.18825019896030426, "learning_rate": 0.000826590989295377, "loss": 3.1141, "step": 91000 }, { "epoch": 4.95, "grad_norm": 0.20688407123088837, "learning_rate": 0.0008236501588048465, "loss": 3.114, "step": 92000 }, { "epoch": 5.0, "grad_norm": 0.18017753958702087, "learning_rate": 0.000820709328314316, "loss": 3.1139, "step": 93000 }, { "epoch": 5.0, "eval_accuracy": 0.40260902798169274, "eval_loss": 3.3907480239868164, "eval_runtime": 155.5488, "eval_samples_per_second": 372.365, "eval_steps_per_second": 5.825, "step": 93010 }, { "epoch": 5.05, "grad_norm": 0.2159438282251358, "learning_rate": 0.0008177714386542759, "loss": 3.0438, "step": 94000 }, { "epoch": 5.11, "grad_norm": 0.1938624382019043, "learning_rate": 0.0008148306081637454, "loss": 3.0471, "step": 95000 }, { "epoch": 5.16, "grad_norm": 0.1948939859867096, "learning_rate": 0.0008118897776732149, "loss": 3.0519, "step": 96000 }, { "epoch": 5.21, "grad_norm": 0.21813590824604034, "learning_rate": 0.000808951888013175, "loss": 3.0615, "step": 97000 }, { "epoch": 5.27, "grad_norm": 0.25585538148880005, "learning_rate": 0.0008060110575226443, "loss": 3.0615, "step": 98000 }, { "epoch": 5.32, "grad_norm": 0.2105061262845993, "learning_rate": 0.0008030702270321138, "loss": 3.0617, "step": 99000 }, { "epoch": 5.38, "grad_norm": 0.18835408985614777, "learning_rate": 0.0008001293965415833, "loss": 3.0677, "step": 100000 }, { "epoch": 5.43, "grad_norm": 0.1814804971218109, "learning_rate": 0.0007971915068815434, "loss": 3.07, "step": 101000 }, { "epoch": 5.48, "grad_norm": 0.2165517359972, "learning_rate": 0.0007942536172215034, "loss": 3.0654, "step": 102000 }, { "epoch": 5.54, "grad_norm": 0.21530425548553467, "learning_rate": 0.0007913127867309729, "loss": 3.0686, "step": 103000 }, { "epoch": 5.59, "grad_norm": NaN, "learning_rate": 0.0007883748970709328, "loss": 3.0676, "step": 104000 }, { "epoch": 5.64, "grad_norm": 0.20055846869945526, "learning_rate": 0.0007854340665804023, "loss": 3.0705, "step": 105000 }, { "epoch": 5.7, "grad_norm": 0.19475945830345154, "learning_rate": 0.0007824932360898718, "loss": 3.0684, "step": 106000 }, { "epoch": 5.75, "grad_norm": 0.2444075047969818, "learning_rate": 0.0007795553464298318, "loss": 3.0644, "step": 107000 }, { "epoch": 5.81, "grad_norm": 0.3142812252044678, "learning_rate": 0.0007766145159393013, "loss": 3.0701, "step": 108000 }, { "epoch": 5.86, "grad_norm": 0.1972387135028839, "learning_rate": 0.0007736736854487707, "loss": 3.0648, "step": 109000 }, { "epoch": 5.91, "grad_norm": 0.18938925862312317, "learning_rate": 0.0007707328549582402, "loss": 3.0686, "step": 110000 }, { "epoch": 5.97, "grad_norm": 0.22070257365703583, "learning_rate": 0.0007677949652982002, "loss": 3.0709, "step": 111000 }, { "epoch": 6.0, "eval_accuracy": 0.40427472089482946, "eval_loss": 3.3642256259918213, "eval_runtime": 154.6657, "eval_samples_per_second": 374.492, "eval_steps_per_second": 5.858, "step": 111612 }, { "epoch": 6.02, "grad_norm": 0.20117591321468353, "learning_rate": 0.0007648541348076697, "loss": 3.0385, "step": 112000 }, { "epoch": 6.07, "grad_norm": 0.2254948914051056, "learning_rate": 0.0007619133043171391, "loss": 2.9981, "step": 113000 }, { "epoch": 6.13, "grad_norm": 0.21827402710914612, "learning_rate": 0.0007589754146570992, "loss": 3.007, "step": 114000 }, { "epoch": 6.18, "grad_norm": 0.2094143182039261, "learning_rate": 0.0007560345841665687, "loss": 3.0119, "step": 115000 }, { "epoch": 6.24, "grad_norm": 0.23713943362236023, "learning_rate": 0.0007530937536760382, "loss": 3.0158, "step": 116000 }, { "epoch": 6.29, "grad_norm": 0.21203161776065826, "learning_rate": 0.0007501558640159981, "loss": 3.0179, "step": 117000 }, { "epoch": 6.34, "grad_norm": 0.223785862326622, "learning_rate": 0.0007472150335254676, "loss": 3.0211, "step": 118000 }, { "epoch": 6.4, "grad_norm": 0.3047547936439514, "learning_rate": 0.0007442742030349371, "loss": 3.0232, "step": 119000 }, { "epoch": 6.45, "grad_norm": 0.22039389610290527, "learning_rate": 0.0007413333725444066, "loss": 3.0261, "step": 120000 }, { "epoch": 6.5, "grad_norm": 0.22225935757160187, "learning_rate": 0.0007383954828843665, "loss": 3.0268, "step": 121000 }, { "epoch": 6.56, "grad_norm": 0.30233263969421387, "learning_rate": 0.000735454652393836, "loss": 3.0299, "step": 122000 }, { "epoch": 6.61, "grad_norm": 0.23030564188957214, "learning_rate": 0.0007325138219033055, "loss": 3.0301, "step": 123000 }, { "epoch": 6.67, "grad_norm": 0.2047109752893448, "learning_rate": 0.0007295759322432654, "loss": 3.027, "step": 124000 }, { "epoch": 6.72, "grad_norm": 0.21085889637470245, "learning_rate": 0.0007266380425832255, "loss": 3.0278, "step": 125000 }, { "epoch": 6.77, "grad_norm": 0.2596847712993622, "learning_rate": 0.000723697212092695, "loss": 3.0358, "step": 126000 }, { "epoch": 6.83, "grad_norm": 0.1978030651807785, "learning_rate": 0.0007207563816021645, "loss": 3.0315, "step": 127000 }, { "epoch": 6.88, "grad_norm": 0.21782337129116058, "learning_rate": 0.000717815551111634, "loss": 3.028, "step": 128000 }, { "epoch": 6.93, "grad_norm": 0.2578461766242981, "learning_rate": 0.000714877661451594, "loss": 3.0329, "step": 129000 }, { "epoch": 6.99, "grad_norm": 0.20211657881736755, "learning_rate": 0.0007119368309610634, "loss": 3.0297, "step": 130000 }, { "epoch": 7.0, "eval_accuracy": 0.40667091369655045, "eval_loss": 3.3544673919677734, "eval_runtime": 155.1034, "eval_samples_per_second": 373.435, "eval_steps_per_second": 5.841, "step": 130214 }, { "epoch": 7.04, "grad_norm": 0.2451675981283188, "learning_rate": 0.0007089989413010234, "loss": 2.9744, "step": 131000 }, { "epoch": 7.1, "grad_norm": 0.26030394434928894, "learning_rate": 0.0007060581108104929, "loss": 2.9677, "step": 132000 }, { "epoch": 7.15, "grad_norm": 0.22699125111103058, "learning_rate": 0.0007031172803199624, "loss": 2.9753, "step": 133000 }, { "epoch": 7.2, "grad_norm": 0.22514161467552185, "learning_rate": 0.0007001764498294319, "loss": 2.974, "step": 134000 }, { "epoch": 7.26, "grad_norm": 0.23178595304489136, "learning_rate": 0.0006972356193389014, "loss": 2.9784, "step": 135000 }, { "epoch": 7.31, "grad_norm": 0.27629485726356506, "learning_rate": 0.0006942977296788612, "loss": 2.9856, "step": 136000 }, { "epoch": 7.36, "grad_norm": 0.2211887091398239, "learning_rate": 0.0006913598400188213, "loss": 2.9859, "step": 137000 }, { "epoch": 7.42, "grad_norm": 0.2578117847442627, "learning_rate": 0.0006884190095282908, "loss": 2.985, "step": 138000 }, { "epoch": 7.47, "grad_norm": 0.25054463744163513, "learning_rate": 0.0006854781790377603, "loss": 2.9902, "step": 139000 }, { "epoch": 7.53, "grad_norm": 0.21288973093032837, "learning_rate": 0.0006825402893777203, "loss": 2.9922, "step": 140000 }, { "epoch": 7.58, "grad_norm": 0.21266783773899078, "learning_rate": 0.0006795994588871898, "loss": 2.994, "step": 141000 }, { "epoch": 7.63, "grad_norm": 0.26795753836631775, "learning_rate": 0.0006766615692271497, "loss": 2.9945, "step": 142000 }, { "epoch": 7.69, "grad_norm": 0.2051943689584732, "learning_rate": 0.0006737207387366192, "loss": 2.994, "step": 143000 }, { "epoch": 7.74, "grad_norm": 0.2313968390226364, "learning_rate": 0.0006707799082460887, "loss": 2.9988, "step": 144000 }, { "epoch": 7.79, "grad_norm": 0.24470819532871246, "learning_rate": 0.0006678390777555582, "loss": 2.998, "step": 145000 }, { "epoch": 7.85, "grad_norm": 0.203665092587471, "learning_rate": 0.0006649011880955182, "loss": 2.9962, "step": 146000 }, { "epoch": 7.9, "grad_norm": 0.21225391328334808, "learning_rate": 0.0006619603576049877, "loss": 2.9975, "step": 147000 }, { "epoch": 7.96, "grad_norm": 0.25866907835006714, "learning_rate": 0.0006590195271144572, "loss": 2.9988, "step": 148000 }, { "epoch": 8.0, "eval_accuracy": 0.4079528153770697, "eval_loss": 3.359600782394409, "eval_runtime": 154.5906, "eval_samples_per_second": 374.673, "eval_steps_per_second": 5.861, "step": 148816 }, { "epoch": 8.01, "grad_norm": 0.25523641705513, "learning_rate": 0.0006560816374544171, "loss": 2.9863, "step": 149000 }, { "epoch": 8.06, "grad_norm": 0.22232286632061005, "learning_rate": 0.0006531408069638866, "loss": 2.9316, "step": 150000 }, { "epoch": 8.12, "grad_norm": 0.22783271968364716, "learning_rate": 0.000650199976473356, "loss": 2.9369, "step": 151000 }, { "epoch": 8.17, "grad_norm": 0.21875810623168945, "learning_rate": 0.0006472591459828255, "loss": 2.9418, "step": 152000 }, { "epoch": 8.22, "grad_norm": 0.2610374093055725, "learning_rate": 0.0006443212563227856, "loss": 2.9468, "step": 153000 }, { "epoch": 8.28, "grad_norm": 0.2271444946527481, "learning_rate": 0.0006413804258322551, "loss": 2.9532, "step": 154000 }, { "epoch": 8.33, "grad_norm": 0.23558945953845978, "learning_rate": 0.000638442536172215, "loss": 2.9513, "step": 155000 }, { "epoch": 8.39, "grad_norm": 0.23239487409591675, "learning_rate": 0.0006355046465121751, "loss": 2.9544, "step": 156000 }, { "epoch": 8.44, "grad_norm": 0.2163064181804657, "learning_rate": 0.0006325638160216445, "loss": 2.9594, "step": 157000 }, { "epoch": 8.49, "grad_norm": 0.25110113620758057, "learning_rate": 0.000629622985531114, "loss": 2.9587, "step": 158000 }, { "epoch": 8.55, "grad_norm": 0.23023970425128937, "learning_rate": 0.0006266850958710741, "loss": 2.9635, "step": 159000 }, { "epoch": 8.6, "grad_norm": 0.2888554036617279, "learning_rate": 0.0006237442653805435, "loss": 2.9645, "step": 160000 }, { "epoch": 8.65, "grad_norm": 0.22880807518959045, "learning_rate": 0.000620803434890013, "loss": 2.9623, "step": 161000 }, { "epoch": 8.71, "grad_norm": 0.25806036591529846, "learning_rate": 0.0006178626043994823, "loss": 2.971, "step": 162000 }, { "epoch": 8.76, "grad_norm": 0.25819092988967896, "learning_rate": 0.0006149247147394424, "loss": 2.9689, "step": 163000 }, { "epoch": 8.82, "grad_norm": 0.2252822071313858, "learning_rate": 0.0006119838842489119, "loss": 2.9672, "step": 164000 }, { "epoch": 8.87, "grad_norm": 0.22955825924873352, "learning_rate": 0.0006090489354193625, "loss": 2.9666, "step": 165000 }, { "epoch": 8.92, "grad_norm": 0.33369356393814087, "learning_rate": 0.000606108104928832, "loss": 2.9677, "step": 166000 }, { "epoch": 8.98, "grad_norm": 0.2391689568758011, "learning_rate": 0.0006031672744383015, "loss": 2.9717, "step": 167000 }, { "epoch": 9.0, "eval_accuracy": 0.40866631908468914, "eval_loss": 3.3723206520080566, "eval_runtime": 155.2809, "eval_samples_per_second": 373.008, "eval_steps_per_second": 5.835, "step": 167418 }, { "epoch": 9.03, "grad_norm": 0.22217118740081787, "learning_rate": 0.0006002293847782614, "loss": 2.9298, "step": 168000 }, { "epoch": 9.09, "grad_norm": 0.249114990234375, "learning_rate": 0.0005972885542877309, "loss": 2.9048, "step": 169000 }, { "epoch": 9.14, "grad_norm": 0.28138816356658936, "learning_rate": 0.0005943506646276909, "loss": 2.9122, "step": 170000 }, { "epoch": 9.19, "grad_norm": 0.26497504115104675, "learning_rate": 0.0005914098341371603, "loss": 2.9165, "step": 171000 }, { "epoch": 9.25, "grad_norm": 0.33365172147750854, "learning_rate": 0.0005884690036466298, "loss": 2.9174, "step": 172000 }, { "epoch": 9.3, "grad_norm": 0.257129043340683, "learning_rate": 0.0005855281731560992, "loss": 2.924, "step": 173000 }, { "epoch": 9.35, "grad_norm": 0.29947561025619507, "learning_rate": 0.0005825902834960592, "loss": 2.9282, "step": 174000 }, { "epoch": 9.41, "grad_norm": 0.23127929866313934, "learning_rate": 0.0005796494530055287, "loss": 2.9272, "step": 175000 }, { "epoch": 9.46, "grad_norm": 0.2609017491340637, "learning_rate": 0.0005767086225149982, "loss": 2.9311, "step": 176000 }, { "epoch": 9.52, "grad_norm": NaN, "learning_rate": 0.0005737707328549583, "loss": 2.9321, "step": 177000 }, { "epoch": 9.57, "grad_norm": 0.27535927295684814, "learning_rate": 0.0005708299023644278, "loss": 2.9343, "step": 178000 }, { "epoch": 9.62, "grad_norm": 0.2315855473279953, "learning_rate": 0.0005678890718738972, "loss": 2.9342, "step": 179000 }, { "epoch": 9.68, "grad_norm": 0.2345881462097168, "learning_rate": 0.0005649482413833666, "loss": 2.9377, "step": 180000 }, { "epoch": 9.73, "grad_norm": 0.36270418763160706, "learning_rate": 0.0005620103517233267, "loss": 2.9415, "step": 181000 }, { "epoch": 9.78, "grad_norm": 0.22433358430862427, "learning_rate": 0.0005590695212327962, "loss": 2.9433, "step": 182000 }, { "epoch": 9.84, "grad_norm": 0.23198537528514862, "learning_rate": 0.0005561286907422656, "loss": 2.9435, "step": 183000 }, { "epoch": 9.89, "grad_norm": 0.3433922827243805, "learning_rate": 0.0005531908010822257, "loss": 2.9422, "step": 184000 }, { "epoch": 9.95, "grad_norm": 0.2841480076313019, "learning_rate": 0.0005502499705916952, "loss": 2.9457, "step": 185000 }, { "epoch": 10.0, "grad_norm": 0.2232799530029297, "learning_rate": 0.0005473091401011647, "loss": 2.9432, "step": 186000 }, { "epoch": 10.0, "eval_accuracy": 0.4092650817761506, "eval_loss": 3.357872486114502, "eval_runtime": 155.1007, "eval_samples_per_second": 373.441, "eval_steps_per_second": 5.841, "step": 186020 }, { "epoch": 10.05, "grad_norm": 0.2610397934913635, "learning_rate": 0.0005443712504411245, "loss": 2.8782, "step": 187000 }, { "epoch": 10.11, "grad_norm": 0.3216498792171478, "learning_rate": 0.000541430419950594, "loss": 2.8815, "step": 188000 }, { "epoch": 10.16, "grad_norm": 0.2474890649318695, "learning_rate": 0.0005384895894600635, "loss": 2.8881, "step": 189000 }, { "epoch": 10.21, "grad_norm": 0.35237178206443787, "learning_rate": 0.000535548758969533, "loss": 2.8933, "step": 190000 }, { "epoch": 10.27, "grad_norm": 0.3191303610801697, "learning_rate": 0.000532610869309493, "loss": 2.8989, "step": 191000 }, { "epoch": 10.32, "grad_norm": 0.2325054258108139, "learning_rate": 0.0005296700388189625, "loss": 2.8978, "step": 192000 }, { "epoch": 10.38, "grad_norm": 0.24588368833065033, "learning_rate": 0.0005267321491589225, "loss": 2.9019, "step": 193000 }, { "epoch": 10.43, "grad_norm": 0.2485446035861969, "learning_rate": 0.0005237913186683919, "loss": 2.9035, "step": 194000 }, { "epoch": 10.48, "grad_norm": 0.24470220506191254, "learning_rate": 0.000520853429008352, "loss": 2.9077, "step": 195000 }, { "epoch": 10.54, "grad_norm": 0.248806893825531, "learning_rate": 0.0005179125985178215, "loss": 2.9089, "step": 196000 }, { "epoch": 10.59, "grad_norm": 0.22887374460697174, "learning_rate": 0.000514971768027291, "loss": 2.908, "step": 197000 }, { "epoch": 10.64, "grad_norm": 0.2529465854167938, "learning_rate": 0.0005120309375367605, "loss": 2.9127, "step": 198000 }, { "epoch": 10.7, "grad_norm": 0.25075119733810425, "learning_rate": 0.0005090901070462299, "loss": 2.911, "step": 199000 }, { "epoch": 10.75, "grad_norm": 0.23723989725112915, "learning_rate": 0.0005061522173861898, "loss": 2.9152, "step": 200000 }, { "epoch": 10.81, "grad_norm": 0.25637319684028625, "learning_rate": 0.0005032143277261498, "loss": 2.9149, "step": 201000 }, { "epoch": 10.86, "grad_norm": 0.28713148832321167, "learning_rate": 0.0005002734972356193, "loss": 2.9178, "step": 202000 }, { "epoch": 10.91, "grad_norm": 0.27700188755989075, "learning_rate": 0.0004973356075755794, "loss": 2.9214, "step": 203000 }, { "epoch": 10.97, "grad_norm": 0.33580395579338074, "learning_rate": 0.0004943947770850489, "loss": 2.9217, "step": 204000 }, { "epoch": 11.0, "eval_accuracy": 0.4097510516302999, "eval_loss": 3.370105028152466, "eval_runtime": 155.2297, "eval_samples_per_second": 373.131, "eval_steps_per_second": 5.837, "step": 204622 }, { "epoch": 11.02, "grad_norm": 0.23724067211151123, "learning_rate": 0.0004914539465945184, "loss": 2.8953, "step": 205000 }, { "epoch": 11.07, "grad_norm": 0.33235830068588257, "learning_rate": 0.0004885160569344783, "loss": 2.8533, "step": 206000 }, { "epoch": 11.13, "grad_norm": 0.2660665810108185, "learning_rate": 0.0004855752264439478, "loss": 2.8628, "step": 207000 }, { "epoch": 11.18, "grad_norm": 0.24949678778648376, "learning_rate": 0.00048263439595341726, "loss": 2.8692, "step": 208000 }, { "epoch": 11.24, "grad_norm": 0.2526322901248932, "learning_rate": 0.00047969356546288675, "loss": 2.8702, "step": 209000 }, { "epoch": 11.29, "grad_norm": 0.29348504543304443, "learning_rate": 0.00047676155746382784, "loss": 2.8716, "step": 210000 }, { "epoch": 11.34, "grad_norm": 0.2396526336669922, "learning_rate": 0.0004738207269732973, "loss": 2.8788, "step": 211000 }, { "epoch": 11.4, "grad_norm": 0.24263983964920044, "learning_rate": 0.00047087989648276677, "loss": 2.8842, "step": 212000 }, { "epoch": 11.45, "grad_norm": 0.24339748919010162, "learning_rate": 0.00046793906599223626, "loss": 2.8805, "step": 213000 }, { "epoch": 11.5, "grad_norm": 0.35075515508651733, "learning_rate": 0.00046499823550170564, "loss": 2.8861, "step": 214000 }, { "epoch": 11.56, "grad_norm": 0.2587752938270569, "learning_rate": 0.00046205740501117514, "loss": 2.8861, "step": 215000 }, { "epoch": 11.61, "grad_norm": 0.2334243208169937, "learning_rate": 0.00045911657452064463, "loss": 2.8904, "step": 216000 }, { "epoch": 11.67, "grad_norm": 0.2610035836696625, "learning_rate": 0.00045617868486060465, "loss": 2.8888, "step": 217000 }, { "epoch": 11.72, "grad_norm": 0.2527429461479187, "learning_rate": 0.00045323785437007414, "loss": 2.8869, "step": 218000 }, { "epoch": 11.77, "grad_norm": 0.27201133966445923, "learning_rate": 0.00045030290554052464, "loss": 2.8921, "step": 219000 }, { "epoch": 11.83, "grad_norm": 0.25473257899284363, "learning_rate": 0.00044736207504999414, "loss": 2.8933, "step": 220000 }, { "epoch": 11.88, "grad_norm": 0.24522466957569122, "learning_rate": 0.0004444212445594636, "loss": 2.8943, "step": 221000 }, { "epoch": 11.93, "grad_norm": 0.2609509825706482, "learning_rate": 0.00044148041406893307, "loss": 2.8972, "step": 222000 }, { "epoch": 11.99, "grad_norm": 0.252936989068985, "learning_rate": 0.0004385425244088931, "loss": 2.8986, "step": 223000 }, { "epoch": 12.0, "eval_accuracy": 0.41030937281606633, "eval_loss": 3.364614963531494, "eval_runtime": 154.8866, "eval_samples_per_second": 373.957, "eval_steps_per_second": 5.849, "step": 223224 }, { "epoch": 12.04, "grad_norm": 0.2545512914657593, "learning_rate": 0.0004356016939183626, "loss": 2.8465, "step": 224000 }, { "epoch": 12.1, "grad_norm": 0.28531843423843384, "learning_rate": 0.0004326638042583226, "loss": 2.8391, "step": 225000 }, { "epoch": 12.15, "grad_norm": 0.26895156502723694, "learning_rate": 0.000429722973767792, "loss": 2.8444, "step": 226000 }, { "epoch": 12.2, "grad_norm": 0.28283581137657166, "learning_rate": 0.0004267821432772615, "loss": 2.849, "step": 227000 }, { "epoch": 12.26, "grad_norm": 0.3257947564125061, "learning_rate": 0.0004238442536172215, "loss": 2.8473, "step": 228000 }, { "epoch": 12.31, "grad_norm": 0.26338285207748413, "learning_rate": 0.000420903423126691, "loss": 2.8562, "step": 229000 }, { "epoch": 12.36, "grad_norm": 0.24919289350509644, "learning_rate": 0.0004179625926361605, "loss": 2.8545, "step": 230000 }, { "epoch": 12.42, "grad_norm": 0.24178272485733032, "learning_rate": 0.00041502176214562993, "loss": 2.8579, "step": 231000 }, { "epoch": 12.47, "grad_norm": 0.2602218985557556, "learning_rate": 0.00041208387248558995, "loss": 2.8588, "step": 232000 }, { "epoch": 12.53, "grad_norm": 0.23942138254642487, "learning_rate": 0.0004091459828255499, "loss": 2.8592, "step": 233000 }, { "epoch": 12.58, "grad_norm": 0.24679596722126007, "learning_rate": 0.0004062051523350194, "loss": 2.8692, "step": 234000 }, { "epoch": 12.63, "grad_norm": 0.24444027245044708, "learning_rate": 0.0004032643218444889, "loss": 2.8655, "step": 235000 }, { "epoch": 12.69, "grad_norm": 0.30420973896980286, "learning_rate": 0.00040032349135395834, "loss": 2.8658, "step": 236000 }, { "epoch": 12.74, "grad_norm": 0.2618982493877411, "learning_rate": 0.00039738560169391837, "loss": 2.8709, "step": 237000 }, { "epoch": 12.79, "grad_norm": 0.2887171804904938, "learning_rate": 0.00039444477120338786, "loss": 2.8724, "step": 238000 }, { "epoch": 12.85, "grad_norm": 0.26698777079582214, "learning_rate": 0.00039150394071285735, "loss": 2.8689, "step": 239000 }, { "epoch": 12.9, "grad_norm": 0.2446519136428833, "learning_rate": 0.0003885631102223268, "loss": 2.8778, "step": 240000 }, { "epoch": 12.96, "grad_norm": 0.25453439354896545, "learning_rate": 0.00038562522056228676, "loss": 2.8745, "step": 241000 }, { "epoch": 13.0, "eval_accuracy": 0.4104802079472664, "eval_loss": 3.367594003677368, "eval_runtime": 154.8681, "eval_samples_per_second": 374.002, "eval_steps_per_second": 5.85, "step": 241826 }, { "epoch": 13.01, "grad_norm": 0.2726215124130249, "learning_rate": 0.0003826873309022468, "loss": 2.8634, "step": 242000 }, { "epoch": 13.06, "grad_norm": 0.2711837887763977, "learning_rate": 0.0003797465004117163, "loss": 2.8164, "step": 243000 }, { "epoch": 13.12, "grad_norm": 0.2655225694179535, "learning_rate": 0.00037680566992118577, "loss": 2.8182, "step": 244000 }, { "epoch": 13.17, "grad_norm": 0.3055564761161804, "learning_rate": 0.00037386778026114574, "loss": 2.8278, "step": 245000 }, { "epoch": 13.22, "grad_norm": 0.3115604817867279, "learning_rate": 0.00037092694977061523, "loss": 2.8282, "step": 246000 }, { "epoch": 13.28, "grad_norm": 0.27207228541374207, "learning_rate": 0.00036798611928008467, "loss": 2.8323, "step": 247000 }, { "epoch": 13.33, "grad_norm": 0.2732410132884979, "learning_rate": 0.00036504528878955416, "loss": 2.8363, "step": 248000 }, { "epoch": 13.39, "grad_norm": 0.24663548171520233, "learning_rate": 0.0003621073991295142, "loss": 2.8382, "step": 249000 }, { "epoch": 13.44, "grad_norm": 0.2780630588531494, "learning_rate": 0.0003591665686389837, "loss": 2.8379, "step": 250000 }, { "epoch": 13.49, "grad_norm": 0.2623123526573181, "learning_rate": 0.0003562257381484531, "loss": 2.8383, "step": 251000 }, { "epoch": 13.55, "grad_norm": 0.26465824246406555, "learning_rate": 0.00035329078931890367, "loss": 2.8445, "step": 252000 }, { "epoch": 13.6, "grad_norm": 0.2646311819553375, "learning_rate": 0.0003503499588283731, "loss": 2.8417, "step": 253000 }, { "epoch": 13.65, "grad_norm": 0.2561984956264496, "learning_rate": 0.00034741206916833313, "loss": 2.8439, "step": 254000 }, { "epoch": 13.71, "grad_norm": 0.3358238637447357, "learning_rate": 0.0003444712386778026, "loss": 2.8495, "step": 255000 }, { "epoch": 13.76, "grad_norm": 0.3044319748878479, "learning_rate": 0.0003415304081872721, "loss": 2.8472, "step": 256000 }, { "epoch": 13.82, "grad_norm": 0.2482803910970688, "learning_rate": 0.0003385925185272321, "loss": 2.8555, "step": 257000 }, { "epoch": 13.87, "grad_norm": 0.2835788130760193, "learning_rate": 0.0003356516880367016, "loss": 2.8494, "step": 258000 }, { "epoch": 13.92, "grad_norm": 0.2810549736022949, "learning_rate": 0.000332710857546171, "loss": 2.8522, "step": 259000 }, { "epoch": 13.98, "grad_norm": 0.30381470918655396, "learning_rate": 0.0003297700270556405, "loss": 2.8518, "step": 260000 }, { "epoch": 14.0, "eval_accuracy": 0.4109941912696463, "eval_loss": 3.3750314712524414, "eval_runtime": 154.897, "eval_samples_per_second": 373.932, "eval_steps_per_second": 5.849, "step": 260428 }, { "epoch": 14.03, "grad_norm": 0.27919963002204895, "learning_rate": 0.00032683213739560053, "loss": 2.8197, "step": 261000 }, { "epoch": 14.08, "grad_norm": 0.27463871240615845, "learning_rate": 0.00032389424773556055, "loss": 2.7989, "step": 262000 }, { "epoch": 14.14, "grad_norm": 0.26133376359939575, "learning_rate": 0.00032095341724503005, "loss": 2.8029, "step": 263000 }, { "epoch": 14.19, "grad_norm": 0.34036701917648315, "learning_rate": 0.00031801258675449943, "loss": 2.8059, "step": 264000 }, { "epoch": 14.25, "grad_norm": 0.293765664100647, "learning_rate": 0.00031507469709445945, "loss": 2.8085, "step": 265000 }, { "epoch": 14.3, "grad_norm": 0.26148098707199097, "learning_rate": 0.00031213386660392895, "loss": 2.8132, "step": 266000 }, { "epoch": 14.35, "grad_norm": 0.25458094477653503, "learning_rate": 0.00030919303611339844, "loss": 2.8154, "step": 267000 }, { "epoch": 14.41, "grad_norm": 0.26752936840057373, "learning_rate": 0.00030625220562286793, "loss": 2.8209, "step": 268000 }, { "epoch": 14.46, "grad_norm": 0.26262667775154114, "learning_rate": 0.00030331137513233737, "loss": 2.8175, "step": 269000 }, { "epoch": 14.51, "grad_norm": 0.3177816867828369, "learning_rate": 0.00030037054464180686, "loss": 2.8221, "step": 270000 }, { "epoch": 14.57, "grad_norm": 0.34930920600891113, "learning_rate": 0.0002974326549817669, "loss": 2.8277, "step": 271000 }, { "epoch": 14.62, "grad_norm": 0.25501158833503723, "learning_rate": 0.0002944918244912364, "loss": 2.8237, "step": 272000 }, { "epoch": 14.68, "grad_norm": 0.25757795572280884, "learning_rate": 0.00029155393483119635, "loss": 2.823, "step": 273000 }, { "epoch": 14.73, "grad_norm": 0.262439489364624, "learning_rate": 0.0002886131043406658, "loss": 2.8256, "step": 274000 }, { "epoch": 14.78, "grad_norm": 0.27561119198799133, "learning_rate": 0.0002856722738501353, "loss": 2.8325, "step": 275000 }, { "epoch": 14.84, "grad_norm": 0.30357280373573303, "learning_rate": 0.0002827343841900953, "loss": 2.8292, "step": 276000 }, { "epoch": 14.89, "grad_norm": 0.27511313557624817, "learning_rate": 0.0002797935536995648, "loss": 2.8319, "step": 277000 }, { "epoch": 14.94, "grad_norm": 0.26999151706695557, "learning_rate": 0.0002768527232090343, "loss": 2.8341, "step": 278000 }, { "epoch": 15.0, "grad_norm": 0.27331700921058655, "learning_rate": 0.00027391483354899425, "loss": 2.8328, "step": 279000 }, { "epoch": 15.0, "eval_accuracy": 0.4111497096844038, "eval_loss": 3.3722476959228516, "eval_runtime": 154.7468, "eval_samples_per_second": 374.295, "eval_steps_per_second": 5.855, "step": 279030 }, { "epoch": 15.05, "grad_norm": 0.2742418944835663, "learning_rate": 0.0002709769438889542, "loss": 2.7834, "step": 280000 }, { "epoch": 15.11, "grad_norm": 0.2747434675693512, "learning_rate": 0.0002680361133984237, "loss": 2.7848, "step": 281000 }, { "epoch": 15.16, "grad_norm": 0.308196485042572, "learning_rate": 0.0002650952829078932, "loss": 2.7865, "step": 282000 }, { "epoch": 15.21, "grad_norm": 0.3813375234603882, "learning_rate": 0.0002621544524173627, "loss": 2.7904, "step": 283000 }, { "epoch": 15.27, "grad_norm": 0.25685274600982666, "learning_rate": 0.00025921656275732267, "loss": 2.7944, "step": 284000 }, { "epoch": 15.32, "grad_norm": 0.2853144109249115, "learning_rate": 0.0002562757322667921, "loss": 2.7967, "step": 285000 }, { "epoch": 15.37, "grad_norm": 0.2811448276042938, "learning_rate": 0.0002533349017762616, "loss": 2.7989, "step": 286000 }, { "epoch": 15.43, "grad_norm": 0.32674285769462585, "learning_rate": 0.0002503970121162216, "loss": 2.8013, "step": 287000 }, { "epoch": 15.48, "grad_norm": 0.2677679657936096, "learning_rate": 0.0002474561816256911, "loss": 2.8003, "step": 288000 }, { "epoch": 15.54, "grad_norm": 0.2848023474216461, "learning_rate": 0.00024451829196565114, "loss": 2.8051, "step": 289000 }, { "epoch": 15.59, "grad_norm": 0.3296356499195099, "learning_rate": 0.00024157746147512058, "loss": 2.8004, "step": 290000 }, { "epoch": 15.64, "grad_norm": 0.3256130516529083, "learning_rate": 0.00023863663098459004, "loss": 2.8051, "step": 291000 }, { "epoch": 15.7, "grad_norm": 0.30205684900283813, "learning_rate": 0.0002357016821550406, "loss": 2.8075, "step": 292000 }, { "epoch": 15.75, "grad_norm": 0.2870996296405792, "learning_rate": 0.00023276085166451006, "loss": 2.8081, "step": 293000 }, { "epoch": 15.8, "grad_norm": 0.26530566811561584, "learning_rate": 0.00022982002117397955, "loss": 2.8119, "step": 294000 }, { "epoch": 15.86, "grad_norm": 0.27719756960868835, "learning_rate": 0.000226879190683449, "loss": 2.8085, "step": 295000 }, { "epoch": 15.91, "grad_norm": 0.27279022336006165, "learning_rate": 0.00022393836019291848, "loss": 2.8137, "step": 296000 }, { "epoch": 15.97, "grad_norm": 0.2712692618370056, "learning_rate": 0.0002210004705328785, "loss": 2.8089, "step": 297000 }, { "epoch": 16.0, "eval_accuracy": 0.4115371957389701, "eval_loss": 3.3797037601470947, "eval_runtime": 154.9333, "eval_samples_per_second": 373.845, "eval_steps_per_second": 5.848, "step": 297632 }, { "epoch": 16.02, "grad_norm": 0.2609094977378845, "learning_rate": 0.0002180625808728385, "loss": 2.7927, "step": 298000 }, { "epoch": 16.07, "grad_norm": 0.33733800053596497, "learning_rate": 0.00021512175038230797, "loss": 2.7628, "step": 299000 }, { "epoch": 16.13, "grad_norm": 0.27339014410972595, "learning_rate": 0.00021218091989177743, "loss": 2.7711, "step": 300000 }, { "epoch": 16.18, "grad_norm": 0.27283820509910583, "learning_rate": 0.00020924303023173743, "loss": 2.7731, "step": 301000 }, { "epoch": 16.23, "grad_norm": 0.29361584782600403, "learning_rate": 0.00020630219974120692, "loss": 2.7739, "step": 302000 }, { "epoch": 16.29, "grad_norm": 0.2751254439353943, "learning_rate": 0.0002033613692506764, "loss": 2.7733, "step": 303000 }, { "epoch": 16.34, "grad_norm": 0.31571540236473083, "learning_rate": 0.00020042053876014588, "loss": 2.7803, "step": 304000 }, { "epoch": 16.4, "grad_norm": 0.2743053138256073, "learning_rate": 0.00019748264910010588, "loss": 2.7825, "step": 305000 }, { "epoch": 16.45, "grad_norm": 0.30572009086608887, "learning_rate": 0.00019454181860957534, "loss": 2.7831, "step": 306000 }, { "epoch": 16.5, "grad_norm": 0.27128228545188904, "learning_rate": 0.00019160392894953534, "loss": 2.7858, "step": 307000 }, { "epoch": 16.56, "grad_norm": 0.28637221455574036, "learning_rate": 0.00018866603928949536, "loss": 2.7873, "step": 308000 }, { "epoch": 16.61, "grad_norm": 0.30166834592819214, "learning_rate": 0.00018572520879896485, "loss": 2.7859, "step": 309000 }, { "epoch": 16.66, "grad_norm": 0.26175713539123535, "learning_rate": 0.00018278437830843432, "loss": 2.7902, "step": 310000 }, { "epoch": 16.72, "grad_norm": 0.3338593542575836, "learning_rate": 0.00017984648864839432, "loss": 2.7876, "step": 311000 }, { "epoch": 16.77, "grad_norm": 0.291354775428772, "learning_rate": 0.00017690565815786378, "loss": 2.789, "step": 312000 }, { "epoch": 16.83, "grad_norm": 0.3241804242134094, "learning_rate": 0.00017396482766733327, "loss": 2.7875, "step": 313000 }, { "epoch": 16.88, "grad_norm": 0.27583855390548706, "learning_rate": 0.00017102399717680274, "loss": 2.7928, "step": 314000 }, { "epoch": 16.93, "grad_norm": 0.38700050115585327, "learning_rate": 0.00016808610751676274, "loss": 2.7902, "step": 315000 }, { "epoch": 16.99, "grad_norm": 0.2652926445007324, "learning_rate": 0.00016514527702623223, "loss": 2.7911, "step": 316000 }, { "epoch": 17.0, "eval_accuracy": 0.4108795174320692, "eval_loss": 3.3881633281707764, "eval_runtime": 154.5367, "eval_samples_per_second": 374.804, "eval_steps_per_second": 5.863, "step": 316234 }, { "epoch": 17.04, "grad_norm": 0.2926689684391022, "learning_rate": 0.0001622044465357017, "loss": 2.7624, "step": 317000 }, { "epoch": 17.09, "grad_norm": 0.2626492977142334, "learning_rate": 0.0001592665568756617, "loss": 2.7546, "step": 318000 }, { "epoch": 17.15, "grad_norm": 0.27373695373535156, "learning_rate": 0.00015632572638513115, "loss": 2.7568, "step": 319000 }, { "epoch": 17.2, "grad_norm": 0.3076825737953186, "learning_rate": 0.00015338489589460065, "loss": 2.7601, "step": 320000 }, { "epoch": 17.26, "grad_norm": 0.2832266092300415, "learning_rate": 0.0001504440654040701, "loss": 2.7627, "step": 321000 }, { "epoch": 17.31, "grad_norm": 0.3134935200214386, "learning_rate": 0.0001475032349135396, "loss": 2.7633, "step": 322000 }, { "epoch": 17.36, "grad_norm": 0.28282949328422546, "learning_rate": 0.0001445653452534996, "loss": 2.7616, "step": 323000 }, { "epoch": 17.42, "grad_norm": 0.2717437744140625, "learning_rate": 0.00014162451476296906, "loss": 2.7664, "step": 324000 }, { "epoch": 17.47, "grad_norm": 0.3018600642681122, "learning_rate": 0.00013868662510292906, "loss": 2.7672, "step": 325000 }, { "epoch": 17.52, "grad_norm": 0.41076505184173584, "learning_rate": 0.00013574579461239855, "loss": 2.7662, "step": 326000 }, { "epoch": 17.58, "grad_norm": 0.28314197063446045, "learning_rate": 0.00013280790495235855, "loss": 2.7679, "step": 327000 }, { "epoch": 17.63, "grad_norm": 0.2659911811351776, "learning_rate": 0.00012986707446182802, "loss": 2.7656, "step": 328000 }, { "epoch": 17.69, "grad_norm": 0.28878411650657654, "learning_rate": 0.00012692918480178804, "loss": 2.7627, "step": 329000 }, { "epoch": 17.74, "grad_norm": 0.2714906930923462, "learning_rate": 0.0001239883543112575, "loss": 2.7715, "step": 330000 }, { "epoch": 17.79, "grad_norm": 0.3475322425365448, "learning_rate": 0.00012105046465121751, "loss": 2.7704, "step": 331000 }, { "epoch": 17.85, "grad_norm": 0.2924496531486511, "learning_rate": 0.00011810963416068698, "loss": 2.7695, "step": 332000 }, { "epoch": 17.9, "grad_norm": 0.2651541829109192, "learning_rate": 0.00011516880367015646, "loss": 2.7744, "step": 333000 }, { "epoch": 17.96, "grad_norm": 0.27891284227371216, "learning_rate": 0.00011223091401011645, "loss": 2.773, "step": 334000 }, { "epoch": 18.0, "eval_accuracy": 0.41146578490748537, "eval_loss": 3.395080804824829, "eval_runtime": 154.4425, "eval_samples_per_second": 375.033, "eval_steps_per_second": 5.866, "step": 334836 }, { "epoch": 18.01, "grad_norm": 0.3232966363430023, "learning_rate": 0.00010929008351958593, "loss": 2.767, "step": 335000 }, { "epoch": 18.06, "grad_norm": 0.3170246481895447, "learning_rate": 0.00010635219385954593, "loss": 2.7387, "step": 336000 }, { "epoch": 18.12, "grad_norm": 0.28522101044654846, "learning_rate": 0.00010341136336901541, "loss": 2.7415, "step": 337000 }, { "epoch": 18.17, "grad_norm": 0.35791322588920593, "learning_rate": 0.00010047347370897542, "loss": 2.741, "step": 338000 }, { "epoch": 18.22, "grad_norm": 0.2994053065776825, "learning_rate": 9.75326432184449e-05, "loss": 2.7415, "step": 339000 }, { "epoch": 18.28, "grad_norm": 0.31925782561302185, "learning_rate": 9.459181272791436e-05, "loss": 2.7482, "step": 340000 }, { "epoch": 18.33, "grad_norm": 0.27817079424858093, "learning_rate": 9.165098223738384e-05, "loss": 2.7465, "step": 341000 }, { "epoch": 18.39, "grad_norm": 0.27790531516075134, "learning_rate": 8.871309257734385e-05, "loss": 2.7479, "step": 342000 }, { "epoch": 18.44, "grad_norm": 0.33701515197753906, "learning_rate": 8.577226208681333e-05, "loss": 2.749, "step": 343000 }, { "epoch": 18.49, "grad_norm": 0.32259926199913025, "learning_rate": 8.283143159628278e-05, "loss": 2.7484, "step": 344000 }, { "epoch": 18.55, "grad_norm": 0.2922397255897522, "learning_rate": 7.989060110575226e-05, "loss": 2.7539, "step": 345000 }, { "epoch": 18.6, "grad_norm": 0.27831122279167175, "learning_rate": 7.694977061522174e-05, "loss": 2.7501, "step": 346000 }, { "epoch": 18.65, "grad_norm": 0.29930371046066284, "learning_rate": 7.401188095518175e-05, "loss": 2.7528, "step": 347000 }, { "epoch": 18.71, "grad_norm": 0.2781750559806824, "learning_rate": 7.107105046465121e-05, "loss": 2.7542, "step": 348000 }, { "epoch": 18.76, "grad_norm": 0.34259232878685, "learning_rate": 6.813316080461122e-05, "loss": 2.7519, "step": 349000 }, { "epoch": 18.82, "grad_norm": 0.27175283432006836, "learning_rate": 6.51923303140807e-05, "loss": 2.7517, "step": 350000 }, { "epoch": 18.87, "grad_norm": 0.28221696615219116, "learning_rate": 6.225149982355017e-05, "loss": 2.7564, "step": 351000 }, { "epoch": 18.92, "grad_norm": 0.29943257570266724, "learning_rate": 5.931361016351018e-05, "loss": 2.7532, "step": 352000 }, { "epoch": 18.98, "grad_norm": 0.31146806478500366, "learning_rate": 5.637277967297965e-05, "loss": 2.7517, "step": 353000 }, { "epoch": 19.0, "eval_accuracy": 0.41124107255441245, "eval_loss": 3.40229868888855, "eval_runtime": 154.6135, "eval_samples_per_second": 374.618, "eval_steps_per_second": 5.86, "step": 353438 }, { "epoch": 19.03, "grad_norm": 0.2778169512748718, "learning_rate": 5.343194918244913e-05, "loss": 2.7402, "step": 354000 }, { "epoch": 19.08, "grad_norm": 0.27510160207748413, "learning_rate": 5.04911186919186e-05, "loss": 2.733, "step": 355000 }, { "epoch": 19.14, "grad_norm": 0.29721584916114807, "learning_rate": 4.75532290318786e-05, "loss": 2.7334, "step": 356000 }, { "epoch": 19.19, "grad_norm": 0.3098326623439789, "learning_rate": 4.4612398541348075e-05, "loss": 2.7346, "step": 357000 }, { "epoch": 19.25, "grad_norm": 0.3339441418647766, "learning_rate": 4.1671568050817554e-05, "loss": 2.7324, "step": 358000 }, { "epoch": 19.3, "grad_norm": 0.29926493763923645, "learning_rate": 3.873367839077756e-05, "loss": 2.7342, "step": 359000 }, { "epoch": 19.35, "grad_norm": 0.32908961176872253, "learning_rate": 3.579578873073756e-05, "loss": 2.7342, "step": 360000 }, { "epoch": 19.41, "grad_norm": 0.3179979622364044, "learning_rate": 3.285495824020703e-05, "loss": 2.7333, "step": 361000 }, { "epoch": 19.46, "grad_norm": 0.34039121866226196, "learning_rate": 2.9914127749676508e-05, "loss": 2.7342, "step": 362000 }, { "epoch": 19.51, "grad_norm": 0.29654234647750854, "learning_rate": 2.6973297259145983e-05, "loss": 2.7365, "step": 363000 }, { "epoch": 19.57, "grad_norm": 0.26721277832984924, "learning_rate": 2.403540759910599e-05, "loss": 2.7342, "step": 364000 }, { "epoch": 19.62, "grad_norm": 0.33862730860710144, "learning_rate": 2.109457710857546e-05, "loss": 2.7342, "step": 365000 }, { "epoch": 19.68, "grad_norm": 0.35417601466178894, "learning_rate": 1.8153746618044937e-05, "loss": 2.7328, "step": 366000 }, { "epoch": 19.73, "grad_norm": 0.2858126163482666, "learning_rate": 1.5215856958004942e-05, "loss": 2.7381, "step": 367000 }, { "epoch": 19.78, "grad_norm": 0.28896602988243103, "learning_rate": 1.2275026467474415e-05, "loss": 2.7363, "step": 368000 }, { "epoch": 19.84, "grad_norm": 0.28532859683036804, "learning_rate": 9.334195976943889e-06, "loss": 2.7336, "step": 369000 }, { "epoch": 19.89, "grad_norm": 0.28909268975257874, "learning_rate": 6.393365486413364e-06, "loss": 2.7353, "step": 370000 }, { "epoch": 19.94, "grad_norm": 0.32802271842956543, "learning_rate": 3.455475826373368e-06, "loss": 2.7323, "step": 371000 }, { "epoch": 20.0, "grad_norm": 0.28372275829315186, "learning_rate": 5.146453358428421e-07, "loss": 2.7342, "step": 372000 }, { "epoch": 20.0, "eval_accuracy": 0.4110677518157195, "eval_loss": 3.411550760269165, "eval_runtime": 155.3905, "eval_samples_per_second": 372.745, "eval_steps_per_second": 5.83, "step": 372040 }, { "epoch": 20.0, "step": 372040, "total_flos": 1.56748665397248e+18, "train_loss": 3.012804690259411, "train_runtime": 81277.7938, "train_samples_per_second": 146.472, "train_steps_per_second": 4.577 } ], "logging_steps": 1000, "max_steps": 372040, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 5000, "total_flos": 1.56748665397248e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }