{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9986731534719153, "eval_steps": 142, "global_step": 1695, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.705155372619629, "learning_rate": 0.0001, "loss": 3.3182, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.3362529277801514, "eval_runtime": 14.4366, "eval_samples_per_second": 33.041, "eval_steps_per_second": 8.312, "step": 1 }, { "epoch": 0.0, "grad_norm": 4.644563674926758, "learning_rate": 0.0002, "loss": 3.2788, "step": 2 }, { "epoch": 0.01, "grad_norm": 4.3825764656066895, "learning_rate": 0.0003, "loss": 2.9231, "step": 3 }, { "epoch": 0.01, "grad_norm": 2.904296636581421, "learning_rate": 0.0004, "loss": 1.1824, "step": 4 }, { "epoch": 0.01, "grad_norm": 1.6988284587860107, "learning_rate": 0.0005, "loss": 0.3472, "step": 5 }, { "epoch": 0.01, "grad_norm": 0.5742101073265076, "learning_rate": 0.0006, "loss": 0.1478, "step": 6 }, { "epoch": 0.01, "grad_norm": 0.6511944532394409, "learning_rate": 0.0007, "loss": 0.1532, "step": 7 }, { "epoch": 0.01, "grad_norm": 2.305083751678467, "learning_rate": 0.0008, "loss": 0.2397, "step": 8 }, { "epoch": 0.02, "grad_norm": 4.7435078620910645, "learning_rate": 0.0009000000000000001, "loss": 0.434, "step": 9 }, { "epoch": 0.02, "grad_norm": 0.6199779510498047, "learning_rate": 0.001, "loss": 0.1743, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.14406554400920868, "learning_rate": 0.0009999991309598973, "loss": 0.1404, "step": 11 }, { "epoch": 0.02, "grad_norm": 0.1965201050043106, "learning_rate": 0.0009999965238426103, "loss": 0.1418, "step": 12 }, { "epoch": 0.02, "grad_norm": 26.520109176635742, "learning_rate": 0.0009999921786572016, "loss": 0.2689, "step": 13 }, { "epoch": 0.02, "grad_norm": 0.0870603695511818, "learning_rate": 0.0009999860954187755, "loss": 0.1338, "step": 14 }, { "epoch": 0.03, "grad_norm": 0.05200817808508873, "learning_rate": 0.0009999782741484788, "loss": 0.1308, "step": 15 }, { "epoch": 0.03, "grad_norm": 0.2145700752735138, "learning_rate": 0.0009999687148734995, "loss": 0.1375, "step": 16 }, { "epoch": 0.03, "grad_norm": 0.19921083748340607, "learning_rate": 0.0009999574176270667, "loss": 0.1388, "step": 17 }, { "epoch": 0.03, "grad_norm": 1.193419337272644, "learning_rate": 0.0009999443824484518, "loss": 0.1978, "step": 18 }, { "epoch": 0.03, "grad_norm": 0.4399484395980835, "learning_rate": 0.0009999296093829671, "loss": 0.1518, "step": 19 }, { "epoch": 0.04, "grad_norm": 44.88853073120117, "learning_rate": 0.0009999130984819661, "loss": 0.9033, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.3220385015010834, "learning_rate": 0.0009998948498028434, "loss": 0.1234, "step": 21 }, { "epoch": 0.04, "grad_norm": 0.5420748591423035, "learning_rate": 0.0009998748634090344, "loss": 0.1602, "step": 22 }, { "epoch": 0.04, "grad_norm": 0.5249865651130676, "learning_rate": 0.0009998531393700149, "loss": 0.1538, "step": 23 }, { "epoch": 0.04, "grad_norm": 0.056158341467380524, "learning_rate": 0.000999829677761301, "loss": 0.1374, "step": 24 }, { "epoch": 0.04, "grad_norm": 0.19818872213363647, "learning_rate": 0.0009998044786644492, "loss": 0.1413, "step": 25 }, { "epoch": 0.05, "grad_norm": 0.27901849150657654, "learning_rate": 0.0009997775421670557, "loss": 0.1395, "step": 26 }, { "epoch": 0.05, "grad_norm": 0.22768354415893555, "learning_rate": 0.0009997488683627558, "loss": 0.1241, "step": 27 }, { "epoch": 0.05, "grad_norm": 0.14878959953784943, "learning_rate": 0.0009997184573512245, "loss": 0.1243, "step": 28 }, { "epoch": 0.05, "grad_norm": 1.0589066743850708, "learning_rate": 0.000999686309238175, "loss": 0.2499, "step": 29 }, { "epoch": 0.05, "grad_norm": 0.11455405503511429, "learning_rate": 0.00099965242413536, "loss": 0.1254, "step": 30 }, { "epoch": 0.05, "grad_norm": 0.16566088795661926, "learning_rate": 0.000999616802160569, "loss": 0.1416, "step": 31 }, { "epoch": 0.06, "grad_norm": 1.3691716194152832, "learning_rate": 0.0009995794434376297, "loss": 0.1465, "step": 32 }, { "epoch": 0.06, "grad_norm": 0.09674070030450821, "learning_rate": 0.000999540348096407, "loss": 0.1373, "step": 33 }, { "epoch": 0.06, "grad_norm": 0.5034632086753845, "learning_rate": 0.000999499516272803, "loss": 0.1471, "step": 34 }, { "epoch": 0.06, "grad_norm": 0.26572930812835693, "learning_rate": 0.0009994569481087553, "loss": 0.1424, "step": 35 }, { "epoch": 0.06, "grad_norm": 0.20631802082061768, "learning_rate": 0.0009994126437522376, "loss": 0.1449, "step": 36 }, { "epoch": 0.07, "grad_norm": 0.11268749833106995, "learning_rate": 0.0009993666033572591, "loss": 0.1403, "step": 37 }, { "epoch": 0.07, "grad_norm": 0.6610996723175049, "learning_rate": 0.0009993188270838635, "loss": 0.1424, "step": 38 }, { "epoch": 0.07, "grad_norm": 98.93838500976562, "learning_rate": 0.0009992693150981291, "loss": 2.775, "step": 39 }, { "epoch": 0.07, "grad_norm": 32.53168869018555, "learning_rate": 0.0009992180675721671, "loss": 0.6932, "step": 40 }, { "epoch": 0.07, "grad_norm": 54.8778076171875, "learning_rate": 0.0009991650846841226, "loss": 5.7008, "step": 41 }, { "epoch": 0.07, "grad_norm": 2.0524775981903076, "learning_rate": 0.000999110366618172, "loss": 0.1623, "step": 42 }, { "epoch": 0.08, "grad_norm": 0.404278427362442, "learning_rate": 0.0009990539135645246, "loss": 0.1427, "step": 43 }, { "epoch": 0.08, "grad_norm": 1.7963409423828125, "learning_rate": 0.0009989957257194198, "loss": 0.174, "step": 44 }, { "epoch": 0.08, "grad_norm": 0.11620022356510162, "learning_rate": 0.0009989358032851284, "loss": 0.1339, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.5025681853294373, "learning_rate": 0.00099887414646995, "loss": 0.1558, "step": 46 }, { "epoch": 0.08, "grad_norm": 78.1165771484375, "learning_rate": 0.0009988107554882138, "loss": 2.2938, "step": 47 }, { "epoch": 0.08, "grad_norm": 0.08389786630868912, "learning_rate": 0.0009987456305602768, "loss": 0.1409, "step": 48 }, { "epoch": 0.09, "grad_norm": 7.123101711273193, "learning_rate": 0.0009986787719125242, "loss": 0.1524, "step": 49 }, { "epoch": 0.09, "grad_norm": 0.5341290235519409, "learning_rate": 0.0009986101797773666, "loss": 0.1598, "step": 50 }, { "epoch": 0.09, "grad_norm": 0.05239284038543701, "learning_rate": 0.000998539854393242, "loss": 0.1386, "step": 51 }, { "epoch": 0.09, "grad_norm": 0.0722254291176796, "learning_rate": 0.0009984677960046123, "loss": 0.1385, "step": 52 }, { "epoch": 0.09, "grad_norm": 0.11535236239433289, "learning_rate": 0.000998394004861964, "loss": 0.1369, "step": 53 }, { "epoch": 0.1, "grad_norm": 0.7584894299507141, "learning_rate": 0.0009983184812218072, "loss": 0.108, "step": 54 }, { "epoch": 0.1, "grad_norm": 0.8361538052558899, "learning_rate": 0.000998241225346674, "loss": 0.1703, "step": 55 }, { "epoch": 0.1, "grad_norm": 0.37683162093162537, "learning_rate": 0.0009981622375051184, "loss": 0.1368, "step": 56 }, { "epoch": 0.1, "grad_norm": 0.5335961580276489, "learning_rate": 0.0009980815179717144, "loss": 0.1559, "step": 57 }, { "epoch": 0.1, "grad_norm": 0.2806299328804016, "learning_rate": 0.0009979990670270565, "loss": 0.1397, "step": 58 }, { "epoch": 0.1, "grad_norm": 0.4967437982559204, "learning_rate": 0.0009979148849577574, "loss": 0.1543, "step": 59 }, { "epoch": 0.11, "grad_norm": 0.10350017994642258, "learning_rate": 0.0009978289720564471, "loss": 0.1367, "step": 60 }, { "epoch": 0.11, "grad_norm": 78.35698699951172, "learning_rate": 0.0009977413286217727, "loss": 2.2474, "step": 61 }, { "epoch": 0.11, "grad_norm": 0.15264186263084412, "learning_rate": 0.0009976519549583973, "loss": 0.1311, "step": 62 }, { "epoch": 0.11, "grad_norm": 0.31865784525871277, "learning_rate": 0.0009975608513769975, "loss": 0.1407, "step": 63 }, { "epoch": 0.11, "grad_norm": 0.32891547679901123, "learning_rate": 0.0009974680181942645, "loss": 0.1423, "step": 64 }, { "epoch": 0.11, "grad_norm": 0.15653717517852783, "learning_rate": 0.0009973734557329008, "loss": 0.1365, "step": 65 }, { "epoch": 0.12, "grad_norm": 0.3237778842449188, "learning_rate": 0.0009972771643216212, "loss": 0.1407, "step": 66 }, { "epoch": 0.12, "grad_norm": 0.13634416460990906, "learning_rate": 0.0009971791442951496, "loss": 0.1378, "step": 67 }, { "epoch": 0.12, "grad_norm": 0.3488883376121521, "learning_rate": 0.0009970793959942197, "loss": 0.1429, "step": 68 }, { "epoch": 0.12, "grad_norm": 0.5150622129440308, "learning_rate": 0.0009969779197655725, "loss": 0.1492, "step": 69 }, { "epoch": 0.12, "grad_norm": 0.3482552468776703, "learning_rate": 0.0009968747159619555, "loss": 0.1415, "step": 70 }, { "epoch": 0.13, "grad_norm": 0.22551549971103668, "learning_rate": 0.000996769784942122, "loss": 0.1418, "step": 71 }, { "epoch": 0.13, "grad_norm": 0.20759086310863495, "learning_rate": 0.0009966631270708287, "loss": 0.1366, "step": 72 }, { "epoch": 0.13, "grad_norm": 13.050313949584961, "learning_rate": 0.0009965547427188356, "loss": 0.1375, "step": 73 }, { "epoch": 0.13, "grad_norm": 0.18372055888175964, "learning_rate": 0.0009964446322629043, "loss": 0.1285, "step": 74 }, { "epoch": 0.13, "grad_norm": 0.4404817819595337, "learning_rate": 0.000996332796085796, "loss": 0.1501, "step": 75 }, { "epoch": 0.13, "grad_norm": 1.269240379333496, "learning_rate": 0.0009962192345762716, "loss": 0.1346, "step": 76 }, { "epoch": 0.14, "grad_norm": 32.20164108276367, "learning_rate": 0.0009961039481290888, "loss": 0.3348, "step": 77 }, { "epoch": 0.14, "grad_norm": 82.92976379394531, "learning_rate": 0.0009959869371450021, "loss": 5.8309, "step": 78 }, { "epoch": 0.14, "grad_norm": 0.3416314721107483, "learning_rate": 0.0009958682020307602, "loss": 0.1418, "step": 79 }, { "epoch": 0.14, "grad_norm": 31.961870193481445, "learning_rate": 0.0009957477431991053, "loss": 0.1899, "step": 80 }, { "epoch": 0.14, "grad_norm": 38.58375930786133, "learning_rate": 0.000995625561068772, "loss": 0.5641, "step": 81 }, { "epoch": 0.15, "grad_norm": 0.32622194290161133, "learning_rate": 0.0009955016560644846, "loss": 0.1144, "step": 82 }, { "epoch": 0.15, "grad_norm": 6.264970779418945, "learning_rate": 0.0009953760286169572, "loss": 0.4788, "step": 83 }, { "epoch": 0.15, "grad_norm": 0.07168668508529663, "learning_rate": 0.0009952486791628904, "loss": 0.1326, "step": 84 }, { "epoch": 0.15, "grad_norm": 35.18340301513672, "learning_rate": 0.000995119608144972, "loss": 0.3884, "step": 85 }, { "epoch": 0.15, "grad_norm": 0.03896519914269447, "learning_rate": 0.000994988816011873, "loss": 0.1249, "step": 86 }, { "epoch": 0.15, "grad_norm": 14.499520301818848, "learning_rate": 0.000994856303218248, "loss": 0.3756, "step": 87 }, { "epoch": 0.16, "grad_norm": 0.3134947419166565, "learning_rate": 0.000994722070224733, "loss": 0.1539, "step": 88 }, { "epoch": 0.16, "grad_norm": 117.39696502685547, "learning_rate": 0.000994586117497943, "loss": 0.5885, "step": 89 }, { "epoch": 0.16, "grad_norm": 37.93465805053711, "learning_rate": 0.0009944484455104716, "loss": 0.7709, "step": 90 }, { "epoch": 0.16, "grad_norm": 236.63330078125, "learning_rate": 0.0009943090547408888, "loss": 6.0182, "step": 91 }, { "epoch": 0.16, "grad_norm": 1.1088515520095825, "learning_rate": 0.0009941679456737394, "loss": 0.1931, "step": 92 }, { "epoch": 0.16, "grad_norm": 0.11310256272554398, "learning_rate": 0.0009940251187995411, "loss": 0.1293, "step": 93 }, { "epoch": 0.17, "grad_norm": 0.6143047213554382, "learning_rate": 0.0009938805746147828, "loss": 0.2364, "step": 94 }, { "epoch": 0.17, "grad_norm": 0.2461577206850052, "learning_rate": 0.0009937343136219232, "loss": 0.1504, "step": 95 }, { "epoch": 0.17, "grad_norm": 97.17162322998047, "learning_rate": 0.0009935863363293895, "loss": 5.764, "step": 96 }, { "epoch": 0.17, "grad_norm": 0.5417380928993225, "learning_rate": 0.000993436643251574, "loss": 0.1576, "step": 97 }, { "epoch": 0.17, "grad_norm": 0.2737255096435547, "learning_rate": 0.0009932852349088341, "loss": 0.1437, "step": 98 }, { "epoch": 0.18, "grad_norm": 138.00778198242188, "learning_rate": 0.0009931321118274896, "loss": 4.1331, "step": 99 }, { "epoch": 0.18, "grad_norm": 46.4688606262207, "learning_rate": 0.0009929772745398205, "loss": 0.6178, "step": 100 }, { "epoch": 0.18, "grad_norm": 0.49907386302948, "learning_rate": 0.0009928207235840663, "loss": 0.1445, "step": 101 }, { "epoch": 0.18, "grad_norm": 0.33814460039138794, "learning_rate": 0.0009926624595044233, "loss": 0.139, "step": 102 }, { "epoch": 0.18, "grad_norm": 0.3241071403026581, "learning_rate": 0.0009925024828510427, "loss": 0.1404, "step": 103 }, { "epoch": 0.18, "grad_norm": 78.4036865234375, "learning_rate": 0.000992340794180029, "loss": 1.2663, "step": 104 }, { "epoch": 0.19, "grad_norm": 1.113776445388794, "learning_rate": 0.000992177394053438, "loss": 0.162, "step": 105 }, { "epoch": 0.19, "grad_norm": 16.512048721313477, "learning_rate": 0.0009920122830392748, "loss": 3.3373, "step": 106 }, { "epoch": 0.19, "grad_norm": 111.53176879882812, "learning_rate": 0.0009918454617114918, "loss": 2.3969, "step": 107 }, { "epoch": 0.19, "grad_norm": 14.91741943359375, "learning_rate": 0.0009916769306499865, "loss": 1.8837, "step": 108 }, { "epoch": 0.19, "grad_norm": 61.30055618286133, "learning_rate": 0.0009915066904406, "loss": 10.4922, "step": 109 }, { "epoch": 0.19, "grad_norm": 0.6948704123497009, "learning_rate": 0.0009913347416751147, "loss": 0.1536, "step": 110 }, { "epoch": 0.2, "grad_norm": 0.7721084356307983, "learning_rate": 0.000991161084951252, "loss": 0.1356, "step": 111 }, { "epoch": 0.2, "grad_norm": 0.24614596366882324, "learning_rate": 0.0009909857208726704, "loss": 0.1339, "step": 112 }, { "epoch": 0.2, "grad_norm": 7.189969062805176, "learning_rate": 0.0009908086500489638, "loss": 0.2551, "step": 113 }, { "epoch": 0.2, "grad_norm": 0.8675662279129028, "learning_rate": 0.0009906298730956585, "loss": 0.1668, "step": 114 }, { "epoch": 0.2, "grad_norm": 0.605249285697937, "learning_rate": 0.0009904493906342123, "loss": 0.1478, "step": 115 }, { "epoch": 0.21, "grad_norm": 0.8765722513198853, "learning_rate": 0.0009902672032920106, "loss": 0.1598, "step": 116 }, { "epoch": 0.21, "grad_norm": 0.6021157503128052, "learning_rate": 0.0009900833117023665, "loss": 0.1506, "step": 117 }, { "epoch": 0.21, "grad_norm": 0.28180792927742004, "learning_rate": 0.000989897716504516, "loss": 0.1389, "step": 118 }, { "epoch": 0.21, "grad_norm": 0.21730898320674896, "learning_rate": 0.0009897104183436184, "loss": 0.1377, "step": 119 }, { "epoch": 0.21, "grad_norm": 0.977118730545044, "learning_rate": 0.0009895214178707516, "loss": 0.1698, "step": 120 }, { "epoch": 0.21, "grad_norm": 2.674729585647583, "learning_rate": 0.0009893307157429118, "loss": 0.1559, "step": 121 }, { "epoch": 0.22, "grad_norm": 0.9852035045623779, "learning_rate": 0.0009891383126230102, "loss": 0.2027, "step": 122 }, { "epoch": 0.22, "grad_norm": 0.36689773201942444, "learning_rate": 0.0009889442091798712, "loss": 0.1498, "step": 123 }, { "epoch": 0.22, "grad_norm": 0.104621522128582, "learning_rate": 0.000988748406088229, "loss": 0.1379, "step": 124 }, { "epoch": 0.22, "grad_norm": 74.17496490478516, "learning_rate": 0.0009885509040287268, "loss": 0.7724, "step": 125 }, { "epoch": 0.22, "grad_norm": 1.2943025827407837, "learning_rate": 0.0009883517036879132, "loss": 0.2643, "step": 126 }, { "epoch": 0.22, "grad_norm": 0.828774094581604, "learning_rate": 0.000988150805758241, "loss": 0.1852, "step": 127 }, { "epoch": 0.23, "grad_norm": 0.13165877759456635, "learning_rate": 0.0009879482109380632, "loss": 0.1429, "step": 128 }, { "epoch": 0.23, "grad_norm": 0.662426769733429, "learning_rate": 0.0009877439199316323, "loss": 0.1643, "step": 129 }, { "epoch": 0.23, "grad_norm": 0.6256189942359924, "learning_rate": 0.0009875379334490962, "loss": 0.157, "step": 130 }, { "epoch": 0.23, "grad_norm": 0.5049256086349487, "learning_rate": 0.0009873302522064972, "loss": 0.1484, "step": 131 }, { "epoch": 0.23, "grad_norm": 1.4133671522140503, "learning_rate": 0.0009871208769257685, "loss": 0.1736, "step": 132 }, { "epoch": 0.24, "grad_norm": 0.7930824756622314, "learning_rate": 0.0009869098083347323, "loss": 0.1543, "step": 133 }, { "epoch": 0.24, "grad_norm": 0.5717449188232422, "learning_rate": 0.0009866970471670965, "loss": 0.1338, "step": 134 }, { "epoch": 0.24, "grad_norm": 0.582081139087677, "learning_rate": 0.0009864825941624537, "loss": 0.1692, "step": 135 }, { "epoch": 0.24, "grad_norm": 10.226588249206543, "learning_rate": 0.0009862664500662763, "loss": 0.2425, "step": 136 }, { "epoch": 0.24, "grad_norm": 1.1186953783035278, "learning_rate": 0.0009860486156299164, "loss": 0.2052, "step": 137 }, { "epoch": 0.24, "grad_norm": 0.2953661382198334, "learning_rate": 0.000985829091610601, "loss": 0.1408, "step": 138 }, { "epoch": 0.25, "grad_norm": 0.8647088408470154, "learning_rate": 0.000985607878771431, "loss": 0.1571, "step": 139 }, { "epoch": 0.25, "grad_norm": 0.41964420676231384, "learning_rate": 0.0009853849778813776, "loss": 0.1477, "step": 140 }, { "epoch": 0.25, "grad_norm": 0.25675931572914124, "learning_rate": 0.0009851603897152803, "loss": 0.1398, "step": 141 }, { "epoch": 0.25, "grad_norm": 0.2311631143093109, "learning_rate": 0.0009849341150538434, "loss": 0.1432, "step": 142 }, { "epoch": 0.25, "eval_loss": 1.5366541147232056, "eval_runtime": 14.6962, "eval_samples_per_second": 32.457, "eval_steps_per_second": 8.165, "step": 142 }, { "epoch": 0.25, "grad_norm": 41.83562469482422, "learning_rate": 0.0009847061546836339, "loss": 1.1525, "step": 143 }, { "epoch": 0.25, "grad_norm": 0.27440375089645386, "learning_rate": 0.0009844765093970787, "loss": 0.1452, "step": 144 }, { "epoch": 0.26, "grad_norm": 0.27643319964408875, "learning_rate": 0.0009842451799924616, "loss": 0.1069, "step": 145 }, { "epoch": 0.26, "grad_norm": 0.21519601345062256, "learning_rate": 0.0009840121672739207, "loss": 0.1358, "step": 146 }, { "epoch": 0.26, "grad_norm": 0.4073689877986908, "learning_rate": 0.0009837774720514456, "loss": 0.1545, "step": 147 }, { "epoch": 0.26, "grad_norm": 0.13685636222362518, "learning_rate": 0.0009835410951408747, "loss": 0.1259, "step": 148 }, { "epoch": 0.26, "grad_norm": 0.07474564015865326, "learning_rate": 0.000983303037363892, "loss": 0.1356, "step": 149 }, { "epoch": 0.27, "grad_norm": 0.45116662979125977, "learning_rate": 0.0009830632995480241, "loss": 0.1379, "step": 150 }, { "epoch": 0.27, "grad_norm": 0.1297813504934311, "learning_rate": 0.0009828218825266388, "loss": 0.1343, "step": 151 }, { "epoch": 0.27, "grad_norm": 0.5846492052078247, "learning_rate": 0.00098257878713894, "loss": 0.1563, "step": 152 }, { "epoch": 0.27, "grad_norm": 0.38457778096199036, "learning_rate": 0.0009823340142299662, "loss": 0.1477, "step": 153 }, { "epoch": 0.27, "grad_norm": 0.09184035658836365, "learning_rate": 0.0009820875646505873, "loss": 0.1376, "step": 154 }, { "epoch": 0.27, "grad_norm": 0.5166211128234863, "learning_rate": 0.0009818394392575019, "loss": 0.1498, "step": 155 }, { "epoch": 0.28, "grad_norm": 0.2788640260696411, "learning_rate": 0.0009815896389132332, "loss": 0.1434, "step": 156 }, { "epoch": 0.28, "grad_norm": 0.3762676417827606, "learning_rate": 0.0009813381644861276, "loss": 0.1482, "step": 157 }, { "epoch": 0.28, "grad_norm": 0.3615610897541046, "learning_rate": 0.0009810850168503506, "loss": 0.1312, "step": 158 }, { "epoch": 0.28, "grad_norm": 0.03483320027589798, "learning_rate": 0.0009808301968858837, "loss": 0.1239, "step": 159 }, { "epoch": 0.28, "grad_norm": 0.5616227984428406, "learning_rate": 0.0009805737054785222, "loss": 0.1881, "step": 160 }, { "epoch": 0.28, "grad_norm": 0.029542161151766777, "learning_rate": 0.000980315543519871, "loss": 0.1254, "step": 161 }, { "epoch": 0.29, "grad_norm": 0.142581045627594, "learning_rate": 0.0009800557119073433, "loss": 0.1258, "step": 162 }, { "epoch": 0.29, "grad_norm": 0.7289375066757202, "learning_rate": 0.0009797942115441546, "loss": 0.1526, "step": 163 }, { "epoch": 0.29, "grad_norm": 0.6975064873695374, "learning_rate": 0.0009795310433393224, "loss": 0.1487, "step": 164 }, { "epoch": 0.29, "grad_norm": 1.3072260618209839, "learning_rate": 0.0009792662082076617, "loss": 0.1712, "step": 165 }, { "epoch": 0.29, "grad_norm": 0.2993917465209961, "learning_rate": 0.000978999707069782, "loss": 0.1424, "step": 166 }, { "epoch": 0.3, "grad_norm": 0.3258236050605774, "learning_rate": 0.0009787315408520839, "loss": 0.135, "step": 167 }, { "epoch": 0.3, "grad_norm": 0.26566603779792786, "learning_rate": 0.000978461710486756, "loss": 0.1441, "step": 168 }, { "epoch": 0.3, "grad_norm": 1.1709599494934082, "learning_rate": 0.0009781902169117718, "loss": 0.2084, "step": 169 }, { "epoch": 0.3, "grad_norm": 0.6554279923439026, "learning_rate": 0.000977917061070887, "loss": 0.1634, "step": 170 }, { "epoch": 0.3, "grad_norm": 0.1635073721408844, "learning_rate": 0.000977642243913635, "loss": 0.1371, "step": 171 }, { "epoch": 0.3, "grad_norm": 0.4419834017753601, "learning_rate": 0.0009773657663953242, "loss": 0.1523, "step": 172 }, { "epoch": 0.31, "grad_norm": 0.839259147644043, "learning_rate": 0.000977087629477035, "loss": 0.1628, "step": 173 }, { "epoch": 0.31, "grad_norm": 0.1979222148656845, "learning_rate": 0.0009768078341256155, "loss": 0.1367, "step": 174 }, { "epoch": 0.31, "grad_norm": 0.2939910888671875, "learning_rate": 0.0009765263813136795, "loss": 0.1349, "step": 175 }, { "epoch": 0.31, "grad_norm": 0.19882674515247345, "learning_rate": 0.0009762432720196024, "loss": 0.1424, "step": 176 }, { "epoch": 0.31, "grad_norm": 0.07146954536437988, "learning_rate": 0.000975958507227517, "loss": 0.1237, "step": 177 }, { "epoch": 0.31, "grad_norm": 0.5031868815422058, "learning_rate": 0.0009756720879273117, "loss": 0.1592, "step": 178 }, { "epoch": 0.32, "grad_norm": 0.14860151708126068, "learning_rate": 0.0009753840151146258, "loss": 0.1396, "step": 179 }, { "epoch": 0.32, "grad_norm": 0.10280521959066391, "learning_rate": 0.0009750942897908468, "loss": 0.1333, "step": 180 }, { "epoch": 0.32, "grad_norm": 0.4652903974056244, "learning_rate": 0.0009748029129631061, "loss": 0.1421, "step": 181 }, { "epoch": 0.32, "grad_norm": 0.3985591530799866, "learning_rate": 0.0009745098856442768, "loss": 0.1459, "step": 182 }, { "epoch": 0.32, "grad_norm": 0.20321591198444366, "learning_rate": 0.0009742152088529683, "loss": 0.1381, "step": 183 }, { "epoch": 0.33, "grad_norm": 0.7694361805915833, "learning_rate": 0.0009739188836135246, "loss": 0.1676, "step": 184 }, { "epoch": 0.33, "grad_norm": 0.04469340294599533, "learning_rate": 0.0009736209109560201, "loss": 0.136, "step": 185 }, { "epoch": 0.33, "grad_norm": 0.08576061576604843, "learning_rate": 0.0009733212919162549, "loss": 0.1408, "step": 186 }, { "epoch": 0.33, "grad_norm": 0.042906519025564194, "learning_rate": 0.0009730200275357535, "loss": 0.1364, "step": 187 }, { "epoch": 0.33, "grad_norm": 0.30054494738578796, "learning_rate": 0.0009727171188617588, "loss": 0.1539, "step": 188 }, { "epoch": 0.33, "grad_norm": 0.05149005725979805, "learning_rate": 0.0009724125669472299, "loss": 0.1352, "step": 189 }, { "epoch": 0.34, "grad_norm": 0.1381620466709137, "learning_rate": 0.0009721063728508383, "loss": 0.1409, "step": 190 }, { "epoch": 0.34, "grad_norm": 0.37344205379486084, "learning_rate": 0.0009717985376369639, "loss": 0.1299, "step": 191 }, { "epoch": 0.34, "grad_norm": 0.1037706583738327, "learning_rate": 0.0009714890623756912, "loss": 0.1341, "step": 192 }, { "epoch": 0.34, "grad_norm": 0.14189712703227997, "learning_rate": 0.0009711779481428056, "loss": 0.1418, "step": 193 }, { "epoch": 0.34, "grad_norm": 0.15108801424503326, "learning_rate": 0.0009708651960197903, "loss": 0.142, "step": 194 }, { "epoch": 0.34, "grad_norm": 0.037045519798994064, "learning_rate": 0.0009705508070938218, "loss": 0.1315, "step": 195 }, { "epoch": 0.35, "grad_norm": 0.23301652073860168, "learning_rate": 0.0009702347824577666, "loss": 0.1396, "step": 196 }, { "epoch": 0.35, "grad_norm": 0.08476269990205765, "learning_rate": 0.0009699171232101768, "loss": 0.1392, "step": 197 }, { "epoch": 0.35, "grad_norm": 0.4222690463066101, "learning_rate": 0.000969597830455287, "loss": 0.1463, "step": 198 }, { "epoch": 0.35, "grad_norm": 0.3234136402606964, "learning_rate": 0.0009692769053030099, "loss": 0.1257, "step": 199 }, { "epoch": 0.35, "grad_norm": 0.04025443643331528, "learning_rate": 0.0009689543488689332, "loss": 0.1303, "step": 200 }, { "epoch": 0.36, "grad_norm": 0.07074520736932755, "learning_rate": 0.0009686301622743144, "loss": 0.1289, "step": 201 }, { "epoch": 0.36, "grad_norm": 0.0788850486278534, "learning_rate": 0.0009683043466460782, "loss": 0.1236, "step": 202 }, { "epoch": 0.36, "grad_norm": 0.525541365146637, "learning_rate": 0.000967976903116812, "loss": 0.1564, "step": 203 }, { "epoch": 0.36, "grad_norm": 0.6145509481430054, "learning_rate": 0.0009676478328247623, "loss": 0.156, "step": 204 }, { "epoch": 0.36, "grad_norm": 0.230132058262825, "learning_rate": 0.0009673171369138296, "loss": 0.1425, "step": 205 }, { "epoch": 0.36, "grad_norm": 0.03262978792190552, "learning_rate": 0.0009669848165335666, "loss": 0.1297, "step": 206 }, { "epoch": 0.37, "grad_norm": 0.0462469644844532, "learning_rate": 0.0009666508728391718, "loss": 0.1177, "step": 207 }, { "epoch": 0.37, "grad_norm": 0.06880385428667068, "learning_rate": 0.0009663153069914874, "loss": 0.1207, "step": 208 }, { "epoch": 0.37, "grad_norm": 0.4248260259628296, "learning_rate": 0.000965978120156994, "loss": 0.1571, "step": 209 }, { "epoch": 0.37, "grad_norm": 0.060492075979709625, "learning_rate": 0.0009656393135078068, "loss": 0.1219, "step": 210 }, { "epoch": 0.37, "grad_norm": 0.12135621905326843, "learning_rate": 0.0009652988882216725, "loss": 0.1323, "step": 211 }, { "epoch": 0.38, "grad_norm": 0.252119243144989, "learning_rate": 0.0009649568454819637, "loss": 0.1366, "step": 212 }, { "epoch": 0.38, "grad_norm": 0.5283567905426025, "learning_rate": 0.0009646131864776761, "loss": 0.1246, "step": 213 }, { "epoch": 0.38, "grad_norm": 2.224665880203247, "learning_rate": 0.0009642679124034233, "loss": 0.2582, "step": 214 }, { "epoch": 0.38, "grad_norm": 1.9277523756027222, "learning_rate": 0.0009639210244594335, "loss": 0.2131, "step": 215 }, { "epoch": 0.38, "grad_norm": 0.5668452978134155, "learning_rate": 0.0009635725238515446, "loss": 0.141, "step": 216 }, { "epoch": 0.38, "grad_norm": 0.13912492990493774, "learning_rate": 0.000963222411791201, "loss": 0.1418, "step": 217 }, { "epoch": 0.39, "grad_norm": 0.39307814836502075, "learning_rate": 0.0009628706894954479, "loss": 0.1477, "step": 218 }, { "epoch": 0.39, "grad_norm": 0.26248928904533386, "learning_rate": 0.000962517358186929, "loss": 0.1315, "step": 219 }, { "epoch": 0.39, "grad_norm": 0.2875257730484009, "learning_rate": 0.0009621624190938803, "loss": 0.1321, "step": 220 }, { "epoch": 0.39, "grad_norm": 0.6386964917182922, "learning_rate": 0.0009618058734501269, "loss": 0.1668, "step": 221 }, { "epoch": 0.39, "grad_norm": 0.16165001690387726, "learning_rate": 0.0009614477224950789, "loss": 0.1272, "step": 222 }, { "epoch": 0.39, "grad_norm": 0.6959558129310608, "learning_rate": 0.0009610879674737262, "loss": 0.1381, "step": 223 }, { "epoch": 0.4, "grad_norm": 0.1701437532901764, "learning_rate": 0.0009607266096366352, "loss": 0.1366, "step": 224 }, { "epoch": 0.4, "grad_norm": 0.2511409819126129, "learning_rate": 0.0009603636502399437, "loss": 0.126, "step": 225 }, { "epoch": 0.4, "grad_norm": 0.04554220288991928, "learning_rate": 0.0009599990905453566, "loss": 0.1321, "step": 226 }, { "epoch": 0.4, "grad_norm": 0.3964705765247345, "learning_rate": 0.000959632931820142, "loss": 0.1383, "step": 227 }, { "epoch": 0.4, "grad_norm": 0.10925984382629395, "learning_rate": 0.0009592651753371264, "loss": 0.1226, "step": 228 }, { "epoch": 0.41, "grad_norm": 0.19012318551540375, "learning_rate": 0.0009588958223746903, "loss": 0.1255, "step": 229 }, { "epoch": 0.41, "grad_norm": 0.23432157933712006, "learning_rate": 0.0009585248742167639, "loss": 0.1152, "step": 230 }, { "epoch": 0.41, "grad_norm": 0.1737753301858902, "learning_rate": 0.0009581523321528223, "loss": 0.1468, "step": 231 }, { "epoch": 0.41, "grad_norm": 0.2625434100627899, "learning_rate": 0.0009577781974778817, "loss": 0.1296, "step": 232 }, { "epoch": 0.41, "grad_norm": 0.3056884706020355, "learning_rate": 0.000957402471492494, "loss": 0.1574, "step": 233 }, { "epoch": 0.41, "grad_norm": 0.4111999273300171, "learning_rate": 0.0009570251555027432, "loss": 0.1434, "step": 234 }, { "epoch": 0.42, "grad_norm": 0.056673482060432434, "learning_rate": 0.0009566462508202401, "loss": 0.1337, "step": 235 }, { "epoch": 0.42, "grad_norm": 0.3861597180366516, "learning_rate": 0.0009562657587621184, "loss": 0.1609, "step": 236 }, { "epoch": 0.42, "grad_norm": 0.35893362760543823, "learning_rate": 0.0009558836806510292, "loss": 0.1189, "step": 237 }, { "epoch": 0.42, "grad_norm": 0.40538331866264343, "learning_rate": 0.0009555000178151374, "loss": 0.1504, "step": 238 }, { "epoch": 0.42, "grad_norm": 81.36141967773438, "learning_rate": 0.0009551147715881167, "loss": 4.7235, "step": 239 }, { "epoch": 0.42, "grad_norm": 0.21178042888641357, "learning_rate": 0.0009547279433091446, "loss": 0.1139, "step": 240 }, { "epoch": 0.43, "grad_norm": 0.27380529046058655, "learning_rate": 0.0009543395343228983, "loss": 0.1504, "step": 241 }, { "epoch": 0.43, "grad_norm": 41.42683410644531, "learning_rate": 0.0009539495459795498, "loss": 1.2477, "step": 242 }, { "epoch": 0.43, "grad_norm": 0.14853385090827942, "learning_rate": 0.0009535579796347612, "loss": 0.1343, "step": 243 }, { "epoch": 0.43, "grad_norm": 0.3484509289264679, "learning_rate": 0.0009531648366496798, "loss": 0.15, "step": 244 }, { "epoch": 0.43, "grad_norm": 0.20152732729911804, "learning_rate": 0.0009527701183909336, "loss": 0.1399, "step": 245 }, { "epoch": 0.44, "grad_norm": 80.84031677246094, "learning_rate": 0.000952373826230627, "loss": 3.1939, "step": 246 }, { "epoch": 0.44, "grad_norm": 15.475607872009277, "learning_rate": 0.0009519759615463346, "loss": 3.3935, "step": 247 }, { "epoch": 0.44, "grad_norm": 77.19477081298828, "learning_rate": 0.0009515765257210979, "loss": 6.5034, "step": 248 }, { "epoch": 0.44, "grad_norm": 0.1174071803689003, "learning_rate": 0.0009511755201434205, "loss": 0.1212, "step": 249 }, { "epoch": 0.44, "grad_norm": 16.503982543945312, "learning_rate": 0.0009507729462072614, "loss": 0.3753, "step": 250 }, { "epoch": 0.44, "grad_norm": 76.65412902832031, "learning_rate": 0.0009503688053120326, "loss": 0.9386, "step": 251 }, { "epoch": 0.45, "grad_norm": 94.82160186767578, "learning_rate": 0.0009499630988625925, "loss": 4.7449, "step": 252 }, { "epoch": 0.45, "grad_norm": 0.2721010148525238, "learning_rate": 0.0009495558282692421, "loss": 0.1358, "step": 253 }, { "epoch": 0.45, "grad_norm": 0.5150814056396484, "learning_rate": 0.0009491469949477187, "loss": 0.1622, "step": 254 }, { "epoch": 0.45, "grad_norm": 51.050167083740234, "learning_rate": 0.0009487366003191931, "loss": 0.7818, "step": 255 }, { "epoch": 0.45, "grad_norm": 11.698090553283691, "learning_rate": 0.0009483246458102625, "loss": 0.3862, "step": 256 }, { "epoch": 0.45, "grad_norm": 0.648543655872345, "learning_rate": 0.0009479111328529472, "loss": 0.1884, "step": 257 }, { "epoch": 0.46, "grad_norm": 0.745293140411377, "learning_rate": 0.0009474960628846843, "loss": 0.1562, "step": 258 }, { "epoch": 0.46, "grad_norm": 0.17890043556690216, "learning_rate": 0.0009470794373483235, "loss": 0.1425, "step": 259 }, { "epoch": 0.46, "grad_norm": 0.5058090686798096, "learning_rate": 0.0009466612576921223, "loss": 0.17, "step": 260 }, { "epoch": 0.46, "grad_norm": 1.3177820444107056, "learning_rate": 0.00094624152536974, "loss": 0.15, "step": 261 }, { "epoch": 0.46, "grad_norm": 0.49652573466300964, "learning_rate": 0.0009458202418402337, "loss": 0.145, "step": 262 }, { "epoch": 0.47, "grad_norm": 11.423394203186035, "learning_rate": 0.0009453974085680526, "loss": 0.349, "step": 263 }, { "epoch": 0.47, "grad_norm": 1.5422337055206299, "learning_rate": 0.0009449730270230326, "loss": 0.211, "step": 264 }, { "epoch": 0.47, "grad_norm": 103.68435668945312, "learning_rate": 0.0009445470986803921, "loss": 17.4069, "step": 265 }, { "epoch": 0.47, "grad_norm": 54.51758575439453, "learning_rate": 0.0009441196250207267, "loss": 15.685, "step": 266 }, { "epoch": 0.47, "grad_norm": 14.596623420715332, "learning_rate": 0.0009436906075300032, "loss": 0.791, "step": 267 }, { "epoch": 0.47, "grad_norm": 3.3164780139923096, "learning_rate": 0.000943260047699555, "loss": 0.3611, "step": 268 }, { "epoch": 0.48, "grad_norm": 0.3087855577468872, "learning_rate": 0.0009428279470260776, "loss": 0.1332, "step": 269 }, { "epoch": 0.48, "grad_norm": 1.1544523239135742, "learning_rate": 0.0009423943070116219, "loss": 0.2405, "step": 270 }, { "epoch": 0.48, "grad_norm": 0.27010253071784973, "learning_rate": 0.00094195912916359, "loss": 0.1241, "step": 271 }, { "epoch": 0.48, "grad_norm": 0.2287709265947342, "learning_rate": 0.0009415224149947306, "loss": 0.1366, "step": 272 }, { "epoch": 0.48, "grad_norm": 0.5216432809829712, "learning_rate": 0.0009410841660231316, "loss": 0.1641, "step": 273 }, { "epoch": 0.48, "grad_norm": 1.3091949224472046, "learning_rate": 0.0009406443837722167, "loss": 0.2524, "step": 274 }, { "epoch": 0.49, "grad_norm": 0.11813609302043915, "learning_rate": 0.0009402030697707398, "loss": 0.1353, "step": 275 }, { "epoch": 0.49, "grad_norm": 1.3709551095962524, "learning_rate": 0.000939760225552779, "loss": 0.2714, "step": 276 }, { "epoch": 0.49, "grad_norm": 8.527563095092773, "learning_rate": 0.0009393158526577322, "loss": 0.1955, "step": 277 }, { "epoch": 0.49, "grad_norm": 21.874027252197266, "learning_rate": 0.0009388699526303105, "loss": 0.2398, "step": 278 }, { "epoch": 0.49, "grad_norm": 51.793731689453125, "learning_rate": 0.0009384225270205339, "loss": 1.3069, "step": 279 }, { "epoch": 0.5, "grad_norm": 0.6711062788963318, "learning_rate": 0.0009379735773837259, "loss": 0.1664, "step": 280 }, { "epoch": 0.5, "grad_norm": 5.93789005279541, "learning_rate": 0.0009375231052805072, "loss": 0.2455, "step": 281 }, { "epoch": 0.5, "grad_norm": 62.527198791503906, "learning_rate": 0.0009370711122767912, "loss": 6.6447, "step": 282 }, { "epoch": 0.5, "grad_norm": 22.35348129272461, "learning_rate": 0.000936617599943778, "loss": 2.5015, "step": 283 }, { "epoch": 0.5, "grad_norm": 0.7277780175209045, "learning_rate": 0.0009361625698579493, "loss": 0.1667, "step": 284 }, { "epoch": 0.5, "eval_loss": 0.14179374277591705, "eval_runtime": 14.7139, "eval_samples_per_second": 32.418, "eval_steps_per_second": 8.156, "step": 284 }, { "epoch": 0.5, "grad_norm": 0.26271358132362366, "learning_rate": 0.0009357060236010625, "loss": 0.1429, "step": 285 }, { "epoch": 0.51, "grad_norm": 21.24464988708496, "learning_rate": 0.0009352479627601457, "loss": 2.0706, "step": 286 }, { "epoch": 0.51, "grad_norm": 6.5764265060424805, "learning_rate": 0.0009347883889274922, "loss": 0.3337, "step": 287 }, { "epoch": 0.51, "grad_norm": 0.6868380904197693, "learning_rate": 0.0009343273037006539, "loss": 0.1994, "step": 288 }, { "epoch": 0.51, "grad_norm": 0.9018234610557556, "learning_rate": 0.0009338647086824372, "loss": 0.1908, "step": 289 }, { "epoch": 0.51, "grad_norm": 1.7751502990722656, "learning_rate": 0.0009334006054808966, "loss": 0.2028, "step": 290 }, { "epoch": 0.51, "grad_norm": 0.5386408567428589, "learning_rate": 0.0009329349957093293, "loss": 0.1853, "step": 291 }, { "epoch": 0.52, "grad_norm": 1.4171103239059448, "learning_rate": 0.0009324678809862695, "loss": 0.3597, "step": 292 }, { "epoch": 0.52, "grad_norm": 0.4105970561504364, "learning_rate": 0.0009319992629354827, "loss": 0.1344, "step": 293 }, { "epoch": 0.52, "grad_norm": 0.26628127694129944, "learning_rate": 0.000931529143185961, "loss": 0.1453, "step": 294 }, { "epoch": 0.52, "grad_norm": 14.981964111328125, "learning_rate": 0.0009310575233719154, "loss": 0.2563, "step": 295 }, { "epoch": 0.52, "grad_norm": 0.6945788264274597, "learning_rate": 0.0009305844051327725, "loss": 0.1229, "step": 296 }, { "epoch": 0.53, "grad_norm": 31.034496307373047, "learning_rate": 0.000930109790113167, "loss": 1.2974, "step": 297 }, { "epoch": 0.53, "grad_norm": 1.5794603824615479, "learning_rate": 0.0009296336799629368, "loss": 0.22, "step": 298 }, { "epoch": 0.53, "grad_norm": 0.33219394087791443, "learning_rate": 0.0009291560763371172, "loss": 0.1262, "step": 299 }, { "epoch": 0.53, "grad_norm": 2.597118377685547, "learning_rate": 0.000928676980895935, "loss": 0.4026, "step": 300 }, { "epoch": 0.53, "grad_norm": 13.547090530395508, "learning_rate": 0.0009281963953048029, "loss": 1.3086, "step": 301 }, { "epoch": 0.53, "grad_norm": 1.289302945137024, "learning_rate": 0.0009277143212343134, "loss": 0.2215, "step": 302 }, { "epoch": 0.54, "grad_norm": 1.2176313400268555, "learning_rate": 0.0009272307603602334, "loss": 0.15, "step": 303 }, { "epoch": 0.54, "grad_norm": 4.436944007873535, "learning_rate": 0.0009267457143634979, "loss": 0.514, "step": 304 }, { "epoch": 0.54, "grad_norm": 29.960241317749023, "learning_rate": 0.0009262591849302047, "loss": 3.5389, "step": 305 }, { "epoch": 0.54, "grad_norm": 5.514049530029297, "learning_rate": 0.0009257711737516082, "loss": 0.2902, "step": 306 }, { "epoch": 0.54, "grad_norm": 2.331019401550293, "learning_rate": 0.0009252816825241135, "loss": 0.2775, "step": 307 }, { "epoch": 0.54, "grad_norm": 0.5708584189414978, "learning_rate": 0.0009247907129492707, "loss": 0.1438, "step": 308 }, { "epoch": 0.55, "grad_norm": 2.16607928276062, "learning_rate": 0.0009242982667337685, "loss": 0.2383, "step": 309 }, { "epoch": 0.55, "grad_norm": 1.5346423387527466, "learning_rate": 0.0009238043455894293, "loss": 0.1793, "step": 310 }, { "epoch": 0.55, "grad_norm": 0.6200052499771118, "learning_rate": 0.000923308951233202, "loss": 0.1473, "step": 311 }, { "epoch": 0.55, "grad_norm": 64.87612915039062, "learning_rate": 0.0009228120853871572, "loss": 0.8875, "step": 312 }, { "epoch": 0.55, "grad_norm": 1.077471137046814, "learning_rate": 0.0009223137497784797, "loss": 0.2114, "step": 313 }, { "epoch": 0.56, "grad_norm": 3.5934722423553467, "learning_rate": 0.0009218139461394644, "loss": 0.2852, "step": 314 }, { "epoch": 0.56, "grad_norm": 0.10276800394058228, "learning_rate": 0.0009213126762075088, "loss": 0.1365, "step": 315 }, { "epoch": 0.56, "grad_norm": 3.9422831535339355, "learning_rate": 0.0009208099417251077, "loss": 0.2949, "step": 316 }, { "epoch": 0.56, "grad_norm": 1.7574914693832397, "learning_rate": 0.0009203057444398468, "loss": 0.2621, "step": 317 }, { "epoch": 0.56, "grad_norm": 0.29479530453681946, "learning_rate": 0.0009198000861043967, "loss": 0.1341, "step": 318 }, { "epoch": 0.56, "grad_norm": 0.5362827181816101, "learning_rate": 0.0009192929684765068, "loss": 0.1398, "step": 319 }, { "epoch": 0.57, "grad_norm": 0.8159481287002563, "learning_rate": 0.0009187843933189994, "loss": 0.1863, "step": 320 }, { "epoch": 0.57, "grad_norm": 0.9413295388221741, "learning_rate": 0.0009182743623997634, "loss": 0.2104, "step": 321 }, { "epoch": 0.57, "grad_norm": 0.5306220650672913, "learning_rate": 0.0009177628774917479, "loss": 0.1537, "step": 322 }, { "epoch": 0.57, "grad_norm": 0.8887706398963928, "learning_rate": 0.0009172499403729567, "loss": 0.1963, "step": 323 }, { "epoch": 0.57, "grad_norm": 0.8467744588851929, "learning_rate": 0.0009167355528264414, "loss": 0.204, "step": 324 }, { "epoch": 0.57, "grad_norm": 0.19867151975631714, "learning_rate": 0.0009162197166402956, "loss": 0.1407, "step": 325 }, { "epoch": 0.58, "grad_norm": 0.13638383150100708, "learning_rate": 0.0009157024336076487, "loss": 0.1408, "step": 326 }, { "epoch": 0.58, "grad_norm": 0.2027496099472046, "learning_rate": 0.0009151837055266594, "loss": 0.1444, "step": 327 }, { "epoch": 0.58, "grad_norm": 0.370151549577713, "learning_rate": 0.0009146635342005098, "loss": 0.158, "step": 328 }, { "epoch": 0.58, "grad_norm": 0.3114052414894104, "learning_rate": 0.000914141921437399, "loss": 0.1464, "step": 329 }, { "epoch": 0.58, "grad_norm": 0.15394961833953857, "learning_rate": 0.0009136188690505362, "loss": 0.1341, "step": 330 }, { "epoch": 0.59, "grad_norm": 0.46498528122901917, "learning_rate": 0.0009130943788581359, "loss": 0.1426, "step": 331 }, { "epoch": 0.59, "grad_norm": 0.28067877888679504, "learning_rate": 0.00091256845268341, "loss": 0.1409, "step": 332 }, { "epoch": 0.59, "grad_norm": 0.061186857521533966, "learning_rate": 0.0009120410923545619, "loss": 0.1401, "step": 333 }, { "epoch": 0.59, "grad_norm": 0.26736098527908325, "learning_rate": 0.0009115122997047811, "loss": 0.1467, "step": 334 }, { "epoch": 0.59, "grad_norm": 0.5139696598052979, "learning_rate": 0.0009109820765722356, "loss": 0.1585, "step": 335 }, { "epoch": 0.59, "grad_norm": 0.40007275342941284, "learning_rate": 0.000910450424800066, "loss": 0.1473, "step": 336 }, { "epoch": 0.6, "grad_norm": 0.66825270652771, "learning_rate": 0.0009099173462363792, "loss": 0.1572, "step": 337 }, { "epoch": 0.6, "grad_norm": 0.5313024520874023, "learning_rate": 0.0009093828427342418, "loss": 0.1555, "step": 338 }, { "epoch": 0.6, "grad_norm": 0.4224655330181122, "learning_rate": 0.0009088469161516735, "loss": 0.1429, "step": 339 }, { "epoch": 0.6, "grad_norm": 0.03462248668074608, "learning_rate": 0.0009083095683516414, "loss": 0.1325, "step": 340 }, { "epoch": 0.6, "grad_norm": 0.542322039604187, "learning_rate": 0.0009077708012020524, "loss": 0.1755, "step": 341 }, { "epoch": 0.61, "grad_norm": 0.2164747267961502, "learning_rate": 0.0009072306165757476, "loss": 0.1458, "step": 342 }, { "epoch": 0.61, "grad_norm": 0.27414461970329285, "learning_rate": 0.0009066890163504955, "loss": 0.1512, "step": 343 }, { "epoch": 0.61, "grad_norm": 0.1911482959985733, "learning_rate": 0.0009061460024089853, "loss": 0.1185, "step": 344 }, { "epoch": 0.61, "grad_norm": 0.1287711262702942, "learning_rate": 0.0009056015766388205, "loss": 0.1372, "step": 345 }, { "epoch": 0.61, "grad_norm": 0.18598809838294983, "learning_rate": 0.0009050557409325125, "loss": 0.1341, "step": 346 }, { "epoch": 0.61, "grad_norm": 0.18694853782653809, "learning_rate": 0.0009045084971874737, "loss": 0.141, "step": 347 }, { "epoch": 0.62, "grad_norm": 0.06479912996292114, "learning_rate": 0.0009039598473060113, "loss": 0.1368, "step": 348 }, { "epoch": 0.62, "grad_norm": 0.17768733203411102, "learning_rate": 0.0009034097931953201, "loss": 0.1381, "step": 349 }, { "epoch": 0.62, "grad_norm": 0.28938984870910645, "learning_rate": 0.0009028583367674765, "loss": 0.1365, "step": 350 }, { "epoch": 0.62, "grad_norm": 0.2924034893512726, "learning_rate": 0.0009023054799394316, "loss": 0.1282, "step": 351 }, { "epoch": 0.62, "grad_norm": 0.28439652919769287, "learning_rate": 0.0009017512246330042, "loss": 0.151, "step": 352 }, { "epoch": 0.62, "grad_norm": 0.14329224824905396, "learning_rate": 0.0009011955727748749, "loss": 0.1419, "step": 353 }, { "epoch": 0.63, "grad_norm": 0.15245947241783142, "learning_rate": 0.0009006385262965785, "loss": 0.1163, "step": 354 }, { "epoch": 0.63, "grad_norm": 0.052399642765522, "learning_rate": 0.000900080087134498, "loss": 0.1241, "step": 355 }, { "epoch": 0.63, "grad_norm": 0.030301153659820557, "learning_rate": 0.0008995202572298575, "loss": 0.1232, "step": 356 }, { "epoch": 0.63, "grad_norm": 0.41738417744636536, "learning_rate": 0.0008989590385287155, "loss": 0.1675, "step": 357 }, { "epoch": 0.63, "grad_norm": 0.19307875633239746, "learning_rate": 0.0008983964329819583, "loss": 0.1328, "step": 358 }, { "epoch": 0.64, "grad_norm": 0.05682377517223358, "learning_rate": 0.000897832442545293, "loss": 0.1322, "step": 359 }, { "epoch": 0.64, "grad_norm": 0.15418089926242828, "learning_rate": 0.0008972670691792409, "loss": 0.1414, "step": 360 }, { "epoch": 0.64, "grad_norm": 0.07167459279298782, "learning_rate": 0.0008967003148491304, "loss": 0.1414, "step": 361 }, { "epoch": 0.64, "grad_norm": 0.2866109609603882, "learning_rate": 0.0008961321815250904, "loss": 0.1381, "step": 362 }, { "epoch": 0.64, "grad_norm": 0.281264990568161, "learning_rate": 0.0008955626711820438, "loss": 0.1365, "step": 363 }, { "epoch": 0.64, "grad_norm": 0.19263768196105957, "learning_rate": 0.0008949917857996997, "loss": 0.1394, "step": 364 }, { "epoch": 0.65, "grad_norm": 0.30531641840934753, "learning_rate": 0.0008944195273625471, "loss": 0.1478, "step": 365 }, { "epoch": 0.65, "grad_norm": 0.16229306161403656, "learning_rate": 0.0008938458978598483, "loss": 0.1412, "step": 366 }, { "epoch": 0.65, "grad_norm": 0.09315463900566101, "learning_rate": 0.0008932708992856315, "loss": 0.1397, "step": 367 }, { "epoch": 0.65, "grad_norm": 0.04228806868195534, "learning_rate": 0.0008926945336386838, "loss": 0.1383, "step": 368 }, { "epoch": 0.65, "grad_norm": 0.2209407389163971, "learning_rate": 0.0008921168029225448, "loss": 0.1434, "step": 369 }, { "epoch": 0.65, "grad_norm": 0.04254443198442459, "learning_rate": 0.0008915377091454992, "loss": 0.1326, "step": 370 }, { "epoch": 0.66, "grad_norm": 0.09651175886392593, "learning_rate": 0.0008909572543205698, "loss": 0.134, "step": 371 }, { "epoch": 0.66, "grad_norm": 0.2821654975414276, "learning_rate": 0.0008903754404655105, "loss": 0.1498, "step": 372 }, { "epoch": 0.66, "grad_norm": 0.43042680621147156, "learning_rate": 0.0008897922696027998, "loss": 0.1571, "step": 373 }, { "epoch": 0.66, "grad_norm": 0.06591568142175674, "learning_rate": 0.0008892077437596332, "loss": 0.1391, "step": 374 }, { "epoch": 0.66, "grad_norm": 0.08771979063749313, "learning_rate": 0.0008886218649679161, "loss": 0.1375, "step": 375 }, { "epoch": 0.67, "grad_norm": 0.03339942544698715, "learning_rate": 0.0008880346352642574, "loss": 0.1368, "step": 376 }, { "epoch": 0.67, "grad_norm": 0.15352453291416168, "learning_rate": 0.0008874460566899616, "loss": 0.1447, "step": 377 }, { "epoch": 0.67, "grad_norm": 0.1778584122657776, "learning_rate": 0.0008868561312910222, "loss": 0.1189, "step": 378 }, { "epoch": 0.67, "grad_norm": 0.11893154680728912, "learning_rate": 0.0008862648611181144, "loss": 0.1167, "step": 379 }, { "epoch": 0.67, "grad_norm": 0.4323861598968506, "learning_rate": 0.0008856722482265886, "loss": 0.1691, "step": 380 }, { "epoch": 0.67, "grad_norm": 0.28813356161117554, "learning_rate": 0.0008850782946764618, "loss": 0.1505, "step": 381 }, { "epoch": 0.68, "grad_norm": 0.5008757710456848, "learning_rate": 0.0008844830025324122, "loss": 0.1671, "step": 382 }, { "epoch": 0.68, "grad_norm": 0.12061876803636551, "learning_rate": 0.0008838863738637705, "loss": 0.1375, "step": 383 }, { "epoch": 0.68, "grad_norm": 0.6747052073478699, "learning_rate": 0.0008832884107445138, "loss": 0.1663, "step": 384 }, { "epoch": 0.68, "grad_norm": 0.18846777081489563, "learning_rate": 0.0008826891152532579, "loss": 0.1148, "step": 385 }, { "epoch": 0.68, "grad_norm": 0.0950111448764801, "learning_rate": 0.0008820884894732497, "loss": 0.1138, "step": 386 }, { "epoch": 0.68, "grad_norm": 0.42371127009391785, "learning_rate": 0.0008814865354923613, "loss": 0.142, "step": 387 }, { "epoch": 0.69, "grad_norm": 0.17662374675273895, "learning_rate": 0.0008808832554030808, "loss": 0.1255, "step": 388 }, { "epoch": 0.69, "grad_norm": 0.7766286134719849, "learning_rate": 0.0008802786513025068, "loss": 0.1613, "step": 389 }, { "epoch": 0.69, "grad_norm": 0.49581214785575867, "learning_rate": 0.0008796727252923403, "loss": 0.1346, "step": 390 }, { "epoch": 0.69, "grad_norm": 0.6148929595947266, "learning_rate": 0.0008790654794788768, "loss": 0.1426, "step": 391 }, { "epoch": 0.69, "grad_norm": 0.15860037505626678, "learning_rate": 0.0008784569159730007, "loss": 0.1382, "step": 392 }, { "epoch": 0.7, "grad_norm": 0.6793199777603149, "learning_rate": 0.0008778470368901761, "loss": 0.1398, "step": 393 }, { "epoch": 0.7, "grad_norm": 0.40314817428588867, "learning_rate": 0.0008772358443504404, "loss": 0.1428, "step": 394 }, { "epoch": 0.7, "grad_norm": 0.6403933167457581, "learning_rate": 0.0008766233404783974, "loss": 0.1556, "step": 395 }, { "epoch": 0.7, "grad_norm": 0.33554157614707947, "learning_rate": 0.0008760095274032083, "loss": 0.1439, "step": 396 }, { "epoch": 0.7, "grad_norm": 0.45690324902534485, "learning_rate": 0.000875394407258586, "loss": 0.1374, "step": 397 }, { "epoch": 0.7, "grad_norm": 0.0541120283305645, "learning_rate": 0.0008747779821827868, "loss": 0.1314, "step": 398 }, { "epoch": 0.71, "grad_norm": 0.6533159613609314, "learning_rate": 0.0008741602543186031, "loss": 0.169, "step": 399 }, { "epoch": 0.71, "grad_norm": 0.4919282793998718, "learning_rate": 0.0008735412258133561, "loss": 0.1569, "step": 400 }, { "epoch": 0.71, "grad_norm": 0.30325594544410706, "learning_rate": 0.0008729208988188881, "loss": 0.1471, "step": 401 }, { "epoch": 0.71, "grad_norm": 0.3497300148010254, "learning_rate": 0.0008722992754915554, "loss": 0.1457, "step": 402 }, { "epoch": 0.71, "grad_norm": 0.22892774641513824, "learning_rate": 0.0008716763579922203, "loss": 0.1334, "step": 403 }, { "epoch": 0.71, "grad_norm": 0.20050272345542908, "learning_rate": 0.0008710521484862439, "loss": 0.1446, "step": 404 }, { "epoch": 0.72, "grad_norm": 0.5029633641242981, "learning_rate": 0.0008704266491434787, "loss": 0.171, "step": 405 }, { "epoch": 0.72, "grad_norm": 0.2720576226711273, "learning_rate": 0.0008697998621382607, "loss": 0.144, "step": 406 }, { "epoch": 0.72, "grad_norm": 0.10961242765188217, "learning_rate": 0.000869171789649402, "loss": 0.1349, "step": 407 }, { "epoch": 0.72, "grad_norm": 0.13584192097187042, "learning_rate": 0.0008685424338601833, "loss": 0.1385, "step": 408 }, { "epoch": 0.72, "grad_norm": 0.6586437821388245, "learning_rate": 0.0008679117969583464, "loss": 0.1459, "step": 409 }, { "epoch": 0.73, "grad_norm": 0.24006032943725586, "learning_rate": 0.0008672798811360864, "loss": 0.1344, "step": 410 }, { "epoch": 0.73, "grad_norm": 0.1859387755393982, "learning_rate": 0.0008666466885900438, "loss": 0.1358, "step": 411 }, { "epoch": 0.73, "grad_norm": 0.5095134973526001, "learning_rate": 0.0008660122215212977, "loss": 0.1387, "step": 412 }, { "epoch": 0.73, "grad_norm": 0.1827729493379593, "learning_rate": 0.0008653764821353573, "loss": 0.1377, "step": 413 }, { "epoch": 0.73, "grad_norm": 0.14332665503025055, "learning_rate": 0.0008647394726421547, "loss": 0.131, "step": 414 }, { "epoch": 0.73, "grad_norm": 0.383101224899292, "learning_rate": 0.0008641011952560371, "loss": 0.146, "step": 415 }, { "epoch": 0.74, "grad_norm": 0.19079791009426117, "learning_rate": 0.000863461652195759, "loss": 0.1255, "step": 416 }, { "epoch": 0.74, "grad_norm": 0.49537310004234314, "learning_rate": 0.0008628208456844747, "loss": 0.1602, "step": 417 }, { "epoch": 0.74, "grad_norm": 0.5658069849014282, "learning_rate": 0.0008621787779497306, "loss": 0.1518, "step": 418 }, { "epoch": 0.74, "grad_norm": 0.2572256326675415, "learning_rate": 0.0008615354512234569, "loss": 0.1369, "step": 419 }, { "epoch": 0.74, "grad_norm": 1.1088945865631104, "learning_rate": 0.0008608908677419605, "loss": 0.1773, "step": 420 }, { "epoch": 0.74, "grad_norm": 0.35405099391937256, "learning_rate": 0.0008602450297459173, "loss": 0.1441, "step": 421 }, { "epoch": 0.75, "grad_norm": 0.39150556921958923, "learning_rate": 0.0008595979394803633, "loss": 0.147, "step": 422 }, { "epoch": 0.75, "grad_norm": 0.07459918409585953, "learning_rate": 0.0008589495991946885, "loss": 0.1338, "step": 423 }, { "epoch": 0.75, "grad_norm": 0.2999761402606964, "learning_rate": 0.0008583000111426276, "loss": 0.1357, "step": 424 }, { "epoch": 0.75, "grad_norm": 0.28417065739631653, "learning_rate": 0.0008576491775822525, "loss": 0.1411, "step": 425 }, { "epoch": 0.75, "grad_norm": 0.32605019211769104, "learning_rate": 0.0008569971007759657, "loss": 0.1329, "step": 426 }, { "epoch": 0.75, "eval_loss": 0.13750587403774261, "eval_runtime": 15.1749, "eval_samples_per_second": 31.433, "eval_steps_per_second": 7.908, "step": 426 }, { "epoch": 0.76, "grad_norm": 0.047430120408535004, "learning_rate": 0.0008563437829904903, "loss": 0.1373, "step": 427 }, { "epoch": 0.76, "grad_norm": 0.4616542160511017, "learning_rate": 0.0008556892264968639, "loss": 0.1534, "step": 428 }, { "epoch": 0.76, "grad_norm": 0.12317585945129395, "learning_rate": 0.0008550334335704297, "loss": 0.1338, "step": 429 }, { "epoch": 0.76, "grad_norm": 0.39604276418685913, "learning_rate": 0.0008543764064908295, "loss": 0.1434, "step": 430 }, { "epoch": 0.76, "grad_norm": 0.3490678369998932, "learning_rate": 0.0008537181475419944, "loss": 0.1365, "step": 431 }, { "epoch": 0.76, "grad_norm": 0.15001270174980164, "learning_rate": 0.0008530586590121383, "loss": 0.1358, "step": 432 }, { "epoch": 0.77, "grad_norm": 0.33340635895729065, "learning_rate": 0.0008523979431937492, "loss": 0.1367, "step": 433 }, { "epoch": 0.77, "grad_norm": 0.06029750779271126, "learning_rate": 0.0008517360023835809, "loss": 0.1366, "step": 434 }, { "epoch": 0.77, "grad_norm": 0.07978738099336624, "learning_rate": 0.0008510728388826463, "loss": 0.1345, "step": 435 }, { "epoch": 0.77, "grad_norm": 0.27599036693573, "learning_rate": 0.0008504084549962079, "loss": 0.1447, "step": 436 }, { "epoch": 0.77, "grad_norm": 0.13302059471607208, "learning_rate": 0.0008497428530337706, "loss": 0.1407, "step": 437 }, { "epoch": 0.77, "grad_norm": 0.20869582891464233, "learning_rate": 0.0008490760353090737, "loss": 0.1374, "step": 438 }, { "epoch": 0.78, "grad_norm": 0.10881117731332779, "learning_rate": 0.0008484080041400825, "loss": 0.1429, "step": 439 }, { "epoch": 0.78, "grad_norm": 0.20344361662864685, "learning_rate": 0.0008477387618489807, "loss": 0.139, "step": 440 }, { "epoch": 0.78, "grad_norm": 0.07153432071208954, "learning_rate": 0.0008470683107621615, "loss": 0.1315, "step": 441 }, { "epoch": 0.78, "grad_norm": 0.08688751608133316, "learning_rate": 0.0008463966532102207, "loss": 0.1346, "step": 442 }, { "epoch": 0.78, "grad_norm": 0.06495650112628937, "learning_rate": 0.0008457237915279476, "loss": 0.1307, "step": 443 }, { "epoch": 0.79, "grad_norm": 0.1892390102148056, "learning_rate": 0.0008450497280543173, "loss": 0.12, "step": 444 }, { "epoch": 0.79, "grad_norm": 0.2579623758792877, "learning_rate": 0.0008443744651324827, "loss": 0.1531, "step": 445 }, { "epoch": 0.79, "grad_norm": 0.149379700422287, "learning_rate": 0.000843698005109766, "loss": 0.1385, "step": 446 }, { "epoch": 0.79, "grad_norm": 0.19281132519245148, "learning_rate": 0.0008430203503376506, "loss": 0.1033, "step": 447 }, { "epoch": 0.79, "grad_norm": 0.33208444714546204, "learning_rate": 0.0008423415031717733, "loss": 0.1525, "step": 448 }, { "epoch": 0.79, "grad_norm": 0.15149784088134766, "learning_rate": 0.0008416614659719157, "loss": 0.1282, "step": 449 }, { "epoch": 0.8, "grad_norm": 0.24646438658237457, "learning_rate": 0.0008409802411019962, "loss": 0.1393, "step": 450 }, { "epoch": 0.8, "grad_norm": 0.2505553662776947, "learning_rate": 0.000840297830930062, "loss": 0.1453, "step": 451 }, { "epoch": 0.8, "grad_norm": 0.1632508784532547, "learning_rate": 0.0008396142378282799, "loss": 0.1274, "step": 452 }, { "epoch": 0.8, "grad_norm": 0.12370573729276657, "learning_rate": 0.0008389294641729292, "loss": 0.1201, "step": 453 }, { "epoch": 0.8, "grad_norm": 0.08046772330999374, "learning_rate": 0.0008382435123443934, "loss": 0.1263, "step": 454 }, { "epoch": 0.8, "grad_norm": 0.19015488028526306, "learning_rate": 0.0008375563847271506, "loss": 0.1318, "step": 455 }, { "epoch": 0.81, "grad_norm": 0.3562954366207123, "learning_rate": 0.0008368680837097669, "loss": 0.132, "step": 456 }, { "epoch": 0.81, "grad_norm": 0.06315189599990845, "learning_rate": 0.000836178611684887, "loss": 0.1113, "step": 457 }, { "epoch": 0.81, "grad_norm": 0.43667125701904297, "learning_rate": 0.0008354879710492264, "loss": 0.1908, "step": 458 }, { "epoch": 0.81, "grad_norm": 0.0708879753947258, "learning_rate": 0.0008347961642035624, "loss": 0.1399, "step": 459 }, { "epoch": 0.81, "grad_norm": 0.04855835437774658, "learning_rate": 0.0008341031935527267, "loss": 0.1258, "step": 460 }, { "epoch": 0.82, "grad_norm": 0.1364990919828415, "learning_rate": 0.0008334090615055965, "loss": 0.1344, "step": 461 }, { "epoch": 0.82, "grad_norm": 0.08166524022817612, "learning_rate": 0.0008327137704750862, "loss": 0.134, "step": 462 }, { "epoch": 0.82, "grad_norm": 0.09308458864688873, "learning_rate": 0.0008320173228781389, "loss": 0.1507, "step": 463 }, { "epoch": 0.82, "grad_norm": 0.07796576619148254, "learning_rate": 0.000831319721135718, "loss": 0.1284, "step": 464 }, { "epoch": 0.82, "grad_norm": 0.12168626487255096, "learning_rate": 0.0008306209676727993, "loss": 0.148, "step": 465 }, { "epoch": 0.82, "grad_norm": 0.18862847983837128, "learning_rate": 0.000829921064918362, "loss": 0.1229, "step": 466 }, { "epoch": 0.83, "grad_norm": 0.23615515232086182, "learning_rate": 0.00082922001530538, "loss": 0.1322, "step": 467 }, { "epoch": 0.83, "grad_norm": 0.34108766913414, "learning_rate": 0.0008285178212708142, "loss": 0.1338, "step": 468 }, { "epoch": 0.83, "grad_norm": 0.39579400420188904, "learning_rate": 0.0008278144852556042, "loss": 0.1341, "step": 469 }, { "epoch": 0.83, "grad_norm": 0.2620592713356018, "learning_rate": 0.0008271100097046585, "loss": 0.1395, "step": 470 }, { "epoch": 0.83, "grad_norm": 0.08778171986341476, "learning_rate": 0.0008264043970668469, "loss": 0.1328, "step": 471 }, { "epoch": 0.84, "grad_norm": 0.6086364388465881, "learning_rate": 0.0008256976497949924, "loss": 0.1271, "step": 472 }, { "epoch": 0.84, "grad_norm": 0.08982394635677338, "learning_rate": 0.0008249897703458619, "loss": 0.1346, "step": 473 }, { "epoch": 0.84, "grad_norm": 0.054080091416835785, "learning_rate": 0.0008242807611801578, "loss": 0.1218, "step": 474 }, { "epoch": 0.84, "grad_norm": 0.5981457829475403, "learning_rate": 0.0008235706247625098, "loss": 0.1715, "step": 475 }, { "epoch": 0.84, "grad_norm": 0.9139420986175537, "learning_rate": 0.0008228593635614659, "loss": 0.1983, "step": 476 }, { "epoch": 0.84, "grad_norm": 0.05938498303294182, "learning_rate": 0.0008221469800494841, "loss": 0.1308, "step": 477 }, { "epoch": 0.85, "grad_norm": 0.11526026576757431, "learning_rate": 0.0008214334767029239, "loss": 0.1422, "step": 478 }, { "epoch": 0.85, "grad_norm": 0.3049907386302948, "learning_rate": 0.0008207188560020373, "loss": 0.1419, "step": 479 }, { "epoch": 0.85, "grad_norm": 0.04782035946846008, "learning_rate": 0.0008200031204309604, "loss": 0.138, "step": 480 }, { "epoch": 0.85, "grad_norm": 0.12950918078422546, "learning_rate": 0.000819286272477705, "loss": 0.1315, "step": 481 }, { "epoch": 0.85, "grad_norm": 0.0429329015314579, "learning_rate": 0.0008185683146341496, "loss": 0.1354, "step": 482 }, { "epoch": 0.85, "grad_norm": 0.4792588949203491, "learning_rate": 0.0008178492493960308, "loss": 0.1476, "step": 483 }, { "epoch": 0.86, "grad_norm": 0.19784927368164062, "learning_rate": 0.0008171290792629346, "loss": 0.1394, "step": 484 }, { "epoch": 0.86, "grad_norm": 0.1172945499420166, "learning_rate": 0.000816407806738288, "loss": 0.1302, "step": 485 }, { "epoch": 0.86, "grad_norm": 0.3732689917087555, "learning_rate": 0.0008156854343293501, "loss": 0.1416, "step": 486 }, { "epoch": 0.86, "grad_norm": 0.5152392983436584, "learning_rate": 0.0008149619645472031, "loss": 0.1403, "step": 487 }, { "epoch": 0.86, "grad_norm": 0.15429601073265076, "learning_rate": 0.000814237399906744, "loss": 0.1322, "step": 488 }, { "epoch": 0.87, "grad_norm": 1.0002127885818481, "learning_rate": 0.0008135117429266756, "loss": 0.1303, "step": 489 }, { "epoch": 0.87, "grad_norm": 0.7232715487480164, "learning_rate": 0.0008127849961294984, "loss": 0.143, "step": 490 }, { "epoch": 0.87, "grad_norm": 0.13510456681251526, "learning_rate": 0.0008120571620415006, "loss": 0.1536, "step": 491 }, { "epoch": 0.87, "grad_norm": 0.5168789625167847, "learning_rate": 0.0008113282431927503, "loss": 0.1312, "step": 492 }, { "epoch": 0.87, "grad_norm": 0.7039850950241089, "learning_rate": 0.000810598242117086, "loss": 0.118, "step": 493 }, { "epoch": 0.87, "grad_norm": 1.5126641988754272, "learning_rate": 0.0008098671613521089, "loss": 0.2343, "step": 494 }, { "epoch": 0.88, "grad_norm": 0.6958308815956116, "learning_rate": 0.0008091350034391731, "loss": 0.1648, "step": 495 }, { "epoch": 0.88, "grad_norm": 6.979303359985352, "learning_rate": 0.0008084017709233766, "loss": 0.2261, "step": 496 }, { "epoch": 0.88, "grad_norm": 0.3389752507209778, "learning_rate": 0.0008076674663535537, "loss": 0.146, "step": 497 }, { "epoch": 0.88, "grad_norm": 0.19990071654319763, "learning_rate": 0.0008069320922822643, "loss": 0.1429, "step": 498 }, { "epoch": 0.88, "grad_norm": 0.33689868450164795, "learning_rate": 0.0008061956512657871, "loss": 0.147, "step": 499 }, { "epoch": 0.88, "grad_norm": 0.09925112873315811, "learning_rate": 0.000805458145864109, "loss": 0.1342, "step": 500 }, { "epoch": 0.89, "grad_norm": 1.961702585220337, "learning_rate": 0.0008047195786409172, "loss": 0.1361, "step": 501 }, { "epoch": 0.89, "grad_norm": 0.4342229962348938, "learning_rate": 0.0008039799521635895, "loss": 0.1485, "step": 502 }, { "epoch": 0.89, "grad_norm": 0.1798858642578125, "learning_rate": 0.0008032392690031867, "loss": 0.1314, "step": 503 }, { "epoch": 0.89, "grad_norm": 1.3653756380081177, "learning_rate": 0.0008024975317344421, "loss": 0.1388, "step": 504 }, { "epoch": 0.89, "grad_norm": 9.677605628967285, "learning_rate": 0.0008017547429357531, "loss": 0.4186, "step": 505 }, { "epoch": 0.9, "grad_norm": 8.348475456237793, "learning_rate": 0.0008010109051891731, "loss": 0.3806, "step": 506 }, { "epoch": 0.9, "grad_norm": 35.19770050048828, "learning_rate": 0.0008002660210804011, "loss": 3.6145, "step": 507 }, { "epoch": 0.9, "grad_norm": 9.18663501739502, "learning_rate": 0.0007995200931987743, "loss": 0.6162, "step": 508 }, { "epoch": 0.9, "grad_norm": 0.05997322499752045, "learning_rate": 0.0007987731241372571, "loss": 0.1129, "step": 509 }, { "epoch": 0.9, "grad_norm": 0.41408172249794006, "learning_rate": 0.000798025116492434, "loss": 0.1512, "step": 510 }, { "epoch": 0.9, "grad_norm": 0.4445393979549408, "learning_rate": 0.0007972760728644996, "loss": 0.1463, "step": 511 }, { "epoch": 0.91, "grad_norm": 0.19678063690662384, "learning_rate": 0.0007965259958572495, "loss": 0.1386, "step": 512 }, { "epoch": 0.91, "grad_norm": 0.45497119426727295, "learning_rate": 0.0007957748880780721, "loss": 0.1373, "step": 513 }, { "epoch": 0.91, "grad_norm": 0.6455509066581726, "learning_rate": 0.0007950227521379381, "loss": 0.1584, "step": 514 }, { "epoch": 0.91, "grad_norm": 0.3793765604496002, "learning_rate": 0.0007942695906513929, "loss": 0.1236, "step": 515 }, { "epoch": 0.91, "grad_norm": 0.20562775433063507, "learning_rate": 0.0007935154062365467, "loss": 0.1364, "step": 516 }, { "epoch": 0.91, "grad_norm": 1.3131325244903564, "learning_rate": 0.0007927602015150655, "loss": 0.1556, "step": 517 }, { "epoch": 0.92, "grad_norm": 0.1705670803785324, "learning_rate": 0.0007920039791121617, "loss": 0.1372, "step": 518 }, { "epoch": 0.92, "grad_norm": 6.6207499504089355, "learning_rate": 0.0007912467416565861, "loss": 0.22, "step": 519 }, { "epoch": 0.92, "grad_norm": 0.34343230724334717, "learning_rate": 0.0007904884917806173, "loss": 0.1453, "step": 520 }, { "epoch": 0.92, "grad_norm": 0.4290754497051239, "learning_rate": 0.0007897292321200537, "loss": 0.1177, "step": 521 }, { "epoch": 0.92, "grad_norm": 0.24469922482967377, "learning_rate": 0.0007889689653142036, "loss": 0.1369, "step": 522 }, { "epoch": 0.93, "grad_norm": 0.5307168960571289, "learning_rate": 0.0007882076940058763, "loss": 0.1542, "step": 523 }, { "epoch": 0.93, "grad_norm": 0.13802866637706757, "learning_rate": 0.000787445420841373, "loss": 0.1372, "step": 524 }, { "epoch": 0.93, "grad_norm": 0.36055922508239746, "learning_rate": 0.0007866821484704776, "loss": 0.1413, "step": 525 }, { "epoch": 0.93, "grad_norm": 0.36655113101005554, "learning_rate": 0.0007859178795464472, "loss": 0.1438, "step": 526 }, { "epoch": 0.93, "grad_norm": 0.6237390637397766, "learning_rate": 0.0007851526167260034, "loss": 0.1382, "step": 527 }, { "epoch": 0.93, "grad_norm": 0.42217007279396057, "learning_rate": 0.0007843863626693221, "loss": 0.1408, "step": 528 }, { "epoch": 0.94, "grad_norm": 24.023250579833984, "learning_rate": 0.0007836191200400256, "loss": 0.1517, "step": 529 }, { "epoch": 0.94, "grad_norm": 0.31599146127700806, "learning_rate": 0.0007828508915051723, "loss": 0.1353, "step": 530 }, { "epoch": 0.94, "grad_norm": 0.6795622706413269, "learning_rate": 0.0007820816797352479, "loss": 0.1515, "step": 531 }, { "epoch": 0.94, "grad_norm": 0.37493640184402466, "learning_rate": 0.0007813114874041557, "loss": 0.141, "step": 532 }, { "epoch": 0.94, "grad_norm": 1.7365546226501465, "learning_rate": 0.0007805403171892079, "loss": 0.1347, "step": 533 }, { "epoch": 0.94, "grad_norm": 18.393390655517578, "learning_rate": 0.000779768171771116, "loss": 0.1753, "step": 534 }, { "epoch": 0.95, "grad_norm": 2.2978413105010986, "learning_rate": 0.0007789950538339812, "loss": 0.1418, "step": 535 }, { "epoch": 0.95, "grad_norm": 0.495151162147522, "learning_rate": 0.0007782209660652854, "loss": 0.146, "step": 536 }, { "epoch": 0.95, "grad_norm": 7.705572605133057, "learning_rate": 0.0007774459111558821, "loss": 0.2042, "step": 537 }, { "epoch": 0.95, "grad_norm": 0.6036086678504944, "learning_rate": 0.0007766698917999862, "loss": 0.1695, "step": 538 }, { "epoch": 0.95, "grad_norm": 127.21215057373047, "learning_rate": 0.0007758929106951656, "loss": 18.5136, "step": 539 }, { "epoch": 0.96, "grad_norm": 40.58448791503906, "learning_rate": 0.0007751149705423312, "loss": 0.5973, "step": 540 }, { "epoch": 0.96, "grad_norm": 0.6296218633651733, "learning_rate": 0.0007743360740457278, "loss": 0.1849, "step": 541 }, { "epoch": 0.96, "grad_norm": 0.4533160924911499, "learning_rate": 0.0007735562239129247, "loss": 0.1464, "step": 542 }, { "epoch": 0.96, "grad_norm": 0.2379036247730255, "learning_rate": 0.0007727754228548058, "loss": 0.1267, "step": 543 }, { "epoch": 0.96, "grad_norm": 0.8904889225959778, "learning_rate": 0.000771993673585561, "loss": 0.2181, "step": 544 }, { "epoch": 0.96, "grad_norm": 0.8934443593025208, "learning_rate": 0.0007712109788226762, "loss": 0.2158, "step": 545 }, { "epoch": 0.97, "grad_norm": 0.3368353545665741, "learning_rate": 0.0007704273412869238, "loss": 0.1489, "step": 546 }, { "epoch": 0.97, "grad_norm": 0.2570180594921112, "learning_rate": 0.0007696427637023537, "loss": 0.144, "step": 547 }, { "epoch": 0.97, "grad_norm": 2.865034580230713, "learning_rate": 0.0007688572487962834, "loss": 0.1664, "step": 548 }, { "epoch": 0.97, "grad_norm": 0.4369525611400604, "learning_rate": 0.0007680707992992888, "loss": 0.1777, "step": 549 }, { "epoch": 0.97, "grad_norm": 0.2545509934425354, "learning_rate": 0.0007672834179451942, "loss": 0.1536, "step": 550 }, { "epoch": 0.97, "grad_norm": 0.14455465972423553, "learning_rate": 0.0007664951074710638, "loss": 0.1256, "step": 551 }, { "epoch": 0.98, "grad_norm": 0.16001886129379272, "learning_rate": 0.0007657058706171911, "loss": 0.1356, "step": 552 }, { "epoch": 0.98, "grad_norm": 0.2537885308265686, "learning_rate": 0.0007649157101270903, "loss": 0.1393, "step": 553 }, { "epoch": 0.98, "grad_norm": 0.33060047030448914, "learning_rate": 0.0007641246287474854, "loss": 0.148, "step": 554 }, { "epoch": 0.98, "grad_norm": 1.691941499710083, "learning_rate": 0.0007633326292283028, "loss": 0.1764, "step": 555 }, { "epoch": 0.98, "grad_norm": 0.20472805202007294, "learning_rate": 0.0007625397143226595, "loss": 0.1424, "step": 556 }, { "epoch": 0.99, "grad_norm": 0.7124485969543457, "learning_rate": 0.0007617458867868553, "loss": 0.1482, "step": 557 }, { "epoch": 0.99, "grad_norm": 0.09631184488534927, "learning_rate": 0.0007609511493803615, "loss": 0.1392, "step": 558 }, { "epoch": 0.99, "grad_norm": 0.20814809203147888, "learning_rate": 0.0007601555048658133, "loss": 0.1384, "step": 559 }, { "epoch": 0.99, "grad_norm": 0.19566737115383148, "learning_rate": 0.0007593589560089984, "loss": 0.1394, "step": 560 }, { "epoch": 0.99, "grad_norm": 0.13406091928482056, "learning_rate": 0.0007585615055788484, "loss": 0.1389, "step": 561 }, { "epoch": 0.99, "grad_norm": 0.07635807991027832, "learning_rate": 0.0007577631563474291, "loss": 0.1376, "step": 562 }, { "epoch": 1.0, "grad_norm": 0.11265091598033905, "learning_rate": 0.0007569639110899302, "loss": 0.1395, "step": 563 }, { "epoch": 1.0, "grad_norm": 0.31152746081352234, "learning_rate": 0.0007561637725846567, "loss": 0.1407, "step": 564 }, { "epoch": 1.0, "grad_norm": 0.13474373519420624, "learning_rate": 0.0007553627436130183, "loss": 0.1386, "step": 565 }, { "epoch": 1.0, "grad_norm": 0.23706336319446564, "learning_rate": 0.0007545608269595201, "loss": 0.1417, "step": 566 }, { "epoch": 1.0, "grad_norm": 0.30558836460113525, "learning_rate": 0.0007537580254117531, "loss": 0.1452, "step": 567 }, { "epoch": 1.0, "grad_norm": 0.106146439909935, "learning_rate": 0.0007529543417603843, "loss": 0.1372, "step": 568 }, { "epoch": 1.0, "eval_loss": 0.13968442380428314, "eval_runtime": 15.1558, "eval_samples_per_second": 31.473, "eval_steps_per_second": 7.918, "step": 568 }, { "epoch": 1.01, "grad_norm": 0.3243511915206909, "learning_rate": 0.0007521497787991472, "loss": 0.1424, "step": 569 }, { "epoch": 1.01, "grad_norm": 0.19688986241817474, "learning_rate": 0.0007513443393248312, "loss": 0.1403, "step": 570 }, { "epoch": 1.01, "grad_norm": 0.1128445565700531, "learning_rate": 0.0007505380261372734, "loss": 0.1397, "step": 571 }, { "epoch": 1.01, "grad_norm": 0.11025507003068924, "learning_rate": 0.0007497308420393477, "loss": 0.1391, "step": 572 }, { "epoch": 1.01, "grad_norm": 0.19862700998783112, "learning_rate": 0.0007489227898369558, "loss": 0.1345, "step": 573 }, { "epoch": 1.02, "grad_norm": 0.11129032075405121, "learning_rate": 0.0007481138723390164, "loss": 0.1342, "step": 574 }, { "epoch": 1.02, "grad_norm": 0.21451863646507263, "learning_rate": 0.0007473040923574567, "loss": 0.132, "step": 575 }, { "epoch": 1.02, "grad_norm": 0.6781334280967712, "learning_rate": 0.0007464934527072016, "loss": 0.1688, "step": 576 }, { "epoch": 1.02, "grad_norm": 0.3881673812866211, "learning_rate": 0.0007456819562061648, "loss": 0.15, "step": 577 }, { "epoch": 1.02, "grad_norm": 0.0530267171561718, "learning_rate": 0.0007448696056752383, "loss": 0.139, "step": 578 }, { "epoch": 1.02, "grad_norm": 0.2782767415046692, "learning_rate": 0.0007440564039382827, "loss": 0.1334, "step": 579 }, { "epoch": 1.03, "grad_norm": 0.693821370601654, "learning_rate": 0.0007432423538221178, "loss": 0.1673, "step": 580 }, { "epoch": 1.03, "grad_norm": 0.27020275592803955, "learning_rate": 0.0007424274581565122, "loss": 0.1464, "step": 581 }, { "epoch": 1.03, "grad_norm": 0.42129820585250854, "learning_rate": 0.0007416117197741742, "loss": 0.1507, "step": 582 }, { "epoch": 1.03, "grad_norm": 0.21161474287509918, "learning_rate": 0.0007407951415107412, "loss": 0.1398, "step": 583 }, { "epoch": 1.03, "grad_norm": 0.13954728841781616, "learning_rate": 0.00073997772620477, "loss": 0.1387, "step": 584 }, { "epoch": 1.03, "grad_norm": 0.08771730959415436, "learning_rate": 0.0007391594766977276, "loss": 0.1419, "step": 585 }, { "epoch": 1.04, "grad_norm": 0.2630119025707245, "learning_rate": 0.0007383403958339806, "loss": 0.1483, "step": 586 }, { "epoch": 1.04, "grad_norm": 0.07496945559978485, "learning_rate": 0.0007375204864607851, "loss": 0.1339, "step": 587 }, { "epoch": 1.04, "grad_norm": 0.25115033984184265, "learning_rate": 0.0007366997514282782, "loss": 0.129, "step": 588 }, { "epoch": 1.04, "grad_norm": 0.24599210917949677, "learning_rate": 0.0007358781935894659, "loss": 0.1491, "step": 589 }, { "epoch": 1.04, "grad_norm": 0.14762777090072632, "learning_rate": 0.0007350558158002153, "loss": 0.1287, "step": 590 }, { "epoch": 1.05, "grad_norm": 0.03114377148449421, "learning_rate": 0.0007342326209192435, "loss": 0.1379, "step": 591 }, { "epoch": 1.05, "grad_norm": 0.5076407194137573, "learning_rate": 0.000733408611808108, "loss": 0.1122, "step": 592 }, { "epoch": 1.05, "grad_norm": 0.10492309182882309, "learning_rate": 0.0007325837913311966, "loss": 0.1284, "step": 593 }, { "epoch": 1.05, "grad_norm": 0.1740669459104538, "learning_rate": 0.0007317581623557177, "loss": 0.1458, "step": 594 }, { "epoch": 1.05, "grad_norm": 0.20419681072235107, "learning_rate": 0.00073093172775169, "loss": 0.1511, "step": 595 }, { "epoch": 1.05, "grad_norm": 0.1906755119562149, "learning_rate": 0.0007301044903919325, "loss": 0.1052, "step": 596 }, { "epoch": 1.06, "grad_norm": 0.10201478004455566, "learning_rate": 0.0007292764531520552, "loss": 0.1147, "step": 597 }, { "epoch": 1.06, "grad_norm": 0.4594266712665558, "learning_rate": 0.0007284476189104485, "loss": 0.1739, "step": 598 }, { "epoch": 1.06, "grad_norm": 0.1697234809398651, "learning_rate": 0.0007276179905482729, "loss": 0.1049, "step": 599 }, { "epoch": 1.06, "grad_norm": 0.09107261896133423, "learning_rate": 0.0007267875709494499, "loss": 0.1319, "step": 600 }, { "epoch": 1.06, "grad_norm": 0.07888934016227722, "learning_rate": 0.0007259563630006512, "loss": 0.1323, "step": 601 }, { "epoch": 1.07, "grad_norm": 0.06149132549762726, "learning_rate": 0.0007251243695912886, "loss": 0.1239, "step": 602 }, { "epoch": 1.07, "grad_norm": 0.3240460157394409, "learning_rate": 0.0007242915936135052, "loss": 0.1614, "step": 603 }, { "epoch": 1.07, "grad_norm": 0.04239710047841072, "learning_rate": 0.0007234580379621635, "loss": 0.1336, "step": 604 }, { "epoch": 1.07, "grad_norm": 0.04415787383913994, "learning_rate": 0.000722623705534837, "loss": 0.1336, "step": 605 }, { "epoch": 1.07, "grad_norm": 0.13500471413135529, "learning_rate": 0.0007217885992317985, "loss": 0.1429, "step": 606 }, { "epoch": 1.07, "grad_norm": 0.09405327588319778, "learning_rate": 0.0007209527219560119, "loss": 0.1399, "step": 607 }, { "epoch": 1.08, "grad_norm": 0.16369308531284332, "learning_rate": 0.0007201160766131207, "loss": 0.1389, "step": 608 }, { "epoch": 1.08, "grad_norm": 0.24509336054325104, "learning_rate": 0.0007192786661114383, "loss": 0.1376, "step": 609 }, { "epoch": 1.08, "grad_norm": 0.29961100220680237, "learning_rate": 0.0007184404933619377, "loss": 0.133, "step": 610 }, { "epoch": 1.08, "grad_norm": 0.4641360640525818, "learning_rate": 0.0007176015612782421, "loss": 0.1491, "step": 611 }, { "epoch": 1.08, "grad_norm": 0.059663355350494385, "learning_rate": 0.0007167618727766138, "loss": 0.1365, "step": 612 }, { "epoch": 1.08, "grad_norm": 0.16221192479133606, "learning_rate": 0.0007159214307759448, "loss": 0.1395, "step": 613 }, { "epoch": 1.09, "grad_norm": 0.04930780455470085, "learning_rate": 0.0007150802381977463, "loss": 0.1368, "step": 614 }, { "epoch": 1.09, "grad_norm": 0.6152715086936951, "learning_rate": 0.0007142382979661386, "loss": 0.1291, "step": 615 }, { "epoch": 1.09, "grad_norm": 0.15247471630573273, "learning_rate": 0.0007133956130078411, "loss": 0.1404, "step": 616 }, { "epoch": 1.09, "grad_norm": 0.7167736887931824, "learning_rate": 0.000712552186252162, "loss": 0.1642, "step": 617 }, { "epoch": 1.09, "grad_norm": 0.2419363558292389, "learning_rate": 0.0007117080206309878, "loss": 0.1317, "step": 618 }, { "epoch": 1.1, "grad_norm": 0.5636677742004395, "learning_rate": 0.0007108631190787735, "loss": 0.147, "step": 619 }, { "epoch": 1.1, "grad_norm": 0.26012521982192993, "learning_rate": 0.0007100174845325327, "loss": 0.1344, "step": 620 }, { "epoch": 1.1, "grad_norm": 0.14845141768455505, "learning_rate": 0.0007091711199318265, "loss": 0.1299, "step": 621 }, { "epoch": 1.1, "grad_norm": 0.1344316601753235, "learning_rate": 0.0007083240282187542, "loss": 0.1401, "step": 622 }, { "epoch": 1.1, "grad_norm": 0.08974921703338623, "learning_rate": 0.0007074762123379423, "loss": 0.1257, "step": 623 }, { "epoch": 1.1, "grad_norm": 0.3263636529445648, "learning_rate": 0.0007066276752365351, "loss": 0.1587, "step": 624 }, { "epoch": 1.11, "grad_norm": 0.12418147176504135, "learning_rate": 0.0007057784198641835, "loss": 0.1361, "step": 625 }, { "epoch": 1.11, "grad_norm": 0.3086402714252472, "learning_rate": 0.0007049284491730353, "loss": 0.1496, "step": 626 }, { "epoch": 1.11, "grad_norm": 0.3529713749885559, "learning_rate": 0.000704077766117725, "loss": 0.1523, "step": 627 }, { "epoch": 1.11, "grad_norm": 0.11717434972524643, "learning_rate": 0.0007032263736553634, "loss": 0.1402, "step": 628 }, { "epoch": 1.11, "grad_norm": 0.19153334200382233, "learning_rate": 0.0007023742747455275, "loss": 0.1407, "step": 629 }, { "epoch": 1.11, "grad_norm": 0.22798961400985718, "learning_rate": 0.0007015214723502495, "loss": 0.1262, "step": 630 }, { "epoch": 1.12, "grad_norm": 0.4415830373764038, "learning_rate": 0.0007006679694340073, "loss": 0.1494, "step": 631 }, { "epoch": 1.12, "grad_norm": 0.554614782333374, "learning_rate": 0.0006998137689637142, "loss": 0.1324, "step": 632 }, { "epoch": 1.12, "grad_norm": 0.32604262232780457, "learning_rate": 0.0006989588739087078, "loss": 0.1433, "step": 633 }, { "epoch": 1.12, "grad_norm": 0.5145484209060669, "learning_rate": 0.0006981032872407406, "loss": 0.152, "step": 634 }, { "epoch": 1.12, "grad_norm": 0.3538295030593872, "learning_rate": 0.0006972470119339691, "loss": 0.137, "step": 635 }, { "epoch": 1.13, "grad_norm": 0.2451559156179428, "learning_rate": 0.0006963900509649435, "loss": 0.1418, "step": 636 }, { "epoch": 1.13, "grad_norm": 0.2240092009305954, "learning_rate": 0.0006955324073125978, "loss": 0.1406, "step": 637 }, { "epoch": 1.13, "grad_norm": 0.5672935843467712, "learning_rate": 0.0006946740839582387, "loss": 0.1682, "step": 638 }, { "epoch": 1.13, "grad_norm": 0.3396548628807068, "learning_rate": 0.000693815083885536, "loss": 0.1538, "step": 639 }, { "epoch": 1.13, "grad_norm": 0.2464788407087326, "learning_rate": 0.0006929554100805117, "loss": 0.145, "step": 640 }, { "epoch": 1.13, "grad_norm": 0.08380208164453506, "learning_rate": 0.0006920950655315297, "loss": 0.1333, "step": 641 }, { "epoch": 1.14, "grad_norm": 0.04563472419977188, "learning_rate": 0.000691234053229286, "loss": 0.1371, "step": 642 }, { "epoch": 1.14, "grad_norm": 0.0336502380669117, "learning_rate": 0.0006903723761667972, "loss": 0.1383, "step": 643 }, { "epoch": 1.14, "grad_norm": 0.11504160612821579, "learning_rate": 0.0006895100373393912, "loss": 0.1366, "step": 644 }, { "epoch": 1.14, "grad_norm": 0.4302406311035156, "learning_rate": 0.0006886470397446957, "loss": 0.1464, "step": 645 }, { "epoch": 1.14, "grad_norm": 0.13670873641967773, "learning_rate": 0.0006877833863826295, "loss": 0.1399, "step": 646 }, { "epoch": 1.14, "grad_norm": 0.11441440135240555, "learning_rate": 0.0006869190802553894, "loss": 0.1389, "step": 647 }, { "epoch": 1.15, "grad_norm": 0.07245034724473953, "learning_rate": 0.0006860541243674426, "loss": 0.1376, "step": 648 }, { "epoch": 1.15, "grad_norm": 0.12628068029880524, "learning_rate": 0.0006851885217255144, "loss": 0.1314, "step": 649 }, { "epoch": 1.15, "grad_norm": 0.345865935087204, "learning_rate": 0.0006843222753385784, "loss": 0.1469, "step": 650 }, { "epoch": 1.15, "grad_norm": 0.18721798062324524, "learning_rate": 0.0006834553882178463, "loss": 0.129, "step": 651 }, { "epoch": 1.15, "grad_norm": 0.1566080003976822, "learning_rate": 0.0006825878633767564, "loss": 0.1296, "step": 652 }, { "epoch": 1.16, "grad_norm": 0.13990430533885956, "learning_rate": 0.0006817197038309643, "loss": 0.1245, "step": 653 }, { "epoch": 1.16, "grad_norm": 0.26073744893074036, "learning_rate": 0.000680850912598332, "loss": 0.1437, "step": 654 }, { "epoch": 1.16, "grad_norm": 0.05034814029932022, "learning_rate": 0.0006799814926989171, "loss": 0.1209, "step": 655 }, { "epoch": 1.16, "grad_norm": 0.29498428106307983, "learning_rate": 0.0006791114471549626, "loss": 0.1476, "step": 656 }, { "epoch": 1.16, "grad_norm": 0.24109311401844025, "learning_rate": 0.0006782407789908863, "loss": 0.1421, "step": 657 }, { "epoch": 1.16, "grad_norm": 0.2070060819387436, "learning_rate": 0.0006773694912332707, "loss": 0.1174, "step": 658 }, { "epoch": 1.17, "grad_norm": 0.05099210515618324, "learning_rate": 0.0006764975869108514, "loss": 0.1325, "step": 659 }, { "epoch": 1.17, "grad_norm": 0.03778371214866638, "learning_rate": 0.0006756250690545078, "loss": 0.1326, "step": 660 }, { "epoch": 1.17, "grad_norm": 0.23074184358119965, "learning_rate": 0.0006747519406972524, "loss": 0.1417, "step": 661 }, { "epoch": 1.17, "grad_norm": 0.162948340177536, "learning_rate": 0.0006738782048742187, "loss": 0.1422, "step": 662 }, { "epoch": 1.17, "grad_norm": 0.1257455050945282, "learning_rate": 0.0006730038646226531, "loss": 0.1352, "step": 663 }, { "epoch": 1.17, "grad_norm": 0.1732119917869568, "learning_rate": 0.0006721289229819024, "loss": 0.1313, "step": 664 }, { "epoch": 1.18, "grad_norm": 0.15348908305168152, "learning_rate": 0.0006712533829934043, "loss": 0.139, "step": 665 }, { "epoch": 1.18, "grad_norm": 0.06923094391822815, "learning_rate": 0.0006703772477006757, "loss": 0.1381, "step": 666 }, { "epoch": 1.18, "grad_norm": 0.307449609041214, "learning_rate": 0.0006695005201493037, "loss": 0.1365, "step": 667 }, { "epoch": 1.18, "grad_norm": 0.09788268059492111, "learning_rate": 0.0006686232033869343, "loss": 0.1358, "step": 668 }, { "epoch": 1.18, "grad_norm": 0.23847998678684235, "learning_rate": 0.0006677453004632608, "loss": 0.1399, "step": 669 }, { "epoch": 1.19, "grad_norm": 0.08175510168075562, "learning_rate": 0.0006668668144300149, "loss": 0.1372, "step": 670 }, { "epoch": 1.19, "grad_norm": 0.18189309537410736, "learning_rate": 0.0006659877483409545, "loss": 0.1401, "step": 671 }, { "epoch": 1.19, "grad_norm": 0.08665986359119415, "learning_rate": 0.000665108105251855, "loss": 0.1345, "step": 672 }, { "epoch": 1.19, "grad_norm": 0.40454381704330444, "learning_rate": 0.0006642278882204963, "loss": 0.1342, "step": 673 }, { "epoch": 1.19, "grad_norm": 0.27606263756752014, "learning_rate": 0.0006633471003066543, "loss": 0.1363, "step": 674 }, { "epoch": 1.19, "grad_norm": 0.06796804070472717, "learning_rate": 0.000662465744572089, "loss": 0.1353, "step": 675 }, { "epoch": 1.2, "grad_norm": 0.4458450376987457, "learning_rate": 0.0006615838240805343, "loss": 0.1521, "step": 676 }, { "epoch": 1.2, "grad_norm": 0.3369523286819458, "learning_rate": 0.0006607013418976873, "loss": 0.1489, "step": 677 }, { "epoch": 1.2, "grad_norm": 0.20170435309410095, "learning_rate": 0.0006598183010911978, "loss": 0.1263, "step": 678 }, { "epoch": 1.2, "grad_norm": 0.11186213046312332, "learning_rate": 0.0006589347047306571, "loss": 0.1344, "step": 679 }, { "epoch": 1.2, "grad_norm": 0.12327159941196442, "learning_rate": 0.0006580505558875878, "loss": 0.1354, "step": 680 }, { "epoch": 1.2, "grad_norm": 0.05389246717095375, "learning_rate": 0.0006571658576354334, "loss": 0.1333, "step": 681 }, { "epoch": 1.21, "grad_norm": 0.20890717208385468, "learning_rate": 0.0006562806130495466, "loss": 0.1428, "step": 682 }, { "epoch": 1.21, "grad_norm": 0.12948615849018097, "learning_rate": 0.0006553948252071799, "loss": 0.1372, "step": 683 }, { "epoch": 1.21, "grad_norm": 0.16449519991874695, "learning_rate": 0.0006545084971874737, "loss": 0.1418, "step": 684 }, { "epoch": 1.21, "grad_norm": 0.04887047037482262, "learning_rate": 0.0006536216320714466, "loss": 0.139, "step": 685 }, { "epoch": 1.21, "grad_norm": 0.1712215691804886, "learning_rate": 0.0006527342329419836, "loss": 0.1389, "step": 686 }, { "epoch": 1.22, "grad_norm": 0.14935021102428436, "learning_rate": 0.000651846302883827, "loss": 0.1369, "step": 687 }, { "epoch": 1.22, "grad_norm": 0.16822853684425354, "learning_rate": 0.0006509578449835636, "loss": 0.1393, "step": 688 }, { "epoch": 1.22, "grad_norm": 0.04274258390069008, "learning_rate": 0.0006500688623296158, "loss": 0.1339, "step": 689 }, { "epoch": 1.22, "grad_norm": 0.20485758781433105, "learning_rate": 0.00064917935801223, "loss": 0.1232, "step": 690 }, { "epoch": 1.22, "grad_norm": 0.16438162326812744, "learning_rate": 0.0006482893351234658, "loss": 0.1272, "step": 691 }, { "epoch": 1.22, "grad_norm": 0.0820753276348114, "learning_rate": 0.0006473987967571855, "loss": 0.1368, "step": 692 }, { "epoch": 1.23, "grad_norm": 0.5247365832328796, "learning_rate": 0.000646507746009043, "loss": 0.1702, "step": 693 }, { "epoch": 1.23, "grad_norm": 0.21259160339832306, "learning_rate": 0.0006456161859764745, "loss": 0.1384, "step": 694 }, { "epoch": 1.23, "grad_norm": 0.10756111145019531, "learning_rate": 0.0006447241197586847, "loss": 0.1316, "step": 695 }, { "epoch": 1.23, "grad_norm": 0.32431429624557495, "learning_rate": 0.0006438315504566397, "loss": 0.1505, "step": 696 }, { "epoch": 1.23, "grad_norm": 0.09354525059461594, "learning_rate": 0.0006429384811730528, "loss": 0.1338, "step": 697 }, { "epoch": 1.23, "grad_norm": 0.25492650270462036, "learning_rate": 0.0006420449150123767, "loss": 0.1391, "step": 698 }, { "epoch": 1.24, "grad_norm": 0.28658854961395264, "learning_rate": 0.0006411508550807905, "loss": 0.1336, "step": 699 }, { "epoch": 1.24, "grad_norm": 0.21230942010879517, "learning_rate": 0.0006402563044861899, "loss": 0.1369, "step": 700 }, { "epoch": 1.24, "grad_norm": 0.13693292438983917, "learning_rate": 0.0006393612663381763, "loss": 0.1347, "step": 701 }, { "epoch": 1.24, "grad_norm": 0.20328965783119202, "learning_rate": 0.0006384657437480457, "loss": 0.1349, "step": 702 }, { "epoch": 1.24, "grad_norm": 0.1463640034198761, "learning_rate": 0.0006375697398287788, "loss": 0.1316, "step": 703 }, { "epoch": 1.25, "grad_norm": 0.47083455324172974, "learning_rate": 0.0006366732576950283, "loss": 0.1538, "step": 704 }, { "epoch": 1.25, "grad_norm": 0.18148604035377502, "learning_rate": 0.0006357763004631105, "loss": 0.1264, "step": 705 }, { "epoch": 1.25, "grad_norm": 0.10440527647733688, "learning_rate": 0.000634878871250992, "loss": 0.1209, "step": 706 }, { "epoch": 1.25, "grad_norm": 0.42732179164886475, "learning_rate": 0.000633980973178281, "loss": 0.1581, "step": 707 }, { "epoch": 1.25, "grad_norm": 0.09864400327205658, "learning_rate": 0.0006330826093662157, "loss": 0.1398, "step": 708 }, { "epoch": 1.25, "grad_norm": 0.2839510142803192, "learning_rate": 0.000632183782937652, "loss": 0.1448, "step": 709 }, { "epoch": 1.26, "grad_norm": 0.18296539783477783, "learning_rate": 0.0006312844970170551, "loss": 0.1369, "step": 710 }, { "epoch": 1.26, "eval_loss": 0.13823845982551575, "eval_runtime": 15.0864, "eval_samples_per_second": 31.618, "eval_steps_per_second": 7.954, "step": 710 }, { "epoch": 1.26, "grad_norm": 0.2305176705121994, "learning_rate": 0.0006303847547304872, "loss": 0.1253, "step": 711 }, { "epoch": 1.26, "grad_norm": 0.2792215049266815, "learning_rate": 0.0006294845592055967, "loss": 0.1292, "step": 712 }, { "epoch": 1.26, "grad_norm": 0.3560260236263275, "learning_rate": 0.0006285839135716078, "loss": 0.142, "step": 713 }, { "epoch": 1.26, "grad_norm": 0.5769198536872864, "learning_rate": 0.000627682820959309, "loss": 0.1629, "step": 714 }, { "epoch": 1.26, "grad_norm": 0.24223147332668304, "learning_rate": 0.000626781284501043, "loss": 0.1436, "step": 715 }, { "epoch": 1.27, "grad_norm": 0.20025451481342316, "learning_rate": 0.0006258793073306948, "loss": 0.1271, "step": 716 }, { "epoch": 1.27, "grad_norm": 0.5235323905944824, "learning_rate": 0.0006249768925836822, "loss": 0.1362, "step": 717 }, { "epoch": 1.27, "grad_norm": 0.12457533925771713, "learning_rate": 0.0006240740433969432, "loss": 0.1267, "step": 718 }, { "epoch": 1.27, "grad_norm": 0.22851230204105377, "learning_rate": 0.0006231707629089263, "loss": 0.1368, "step": 719 }, { "epoch": 1.27, "grad_norm": 0.12162783741950989, "learning_rate": 0.0006222670542595799, "loss": 0.1348, "step": 720 }, { "epoch": 1.28, "grad_norm": 0.21811127662658691, "learning_rate": 0.0006213629205903399, "loss": 0.1302, "step": 721 }, { "epoch": 1.28, "grad_norm": 0.1042797714471817, "learning_rate": 0.0006204583650441201, "loss": 0.1227, "step": 722 }, { "epoch": 1.28, "grad_norm": 0.5917842388153076, "learning_rate": 0.0006195533907653003, "loss": 0.1218, "step": 723 }, { "epoch": 1.28, "grad_norm": 0.6369093656539917, "learning_rate": 0.000618648000899717, "loss": 0.1309, "step": 724 }, { "epoch": 1.28, "grad_norm": 0.298677921295166, "learning_rate": 0.0006177421985946498, "loss": 0.1329, "step": 725 }, { "epoch": 1.28, "grad_norm": 0.5087531208992004, "learning_rate": 0.0006168359869988133, "loss": 0.1619, "step": 726 }, { "epoch": 1.29, "grad_norm": 0.5805624723434448, "learning_rate": 0.0006159293692623443, "loss": 0.1388, "step": 727 }, { "epoch": 1.29, "grad_norm": 0.595432698726654, "learning_rate": 0.0006150223485367914, "loss": 0.1363, "step": 728 }, { "epoch": 1.29, "grad_norm": 2.0664656162261963, "learning_rate": 0.0006141149279751042, "loss": 0.1373, "step": 729 }, { "epoch": 1.29, "grad_norm": 1.3190929889678955, "learning_rate": 0.0006132071107316221, "loss": 0.1434, "step": 730 }, { "epoch": 1.29, "grad_norm": 0.19045250117778778, "learning_rate": 0.0006122988999620634, "loss": 0.1177, "step": 731 }, { "epoch": 1.3, "grad_norm": 1.3130842447280884, "learning_rate": 0.0006113902988235145, "loss": 0.1542, "step": 732 }, { "epoch": 1.3, "grad_norm": 0.5767085552215576, "learning_rate": 0.0006104813104744187, "loss": 0.1627, "step": 733 }, { "epoch": 1.3, "grad_norm": 0.45873621106147766, "learning_rate": 0.0006095719380745653, "loss": 0.1369, "step": 734 }, { "epoch": 1.3, "grad_norm": 0.4458267092704773, "learning_rate": 0.0006086621847850788, "loss": 0.1207, "step": 735 }, { "epoch": 1.3, "grad_norm": 0.13178426027297974, "learning_rate": 0.0006077520537684072, "loss": 0.1263, "step": 736 }, { "epoch": 1.3, "grad_norm": 0.19360630214214325, "learning_rate": 0.0006068415481883121, "loss": 0.1366, "step": 737 }, { "epoch": 1.31, "grad_norm": 0.12965673208236694, "learning_rate": 0.0006059306712098571, "loss": 0.1436, "step": 738 }, { "epoch": 1.31, "grad_norm": 0.13222691416740417, "learning_rate": 0.0006050194259993966, "loss": 0.1294, "step": 739 }, { "epoch": 1.31, "grad_norm": 0.14453792572021484, "learning_rate": 0.0006041078157245648, "loss": 0.1273, "step": 740 }, { "epoch": 1.31, "grad_norm": 0.27612432837486267, "learning_rate": 0.0006031958435542659, "loss": 0.1145, "step": 741 }, { "epoch": 1.31, "grad_norm": 0.6110266447067261, "learning_rate": 0.0006022835126586609, "loss": 0.1299, "step": 742 }, { "epoch": 1.31, "grad_norm": 0.29062649607658386, "learning_rate": 0.0006013708262091586, "loss": 0.132, "step": 743 }, { "epoch": 1.32, "grad_norm": 0.44451919198036194, "learning_rate": 0.0006004577873784034, "loss": 0.1235, "step": 744 }, { "epoch": 1.32, "grad_norm": 0.15329335629940033, "learning_rate": 0.0005995443993402648, "loss": 0.1462, "step": 745 }, { "epoch": 1.32, "grad_norm": 0.7718572616577148, "learning_rate": 0.000598630665269826, "loss": 0.1309, "step": 746 }, { "epoch": 1.32, "grad_norm": 0.427112340927124, "learning_rate": 0.0005977165883433733, "loss": 0.1565, "step": 747 }, { "epoch": 1.32, "grad_norm": 0.18447764217853546, "learning_rate": 0.0005968021717383849, "loss": 0.1431, "step": 748 }, { "epoch": 1.33, "grad_norm": 0.2808150053024292, "learning_rate": 0.0005958874186335193, "loss": 0.1429, "step": 749 }, { "epoch": 1.33, "grad_norm": 0.29161331057548523, "learning_rate": 0.0005949723322086053, "loss": 0.1238, "step": 750 }, { "epoch": 1.33, "grad_norm": 0.3760308027267456, "learning_rate": 0.0005940569156446298, "loss": 0.1416, "step": 751 }, { "epoch": 1.33, "grad_norm": 0.24679379165172577, "learning_rate": 0.0005931411721237279, "loss": 0.1366, "step": 752 }, { "epoch": 1.33, "grad_norm": 0.8587498664855957, "learning_rate": 0.0005922251048291707, "loss": 0.1525, "step": 753 }, { "epoch": 1.33, "grad_norm": 0.17934030294418335, "learning_rate": 0.0005913087169453553, "loss": 0.1287, "step": 754 }, { "epoch": 1.34, "grad_norm": 0.43589112162590027, "learning_rate": 0.0005903920116577931, "loss": 0.1472, "step": 755 }, { "epoch": 1.34, "grad_norm": 0.061591099947690964, "learning_rate": 0.0005894749921530983, "loss": 0.129, "step": 756 }, { "epoch": 1.34, "grad_norm": 2.9088215827941895, "learning_rate": 0.0005885576616189781, "loss": 0.1379, "step": 757 }, { "epoch": 1.34, "grad_norm": 0.5655280351638794, "learning_rate": 0.0005876400232442205, "loss": 0.138, "step": 758 }, { "epoch": 1.34, "grad_norm": 0.05325045809149742, "learning_rate": 0.0005867220802186837, "loss": 0.1269, "step": 759 }, { "epoch": 1.34, "grad_norm": 0.08003886044025421, "learning_rate": 0.000585803835733285, "loss": 0.1386, "step": 760 }, { "epoch": 1.35, "grad_norm": 0.09640295058488846, "learning_rate": 0.0005848852929799894, "loss": 0.1364, "step": 761 }, { "epoch": 1.35, "grad_norm": 0.5506378412246704, "learning_rate": 0.0005839664551517988, "loss": 0.1424, "step": 762 }, { "epoch": 1.35, "grad_norm": 0.06252831220626831, "learning_rate": 0.000583047325442741, "loss": 0.1384, "step": 763 }, { "epoch": 1.35, "grad_norm": 0.11976012587547302, "learning_rate": 0.0005821279070478583, "loss": 0.1422, "step": 764 }, { "epoch": 1.35, "grad_norm": 0.13022871315479279, "learning_rate": 0.0005812082031631966, "loss": 0.1357, "step": 765 }, { "epoch": 1.36, "grad_norm": 0.12146733701229095, "learning_rate": 0.0005802882169857938, "loss": 0.14, "step": 766 }, { "epoch": 1.36, "grad_norm": 0.17471055686473846, "learning_rate": 0.00057936795171367, "loss": 0.1432, "step": 767 }, { "epoch": 1.36, "grad_norm": 0.07833831012248993, "learning_rate": 0.0005784474105458143, "loss": 0.1425, "step": 768 }, { "epoch": 1.36, "grad_norm": 0.17084555327892303, "learning_rate": 0.000577526596682176, "loss": 0.1437, "step": 769 }, { "epoch": 1.36, "grad_norm": 0.112625353038311, "learning_rate": 0.0005766055133236513, "loss": 0.1429, "step": 770 }, { "epoch": 1.36, "grad_norm": 0.15991152822971344, "learning_rate": 0.000575684163672074, "loss": 0.1387, "step": 771 }, { "epoch": 1.37, "grad_norm": 0.18027663230895996, "learning_rate": 0.0005747625509302033, "loss": 0.1439, "step": 772 }, { "epoch": 1.37, "grad_norm": 0.4315040111541748, "learning_rate": 0.0005738406783017127, "loss": 0.1524, "step": 773 }, { "epoch": 1.37, "grad_norm": 0.3343091905117035, "learning_rate": 0.0005729185489911797, "loss": 0.1481, "step": 774 }, { "epoch": 1.37, "grad_norm": 0.16762638092041016, "learning_rate": 0.0005719961662040733, "loss": 0.1389, "step": 775 }, { "epoch": 1.37, "grad_norm": 0.11396286636590958, "learning_rate": 0.0005710735331467444, "loss": 0.1351, "step": 776 }, { "epoch": 1.37, "grad_norm": 0.11262958496809006, "learning_rate": 0.0005701506530264132, "loss": 0.1343, "step": 777 }, { "epoch": 1.38, "grad_norm": 0.06293229013681412, "learning_rate": 0.0005692275290511592, "loss": 0.1322, "step": 778 }, { "epoch": 1.38, "grad_norm": 0.037539321929216385, "learning_rate": 0.0005683041644299093, "loss": 0.13, "step": 779 }, { "epoch": 1.38, "grad_norm": 0.2424931526184082, "learning_rate": 0.0005673805623724272, "loss": 0.1333, "step": 780 }, { "epoch": 1.38, "grad_norm": 0.2825300991535187, "learning_rate": 0.0005664567260893019, "loss": 0.15, "step": 781 }, { "epoch": 1.38, "grad_norm": 0.12494263052940369, "learning_rate": 0.000565532658791936, "loss": 0.1317, "step": 782 }, { "epoch": 1.39, "grad_norm": 0.29031720757484436, "learning_rate": 0.0005646083636925362, "loss": 0.1593, "step": 783 }, { "epoch": 1.39, "grad_norm": 0.3038758933544159, "learning_rate": 0.0005636838440041004, "loss": 0.1551, "step": 784 }, { "epoch": 1.39, "grad_norm": 0.14983759820461273, "learning_rate": 0.0005627591029404071, "loss": 0.1402, "step": 785 }, { "epoch": 1.39, "grad_norm": 0.25971877574920654, "learning_rate": 0.0005618341437160049, "loss": 0.1389, "step": 786 }, { "epoch": 1.39, "grad_norm": 0.2425714135169983, "learning_rate": 0.0005609089695462002, "loss": 0.1399, "step": 787 }, { "epoch": 1.39, "grad_norm": 0.11737050861120224, "learning_rate": 0.0005599835836470469, "loss": 0.1237, "step": 788 }, { "epoch": 1.4, "grad_norm": 0.6737673878669739, "learning_rate": 0.0005590579892353348, "loss": 0.1837, "step": 789 }, { "epoch": 1.4, "grad_norm": 0.2361481785774231, "learning_rate": 0.0005581321895285787, "loss": 0.146, "step": 790 }, { "epoch": 1.4, "grad_norm": 0.47753140330314636, "learning_rate": 0.0005572061877450068, "loss": 0.1664, "step": 791 }, { "epoch": 1.4, "grad_norm": 0.2968634068965912, "learning_rate": 0.0005562799871035495, "loss": 0.1511, "step": 792 }, { "epoch": 1.4, "grad_norm": 0.20170801877975464, "learning_rate": 0.0005553535908238294, "loss": 0.1408, "step": 793 }, { "epoch": 1.4, "grad_norm": 0.11540532112121582, "learning_rate": 0.0005544270021261482, "loss": 0.1415, "step": 794 }, { "epoch": 1.41, "grad_norm": 0.10350099951028824, "learning_rate": 0.0005535002242314772, "loss": 0.1393, "step": 795 }, { "epoch": 1.41, "grad_norm": 0.06757602840662003, "learning_rate": 0.0005525732603614444, "loss": 0.1335, "step": 796 }, { "epoch": 1.41, "grad_norm": 0.11407013237476349, "learning_rate": 0.0005516461137383254, "loss": 0.1342, "step": 797 }, { "epoch": 1.41, "grad_norm": 0.40271708369255066, "learning_rate": 0.0005507187875850305, "loss": 0.1536, "step": 798 }, { "epoch": 1.41, "grad_norm": 0.35031354427337646, "learning_rate": 0.000549791285125094, "loss": 0.1489, "step": 799 }, { "epoch": 1.42, "grad_norm": 0.347901850938797, "learning_rate": 0.0005488636095826636, "loss": 0.1463, "step": 800 }, { "epoch": 1.42, "grad_norm": 0.17497143149375916, "learning_rate": 0.0005479357641824877, "loss": 0.1385, "step": 801 }, { "epoch": 1.42, "grad_norm": 0.42803797125816345, "learning_rate": 0.0005470077521499062, "loss": 0.1438, "step": 802 }, { "epoch": 1.42, "grad_norm": 0.77762371301651, "learning_rate": 0.0005460795767108378, "loss": 0.1616, "step": 803 }, { "epoch": 1.42, "grad_norm": 0.27612486481666565, "learning_rate": 0.0005451512410917691, "loss": 0.1424, "step": 804 }, { "epoch": 1.42, "grad_norm": 0.10936840623617172, "learning_rate": 0.0005442227485197435, "loss": 0.1379, "step": 805 }, { "epoch": 1.43, "grad_norm": 0.19322127103805542, "learning_rate": 0.0005432941022223503, "loss": 0.1279, "step": 806 }, { "epoch": 1.43, "grad_norm": 0.14601223170757294, "learning_rate": 0.000542365305427713, "loss": 0.1394, "step": 807 }, { "epoch": 1.43, "grad_norm": 0.05485713109374046, "learning_rate": 0.0005414363613644781, "loss": 0.1245, "step": 808 }, { "epoch": 1.43, "grad_norm": 3.4448142051696777, "learning_rate": 0.0005405072732618043, "loss": 0.3245, "step": 809 }, { "epoch": 1.43, "grad_norm": 0.3802805542945862, "learning_rate": 0.0005395780443493508, "loss": 0.1617, "step": 810 }, { "epoch": 1.43, "grad_norm": 2.19525146484375, "learning_rate": 0.0005386486778572665, "loss": 0.3246, "step": 811 }, { "epoch": 1.44, "grad_norm": 0.1237780973315239, "learning_rate": 0.0005377191770161783, "loss": 0.1348, "step": 812 }, { "epoch": 1.44, "grad_norm": 0.17096708714962006, "learning_rate": 0.0005367895450571801, "loss": 0.1417, "step": 813 }, { "epoch": 1.44, "grad_norm": 0.033038243651390076, "learning_rate": 0.0005358597852118219, "loss": 0.1308, "step": 814 }, { "epoch": 1.44, "grad_norm": 0.2681437134742737, "learning_rate": 0.000534929900712098, "loss": 0.1465, "step": 815 }, { "epoch": 1.44, "grad_norm": 0.11117050051689148, "learning_rate": 0.0005339998947904363, "loss": 0.1383, "step": 816 }, { "epoch": 1.45, "grad_norm": 0.25842952728271484, "learning_rate": 0.0005330697706796861, "loss": 0.1397, "step": 817 }, { "epoch": 1.45, "grad_norm": 0.08293187618255615, "learning_rate": 0.0005321395316131083, "loss": 0.1356, "step": 818 }, { "epoch": 1.45, "grad_norm": 0.14762446284294128, "learning_rate": 0.0005312091808243631, "loss": 0.1416, "step": 819 }, { "epoch": 1.45, "grad_norm": 0.4471145570278168, "learning_rate": 0.0005302787215474991, "loss": 0.1461, "step": 820 }, { "epoch": 1.45, "grad_norm": 0.2443719059228897, "learning_rate": 0.0005293481570169421, "loss": 0.1458, "step": 821 }, { "epoch": 1.45, "grad_norm": 0.22830860316753387, "learning_rate": 0.0005284174904674835, "loss": 0.139, "step": 822 }, { "epoch": 1.46, "grad_norm": 0.5184169411659241, "learning_rate": 0.0005274867251342694, "loss": 0.1417, "step": 823 }, { "epoch": 1.46, "grad_norm": 0.3812021017074585, "learning_rate": 0.0005265558642527897, "loss": 0.1346, "step": 824 }, { "epoch": 1.46, "grad_norm": 0.2922486662864685, "learning_rate": 0.0005256249110588659, "loss": 0.1294, "step": 825 }, { "epoch": 1.46, "grad_norm": 0.29819944500923157, "learning_rate": 0.0005246938687886409, "loss": 0.1401, "step": 826 }, { "epoch": 1.46, "grad_norm": 0.07050393521785736, "learning_rate": 0.0005237627406785666, "loss": 0.1307, "step": 827 }, { "epoch": 1.46, "grad_norm": 0.1988169550895691, "learning_rate": 0.0005228315299653941, "loss": 0.1359, "step": 828 }, { "epoch": 1.47, "grad_norm": 0.31983789801597595, "learning_rate": 0.0005219002398861611, "loss": 0.1459, "step": 829 }, { "epoch": 1.47, "grad_norm": 0.28960883617401123, "learning_rate": 0.000520968873678181, "loss": 0.1447, "step": 830 }, { "epoch": 1.47, "grad_norm": 0.36790764331817627, "learning_rate": 0.0005200374345790325, "loss": 0.1287, "step": 831 }, { "epoch": 1.47, "grad_norm": 0.055655404925346375, "learning_rate": 0.0005191059258265471, "loss": 0.1346, "step": 832 }, { "epoch": 1.47, "grad_norm": 0.42271995544433594, "learning_rate": 0.0005181743506587989, "loss": 0.1445, "step": 833 }, { "epoch": 1.48, "grad_norm": 0.4579026997089386, "learning_rate": 0.0005172427123140923, "loss": 0.1397, "step": 834 }, { "epoch": 1.48, "grad_norm": 0.30265891551971436, "learning_rate": 0.0005163110140309518, "loss": 0.1389, "step": 835 }, { "epoch": 1.48, "grad_norm": 0.23372715711593628, "learning_rate": 0.0005153792590481101, "loss": 0.1426, "step": 836 }, { "epoch": 1.48, "grad_norm": 0.22771018743515015, "learning_rate": 0.0005144474506044969, "loss": 0.1412, "step": 837 }, { "epoch": 1.48, "grad_norm": 0.32557952404022217, "learning_rate": 0.000513515591939228, "loss": 0.1382, "step": 838 }, { "epoch": 1.48, "grad_norm": 0.4409979283809662, "learning_rate": 0.0005125836862915934, "loss": 0.1382, "step": 839 }, { "epoch": 1.49, "grad_norm": 112.177978515625, "learning_rate": 0.0005116517369010466, "loss": 1.093, "step": 840 }, { "epoch": 1.49, "grad_norm": 0.13140363991260529, "learning_rate": 0.0005107197470071933, "loss": 0.1344, "step": 841 }, { "epoch": 1.49, "grad_norm": 0.0935206264257431, "learning_rate": 0.00050978771984978, "loss": 0.1315, "step": 842 }, { "epoch": 1.49, "grad_norm": 0.5304569602012634, "learning_rate": 0.0005088556586686822, "loss": 0.1549, "step": 843 }, { "epoch": 1.49, "grad_norm": 0.07438669353723526, "learning_rate": 0.0005079235667038944, "loss": 0.1311, "step": 844 }, { "epoch": 1.49, "grad_norm": 0.17763537168502808, "learning_rate": 0.0005069914471955179, "loss": 0.1342, "step": 845 }, { "epoch": 1.5, "grad_norm": 0.3256682753562927, "learning_rate": 0.0005060593033837493, "loss": 0.1435, "step": 846 }, { "epoch": 1.5, "grad_norm": 0.3771526515483856, "learning_rate": 0.0005051271385088701, "loss": 0.1434, "step": 847 }, { "epoch": 1.5, "grad_norm": 0.3716539740562439, "learning_rate": 0.0005041949558112351, "loss": 0.1329, "step": 848 }, { "epoch": 1.5, "grad_norm": 0.13685204088687897, "learning_rate": 0.0005032627585312608, "loss": 0.1415, "step": 849 }, { "epoch": 1.5, "grad_norm": 0.21241213381290436, "learning_rate": 0.0005023305499094144, "loss": 0.1384, "step": 850 }, { "epoch": 1.51, "grad_norm": 0.05780967324972153, "learning_rate": 0.0005013983331862026, "loss": 0.1366, "step": 851 }, { "epoch": 1.51, "grad_norm": 0.5117526650428772, "learning_rate": 0.0005004661116021605, "loss": 0.1537, "step": 852 }, { "epoch": 1.51, "eval_loss": 0.14083661139011383, "eval_runtime": 14.5613, "eval_samples_per_second": 32.758, "eval_steps_per_second": 8.241, "step": 852 }, { "epoch": 1.51, "grad_norm": 0.6577463150024414, "learning_rate": 0.0004995338883978395, "loss": 0.1461, "step": 853 }, { "epoch": 1.51, "grad_norm": 0.35039347410202026, "learning_rate": 0.0004986016668137974, "loss": 0.1345, "step": 854 }, { "epoch": 1.51, "grad_norm": 0.1379460096359253, "learning_rate": 0.0004976694500905857, "loss": 0.1425, "step": 855 }, { "epoch": 1.51, "grad_norm": 0.23959362506866455, "learning_rate": 0.0004967372414687393, "loss": 0.1535, "step": 856 }, { "epoch": 1.52, "grad_norm": 0.387977659702301, "learning_rate": 0.000495805044188765, "loss": 0.1709, "step": 857 }, { "epoch": 1.52, "grad_norm": 0.10591788589954376, "learning_rate": 0.0004948728614911299, "loss": 0.137, "step": 858 }, { "epoch": 1.52, "grad_norm": 0.1370454728603363, "learning_rate": 0.0004939406966162507, "loss": 0.1413, "step": 859 }, { "epoch": 1.52, "grad_norm": 0.10982546955347061, "learning_rate": 0.0004930085528044823, "loss": 0.1422, "step": 860 }, { "epoch": 1.52, "grad_norm": 0.1631615161895752, "learning_rate": 0.0004920764332961055, "loss": 0.1439, "step": 861 }, { "epoch": 1.52, "grad_norm": 0.4625565707683563, "learning_rate": 0.0004911443413313179, "loss": 0.13, "step": 862 }, { "epoch": 1.53, "grad_norm": 0.09019370377063751, "learning_rate": 0.0004902122801502201, "loss": 0.1367, "step": 863 }, { "epoch": 1.53, "grad_norm": 0.058873746544122696, "learning_rate": 0.0004892802529928067, "loss": 0.1388, "step": 864 }, { "epoch": 1.53, "grad_norm": 0.16651901602745056, "learning_rate": 0.0004883482630989535, "loss": 0.1383, "step": 865 }, { "epoch": 1.53, "grad_norm": 0.1222059577703476, "learning_rate": 0.00048741631370840676, "loss": 0.1391, "step": 866 }, { "epoch": 1.53, "grad_norm": 0.15417732298374176, "learning_rate": 0.00048648440806077226, "loss": 0.1368, "step": 867 }, { "epoch": 1.54, "grad_norm": 0.19719868898391724, "learning_rate": 0.00048555254939550326, "loss": 0.1423, "step": 868 }, { "epoch": 1.54, "grad_norm": 0.1811167150735855, "learning_rate": 0.0004846207409518899, "loss": 0.1382, "step": 869 }, { "epoch": 1.54, "grad_norm": 0.12746267020702362, "learning_rate": 0.0004836889859690483, "loss": 0.1375, "step": 870 }, { "epoch": 1.54, "grad_norm": 0.18294665217399597, "learning_rate": 0.00048275728768590776, "loss": 0.1376, "step": 871 }, { "epoch": 1.54, "grad_norm": 0.14346922934055328, "learning_rate": 0.0004818256493412011, "loss": 0.137, "step": 872 }, { "epoch": 1.54, "grad_norm": 0.07995035499334335, "learning_rate": 0.00048089407417345296, "loss": 0.1356, "step": 873 }, { "epoch": 1.55, "grad_norm": 0.14909203350543976, "learning_rate": 0.0004799625654209675, "loss": 0.1374, "step": 874 }, { "epoch": 1.55, "grad_norm": 0.06708569079637527, "learning_rate": 0.00047903112632181904, "loss": 0.1381, "step": 875 }, { "epoch": 1.55, "grad_norm": 0.22370800375938416, "learning_rate": 0.00047809976011383906, "loss": 0.1445, "step": 876 }, { "epoch": 1.55, "grad_norm": 0.05151727795600891, "learning_rate": 0.0004771684700346059, "loss": 0.1371, "step": 877 }, { "epoch": 1.55, "grad_norm": 0.12744151055812836, "learning_rate": 0.0004762372593214335, "loss": 0.1369, "step": 878 }, { "epoch": 1.56, "grad_norm": 0.13104400038719177, "learning_rate": 0.0004753061312113592, "loss": 0.1346, "step": 879 }, { "epoch": 1.56, "grad_norm": 0.11967893689870834, "learning_rate": 0.00047437508894113416, "loss": 0.1318, "step": 880 }, { "epoch": 1.56, "grad_norm": 0.035317592322826385, "learning_rate": 0.00047344413574721046, "loss": 0.1352, "step": 881 }, { "epoch": 1.56, "grad_norm": 0.15099988877773285, "learning_rate": 0.0004725132748657307, "loss": 0.1401, "step": 882 }, { "epoch": 1.56, "grad_norm": 0.24859024584293365, "learning_rate": 0.0004715825095325168, "loss": 0.1277, "step": 883 }, { "epoch": 1.56, "grad_norm": 0.11024681478738785, "learning_rate": 0.00047065184298305797, "loss": 0.1375, "step": 884 }, { "epoch": 1.57, "grad_norm": 0.031196558848023415, "learning_rate": 0.00046972127845250084, "loss": 0.133, "step": 885 }, { "epoch": 1.57, "grad_norm": 0.05172949284315109, "learning_rate": 0.00046879081917563695, "loss": 0.1324, "step": 886 }, { "epoch": 1.57, "grad_norm": 0.04595587030053139, "learning_rate": 0.0004678604683868918, "loss": 0.1361, "step": 887 }, { "epoch": 1.57, "grad_norm": 0.054625846445560455, "learning_rate": 0.00046693022932031415, "loss": 0.1334, "step": 888 }, { "epoch": 1.57, "grad_norm": 0.18956537544727325, "learning_rate": 0.0004660001052095639, "loss": 0.1419, "step": 889 }, { "epoch": 1.57, "grad_norm": 0.12293694168329239, "learning_rate": 0.00046507009928790195, "loss": 0.1234, "step": 890 }, { "epoch": 1.58, "grad_norm": 0.12140147387981415, "learning_rate": 0.00046414021478817817, "loss": 0.1282, "step": 891 }, { "epoch": 1.58, "grad_norm": 0.20622491836547852, "learning_rate": 0.00046321045494282, "loss": 0.1238, "step": 892 }, { "epoch": 1.58, "grad_norm": 0.1974942833185196, "learning_rate": 0.00046228082298382196, "loss": 0.1511, "step": 893 }, { "epoch": 1.58, "grad_norm": 0.13894042372703552, "learning_rate": 0.0004613513221427337, "loss": 0.1349, "step": 894 }, { "epoch": 1.58, "grad_norm": 0.023365622386336327, "learning_rate": 0.00046042195565064914, "loss": 0.1371, "step": 895 }, { "epoch": 1.59, "grad_norm": 0.16795076429843903, "learning_rate": 0.0004594927267381958, "loss": 0.1399, "step": 896 }, { "epoch": 1.59, "grad_norm": 0.06654185056686401, "learning_rate": 0.00045856363863552195, "loss": 0.1365, "step": 897 }, { "epoch": 1.59, "grad_norm": 0.21354056894779205, "learning_rate": 0.00045763469457228695, "loss": 0.1431, "step": 898 }, { "epoch": 1.59, "grad_norm": 0.1247892901301384, "learning_rate": 0.0004567058977776498, "loss": 0.1391, "step": 899 }, { "epoch": 1.59, "grad_norm": 0.1229679062962532, "learning_rate": 0.00045577725148025647, "loss": 0.1324, "step": 900 }, { "epoch": 1.59, "grad_norm": 0.0285334512591362, "learning_rate": 0.000454848758908231, "loss": 0.1417, "step": 901 }, { "epoch": 1.6, "grad_norm": 0.11522159725427628, "learning_rate": 0.0004539204232891622, "loss": 0.1349, "step": 902 }, { "epoch": 1.6, "grad_norm": 0.04999208077788353, "learning_rate": 0.00045299224785009374, "loss": 0.1395, "step": 903 }, { "epoch": 1.6, "grad_norm": 0.19387230277061462, "learning_rate": 0.00045206423581751245, "loss": 0.1367, "step": 904 }, { "epoch": 1.6, "grad_norm": 0.030587391927838326, "learning_rate": 0.0004511363904173366, "loss": 0.1392, "step": 905 }, { "epoch": 1.6, "grad_norm": 0.031090332195162773, "learning_rate": 0.0004502087148749061, "loss": 0.137, "step": 906 }, { "epoch": 1.6, "grad_norm": 0.07691047340631485, "learning_rate": 0.0004492812124149696, "loss": 0.144, "step": 907 }, { "epoch": 1.61, "grad_norm": 0.0668129101395607, "learning_rate": 0.0004483538862616747, "loss": 0.1337, "step": 908 }, { "epoch": 1.61, "grad_norm": 0.3449445962905884, "learning_rate": 0.00044742673963855576, "loss": 0.1526, "step": 909 }, { "epoch": 1.61, "grad_norm": 0.19746670126914978, "learning_rate": 0.000446499775768523, "loss": 0.1319, "step": 910 }, { "epoch": 1.61, "grad_norm": 0.11988267302513123, "learning_rate": 0.0004455729978738517, "loss": 0.1411, "step": 911 }, { "epoch": 1.61, "grad_norm": 0.17063240706920624, "learning_rate": 0.00044464640917617063, "loss": 0.1354, "step": 912 }, { "epoch": 1.62, "grad_norm": 0.26187554001808167, "learning_rate": 0.00044372001289645044, "loss": 0.136, "step": 913 }, { "epoch": 1.62, "grad_norm": 0.05965143442153931, "learning_rate": 0.00044279381225499344, "loss": 0.1398, "step": 914 }, { "epoch": 1.62, "grad_norm": 0.07176820188760757, "learning_rate": 0.00044186781047142134, "loss": 0.1388, "step": 915 }, { "epoch": 1.62, "grad_norm": 0.038787998259067535, "learning_rate": 0.0004409420107646652, "loss": 0.1383, "step": 916 }, { "epoch": 1.62, "grad_norm": 0.03987140208482742, "learning_rate": 0.0004400164163529532, "loss": 0.1366, "step": 917 }, { "epoch": 1.62, "grad_norm": 0.12179240584373474, "learning_rate": 0.00043909103045379987, "loss": 0.1306, "step": 918 }, { "epoch": 1.63, "grad_norm": 0.09804455190896988, "learning_rate": 0.0004381658562839953, "loss": 0.128, "step": 919 }, { "epoch": 1.63, "grad_norm": 0.08840085566043854, "learning_rate": 0.00043724089705959304, "loss": 0.1364, "step": 920 }, { "epoch": 1.63, "grad_norm": 0.18564368784427643, "learning_rate": 0.00043631615599589964, "loss": 0.1485, "step": 921 }, { "epoch": 1.63, "grad_norm": 0.3653159737586975, "learning_rate": 0.00043539163630746384, "loss": 0.1486, "step": 922 }, { "epoch": 1.63, "grad_norm": 0.08679798990488052, "learning_rate": 0.000434467341208064, "loss": 0.1291, "step": 923 }, { "epoch": 1.63, "grad_norm": 0.1024034321308136, "learning_rate": 0.00043354327391069826, "loss": 0.1275, "step": 924 }, { "epoch": 1.64, "grad_norm": 0.041372958570718765, "learning_rate": 0.0004326194376275729, "loss": 0.1328, "step": 925 }, { "epoch": 1.64, "grad_norm": 0.06509742885828018, "learning_rate": 0.0004316958355700906, "loss": 0.1324, "step": 926 }, { "epoch": 1.64, "grad_norm": 0.09408631920814514, "learning_rate": 0.0004307724709488409, "loss": 0.1405, "step": 927 }, { "epoch": 1.64, "grad_norm": 0.1963924914598465, "learning_rate": 0.0004298493469735869, "loss": 0.1436, "step": 928 }, { "epoch": 1.64, "grad_norm": 0.10209079831838608, "learning_rate": 0.0004289264668532557, "loss": 0.1277, "step": 929 }, { "epoch": 1.65, "grad_norm": 0.026920847594738007, "learning_rate": 0.00042800383379592677, "loss": 0.1295, "step": 930 }, { "epoch": 1.65, "grad_norm": 0.03551056608557701, "learning_rate": 0.00042708145100882035, "loss": 0.1281, "step": 931 }, { "epoch": 1.65, "grad_norm": 0.14194993674755096, "learning_rate": 0.00042615932169828743, "loss": 0.1398, "step": 932 }, { "epoch": 1.65, "grad_norm": 0.2725144326686859, "learning_rate": 0.00042523744906979683, "loss": 0.1217, "step": 933 }, { "epoch": 1.65, "grad_norm": 0.22893387079238892, "learning_rate": 0.00042431583632792605, "loss": 0.1517, "step": 934 }, { "epoch": 1.65, "grad_norm": 0.20985311269760132, "learning_rate": 0.00042339448667634886, "loss": 0.1433, "step": 935 }, { "epoch": 1.66, "grad_norm": 0.053967542946338654, "learning_rate": 0.00042247340331782416, "loss": 0.12, "step": 936 }, { "epoch": 1.66, "grad_norm": 0.22838272154331207, "learning_rate": 0.0004215525894541856, "loss": 0.1176, "step": 937 }, { "epoch": 1.66, "grad_norm": 0.3237338066101074, "learning_rate": 0.0004206320482863301, "loss": 0.1476, "step": 938 }, { "epoch": 1.66, "grad_norm": 0.09525377303361893, "learning_rate": 0.0004197117830142062, "loss": 0.1342, "step": 939 }, { "epoch": 1.66, "grad_norm": 0.05312574282288551, "learning_rate": 0.0004187917968368036, "loss": 0.1311, "step": 940 }, { "epoch": 1.66, "grad_norm": 0.11625714600086212, "learning_rate": 0.00041787209295214177, "loss": 0.1133, "step": 941 }, { "epoch": 1.67, "grad_norm": 0.04892723262310028, "learning_rate": 0.000416952674557259, "loss": 0.106, "step": 942 }, { "epoch": 1.67, "grad_norm": 0.44417399168014526, "learning_rate": 0.00041603354484820134, "loss": 0.1653, "step": 943 }, { "epoch": 1.67, "grad_norm": 0.07979090511798859, "learning_rate": 0.00041511470702001074, "loss": 0.1144, "step": 944 }, { "epoch": 1.67, "grad_norm": 3.163567304611206, "learning_rate": 0.00041419616426671517, "loss": 0.1576, "step": 945 }, { "epoch": 1.67, "grad_norm": 0.11929760128259659, "learning_rate": 0.0004132779197813164, "loss": 0.1329, "step": 946 }, { "epoch": 1.68, "grad_norm": 0.21021872758865356, "learning_rate": 0.0004123599767557795, "loss": 0.0956, "step": 947 }, { "epoch": 1.68, "grad_norm": 0.4803867042064667, "learning_rate": 0.00041144233838102197, "loss": 0.2027, "step": 948 }, { "epoch": 1.68, "grad_norm": 0.0795937329530716, "learning_rate": 0.0004105250078469018, "loss": 0.1226, "step": 949 }, { "epoch": 1.68, "grad_norm": 0.19914481043815613, "learning_rate": 0.00040960798834220705, "loss": 0.1457, "step": 950 }, { "epoch": 1.68, "grad_norm": 1.662695288658142, "learning_rate": 0.00040869128305464475, "loss": 0.1465, "step": 951 }, { "epoch": 1.68, "grad_norm": 0.1512700617313385, "learning_rate": 0.00040777489517082924, "loss": 0.1391, "step": 952 }, { "epoch": 1.69, "grad_norm": 0.3317195773124695, "learning_rate": 0.00040685882787627227, "loss": 0.1397, "step": 953 }, { "epoch": 1.69, "grad_norm": 0.2609153985977173, "learning_rate": 0.00040594308435537026, "loss": 0.1217, "step": 954 }, { "epoch": 1.69, "grad_norm": 0.40559151768684387, "learning_rate": 0.00040502766779139485, "loss": 0.1317, "step": 955 }, { "epoch": 1.69, "grad_norm": 0.21796320378780365, "learning_rate": 0.0004041125813664808, "loss": 0.1364, "step": 956 }, { "epoch": 1.69, "grad_norm": 0.6307505369186401, "learning_rate": 0.0004031978282616151, "loss": 0.1553, "step": 957 }, { "epoch": 1.69, "grad_norm": 0.28565332293510437, "learning_rate": 0.00040228341165662683, "loss": 0.1344, "step": 958 }, { "epoch": 1.7, "grad_norm": 0.4165158271789551, "learning_rate": 0.0004013693347301741, "loss": 0.1455, "step": 959 }, { "epoch": 1.7, "grad_norm": 0.821273922920227, "learning_rate": 0.0004004556006597353, "loss": 0.1287, "step": 960 }, { "epoch": 1.7, "grad_norm": 0.11364096403121948, "learning_rate": 0.0003995422126215967, "loss": 0.1177, "step": 961 }, { "epoch": 1.7, "grad_norm": 0.349627822637558, "learning_rate": 0.0003986291737908414, "loss": 0.1217, "step": 962 }, { "epoch": 1.7, "grad_norm": 0.10462171584367752, "learning_rate": 0.0003977164873413391, "loss": 0.1168, "step": 963 }, { "epoch": 1.71, "grad_norm": 0.11335984617471695, "learning_rate": 0.0003968041564457342, "loss": 0.1313, "step": 964 }, { "epoch": 1.71, "grad_norm": 0.37488850951194763, "learning_rate": 0.0003958921842754351, "loss": 0.133, "step": 965 }, { "epoch": 1.71, "grad_norm": 0.09337367117404938, "learning_rate": 0.00039498057400060363, "loss": 0.1464, "step": 966 }, { "epoch": 1.71, "grad_norm": 0.27405792474746704, "learning_rate": 0.000394069328790143, "loss": 0.1322, "step": 967 }, { "epoch": 1.71, "grad_norm": 0.5987095832824707, "learning_rate": 0.00039315845181168784, "loss": 0.1307, "step": 968 }, { "epoch": 1.71, "grad_norm": 0.3096538484096527, "learning_rate": 0.00039224794623159294, "loss": 0.1349, "step": 969 }, { "epoch": 1.72, "grad_norm": 0.5547122359275818, "learning_rate": 0.0003913378152149214, "loss": 0.1455, "step": 970 }, { "epoch": 1.72, "grad_norm": 0.4886229634284973, "learning_rate": 0.0003904280619254348, "loss": 0.1251, "step": 971 }, { "epoch": 1.72, "grad_norm": 0.18052807450294495, "learning_rate": 0.0003895186895255814, "loss": 0.1407, "step": 972 }, { "epoch": 1.72, "grad_norm": 0.09001462161540985, "learning_rate": 0.0003886097011764855, "loss": 0.1143, "step": 973 }, { "epoch": 1.72, "grad_norm": 0.3248112201690674, "learning_rate": 0.0003877011000379367, "loss": 0.1212, "step": 974 }, { "epoch": 1.72, "grad_norm": 0.11648620665073395, "learning_rate": 0.000386792889268378, "loss": 0.1167, "step": 975 }, { "epoch": 1.73, "grad_norm": 1.3816261291503906, "learning_rate": 0.00038588507202489585, "loss": 0.1518, "step": 976 }, { "epoch": 1.73, "grad_norm": 0.6389634013175964, "learning_rate": 0.00038497765146320873, "loss": 0.1372, "step": 977 }, { "epoch": 1.73, "grad_norm": 0.6133326888084412, "learning_rate": 0.0003840706307376557, "loss": 0.1252, "step": 978 }, { "epoch": 1.73, "grad_norm": 0.20734143257141113, "learning_rate": 0.00038316401300118674, "loss": 0.1115, "step": 979 }, { "epoch": 1.73, "grad_norm": 0.06368093192577362, "learning_rate": 0.0003822578014053502, "loss": 0.145, "step": 980 }, { "epoch": 1.74, "grad_norm": 0.07665737718343735, "learning_rate": 0.0003813519991002831, "loss": 0.1557, "step": 981 }, { "epoch": 1.74, "grad_norm": 0.09085717052221298, "learning_rate": 0.00038044660923469963, "loss": 0.1251, "step": 982 }, { "epoch": 1.74, "grad_norm": 0.08342912048101425, "learning_rate": 0.00037954163495588, "loss": 0.1256, "step": 983 }, { "epoch": 1.74, "grad_norm": 0.6068560481071472, "learning_rate": 0.00037863707940966024, "loss": 0.1506, "step": 984 }, { "epoch": 1.74, "grad_norm": 0.22720251977443695, "learning_rate": 0.00037773294574042015, "loss": 0.1151, "step": 985 }, { "epoch": 1.74, "grad_norm": 0.5528678297996521, "learning_rate": 0.00037682923709107363, "loss": 0.1436, "step": 986 }, { "epoch": 1.75, "grad_norm": 0.6791836619377136, "learning_rate": 0.00037592595660305707, "loss": 0.1405, "step": 987 }, { "epoch": 1.75, "grad_norm": 0.07115644961595535, "learning_rate": 0.0003750231074163179, "loss": 0.0997, "step": 988 }, { "epoch": 1.75, "grad_norm": 0.36401447653770447, "learning_rate": 0.00037412069266930514, "loss": 0.1471, "step": 989 }, { "epoch": 1.75, "grad_norm": 0.7831732630729675, "learning_rate": 0.00037321871549895715, "loss": 0.1314, "step": 990 }, { "epoch": 1.75, "grad_norm": 0.12779557704925537, "learning_rate": 0.00037231717904069096, "loss": 0.1446, "step": 991 }, { "epoch": 1.75, "grad_norm": 0.41478636860847473, "learning_rate": 0.0003714160864283923, "loss": 0.1429, "step": 992 }, { "epoch": 1.76, "grad_norm": 0.1117364913225174, "learning_rate": 0.00037051544079440334, "loss": 0.1148, "step": 993 }, { "epoch": 1.76, "grad_norm": 0.5920963287353516, "learning_rate": 0.00036961524526951277, "loss": 0.1204, "step": 994 }, { "epoch": 1.76, "eval_loss": 0.13537168502807617, "eval_runtime": 14.0251, "eval_samples_per_second": 34.01, "eval_steps_per_second": 8.556, "step": 994 }, { "epoch": 1.76, "grad_norm": 0.43262025713920593, "learning_rate": 0.000368715502982945, "loss": 0.1302, "step": 995 }, { "epoch": 1.76, "grad_norm": 0.10716990381479263, "learning_rate": 0.00036781621706234816, "loss": 0.133, "step": 996 }, { "epoch": 1.76, "grad_norm": 0.0918804183602333, "learning_rate": 0.0003669173906337846, "loss": 0.1354, "step": 997 }, { "epoch": 1.77, "grad_norm": 0.13421286642551422, "learning_rate": 0.0003660190268217189, "loss": 0.121, "step": 998 }, { "epoch": 1.77, "grad_norm": 0.15904036164283752, "learning_rate": 0.00036512112874900797, "loss": 0.1341, "step": 999 }, { "epoch": 1.77, "grad_norm": 0.3201177418231964, "learning_rate": 0.00036422369953688973, "loss": 0.1454, "step": 1000 }, { "epoch": 1.77, "grad_norm": 0.3361368477344513, "learning_rate": 0.0003633267423049717, "loss": 0.152, "step": 1001 }, { "epoch": 1.77, "grad_norm": 0.09073235839605331, "learning_rate": 0.0003624302601712213, "loss": 0.1331, "step": 1002 }, { "epoch": 1.77, "grad_norm": 0.2717398405075073, "learning_rate": 0.0003615342562519542, "loss": 0.1373, "step": 1003 }, { "epoch": 1.78, "grad_norm": 0.0666472539305687, "learning_rate": 0.0003606387336618237, "loss": 0.1496, "step": 1004 }, { "epoch": 1.78, "grad_norm": 0.15450868010520935, "learning_rate": 0.0003597436955138102, "loss": 0.1464, "step": 1005 }, { "epoch": 1.78, "grad_norm": 0.236429825425148, "learning_rate": 0.0003588491449192096, "loss": 0.1342, "step": 1006 }, { "epoch": 1.78, "grad_norm": 0.06421036273241043, "learning_rate": 0.0003579550849876233, "loss": 0.1352, "step": 1007 }, { "epoch": 1.78, "grad_norm": 0.07432877272367477, "learning_rate": 0.00035706151882694727, "loss": 0.131, "step": 1008 }, { "epoch": 1.79, "grad_norm": 0.10126742720603943, "learning_rate": 0.00035616844954336046, "loss": 0.1471, "step": 1009 }, { "epoch": 1.79, "grad_norm": 0.15761522948741913, "learning_rate": 0.0003552758802413154, "loss": 0.1358, "step": 1010 }, { "epoch": 1.79, "grad_norm": 0.02369426190853119, "learning_rate": 0.0003543838140235257, "loss": 0.1296, "step": 1011 }, { "epoch": 1.79, "grad_norm": 0.27005845308303833, "learning_rate": 0.0003534922539909569, "loss": 0.1412, "step": 1012 }, { "epoch": 1.79, "grad_norm": 0.0638512596487999, "learning_rate": 0.00035260120324281474, "loss": 0.1366, "step": 1013 }, { "epoch": 1.79, "grad_norm": 0.5134268403053284, "learning_rate": 0.00035171066487653423, "loss": 0.146, "step": 1014 }, { "epoch": 1.8, "grad_norm": 0.2569257915019989, "learning_rate": 0.00035082064198776997, "loss": 0.1507, "step": 1015 }, { "epoch": 1.8, "grad_norm": 0.16015255451202393, "learning_rate": 0.0003499311376703842, "loss": 0.1297, "step": 1016 }, { "epoch": 1.8, "grad_norm": 0.22499139606952667, "learning_rate": 0.0003490421550164364, "loss": 0.1357, "step": 1017 }, { "epoch": 1.8, "grad_norm": 0.5117542743682861, "learning_rate": 0.0003481536971161732, "loss": 0.1418, "step": 1018 }, { "epoch": 1.8, "grad_norm": 0.27242857217788696, "learning_rate": 0.00034726576705801636, "loss": 0.1358, "step": 1019 }, { "epoch": 1.8, "grad_norm": 0.3779907822608948, "learning_rate": 0.0003463783679285535, "loss": 0.1512, "step": 1020 }, { "epoch": 1.81, "grad_norm": 0.2148500680923462, "learning_rate": 0.00034549150281252633, "loss": 0.1445, "step": 1021 }, { "epoch": 1.81, "grad_norm": 0.08610748499631882, "learning_rate": 0.0003446051747928202, "loss": 0.1333, "step": 1022 }, { "epoch": 1.81, "grad_norm": 0.07877200841903687, "learning_rate": 0.0003437193869504535, "loss": 0.1333, "step": 1023 }, { "epoch": 1.81, "grad_norm": 0.1410919725894928, "learning_rate": 0.0003428341423645668, "loss": 0.1303, "step": 1024 }, { "epoch": 1.81, "grad_norm": 0.1181708350777626, "learning_rate": 0.00034194944411241213, "loss": 0.1234, "step": 1025 }, { "epoch": 1.82, "grad_norm": 0.13002073764801025, "learning_rate": 0.00034106529526934303, "loss": 0.1405, "step": 1026 }, { "epoch": 1.82, "grad_norm": 0.14099909365177155, "learning_rate": 0.00034018169890880225, "loss": 0.1342, "step": 1027 }, { "epoch": 1.82, "grad_norm": 0.24455852806568146, "learning_rate": 0.00033929865810231264, "loss": 0.1482, "step": 1028 }, { "epoch": 1.82, "grad_norm": 0.23751892149448395, "learning_rate": 0.0003384161759194658, "loss": 0.1509, "step": 1029 }, { "epoch": 1.82, "grad_norm": 0.40307796001434326, "learning_rate": 0.00033753425542791104, "loss": 0.1542, "step": 1030 }, { "epoch": 1.82, "grad_norm": 0.123422771692276, "learning_rate": 0.0003366528996933458, "loss": 0.129, "step": 1031 }, { "epoch": 1.83, "grad_norm": 0.15652324259281158, "learning_rate": 0.00033577211177950386, "loss": 0.1277, "step": 1032 }, { "epoch": 1.83, "grad_norm": 0.2987327575683594, "learning_rate": 0.0003348918947481452, "loss": 0.1395, "step": 1033 }, { "epoch": 1.83, "grad_norm": 0.2202194184064865, "learning_rate": 0.00033401225165904556, "loss": 0.1287, "step": 1034 }, { "epoch": 1.83, "grad_norm": 0.10783470422029495, "learning_rate": 0.0003331331855699852, "loss": 0.1423, "step": 1035 }, { "epoch": 1.83, "grad_norm": 0.26680612564086914, "learning_rate": 0.0003322546995367394, "loss": 0.1307, "step": 1036 }, { "epoch": 1.83, "grad_norm": 0.14138604700565338, "learning_rate": 0.00033137679661306575, "loss": 0.1273, "step": 1037 }, { "epoch": 1.84, "grad_norm": 0.2755129039287567, "learning_rate": 0.0003304994798506962, "loss": 0.1485, "step": 1038 }, { "epoch": 1.84, "grad_norm": 0.15637388825416565, "learning_rate": 0.00032962275229932446, "loss": 0.1233, "step": 1039 }, { "epoch": 1.84, "grad_norm": 0.10934972018003464, "learning_rate": 0.00032874661700659587, "loss": 0.1438, "step": 1040 }, { "epoch": 1.84, "grad_norm": 0.16785453259944916, "learning_rate": 0.00032787107701809755, "loss": 0.1257, "step": 1041 }, { "epoch": 1.84, "grad_norm": 0.2813141644001007, "learning_rate": 0.0003269961353773469, "loss": 0.1594, "step": 1042 }, { "epoch": 1.85, "grad_norm": 0.321236789226532, "learning_rate": 0.00032612179512578126, "loss": 0.1476, "step": 1043 }, { "epoch": 1.85, "grad_norm": 0.35957199335098267, "learning_rate": 0.0003252480593027478, "loss": 0.1614, "step": 1044 }, { "epoch": 1.85, "grad_norm": 0.2529314160346985, "learning_rate": 0.0003243749309454922, "loss": 0.1436, "step": 1045 }, { "epoch": 1.85, "grad_norm": 0.36090412735939026, "learning_rate": 0.00032350241308914864, "loss": 0.1578, "step": 1046 }, { "epoch": 1.85, "grad_norm": 0.47541487216949463, "learning_rate": 0.0003226305087667295, "loss": 0.1247, "step": 1047 }, { "epoch": 1.85, "grad_norm": 0.23453806340694427, "learning_rate": 0.0003217592210091137, "loss": 0.1435, "step": 1048 }, { "epoch": 1.86, "grad_norm": 0.12492989748716354, "learning_rate": 0.0003208885528450376, "loss": 0.1229, "step": 1049 }, { "epoch": 1.86, "grad_norm": 0.19712020456790924, "learning_rate": 0.00032001850730108307, "loss": 0.1292, "step": 1050 }, { "epoch": 1.86, "grad_norm": 0.09731408208608627, "learning_rate": 0.00031914908740166795, "loss": 0.1333, "step": 1051 }, { "epoch": 1.86, "grad_norm": 0.06944354623556137, "learning_rate": 0.0003182802961690357, "loss": 0.1292, "step": 1052 }, { "epoch": 1.86, "grad_norm": 0.07448045909404755, "learning_rate": 0.00031741213662324363, "loss": 0.1349, "step": 1053 }, { "epoch": 1.86, "grad_norm": 0.28523683547973633, "learning_rate": 0.0003165446117821538, "loss": 0.1452, "step": 1054 }, { "epoch": 1.87, "grad_norm": 0.09108186513185501, "learning_rate": 0.0003156777246614215, "loss": 0.1361, "step": 1055 }, { "epoch": 1.87, "grad_norm": 0.13375020027160645, "learning_rate": 0.0003148114782744855, "loss": 0.1381, "step": 1056 }, { "epoch": 1.87, "grad_norm": 0.06716307252645493, "learning_rate": 0.00031394587563255755, "loss": 0.1383, "step": 1057 }, { "epoch": 1.87, "grad_norm": 0.11596639454364777, "learning_rate": 0.00031308091974461064, "loss": 0.1313, "step": 1058 }, { "epoch": 1.87, "grad_norm": 0.39337942004203796, "learning_rate": 0.00031221661361737065, "loss": 0.1359, "step": 1059 }, { "epoch": 1.88, "grad_norm": 0.07525162398815155, "learning_rate": 0.00031135296025530424, "loss": 0.1326, "step": 1060 }, { "epoch": 1.88, "grad_norm": 0.020530417561531067, "learning_rate": 0.0003104899626606088, "loss": 0.1368, "step": 1061 }, { "epoch": 1.88, "grad_norm": 0.17400570213794708, "learning_rate": 0.00030962762383320285, "loss": 0.1309, "step": 1062 }, { "epoch": 1.88, "grad_norm": 0.1574063003063202, "learning_rate": 0.00030876594677071404, "loss": 0.1365, "step": 1063 }, { "epoch": 1.88, "grad_norm": 0.21009187400341034, "learning_rate": 0.0003079049344684702, "loss": 0.1382, "step": 1064 }, { "epoch": 1.88, "grad_norm": 0.047014713287353516, "learning_rate": 0.00030704458991948844, "loss": 0.1327, "step": 1065 }, { "epoch": 1.89, "grad_norm": 0.11032029241323471, "learning_rate": 0.0003061849161144641, "loss": 0.1252, "step": 1066 }, { "epoch": 1.89, "grad_norm": 0.08818018436431885, "learning_rate": 0.0003053259160417613, "loss": 0.1495, "step": 1067 }, { "epoch": 1.89, "grad_norm": 0.08112979680299759, "learning_rate": 0.0003044675926874023, "loss": 0.1408, "step": 1068 }, { "epoch": 1.89, "grad_norm": 0.11870189756155014, "learning_rate": 0.00030360994903505653, "loss": 0.1342, "step": 1069 }, { "epoch": 1.89, "grad_norm": 0.16511432826519012, "learning_rate": 0.000302752988066031, "loss": 0.148, "step": 1070 }, { "epoch": 1.89, "grad_norm": 0.31104427576065063, "learning_rate": 0.0003018967127592595, "loss": 0.15, "step": 1071 }, { "epoch": 1.9, "grad_norm": 0.1434139758348465, "learning_rate": 0.0003010411260912922, "loss": 0.142, "step": 1072 }, { "epoch": 1.9, "grad_norm": 0.29530733823776245, "learning_rate": 0.00030018623103628594, "loss": 0.1284, "step": 1073 }, { "epoch": 1.9, "grad_norm": 0.06294587254524231, "learning_rate": 0.00029933203056599274, "loss": 0.1379, "step": 1074 }, { "epoch": 1.9, "grad_norm": 0.0581539049744606, "learning_rate": 0.0002984785276497507, "loss": 0.1278, "step": 1075 }, { "epoch": 1.9, "grad_norm": 0.09157228469848633, "learning_rate": 0.0002976257252544726, "loss": 0.1348, "step": 1076 }, { "epoch": 1.91, "grad_norm": 0.10196894407272339, "learning_rate": 0.00029677362634463643, "loss": 0.1386, "step": 1077 }, { "epoch": 1.91, "grad_norm": 0.1152050793170929, "learning_rate": 0.00029592223388227504, "loss": 0.1337, "step": 1078 }, { "epoch": 1.91, "grad_norm": 0.34997934103012085, "learning_rate": 0.0002950715508269648, "loss": 0.1346, "step": 1079 }, { "epoch": 1.91, "grad_norm": 0.21548382937908173, "learning_rate": 0.00029422158013581656, "loss": 0.1309, "step": 1080 }, { "epoch": 1.91, "grad_norm": 0.3693360686302185, "learning_rate": 0.000293372324763465, "loss": 0.1587, "step": 1081 }, { "epoch": 1.91, "grad_norm": 0.2655669152736664, "learning_rate": 0.0002925237876620576, "loss": 0.1285, "step": 1082 }, { "epoch": 1.92, "grad_norm": 0.08570755273103714, "learning_rate": 0.00029167597178124583, "loss": 0.1359, "step": 1083 }, { "epoch": 1.92, "grad_norm": 0.2622168958187103, "learning_rate": 0.00029082888006817364, "loss": 0.1315, "step": 1084 }, { "epoch": 1.92, "grad_norm": 0.020985718816518784, "learning_rate": 0.0002899825154674674, "loss": 0.1308, "step": 1085 }, { "epoch": 1.92, "grad_norm": 0.15370753407478333, "learning_rate": 0.00028913688092122665, "loss": 0.1269, "step": 1086 }, { "epoch": 1.92, "grad_norm": 0.1030414029955864, "learning_rate": 0.0002882919793690123, "loss": 0.1143, "step": 1087 }, { "epoch": 1.92, "grad_norm": 0.31633636355400085, "learning_rate": 0.00028744781374783813, "loss": 0.1563, "step": 1088 }, { "epoch": 1.93, "grad_norm": 0.03694160282611847, "learning_rate": 0.00028660438699215895, "loss": 0.1276, "step": 1089 }, { "epoch": 1.93, "grad_norm": 0.14471565186977386, "learning_rate": 0.0002857617020338614, "loss": 0.137, "step": 1090 }, { "epoch": 1.93, "grad_norm": 0.08222481608390808, "learning_rate": 0.0002849197618022539, "loss": 0.1279, "step": 1091 }, { "epoch": 1.93, "grad_norm": 0.11603690683841705, "learning_rate": 0.00028407856922405526, "loss": 0.138, "step": 1092 }, { "epoch": 1.93, "grad_norm": 0.18144792318344116, "learning_rate": 0.0002832381272233864, "loss": 0.1481, "step": 1093 }, { "epoch": 1.94, "grad_norm": 0.05054265260696411, "learning_rate": 0.00028239843872175814, "loss": 0.1363, "step": 1094 }, { "epoch": 1.94, "grad_norm": 2.8396074771881104, "learning_rate": 0.00028155950663806236, "loss": 0.1345, "step": 1095 }, { "epoch": 1.94, "grad_norm": 0.30984073877334595, "learning_rate": 0.0002807213338885619, "loss": 0.1338, "step": 1096 }, { "epoch": 1.94, "grad_norm": 0.34585434198379517, "learning_rate": 0.00027988392338687925, "loss": 0.1389, "step": 1097 }, { "epoch": 1.94, "grad_norm": 0.0312834158539772, "learning_rate": 0.0002790472780439881, "loss": 0.1338, "step": 1098 }, { "epoch": 1.94, "grad_norm": 0.5297988057136536, "learning_rate": 0.0002782114007682016, "loss": 0.1321, "step": 1099 }, { "epoch": 1.95, "grad_norm": 0.10174748301506042, "learning_rate": 0.0002773762944651632, "loss": 0.1202, "step": 1100 }, { "epoch": 1.95, "grad_norm": 0.0418776273727417, "learning_rate": 0.0002765419620378366, "loss": 0.1257, "step": 1101 }, { "epoch": 1.95, "grad_norm": 0.3508782386779785, "learning_rate": 0.0002757084063864949, "loss": 0.1612, "step": 1102 }, { "epoch": 1.95, "grad_norm": 0.2867041230201721, "learning_rate": 0.00027487563040871145, "loss": 0.1574, "step": 1103 }, { "epoch": 1.95, "grad_norm": 0.24160172045230865, "learning_rate": 0.00027404363699934907, "loss": 0.1175, "step": 1104 }, { "epoch": 1.95, "grad_norm": 0.03382538631558418, "learning_rate": 0.0002732124290505501, "loss": 0.1363, "step": 1105 }, { "epoch": 1.96, "grad_norm": 0.05089818313717842, "learning_rate": 0.000272382009451727, "loss": 0.1384, "step": 1106 }, { "epoch": 1.96, "grad_norm": 0.08688928186893463, "learning_rate": 0.0002715523810895515, "loss": 0.1415, "step": 1107 }, { "epoch": 1.96, "grad_norm": 0.03926026448607445, "learning_rate": 0.00027072354684794486, "loss": 0.1357, "step": 1108 }, { "epoch": 1.96, "grad_norm": 0.05058757960796356, "learning_rate": 0.0002698955096080677, "loss": 0.1356, "step": 1109 }, { "epoch": 1.96, "grad_norm": 0.0489107221364975, "learning_rate": 0.00026906827224831023, "loss": 0.1394, "step": 1110 }, { "epoch": 1.97, "grad_norm": 0.09765997529029846, "learning_rate": 0.00026824183764428223, "loss": 0.1388, "step": 1111 }, { "epoch": 1.97, "grad_norm": 0.06232646107673645, "learning_rate": 0.00026741620866880335, "loss": 0.1322, "step": 1112 }, { "epoch": 1.97, "grad_norm": 0.08469201624393463, "learning_rate": 0.0002665913881918921, "loss": 0.1369, "step": 1113 }, { "epoch": 1.97, "grad_norm": 0.08312228322029114, "learning_rate": 0.00026576737908075667, "loss": 0.137, "step": 1114 }, { "epoch": 1.97, "grad_norm": 0.13206493854522705, "learning_rate": 0.00026494418419978485, "loss": 0.127, "step": 1115 }, { "epoch": 1.97, "grad_norm": 0.050156012177467346, "learning_rate": 0.0002641218064105341, "loss": 0.1304, "step": 1116 }, { "epoch": 1.98, "grad_norm": 0.22207175195217133, "learning_rate": 0.0002633002485717219, "loss": 0.1406, "step": 1117 }, { "epoch": 1.98, "grad_norm": 0.25118494033813477, "learning_rate": 0.0002624795135392148, "loss": 0.1373, "step": 1118 }, { "epoch": 1.98, "grad_norm": 0.09831628948450089, "learning_rate": 0.00026165960416601943, "loss": 0.1459, "step": 1119 }, { "epoch": 1.98, "grad_norm": 0.037262722849845886, "learning_rate": 0.00026084052330227237, "loss": 0.1314, "step": 1120 }, { "epoch": 1.98, "grad_norm": 0.05748564377427101, "learning_rate": 0.0002600222737952299, "loss": 0.1439, "step": 1121 }, { "epoch": 1.98, "grad_norm": 0.06702205538749695, "learning_rate": 0.00025920485848925914, "loss": 0.1265, "step": 1122 }, { "epoch": 1.99, "grad_norm": 0.11122670769691467, "learning_rate": 0.00025838828022582596, "loss": 0.1275, "step": 1123 }, { "epoch": 1.99, "grad_norm": 0.1843162626028061, "learning_rate": 0.0002575725418434878, "loss": 0.1419, "step": 1124 }, { "epoch": 1.99, "grad_norm": 0.11396101117134094, "learning_rate": 0.00025675764617788234, "loss": 0.1342, "step": 1125 }, { "epoch": 1.99, "grad_norm": 0.043184638023376465, "learning_rate": 0.00025594359606171725, "loss": 0.1295, "step": 1126 }, { "epoch": 1.99, "grad_norm": 0.15845736861228943, "learning_rate": 0.0002551303943247619, "loss": 0.1429, "step": 1127 }, { "epoch": 2.0, "grad_norm": 0.1975352019071579, "learning_rate": 0.0002543180437938352, "loss": 0.1153, "step": 1128 }, { "epoch": 2.0, "grad_norm": 0.13739655911922455, "learning_rate": 0.00025350654729279834, "loss": 0.1347, "step": 1129 }, { "epoch": 2.0, "grad_norm": 0.2761506140232086, "learning_rate": 0.0002526959076425434, "loss": 0.147, "step": 1130 }, { "epoch": 2.0, "grad_norm": 0.09016864746809006, "learning_rate": 0.0002518861276609837, "loss": 0.1298, "step": 1131 }, { "epoch": 2.0, "grad_norm": 0.15213081240653992, "learning_rate": 0.00025107721016304424, "loss": 0.126, "step": 1132 }, { "epoch": 2.0, "grad_norm": 0.10915911197662354, "learning_rate": 0.00025026915796065233, "loss": 0.1533, "step": 1133 }, { "epoch": 2.01, "grad_norm": 0.21597544848918915, "learning_rate": 0.0002494619738627266, "loss": 0.1352, "step": 1134 }, { "epoch": 2.01, "grad_norm": 0.14955481886863708, "learning_rate": 0.00024865566067516895, "loss": 0.115, "step": 1135 }, { "epoch": 2.01, "grad_norm": 0.07370211184024811, "learning_rate": 0.000247850221200853, "loss": 0.1262, "step": 1136 }, { "epoch": 2.01, "eval_loss": 0.13425856828689575, "eval_runtime": 14.0067, "eval_samples_per_second": 34.055, "eval_steps_per_second": 8.567, "step": 1136 }, { "epoch": 2.01, "grad_norm": 0.22561423480510712, "learning_rate": 0.00024704565823961564, "loss": 0.1442, "step": 1137 }, { "epoch": 2.01, "grad_norm": 0.1496383398771286, "learning_rate": 0.0002462419745882469, "loss": 0.1367, "step": 1138 }, { "epoch": 2.02, "grad_norm": 0.1709650307893753, "learning_rate": 0.00024543917304047986, "loss": 0.1334, "step": 1139 }, { "epoch": 2.02, "grad_norm": 0.2110009342432022, "learning_rate": 0.0002446372563869818, "loss": 0.1387, "step": 1140 }, { "epoch": 2.02, "grad_norm": 0.26138541102409363, "learning_rate": 0.00024383622741534343, "loss": 0.1366, "step": 1141 }, { "epoch": 2.02, "grad_norm": 0.13308700919151306, "learning_rate": 0.00024303608891006984, "loss": 0.1272, "step": 1142 }, { "epoch": 2.02, "grad_norm": 0.31121209263801575, "learning_rate": 0.0002422368436525711, "loss": 0.158, "step": 1143 }, { "epoch": 2.02, "grad_norm": 0.09018420428037643, "learning_rate": 0.00024143849442115158, "loss": 0.1333, "step": 1144 }, { "epoch": 2.03, "grad_norm": 0.19516992568969727, "learning_rate": 0.00024064104399100167, "loss": 0.1276, "step": 1145 }, { "epoch": 2.03, "grad_norm": 0.22115693986415863, "learning_rate": 0.00023984449513418687, "loss": 0.1184, "step": 1146 }, { "epoch": 2.03, "grad_norm": 0.08551298081874847, "learning_rate": 0.00023904885061963843, "loss": 0.1318, "step": 1147 }, { "epoch": 2.03, "grad_norm": 0.27814337611198425, "learning_rate": 0.00023825411321314489, "loss": 0.1454, "step": 1148 }, { "epoch": 2.03, "grad_norm": 0.39333808422088623, "learning_rate": 0.0002374602856773404, "loss": 0.1491, "step": 1149 }, { "epoch": 2.03, "grad_norm": 0.1820652335882187, "learning_rate": 0.00023666737077169726, "loss": 0.1464, "step": 1150 }, { "epoch": 2.04, "grad_norm": 0.21041136980056763, "learning_rate": 0.00023587537125251468, "loss": 0.1437, "step": 1151 }, { "epoch": 2.04, "grad_norm": 0.22204861044883728, "learning_rate": 0.00023508428987290987, "loss": 0.1375, "step": 1152 }, { "epoch": 2.04, "grad_norm": 0.1012771725654602, "learning_rate": 0.00023429412938280898, "loss": 0.1271, "step": 1153 }, { "epoch": 2.04, "grad_norm": 0.23188555240631104, "learning_rate": 0.0002335048925289362, "loss": 0.1254, "step": 1154 }, { "epoch": 2.04, "grad_norm": 0.22195938229560852, "learning_rate": 0.00023271658205480588, "loss": 0.1374, "step": 1155 }, { "epoch": 2.05, "grad_norm": 0.24288509786128998, "learning_rate": 0.00023192920070071144, "loss": 0.1386, "step": 1156 }, { "epoch": 2.05, "grad_norm": 0.0804426372051239, "learning_rate": 0.00023114275120371657, "loss": 0.128, "step": 1157 }, { "epoch": 2.05, "grad_norm": 0.2349734604358673, "learning_rate": 0.00023035723629764615, "loss": 0.139, "step": 1158 }, { "epoch": 2.05, "grad_norm": 0.21826253831386566, "learning_rate": 0.0002295726587130761, "loss": 0.1483, "step": 1159 }, { "epoch": 2.05, "grad_norm": 0.12499336153268814, "learning_rate": 0.0002287890211773238, "loss": 0.1446, "step": 1160 }, { "epoch": 2.05, "grad_norm": 0.22501634061336517, "learning_rate": 0.00022800632641443902, "loss": 0.1288, "step": 1161 }, { "epoch": 2.06, "grad_norm": 0.123602956533432, "learning_rate": 0.00022722457714519418, "loss": 0.1287, "step": 1162 }, { "epoch": 2.06, "grad_norm": 0.1110440120100975, "learning_rate": 0.00022644377608707522, "loss": 0.1375, "step": 1163 }, { "epoch": 2.06, "grad_norm": 0.10080868005752563, "learning_rate": 0.00022566392595427216, "loss": 0.1236, "step": 1164 }, { "epoch": 2.06, "grad_norm": 0.11740677803754807, "learning_rate": 0.00022488502945766892, "loss": 0.1362, "step": 1165 }, { "epoch": 2.06, "grad_norm": 0.08558549731969833, "learning_rate": 0.00022410708930483463, "loss": 0.1525, "step": 1166 }, { "epoch": 2.06, "grad_norm": 0.4037264883518219, "learning_rate": 0.00022333010820001394, "loss": 0.1366, "step": 1167 }, { "epoch": 2.07, "grad_norm": 0.12114354968070984, "learning_rate": 0.00022255408884411793, "loss": 0.1386, "step": 1168 }, { "epoch": 2.07, "grad_norm": 0.13077768683433533, "learning_rate": 0.0002217790339347146, "loss": 0.1281, "step": 1169 }, { "epoch": 2.07, "grad_norm": 0.15883688628673553, "learning_rate": 0.0002210049461660189, "loss": 0.129, "step": 1170 }, { "epoch": 2.07, "grad_norm": 0.1601915806531906, "learning_rate": 0.00022023182822888398, "loss": 0.1119, "step": 1171 }, { "epoch": 2.07, "grad_norm": 0.17367267608642578, "learning_rate": 0.0002194596828107921, "loss": 0.147, "step": 1172 }, { "epoch": 2.08, "grad_norm": 0.2555027902126312, "learning_rate": 0.00021868851259584427, "loss": 0.1495, "step": 1173 }, { "epoch": 2.08, "grad_norm": 0.10203690081834793, "learning_rate": 0.00021791832026475238, "loss": 0.1348, "step": 1174 }, { "epoch": 2.08, "grad_norm": 0.19506026804447174, "learning_rate": 0.00021714910849482776, "loss": 0.1332, "step": 1175 }, { "epoch": 2.08, "grad_norm": 0.1914072483778, "learning_rate": 0.00021638087995997442, "loss": 0.1545, "step": 1176 }, { "epoch": 2.08, "grad_norm": 0.1700693666934967, "learning_rate": 0.00021561363733067795, "loss": 0.1435, "step": 1177 }, { "epoch": 2.08, "grad_norm": 0.11551807075738907, "learning_rate": 0.00021484738327399682, "loss": 0.1335, "step": 1178 }, { "epoch": 2.09, "grad_norm": 0.039073869585990906, "learning_rate": 0.0002140821204535529, "loss": 0.1292, "step": 1179 }, { "epoch": 2.09, "grad_norm": 0.1664426028728485, "learning_rate": 0.00021331785152952244, "loss": 0.1322, "step": 1180 }, { "epoch": 2.09, "grad_norm": 0.10394078493118286, "learning_rate": 0.00021255457915862692, "loss": 0.1355, "step": 1181 }, { "epoch": 2.09, "grad_norm": 0.23373393714427948, "learning_rate": 0.00021179230599412374, "loss": 0.1345, "step": 1182 }, { "epoch": 2.09, "grad_norm": 0.19787871837615967, "learning_rate": 0.00021103103468579653, "loss": 0.1341, "step": 1183 }, { "epoch": 2.09, "grad_norm": 0.1043769121170044, "learning_rate": 0.00021027076787994632, "loss": 0.1305, "step": 1184 }, { "epoch": 2.1, "grad_norm": 0.14168131351470947, "learning_rate": 0.00020951150821938276, "loss": 0.1399, "step": 1185 }, { "epoch": 2.1, "grad_norm": 0.2587750554084778, "learning_rate": 0.0002087532583434139, "loss": 0.1381, "step": 1186 }, { "epoch": 2.1, "grad_norm": 0.13129281997680664, "learning_rate": 0.00020799602088783837, "loss": 0.1191, "step": 1187 }, { "epoch": 2.1, "grad_norm": 0.1005392074584961, "learning_rate": 0.00020723979848493473, "loss": 0.1342, "step": 1188 }, { "epoch": 2.1, "grad_norm": 0.10267619788646698, "learning_rate": 0.0002064845937634533, "loss": 0.1355, "step": 1189 }, { "epoch": 2.11, "grad_norm": 0.06988910585641861, "learning_rate": 0.00020573040934860715, "loss": 0.1301, "step": 1190 }, { "epoch": 2.11, "grad_norm": 0.10177844017744064, "learning_rate": 0.00020497724786206183, "loss": 0.1309, "step": 1191 }, { "epoch": 2.11, "grad_norm": 0.10639530420303345, "learning_rate": 0.000204225111921928, "loss": 0.1201, "step": 1192 }, { "epoch": 2.11, "grad_norm": 0.176835834980011, "learning_rate": 0.00020347400414275058, "loss": 0.134, "step": 1193 }, { "epoch": 2.11, "grad_norm": 0.0938374251127243, "learning_rate": 0.00020272392713550048, "loss": 0.1424, "step": 1194 }, { "epoch": 2.11, "grad_norm": 0.0616929791867733, "learning_rate": 0.00020197488350756615, "loss": 0.1422, "step": 1195 }, { "epoch": 2.12, "grad_norm": 0.07180429995059967, "learning_rate": 0.00020122687586274297, "loss": 0.1355, "step": 1196 }, { "epoch": 2.12, "grad_norm": 0.1905076801776886, "learning_rate": 0.00020047990680122584, "loss": 0.1155, "step": 1197 }, { "epoch": 2.12, "grad_norm": 0.3793567717075348, "learning_rate": 0.00019973397891959893, "loss": 0.1582, "step": 1198 }, { "epoch": 2.12, "grad_norm": 0.27994590997695923, "learning_rate": 0.000198989094810827, "loss": 0.1186, "step": 1199 }, { "epoch": 2.12, "grad_norm": 0.0879906564950943, "learning_rate": 0.000198245257064247, "loss": 0.1313, "step": 1200 }, { "epoch": 2.12, "grad_norm": 0.20328675210475922, "learning_rate": 0.000197502468265558, "loss": 0.1494, "step": 1201 }, { "epoch": 2.13, "grad_norm": 0.12098411470651627, "learning_rate": 0.00019676073099681335, "loss": 0.1201, "step": 1202 }, { "epoch": 2.13, "grad_norm": 0.1231808289885521, "learning_rate": 0.00019602004783641054, "loss": 0.1374, "step": 1203 }, { "epoch": 2.13, "grad_norm": 0.14193227887153625, "learning_rate": 0.00019528042135908292, "loss": 0.14, "step": 1204 }, { "epoch": 2.13, "grad_norm": 0.2550235688686371, "learning_rate": 0.0001945418541358911, "loss": 0.1289, "step": 1205 }, { "epoch": 2.13, "grad_norm": 0.05161747708916664, "learning_rate": 0.00019380434873421292, "loss": 0.1208, "step": 1206 }, { "epoch": 2.14, "grad_norm": 0.40697818994522095, "learning_rate": 0.00019306790771773575, "loss": 0.107, "step": 1207 }, { "epoch": 2.14, "grad_norm": 0.09957734495401382, "learning_rate": 0.00019233253364644653, "loss": 0.1258, "step": 1208 }, { "epoch": 2.14, "grad_norm": 0.15506863594055176, "learning_rate": 0.00019159822907662333, "loss": 0.137, "step": 1209 }, { "epoch": 2.14, "grad_norm": 0.11825218796730042, "learning_rate": 0.00019086499656082684, "loss": 0.1299, "step": 1210 }, { "epoch": 2.14, "grad_norm": 0.3029904365539551, "learning_rate": 0.00019013283864789104, "loss": 0.1136, "step": 1211 }, { "epoch": 2.14, "grad_norm": 0.11233114451169968, "learning_rate": 0.00018940175788291407, "loss": 0.125, "step": 1212 }, { "epoch": 2.15, "grad_norm": 0.16029077768325806, "learning_rate": 0.00018867175680725002, "loss": 0.1713, "step": 1213 }, { "epoch": 2.15, "grad_norm": 0.1274372637271881, "learning_rate": 0.0001879428379584995, "loss": 0.0996, "step": 1214 }, { "epoch": 2.15, "grad_norm": 0.1937287449836731, "learning_rate": 0.0001872150038705015, "loss": 0.1246, "step": 1215 }, { "epoch": 2.15, "grad_norm": 0.1734904944896698, "learning_rate": 0.00018648825707332433, "loss": 0.1698, "step": 1216 }, { "epoch": 2.15, "grad_norm": 0.31292834877967834, "learning_rate": 0.0001857626000932562, "loss": 0.1284, "step": 1217 }, { "epoch": 2.15, "grad_norm": 0.11059171706438065, "learning_rate": 0.0001850380354527972, "loss": 0.1039, "step": 1218 }, { "epoch": 2.16, "grad_norm": 0.12312529236078262, "learning_rate": 0.00018431456567065003, "loss": 0.1436, "step": 1219 }, { "epoch": 2.16, "grad_norm": 0.2583165168762207, "learning_rate": 0.0001835921932617119, "loss": 0.113, "step": 1220 }, { "epoch": 2.16, "grad_norm": 0.2463943064212799, "learning_rate": 0.00018287092073706557, "loss": 0.1373, "step": 1221 }, { "epoch": 2.16, "grad_norm": 0.23069016635417938, "learning_rate": 0.0001821507506039693, "loss": 0.1473, "step": 1222 }, { "epoch": 2.16, "grad_norm": 0.057799965143203735, "learning_rate": 0.0001814316853658503, "loss": 0.1089, "step": 1223 }, { "epoch": 2.17, "grad_norm": 0.14288663864135742, "learning_rate": 0.00018071372752229497, "loss": 0.1168, "step": 1224 }, { "epoch": 2.17, "grad_norm": 0.1829814910888672, "learning_rate": 0.00017999687956903954, "loss": 0.1232, "step": 1225 }, { "epoch": 2.17, "grad_norm": 0.07779528200626373, "learning_rate": 0.00017928114399796296, "loss": 0.1319, "step": 1226 }, { "epoch": 2.17, "grad_norm": 0.12610167264938354, "learning_rate": 0.00017856652329707623, "loss": 0.1499, "step": 1227 }, { "epoch": 2.17, "grad_norm": 0.08324690163135529, "learning_rate": 0.00017785301995051588, "loss": 0.1164, "step": 1228 }, { "epoch": 2.17, "grad_norm": 0.16664673388004303, "learning_rate": 0.00017714063643853423, "loss": 0.1102, "step": 1229 }, { "epoch": 2.18, "grad_norm": 0.47252926230430603, "learning_rate": 0.00017642937523749036, "loss": 0.1267, "step": 1230 }, { "epoch": 2.18, "grad_norm": 0.24932174384593964, "learning_rate": 0.00017571923881984236, "loss": 0.0999, "step": 1231 }, { "epoch": 2.18, "grad_norm": 0.1122673973441124, "learning_rate": 0.0001750102296541382, "loss": 0.1149, "step": 1232 }, { "epoch": 2.18, "grad_norm": 0.03139616549015045, "learning_rate": 0.00017430235020500756, "loss": 0.14, "step": 1233 }, { "epoch": 2.18, "grad_norm": 0.1314571052789688, "learning_rate": 0.0001735956029331532, "loss": 0.1316, "step": 1234 }, { "epoch": 2.18, "grad_norm": 0.12405366450548172, "learning_rate": 0.00017288999029534176, "loss": 0.1066, "step": 1235 }, { "epoch": 2.19, "grad_norm": 0.12566903233528137, "learning_rate": 0.00017218551474439586, "loss": 0.1217, "step": 1236 }, { "epoch": 2.19, "grad_norm": 0.136549711227417, "learning_rate": 0.0001714821787291858, "loss": 0.1242, "step": 1237 }, { "epoch": 2.19, "grad_norm": 0.049642208963632584, "learning_rate": 0.00017077998469462009, "loss": 0.1343, "step": 1238 }, { "epoch": 2.19, "grad_norm": 0.2866723835468292, "learning_rate": 0.0001700789350816382, "loss": 0.1365, "step": 1239 }, { "epoch": 2.19, "grad_norm": 0.12442098557949066, "learning_rate": 0.00016937903232720075, "loss": 0.1512, "step": 1240 }, { "epoch": 2.2, "grad_norm": 0.11928176134824753, "learning_rate": 0.00016868027886428194, "loss": 0.147, "step": 1241 }, { "epoch": 2.2, "grad_norm": 0.09463697671890259, "learning_rate": 0.00016798267712186122, "loss": 0.117, "step": 1242 }, { "epoch": 2.2, "grad_norm": 0.06501590460538864, "learning_rate": 0.0001672862295249138, "loss": 0.1508, "step": 1243 }, { "epoch": 2.2, "grad_norm": 0.30164363980293274, "learning_rate": 0.00016659093849440354, "loss": 0.1501, "step": 1244 }, { "epoch": 2.2, "grad_norm": 0.15526002645492554, "learning_rate": 0.00016589680644727345, "loss": 0.1242, "step": 1245 }, { "epoch": 2.2, "grad_norm": 0.07917926460504532, "learning_rate": 0.00016520383579643767, "loss": 0.1314, "step": 1246 }, { "epoch": 2.21, "grad_norm": 0.10616712272167206, "learning_rate": 0.00016451202895077378, "loss": 0.1462, "step": 1247 }, { "epoch": 2.21, "grad_norm": 0.1220119372010231, "learning_rate": 0.0001638213883151129, "loss": 0.1191, "step": 1248 }, { "epoch": 2.21, "grad_norm": 0.10847750306129456, "learning_rate": 0.0001631319162902331, "loss": 0.0983, "step": 1249 }, { "epoch": 2.21, "grad_norm": 0.03508616238832474, "learning_rate": 0.0001624436152728495, "loss": 0.1238, "step": 1250 }, { "epoch": 2.21, "grad_norm": 0.05642473325133324, "learning_rate": 0.0001617564876556067, "loss": 0.1381, "step": 1251 }, { "epoch": 2.21, "grad_norm": 0.12747938930988312, "learning_rate": 0.0001610705358270708, "loss": 0.1165, "step": 1252 }, { "epoch": 2.22, "grad_norm": 0.030638879165053368, "learning_rate": 0.0001603857621717202, "loss": 0.1415, "step": 1253 }, { "epoch": 2.22, "grad_norm": 0.08363982290029526, "learning_rate": 0.00015970216906993818, "loss": 0.1193, "step": 1254 }, { "epoch": 2.22, "grad_norm": 0.04466724768280983, "learning_rate": 0.00015901975889800386, "loss": 0.1214, "step": 1255 }, { "epoch": 2.22, "grad_norm": 0.1267317682504654, "learning_rate": 0.00015833853402808434, "loss": 0.1391, "step": 1256 }, { "epoch": 2.22, "grad_norm": 0.18954621255397797, "learning_rate": 0.00015765849682822686, "loss": 0.1665, "step": 1257 }, { "epoch": 2.23, "grad_norm": 0.10698127746582031, "learning_rate": 0.00015697964966234946, "loss": 0.1019, "step": 1258 }, { "epoch": 2.23, "grad_norm": 0.12424101680517197, "learning_rate": 0.00015630199489023415, "loss": 0.1591, "step": 1259 }, { "epoch": 2.23, "grad_norm": 0.0653344914317131, "learning_rate": 0.0001556255348675174, "loss": 0.1536, "step": 1260 }, { "epoch": 2.23, "grad_norm": 0.04556593671441078, "learning_rate": 0.0001549502719456827, "loss": 0.1502, "step": 1261 }, { "epoch": 2.23, "grad_norm": 0.10966439545154572, "learning_rate": 0.00015427620847205238, "loss": 0.1322, "step": 1262 }, { "epoch": 2.23, "grad_norm": 0.21205949783325195, "learning_rate": 0.00015360334678977933, "loss": 0.1374, "step": 1263 }, { "epoch": 2.24, "grad_norm": 0.3078778088092804, "learning_rate": 0.00015293168923783857, "loss": 0.1303, "step": 1264 }, { "epoch": 2.24, "grad_norm": 0.22372427582740784, "learning_rate": 0.0001522612381510195, "loss": 0.1334, "step": 1265 }, { "epoch": 2.24, "grad_norm": 0.07522010058164597, "learning_rate": 0.00015159199585991744, "loss": 0.1315, "step": 1266 }, { "epoch": 2.24, "grad_norm": 0.10713458061218262, "learning_rate": 0.00015092396469092617, "loss": 0.1181, "step": 1267 }, { "epoch": 2.24, "grad_norm": 0.14708364009857178, "learning_rate": 0.00015025714696622933, "loss": 0.1246, "step": 1268 }, { "epoch": 2.25, "grad_norm": 0.050081461668014526, "learning_rate": 0.00014959154500379212, "loss": 0.1329, "step": 1269 }, { "epoch": 2.25, "grad_norm": 0.15830601751804352, "learning_rate": 0.00014892716111735376, "loss": 0.1372, "step": 1270 }, { "epoch": 2.25, "grad_norm": 0.0707254484295845, "learning_rate": 0.00014826399761641907, "loss": 0.1235, "step": 1271 }, { "epoch": 2.25, "grad_norm": 0.1088636964559555, "learning_rate": 0.00014760205680625083, "loss": 0.1439, "step": 1272 }, { "epoch": 2.25, "grad_norm": 0.07059821486473083, "learning_rate": 0.00014694134098786182, "loss": 0.1359, "step": 1273 }, { "epoch": 2.25, "grad_norm": 0.2451217770576477, "learning_rate": 0.0001462818524580057, "loss": 0.1592, "step": 1274 }, { "epoch": 2.26, "grad_norm": 0.08877355605363846, "learning_rate": 0.00014562359350917055, "loss": 0.1267, "step": 1275 }, { "epoch": 2.26, "grad_norm": 0.09591105580329895, "learning_rate": 0.00014496656642957025, "loss": 0.1461, "step": 1276 }, { "epoch": 2.26, "grad_norm": 0.12581078708171844, "learning_rate": 0.0001443107735031361, "loss": 0.1357, "step": 1277 }, { "epoch": 2.26, "grad_norm": 0.04613172635436058, "learning_rate": 0.00014365621700950987, "loss": 0.12, "step": 1278 }, { "epoch": 2.26, "eval_loss": 0.13804392516613007, "eval_runtime": 14.0836, "eval_samples_per_second": 33.869, "eval_steps_per_second": 8.521, "step": 1278 }, { "epoch": 2.26, "grad_norm": 0.0844263806939125, "learning_rate": 0.0001430028992240344, "loss": 0.1243, "step": 1279 }, { "epoch": 2.26, "grad_norm": 0.22574763000011444, "learning_rate": 0.0001423508224177474, "loss": 0.1461, "step": 1280 }, { "epoch": 2.27, "grad_norm": 0.2293427437543869, "learning_rate": 0.00014169998885737257, "loss": 0.1612, "step": 1281 }, { "epoch": 2.27, "grad_norm": 0.05698062479496002, "learning_rate": 0.00014105040080531161, "loss": 0.1343, "step": 1282 }, { "epoch": 2.27, "grad_norm": 0.13387730717658997, "learning_rate": 0.00014040206051963678, "loss": 0.1132, "step": 1283 }, { "epoch": 2.27, "grad_norm": 0.24633343517780304, "learning_rate": 0.00013975497025408285, "loss": 0.1122, "step": 1284 }, { "epoch": 2.27, "grad_norm": 0.03243758901953697, "learning_rate": 0.00013910913225803945, "loss": 0.1229, "step": 1285 }, { "epoch": 2.28, "grad_norm": 0.10440926253795624, "learning_rate": 0.00013846454877654318, "loss": 0.1347, "step": 1286 }, { "epoch": 2.28, "grad_norm": 0.1835152506828308, "learning_rate": 0.00013782122205026958, "loss": 0.1554, "step": 1287 }, { "epoch": 2.28, "grad_norm": 0.02530871145427227, "learning_rate": 0.0001371791543155253, "loss": 0.1306, "step": 1288 }, { "epoch": 2.28, "grad_norm": 0.02593812346458435, "learning_rate": 0.00013653834780424112, "loss": 0.1425, "step": 1289 }, { "epoch": 2.28, "grad_norm": 0.13932591676712036, "learning_rate": 0.000135898804743963, "loss": 0.1314, "step": 1290 }, { "epoch": 2.28, "grad_norm": 0.06134510040283203, "learning_rate": 0.00013526052735784538, "loss": 0.1215, "step": 1291 }, { "epoch": 2.29, "grad_norm": 0.12832637131214142, "learning_rate": 0.00013462351786464283, "loss": 0.1293, "step": 1292 }, { "epoch": 2.29, "grad_norm": 0.1946830302476883, "learning_rate": 0.00013398777847870236, "loss": 0.1383, "step": 1293 }, { "epoch": 2.29, "grad_norm": 0.07782842963933945, "learning_rate": 0.00013335331140995626, "loss": 0.1397, "step": 1294 }, { "epoch": 2.29, "grad_norm": 0.12014558911323547, "learning_rate": 0.00013272011886391366, "loss": 0.1342, "step": 1295 }, { "epoch": 2.29, "grad_norm": 0.3111260235309601, "learning_rate": 0.00013208820304165365, "loss": 0.1425, "step": 1296 }, { "epoch": 2.29, "grad_norm": 0.10395296663045883, "learning_rate": 0.0001314575661398168, "loss": 0.1355, "step": 1297 }, { "epoch": 2.3, "grad_norm": 0.14676041901111603, "learning_rate": 0.00013082821035059812, "loss": 0.1315, "step": 1298 }, { "epoch": 2.3, "grad_norm": 0.1285468190908432, "learning_rate": 0.00013020013786173946, "loss": 0.1381, "step": 1299 }, { "epoch": 2.3, "grad_norm": 0.1933162659406662, "learning_rate": 0.0001295733508565213, "loss": 0.1389, "step": 1300 }, { "epoch": 2.3, "grad_norm": 0.10214361548423767, "learning_rate": 0.0001289478515137561, "loss": 0.1317, "step": 1301 }, { "epoch": 2.3, "grad_norm": 0.07621518522500992, "learning_rate": 0.0001283236420077798, "loss": 0.1316, "step": 1302 }, { "epoch": 2.31, "grad_norm": 0.19626323878765106, "learning_rate": 0.0001277007245084446, "loss": 0.1365, "step": 1303 }, { "epoch": 2.31, "grad_norm": 0.14391857385635376, "learning_rate": 0.00012707910118111193, "loss": 0.1284, "step": 1304 }, { "epoch": 2.31, "grad_norm": 0.346746563911438, "learning_rate": 0.0001264587741866439, "loss": 0.1354, "step": 1305 }, { "epoch": 2.31, "grad_norm": 0.14903244376182556, "learning_rate": 0.00012583974568139699, "loss": 0.1351, "step": 1306 }, { "epoch": 2.31, "grad_norm": 0.11248177289962769, "learning_rate": 0.00012522201781721336, "loss": 0.1264, "step": 1307 }, { "epoch": 2.31, "grad_norm": 0.16379296779632568, "learning_rate": 0.00012460559274141407, "loss": 0.1302, "step": 1308 }, { "epoch": 2.32, "grad_norm": 0.07437599450349808, "learning_rate": 0.00012399047259679182, "loss": 0.1368, "step": 1309 }, { "epoch": 2.32, "grad_norm": 0.035767342895269394, "learning_rate": 0.00012337665952160266, "loss": 0.1329, "step": 1310 }, { "epoch": 2.32, "grad_norm": 0.1937914788722992, "learning_rate": 0.00012276415564955952, "loss": 0.1353, "step": 1311 }, { "epoch": 2.32, "grad_norm": 0.08204084634780884, "learning_rate": 0.00012215296310982404, "loss": 0.1349, "step": 1312 }, { "epoch": 2.32, "grad_norm": 0.08185072988271713, "learning_rate": 0.00012154308402699932, "loss": 0.1279, "step": 1313 }, { "epoch": 2.32, "grad_norm": 0.12454118579626083, "learning_rate": 0.00012093452052112308, "loss": 0.143, "step": 1314 }, { "epoch": 2.33, "grad_norm": 0.047802336513996124, "learning_rate": 0.0001203272747076598, "loss": 0.1268, "step": 1315 }, { "epoch": 2.33, "grad_norm": 0.1269330382347107, "learning_rate": 0.00011972134869749324, "loss": 0.1307, "step": 1316 }, { "epoch": 2.33, "grad_norm": 0.04310709610581398, "learning_rate": 0.0001191167445969193, "loss": 0.1361, "step": 1317 }, { "epoch": 2.33, "grad_norm": 0.11329284310340881, "learning_rate": 0.00011851346450763878, "loss": 0.1275, "step": 1318 }, { "epoch": 2.33, "grad_norm": 0.19605499505996704, "learning_rate": 0.0001179115105267502, "loss": 0.1276, "step": 1319 }, { "epoch": 2.34, "grad_norm": 0.07777590304613113, "learning_rate": 0.00011731088474674234, "loss": 0.1266, "step": 1320 }, { "epoch": 2.34, "grad_norm": 0.36323878169059753, "learning_rate": 0.00011671158925548624, "loss": 0.1324, "step": 1321 }, { "epoch": 2.34, "grad_norm": 0.10665354877710342, "learning_rate": 0.0001161136261362296, "loss": 0.1274, "step": 1322 }, { "epoch": 2.34, "grad_norm": 0.10488320887088776, "learning_rate": 0.00011551699746758787, "loss": 0.1244, "step": 1323 }, { "epoch": 2.34, "grad_norm": 0.0471271350979805, "learning_rate": 0.00011492170532353813, "loss": 0.1326, "step": 1324 }, { "epoch": 2.34, "grad_norm": 0.09352532774209976, "learning_rate": 0.00011432775177341165, "loss": 0.1407, "step": 1325 }, { "epoch": 2.35, "grad_norm": 0.03449404612183571, "learning_rate": 0.00011373513888188564, "loss": 0.1328, "step": 1326 }, { "epoch": 2.35, "grad_norm": 0.09462067484855652, "learning_rate": 0.00011314386870897792, "loss": 0.141, "step": 1327 }, { "epoch": 2.35, "grad_norm": 0.06903573870658875, "learning_rate": 0.00011255394331003854, "loss": 0.1169, "step": 1328 }, { "epoch": 2.35, "grad_norm": 0.030416639521718025, "learning_rate": 0.00011196536473574276, "loss": 0.1235, "step": 1329 }, { "epoch": 2.35, "grad_norm": 0.2395412176847458, "learning_rate": 0.00011137813503208399, "loss": 0.145, "step": 1330 }, { "epoch": 2.35, "grad_norm": 0.2817445695400238, "learning_rate": 0.00011079225624036688, "loss": 0.1249, "step": 1331 }, { "epoch": 2.36, "grad_norm": 0.17817607522010803, "learning_rate": 0.00011020773039720017, "loss": 0.1458, "step": 1332 }, { "epoch": 2.36, "grad_norm": 0.03026886098086834, "learning_rate": 0.00010962455953448952, "loss": 0.1258, "step": 1333 }, { "epoch": 2.36, "grad_norm": 0.07572410255670547, "learning_rate": 0.0001090427456794304, "loss": 0.1217, "step": 1334 }, { "epoch": 2.36, "grad_norm": 0.06871062517166138, "learning_rate": 0.00010846229085450099, "loss": 0.1402, "step": 1335 }, { "epoch": 2.36, "grad_norm": 0.2113131284713745, "learning_rate": 0.00010788319707745525, "loss": 0.1276, "step": 1336 }, { "epoch": 2.37, "grad_norm": 0.20129992067813873, "learning_rate": 0.00010730546636131621, "loss": 0.1118, "step": 1337 }, { "epoch": 2.37, "grad_norm": 0.05704864114522934, "learning_rate": 0.00010672910071436865, "loss": 0.1203, "step": 1338 }, { "epoch": 2.37, "grad_norm": 0.058348219841718674, "learning_rate": 0.00010615410214015186, "loss": 0.1218, "step": 1339 }, { "epoch": 2.37, "grad_norm": 0.15510503947734833, "learning_rate": 0.00010558047263745297, "loss": 0.1207, "step": 1340 }, { "epoch": 2.37, "grad_norm": 0.2903478741645813, "learning_rate": 0.00010500821420030049, "loss": 0.1317, "step": 1341 }, { "epoch": 2.37, "grad_norm": 0.18940383195877075, "learning_rate": 0.00010443732881795614, "loss": 0.1333, "step": 1342 }, { "epoch": 2.38, "grad_norm": 0.08819776773452759, "learning_rate": 0.0001038678184749095, "loss": 0.1396, "step": 1343 }, { "epoch": 2.38, "grad_norm": 0.09822792559862137, "learning_rate": 0.00010329968515086969, "loss": 0.1326, "step": 1344 }, { "epoch": 2.38, "grad_norm": 0.06949262320995331, "learning_rate": 0.00010273293082075913, "loss": 0.1459, "step": 1345 }, { "epoch": 2.38, "grad_norm": 0.1197618618607521, "learning_rate": 0.00010216755745470701, "loss": 0.1171, "step": 1346 }, { "epoch": 2.38, "grad_norm": 0.12239718437194824, "learning_rate": 0.00010160356701804169, "loss": 0.1496, "step": 1347 }, { "epoch": 2.38, "grad_norm": 0.04563026875257492, "learning_rate": 0.00010104096147128455, "loss": 0.1425, "step": 1348 }, { "epoch": 2.39, "grad_norm": 0.14311982691287994, "learning_rate": 0.00010047974277014266, "loss": 0.1313, "step": 1349 }, { "epoch": 2.39, "grad_norm": 0.05187974497675896, "learning_rate": 9.991991286550207e-05, "loss": 0.1278, "step": 1350 }, { "epoch": 2.39, "grad_norm": 0.14110100269317627, "learning_rate": 9.936147370342164e-05, "loss": 0.1132, "step": 1351 }, { "epoch": 2.39, "grad_norm": 0.05993328616023064, "learning_rate": 9.880442722512518e-05, "loss": 0.1409, "step": 1352 }, { "epoch": 2.39, "grad_norm": 0.12421682476997375, "learning_rate": 9.824877536699584e-05, "loss": 0.1564, "step": 1353 }, { "epoch": 2.4, "grad_norm": 0.11717087775468826, "learning_rate": 9.769452006056856e-05, "loss": 0.1205, "step": 1354 }, { "epoch": 2.4, "grad_norm": 0.13294954597949982, "learning_rate": 9.714166323252349e-05, "loss": 0.1439, "step": 1355 }, { "epoch": 2.4, "grad_norm": 0.03808373585343361, "learning_rate": 9.659020680468e-05, "loss": 0.1139, "step": 1356 }, { "epoch": 2.4, "grad_norm": 0.07248345017433167, "learning_rate": 9.604015269398874e-05, "loss": 0.1314, "step": 1357 }, { "epoch": 2.4, "grad_norm": 0.16012993454933167, "learning_rate": 9.549150281252633e-05, "loss": 0.1408, "step": 1358 }, { "epoch": 2.4, "grad_norm": 0.07186062633991241, "learning_rate": 9.49442590674876e-05, "loss": 0.1515, "step": 1359 }, { "epoch": 2.41, "grad_norm": 0.06986256688833237, "learning_rate": 9.439842336117954e-05, "loss": 0.1054, "step": 1360 }, { "epoch": 2.41, "grad_norm": 0.06326832622289658, "learning_rate": 9.385399759101481e-05, "loss": 0.121, "step": 1361 }, { "epoch": 2.41, "grad_norm": 0.2067011445760727, "learning_rate": 9.331098364950453e-05, "loss": 0.094, "step": 1362 }, { "epoch": 2.41, "grad_norm": 0.03115193173289299, "learning_rate": 9.276938342425245e-05, "loss": 0.1194, "step": 1363 }, { "epoch": 2.41, "grad_norm": 0.050981197506189346, "learning_rate": 9.222919879794772e-05, "loss": 0.1483, "step": 1364 }, { "epoch": 2.41, "grad_norm": 0.15227286517620087, "learning_rate": 9.169043164835867e-05, "loss": 0.1301, "step": 1365 }, { "epoch": 2.42, "grad_norm": 0.18392544984817505, "learning_rate": 9.115308384832638e-05, "loss": 0.1309, "step": 1366 }, { "epoch": 2.42, "grad_norm": 0.09350251406431198, "learning_rate": 9.061715726575825e-05, "loss": 0.1098, "step": 1367 }, { "epoch": 2.42, "grad_norm": 0.19781382381916046, "learning_rate": 9.008265376362079e-05, "loss": 0.0971, "step": 1368 }, { "epoch": 2.42, "grad_norm": 0.06422741711139679, "learning_rate": 8.954957519993401e-05, "loss": 0.116, "step": 1369 }, { "epoch": 2.42, "grad_norm": 0.04665152728557587, "learning_rate": 8.901792342776438e-05, "loss": 0.102, "step": 1370 }, { "epoch": 2.43, "grad_norm": 0.23038771748542786, "learning_rate": 8.848770029521874e-05, "loss": 0.1434, "step": 1371 }, { "epoch": 2.43, "grad_norm": 0.06798720359802246, "learning_rate": 8.795890764543818e-05, "loss": 0.1251, "step": 1372 }, { "epoch": 2.43, "grad_norm": 0.11675478518009186, "learning_rate": 8.74315473165902e-05, "loss": 0.1236, "step": 1373 }, { "epoch": 2.43, "grad_norm": 0.0576016791164875, "learning_rate": 8.690562114186423e-05, "loss": 0.1292, "step": 1374 }, { "epoch": 2.43, "grad_norm": 0.060148756951093674, "learning_rate": 8.638113094946381e-05, "loss": 0.0988, "step": 1375 }, { "epoch": 2.43, "grad_norm": 0.18434777855873108, "learning_rate": 8.585807856260108e-05, "loss": 0.1204, "step": 1376 }, { "epoch": 2.44, "grad_norm": 0.20784330368041992, "learning_rate": 8.533646579949034e-05, "loss": 0.1306, "step": 1377 }, { "epoch": 2.44, "grad_norm": 0.14157481491565704, "learning_rate": 8.481629447334066e-05, "loss": 0.1467, "step": 1378 }, { "epoch": 2.44, "grad_norm": 0.06923433393239975, "learning_rate": 8.429756639235136e-05, "loss": 0.1189, "step": 1379 }, { "epoch": 2.44, "grad_norm": 0.24702174961566925, "learning_rate": 8.37802833597045e-05, "loss": 0.1472, "step": 1380 }, { "epoch": 2.44, "grad_norm": 0.0554979182779789, "learning_rate": 8.326444717355874e-05, "loss": 0.1087, "step": 1381 }, { "epoch": 2.44, "grad_norm": 0.11092767864465714, "learning_rate": 8.275005962704346e-05, "loss": 0.1191, "step": 1382 }, { "epoch": 2.45, "grad_norm": 0.09958731383085251, "learning_rate": 8.223712250825216e-05, "loss": 0.1345, "step": 1383 }, { "epoch": 2.45, "grad_norm": 0.19082792103290558, "learning_rate": 8.172563760023665e-05, "loss": 0.1056, "step": 1384 }, { "epoch": 2.45, "grad_norm": 0.06456390023231506, "learning_rate": 8.121560668100064e-05, "loss": 0.1055, "step": 1385 }, { "epoch": 2.45, "grad_norm": 0.10609104484319687, "learning_rate": 8.070703152349334e-05, "loss": 0.1202, "step": 1386 }, { "epoch": 2.45, "grad_norm": 0.13664673268795013, "learning_rate": 8.019991389560349e-05, "loss": 0.1486, "step": 1387 }, { "epoch": 2.46, "grad_norm": 0.24724584817886353, "learning_rate": 7.969425556015325e-05, "loss": 0.1453, "step": 1388 }, { "epoch": 2.46, "grad_norm": 0.07616293430328369, "learning_rate": 7.919005827489228e-05, "loss": 0.1203, "step": 1389 }, { "epoch": 2.46, "grad_norm": 0.10011781007051468, "learning_rate": 7.868732379249122e-05, "loss": 0.0969, "step": 1390 }, { "epoch": 2.46, "grad_norm": 0.19106784462928772, "learning_rate": 7.818605386053573e-05, "loss": 0.1426, "step": 1391 }, { "epoch": 2.46, "grad_norm": 0.17057418823242188, "learning_rate": 7.768625022152037e-05, "loss": 0.1412, "step": 1392 }, { "epoch": 2.46, "grad_norm": 0.10925137251615524, "learning_rate": 7.718791461284303e-05, "loss": 0.1177, "step": 1393 }, { "epoch": 2.47, "grad_norm": 0.16444699466228485, "learning_rate": 7.669104876679795e-05, "loss": 0.1238, "step": 1394 }, { "epoch": 2.47, "grad_norm": 0.09140460193157196, "learning_rate": 7.619565441057075e-05, "loss": 0.127, "step": 1395 }, { "epoch": 2.47, "grad_norm": 0.2647579312324524, "learning_rate": 7.570173326623153e-05, "loss": 0.1404, "step": 1396 }, { "epoch": 2.47, "grad_norm": 0.11721226572990417, "learning_rate": 7.520928705072938e-05, "loss": 0.1432, "step": 1397 }, { "epoch": 2.47, "grad_norm": 0.08702779561281204, "learning_rate": 7.471831747588653e-05, "loss": 0.0986, "step": 1398 }, { "epoch": 2.48, "grad_norm": 0.09286800026893616, "learning_rate": 7.422882624839178e-05, "loss": 0.1113, "step": 1399 }, { "epoch": 2.48, "grad_norm": 0.10280603915452957, "learning_rate": 7.37408150697953e-05, "loss": 0.1048, "step": 1400 }, { "epoch": 2.48, "grad_norm": 0.20054741203784943, "learning_rate": 7.325428563650222e-05, "loss": 0.1496, "step": 1401 }, { "epoch": 2.48, "grad_norm": 0.21980531513690948, "learning_rate": 7.276923963976667e-05, "loss": 0.1246, "step": 1402 }, { "epoch": 2.48, "grad_norm": 0.16484405100345612, "learning_rate": 7.228567876568665e-05, "loss": 0.1257, "step": 1403 }, { "epoch": 2.48, "grad_norm": 0.219330832362175, "learning_rate": 7.180360469519714e-05, "loss": 0.1329, "step": 1404 }, { "epoch": 2.49, "grad_norm": 0.17407891154289246, "learning_rate": 7.132301910406502e-05, "loss": 0.1585, "step": 1405 }, { "epoch": 2.49, "grad_norm": 0.09703540056943893, "learning_rate": 7.084392366288295e-05, "loss": 0.1161, "step": 1406 }, { "epoch": 2.49, "grad_norm": 0.21986158192157745, "learning_rate": 7.036632003706328e-05, "loss": 0.1408, "step": 1407 }, { "epoch": 2.49, "grad_norm": 0.16698057949543, "learning_rate": 6.989020988683314e-05, "loss": 0.1183, "step": 1408 }, { "epoch": 2.49, "grad_norm": 0.09089485555887222, "learning_rate": 6.941559486722748e-05, "loss": 0.1138, "step": 1409 }, { "epoch": 2.49, "grad_norm": 0.3572080731391907, "learning_rate": 6.894247662808457e-05, "loss": 0.1148, "step": 1410 }, { "epoch": 2.5, "grad_norm": 0.21670278906822205, "learning_rate": 6.847085681403914e-05, "loss": 0.1081, "step": 1411 }, { "epoch": 2.5, "grad_norm": 0.1353166103363037, "learning_rate": 6.800073706451721e-05, "loss": 0.1335, "step": 1412 }, { "epoch": 2.5, "grad_norm": 0.10845667868852615, "learning_rate": 6.753211901373064e-05, "loss": 0.1462, "step": 1413 }, { "epoch": 2.5, "grad_norm": 0.13182678818702698, "learning_rate": 6.706500429067075e-05, "loss": 0.137, "step": 1414 }, { "epoch": 2.5, "grad_norm": 0.11666212975978851, "learning_rate": 6.659939451910341e-05, "loss": 0.1245, "step": 1415 }, { "epoch": 2.51, "grad_norm": 0.19230018556118011, "learning_rate": 6.613529131756285e-05, "loss": 0.1392, "step": 1416 }, { "epoch": 2.51, "grad_norm": 0.12369673699140549, "learning_rate": 6.567269629934613e-05, "loss": 0.152, "step": 1417 }, { "epoch": 2.51, "grad_norm": 0.4312630891799927, "learning_rate": 6.521161107250778e-05, "loss": 0.1539, "step": 1418 }, { "epoch": 2.51, "grad_norm": 0.12075989693403244, "learning_rate": 6.475203723985418e-05, "loss": 0.1429, "step": 1419 }, { "epoch": 2.51, "grad_norm": 0.19320321083068848, "learning_rate": 6.429397639893758e-05, "loss": 0.146, "step": 1420 }, { "epoch": 2.51, "eval_loss": 0.13353538513183594, "eval_runtime": 14.1007, "eval_samples_per_second": 33.828, "eval_steps_per_second": 8.51, "step": 1420 }, { "epoch": 2.51, "grad_norm": 2.2152562141418457, "learning_rate": 6.38374301420509e-05, "loss": 0.1597, "step": 1421 }, { "epoch": 2.52, "grad_norm": 0.04021691530942917, "learning_rate": 6.338240005622209e-05, "loss": 0.1246, "step": 1422 }, { "epoch": 2.52, "grad_norm": 0.06637288630008698, "learning_rate": 6.292888772320882e-05, "loss": 0.1187, "step": 1423 }, { "epoch": 2.52, "grad_norm": 0.16920767724514008, "learning_rate": 6.247689471949291e-05, "loss": 0.1268, "step": 1424 }, { "epoch": 2.52, "grad_norm": 0.22192606329917908, "learning_rate": 6.202642261627411e-05, "loss": 0.1477, "step": 1425 }, { "epoch": 2.52, "grad_norm": 0.08232539147138596, "learning_rate": 6.157747297946608e-05, "loss": 0.1264, "step": 1426 }, { "epoch": 2.52, "grad_norm": 0.08191093057394028, "learning_rate": 6.113004736968953e-05, "loss": 0.1459, "step": 1427 }, { "epoch": 2.53, "grad_norm": 0.12319450080394745, "learning_rate": 6.068414734226774e-05, "loss": 0.105, "step": 1428 }, { "epoch": 2.53, "grad_norm": 0.2650308907032013, "learning_rate": 6.023977444722095e-05, "loss": 0.1232, "step": 1429 }, { "epoch": 2.53, "grad_norm": 0.13269290328025818, "learning_rate": 5.979693022926025e-05, "loss": 0.1118, "step": 1430 }, { "epoch": 2.53, "grad_norm": 0.145817831158638, "learning_rate": 5.935561622778335e-05, "loss": 0.1142, "step": 1431 }, { "epoch": 2.53, "grad_norm": 0.09145599603652954, "learning_rate": 5.891583397686862e-05, "loss": 0.1285, "step": 1432 }, { "epoch": 2.54, "grad_norm": 0.11915401369333267, "learning_rate": 5.8477585005269564e-05, "loss": 0.1172, "step": 1433 }, { "epoch": 2.54, "grad_norm": 0.13061662018299103, "learning_rate": 5.804087083641002e-05, "loss": 0.1401, "step": 1434 }, { "epoch": 2.54, "grad_norm": 0.08747705817222595, "learning_rate": 5.760569298837826e-05, "loss": 0.1315, "step": 1435 }, { "epoch": 2.54, "grad_norm": 0.08562975376844406, "learning_rate": 5.717205297392247e-05, "loss": 0.122, "step": 1436 }, { "epoch": 2.54, "grad_norm": 0.12241534143686295, "learning_rate": 5.673995230044498e-05, "loss": 0.1247, "step": 1437 }, { "epoch": 2.54, "grad_norm": 0.044354431331157684, "learning_rate": 5.6309392469996944e-05, "loss": 0.123, "step": 1438 }, { "epoch": 2.55, "grad_norm": 0.06913258135318756, "learning_rate": 5.5880374979273395e-05, "loss": 0.1275, "step": 1439 }, { "epoch": 2.55, "grad_norm": 0.0650060847401619, "learning_rate": 5.5452901319607894e-05, "loss": 0.1066, "step": 1440 }, { "epoch": 2.55, "grad_norm": 0.14583538472652435, "learning_rate": 5.502697297696746e-05, "loss": 0.157, "step": 1441 }, { "epoch": 2.55, "grad_norm": 0.12845255434513092, "learning_rate": 5.4602591431947514e-05, "loss": 0.1498, "step": 1442 }, { "epoch": 2.55, "grad_norm": 0.08573470264673233, "learning_rate": 5.417975815976628e-05, "loss": 0.1014, "step": 1443 }, { "epoch": 2.55, "grad_norm": 0.21065488457679749, "learning_rate": 5.37584746302599e-05, "loss": 0.168, "step": 1444 }, { "epoch": 2.56, "grad_norm": 0.06427565217018127, "learning_rate": 5.333874230787772e-05, "loss": 0.1166, "step": 1445 }, { "epoch": 2.56, "grad_norm": 0.15018604695796967, "learning_rate": 5.292056265167644e-05, "loss": 0.1257, "step": 1446 }, { "epoch": 2.56, "grad_norm": 0.17715144157409668, "learning_rate": 5.2503937115315816e-05, "loss": 0.1457, "step": 1447 }, { "epoch": 2.56, "grad_norm": 0.19283372163772583, "learning_rate": 5.208886714705291e-05, "loss": 0.1431, "step": 1448 }, { "epoch": 2.56, "grad_norm": 0.1359899789094925, "learning_rate": 5.16753541897374e-05, "loss": 0.1388, "step": 1449 }, { "epoch": 2.57, "grad_norm": 0.09452392160892487, "learning_rate": 5.126339968080695e-05, "loss": 0.1245, "step": 1450 }, { "epoch": 2.57, "grad_norm": 0.043228164315223694, "learning_rate": 5.085300505228124e-05, "loss": 0.1291, "step": 1451 }, { "epoch": 2.57, "grad_norm": 0.04729638248682022, "learning_rate": 5.0444171730758046e-05, "loss": 0.1374, "step": 1452 }, { "epoch": 2.57, "grad_norm": 0.1768418550491333, "learning_rate": 5.00369011374075e-05, "loss": 0.1232, "step": 1453 }, { "epoch": 2.57, "grad_norm": 0.040266744792461395, "learning_rate": 4.9631194687967394e-05, "loss": 0.1337, "step": 1454 }, { "epoch": 2.57, "grad_norm": 0.0584387332201004, "learning_rate": 4.9227053792738616e-05, "loss": 0.151, "step": 1455 }, { "epoch": 2.58, "grad_norm": 0.06620542705059052, "learning_rate": 4.882447985657956e-05, "loss": 0.1306, "step": 1456 }, { "epoch": 2.58, "grad_norm": 0.2833789885044098, "learning_rate": 4.842347427890198e-05, "loss": 0.149, "step": 1457 }, { "epoch": 2.58, "grad_norm": 0.12224625051021576, "learning_rate": 4.8024038453665544e-05, "loss": 0.1403, "step": 1458 }, { "epoch": 2.58, "grad_norm": 0.027846721932291985, "learning_rate": 4.762617376937312e-05, "loss": 0.1287, "step": 1459 }, { "epoch": 2.58, "grad_norm": 0.26311802864074707, "learning_rate": 4.722988160906638e-05, "loss": 0.1335, "step": 1460 }, { "epoch": 2.58, "grad_norm": 0.3691308796405792, "learning_rate": 4.6835163350320176e-05, "loss": 0.145, "step": 1461 }, { "epoch": 2.59, "grad_norm": 0.028389999642968178, "learning_rate": 4.644202036523881e-05, "loss": 0.1248, "step": 1462 }, { "epoch": 2.59, "grad_norm": 0.04470786452293396, "learning_rate": 4.605045402045022e-05, "loss": 0.1381, "step": 1463 }, { "epoch": 2.59, "grad_norm": 0.21815143525600433, "learning_rate": 4.566046567710169e-05, "loss": 0.1258, "step": 1464 }, { "epoch": 2.59, "grad_norm": 0.14745290577411652, "learning_rate": 4.527205669085549e-05, "loss": 0.1403, "step": 1465 }, { "epoch": 2.59, "grad_norm": 0.2701740562915802, "learning_rate": 4.488522841188336e-05, "loss": 0.1228, "step": 1466 }, { "epoch": 2.6, "grad_norm": 0.1660449504852295, "learning_rate": 4.449998218486262e-05, "loss": 0.1529, "step": 1467 }, { "epoch": 2.6, "grad_norm": 0.060599759221076965, "learning_rate": 4.411631934897092e-05, "loss": 0.1211, "step": 1468 }, { "epoch": 2.6, "grad_norm": 0.2801418900489807, "learning_rate": 4.3734241237881666e-05, "loss": 0.137, "step": 1469 }, { "epoch": 2.6, "grad_norm": 0.06915499269962311, "learning_rate": 4.335374917975982e-05, "loss": 0.1358, "step": 1470 }, { "epoch": 2.6, "grad_norm": 0.12740154564380646, "learning_rate": 4.297484449725691e-05, "loss": 0.1347, "step": 1471 }, { "epoch": 2.6, "grad_norm": 0.1669289469718933, "learning_rate": 4.259752850750609e-05, "loss": 0.1301, "step": 1472 }, { "epoch": 2.61, "grad_norm": 0.07437172532081604, "learning_rate": 4.222180252211849e-05, "loss": 0.119, "step": 1473 }, { "epoch": 2.61, "grad_norm": 0.08567313104867935, "learning_rate": 4.184766784717775e-05, "loss": 0.1128, "step": 1474 }, { "epoch": 2.61, "grad_norm": 0.11972495913505554, "learning_rate": 4.147512578323615e-05, "loss": 0.1355, "step": 1475 }, { "epoch": 2.61, "grad_norm": 0.0828404352068901, "learning_rate": 4.110417762530977e-05, "loss": 0.1511, "step": 1476 }, { "epoch": 2.61, "grad_norm": 0.09823042154312134, "learning_rate": 4.073482466287359e-05, "loss": 0.1237, "step": 1477 }, { "epoch": 2.61, "grad_norm": 0.039382707327604294, "learning_rate": 4.036706817985802e-05, "loss": 0.1182, "step": 1478 }, { "epoch": 2.62, "grad_norm": 0.09079232066869736, "learning_rate": 4.0000909454643406e-05, "loss": 0.1207, "step": 1479 }, { "epoch": 2.62, "grad_norm": 0.15438657999038696, "learning_rate": 3.9636349760056425e-05, "loss": 0.1229, "step": 1480 }, { "epoch": 2.62, "grad_norm": 0.09138775616884232, "learning_rate": 3.927339036336486e-05, "loss": 0.1291, "step": 1481 }, { "epoch": 2.62, "grad_norm": 0.06647726148366928, "learning_rate": 3.8912032526273846e-05, "loss": 0.13, "step": 1482 }, { "epoch": 2.62, "grad_norm": 0.13314370810985565, "learning_rate": 3.855227750492118e-05, "loss": 0.1367, "step": 1483 }, { "epoch": 2.63, "grad_norm": 0.09128770977258682, "learning_rate": 3.819412654987314e-05, "loss": 0.1164, "step": 1484 }, { "epoch": 2.63, "grad_norm": 0.03929639607667923, "learning_rate": 3.783758090611983e-05, "loss": 0.1463, "step": 1485 }, { "epoch": 2.63, "grad_norm": 0.06576254218816757, "learning_rate": 3.748264181307109e-05, "loss": 0.1158, "step": 1486 }, { "epoch": 2.63, "grad_norm": 0.1495221108198166, "learning_rate": 3.712931050455204e-05, "loss": 0.1234, "step": 1487 }, { "epoch": 2.63, "grad_norm": 0.1673547774553299, "learning_rate": 3.6777588208799116e-05, "loss": 0.1176, "step": 1488 }, { "epoch": 2.63, "grad_norm": 0.14675819873809814, "learning_rate": 3.6427476148455484e-05, "loss": 0.1317, "step": 1489 }, { "epoch": 2.64, "grad_norm": 0.07739079743623734, "learning_rate": 3.607897554056672e-05, "loss": 0.1235, "step": 1490 }, { "epoch": 2.64, "grad_norm": 0.048171430826187134, "learning_rate": 3.5732087596576866e-05, "loss": 0.1408, "step": 1491 }, { "epoch": 2.64, "grad_norm": 0.04192957654595375, "learning_rate": 3.538681352232403e-05, "loss": 0.1361, "step": 1492 }, { "epoch": 2.64, "grad_norm": 0.0708787590265274, "learning_rate": 3.50431545180363e-05, "loss": 0.1345, "step": 1493 }, { "epoch": 2.64, "grad_norm": 0.047831833362579346, "learning_rate": 3.470111177832758e-05, "loss": 0.1179, "step": 1494 }, { "epoch": 2.64, "grad_norm": 0.07011017948389053, "learning_rate": 3.436068649219326e-05, "loss": 0.1206, "step": 1495 }, { "epoch": 2.65, "grad_norm": 0.20468172430992126, "learning_rate": 3.402187984300614e-05, "loss": 0.1155, "step": 1496 }, { "epoch": 2.65, "grad_norm": 0.5008040070533752, "learning_rate": 3.368469300851262e-05, "loss": 0.1518, "step": 1497 }, { "epoch": 2.65, "grad_norm": 0.12470618635416031, "learning_rate": 3.334912716082811e-05, "loss": 0.1476, "step": 1498 }, { "epoch": 2.65, "grad_norm": 0.04671414569020271, "learning_rate": 3.30151834664334e-05, "loss": 0.1516, "step": 1499 }, { "epoch": 2.65, "grad_norm": 0.06154852360486984, "learning_rate": 3.2682863086170414e-05, "loss": 0.1419, "step": 1500 }, { "epoch": 2.66, "grad_norm": 0.13698704540729523, "learning_rate": 3.235216717523787e-05, "loss": 0.14, "step": 1501 }, { "epoch": 2.66, "grad_norm": 0.25294530391693115, "learning_rate": 3.2023096883188e-05, "loss": 0.1392, "step": 1502 }, { "epoch": 2.66, "grad_norm": 0.0433129258453846, "learning_rate": 3.169565335392183e-05, "loss": 0.1203, "step": 1503 }, { "epoch": 2.66, "grad_norm": 0.10114753991365433, "learning_rate": 3.136983772568569e-05, "loss": 0.1179, "step": 1504 }, { "epoch": 2.66, "grad_norm": 0.058237019926309586, "learning_rate": 3.104565113106689e-05, "loss": 0.1491, "step": 1505 }, { "epoch": 2.66, "grad_norm": 0.07598286867141724, "learning_rate": 3.0723094696990027e-05, "loss": 0.1117, "step": 1506 }, { "epoch": 2.67, "grad_norm": 0.13283759355545044, "learning_rate": 3.040216954471309e-05, "loss": 0.1409, "step": 1507 }, { "epoch": 2.67, "grad_norm": 0.1821421980857849, "learning_rate": 3.0082876789823245e-05, "loss": 0.1257, "step": 1508 }, { "epoch": 2.67, "grad_norm": 0.04798609018325806, "learning_rate": 2.9765217542233438e-05, "loss": 0.1253, "step": 1509 }, { "epoch": 2.67, "grad_norm": 0.3384339213371277, "learning_rate": 2.9449192906178203e-05, "loss": 0.1432, "step": 1510 }, { "epoch": 2.67, "grad_norm": 0.08849462866783142, "learning_rate": 2.9134803980209734e-05, "loss": 0.1336, "step": 1511 }, { "epoch": 2.67, "grad_norm": 0.11560472846031189, "learning_rate": 2.88220518571945e-05, "loss": 0.11, "step": 1512 }, { "epoch": 2.68, "grad_norm": 0.2896404266357422, "learning_rate": 2.8510937624308954e-05, "loss": 0.163, "step": 1513 }, { "epoch": 2.68, "grad_norm": 0.15655553340911865, "learning_rate": 2.8201462363036112e-05, "loss": 0.1396, "step": 1514 }, { "epoch": 2.68, "grad_norm": 0.05299900099635124, "learning_rate": 2.7893627149161717e-05, "loss": 0.115, "step": 1515 }, { "epoch": 2.68, "grad_norm": 0.08437127619981766, "learning_rate": 2.7587433052770115e-05, "loss": 0.1149, "step": 1516 }, { "epoch": 2.68, "grad_norm": 0.18013562262058258, "learning_rate": 2.72828811382414e-05, "loss": 0.1164, "step": 1517 }, { "epoch": 2.69, "grad_norm": 0.09165755659341812, "learning_rate": 2.6979972464246604e-05, "loss": 0.1474, "step": 1518 }, { "epoch": 2.69, "grad_norm": 0.16448521614074707, "learning_rate": 2.667870808374506e-05, "loss": 0.1372, "step": 1519 }, { "epoch": 2.69, "grad_norm": 0.06156858801841736, "learning_rate": 2.6379089043980064e-05, "loss": 0.1421, "step": 1520 }, { "epoch": 2.69, "grad_norm": 0.13896550238132477, "learning_rate": 2.6081116386475313e-05, "loss": 0.1283, "step": 1521 }, { "epoch": 2.69, "grad_norm": 0.12410745024681091, "learning_rate": 2.5784791147031638e-05, "loss": 0.1273, "step": 1522 }, { "epoch": 2.69, "grad_norm": 0.0802113488316536, "learning_rate": 2.5490114355723294e-05, "loss": 0.1134, "step": 1523 }, { "epoch": 2.7, "grad_norm": 0.22037829458713531, "learning_rate": 2.5197087036893774e-05, "loss": 0.1457, "step": 1524 }, { "epoch": 2.7, "grad_norm": 0.09046395123004913, "learning_rate": 2.490571020915322e-05, "loss": 0.1279, "step": 1525 }, { "epoch": 2.7, "grad_norm": 0.08439705520868301, "learning_rate": 2.4615984885374143e-05, "loss": 0.0983, "step": 1526 }, { "epoch": 2.7, "grad_norm": 0.311358243227005, "learning_rate": 2.4327912072688306e-05, "loss": 0.1276, "step": 1527 }, { "epoch": 2.7, "grad_norm": 0.18210773169994354, "learning_rate": 2.404149277248313e-05, "loss": 0.1321, "step": 1528 }, { "epoch": 2.7, "grad_norm": 0.17474836111068726, "learning_rate": 2.3756727980397742e-05, "loss": 0.1602, "step": 1529 }, { "epoch": 2.71, "grad_norm": 0.040003515779972076, "learning_rate": 2.3473618686320474e-05, "loss": 0.1371, "step": 1530 }, { "epoch": 2.71, "grad_norm": 0.25132429599761963, "learning_rate": 2.319216587438455e-05, "loss": 0.1293, "step": 1531 }, { "epoch": 2.71, "grad_norm": 0.07414573431015015, "learning_rate": 2.291237052296513e-05, "loss": 0.1316, "step": 1532 }, { "epoch": 2.71, "grad_norm": 0.050560012459754944, "learning_rate": 2.2634233604675812e-05, "loss": 0.1191, "step": 1533 }, { "epoch": 2.71, "grad_norm": 0.048617489635944366, "learning_rate": 2.2357756086364924e-05, "loss": 0.1106, "step": 1534 }, { "epoch": 2.72, "grad_norm": 0.09126359969377518, "learning_rate": 2.2082938929112838e-05, "loss": 0.1422, "step": 1535 }, { "epoch": 2.72, "grad_norm": 0.09898441284894943, "learning_rate": 2.180978308822812e-05, "loss": 0.1145, "step": 1536 }, { "epoch": 2.72, "grad_norm": 0.2153613418340683, "learning_rate": 2.1538289513244212e-05, "loss": 0.1339, "step": 1537 }, { "epoch": 2.72, "grad_norm": 0.07101116329431534, "learning_rate": 2.126845914791631e-05, "loss": 0.1397, "step": 1538 }, { "epoch": 2.72, "grad_norm": 0.23921048641204834, "learning_rate": 2.100029293021799e-05, "loss": 0.113, "step": 1539 }, { "epoch": 2.72, "grad_norm": 0.16103985905647278, "learning_rate": 2.0733791792338197e-05, "loss": 0.1372, "step": 1540 }, { "epoch": 2.73, "grad_norm": 0.05808824300765991, "learning_rate": 2.046895666067755e-05, "loss": 0.1549, "step": 1541 }, { "epoch": 2.73, "grad_norm": 0.044097110629081726, "learning_rate": 2.0205788455845474e-05, "loss": 0.1337, "step": 1542 }, { "epoch": 2.73, "grad_norm": 0.16746556758880615, "learning_rate": 1.9944288092656858e-05, "loss": 0.1096, "step": 1543 }, { "epoch": 2.73, "grad_norm": 0.3115890920162201, "learning_rate": 1.9684456480128843e-05, "loss": 0.1437, "step": 1544 }, { "epoch": 2.73, "grad_norm": 0.12141338735818863, "learning_rate": 1.942629452147787e-05, "loss": 0.1322, "step": 1545 }, { "epoch": 2.74, "grad_norm": 0.13122229278087616, "learning_rate": 1.9169803114116313e-05, "loss": 0.1057, "step": 1546 }, { "epoch": 2.74, "grad_norm": 0.12674184143543243, "learning_rate": 1.8914983149649513e-05, "loss": 0.1004, "step": 1547 }, { "epoch": 2.74, "grad_norm": 0.18313969671726227, "learning_rate": 1.866183551387235e-05, "loss": 0.1122, "step": 1548 }, { "epoch": 2.74, "grad_norm": 0.12356254458427429, "learning_rate": 1.84103610867668e-05, "loss": 0.1362, "step": 1549 }, { "epoch": 2.74, "grad_norm": 0.09115366637706757, "learning_rate": 1.8160560742498223e-05, "loss": 0.131, "step": 1550 }, { "epoch": 2.74, "grad_norm": 0.03777840733528137, "learning_rate": 1.7912435349412726e-05, "loss": 0.1339, "step": 1551 }, { "epoch": 2.75, "grad_norm": 0.026918258517980576, "learning_rate": 1.7665985770033975e-05, "loss": 0.1414, "step": 1552 }, { "epoch": 2.75, "grad_norm": 0.06117779389023781, "learning_rate": 1.7421212861060133e-05, "loss": 0.1121, "step": 1553 }, { "epoch": 2.75, "grad_norm": 0.05697258189320564, "learning_rate": 1.7178117473361287e-05, "loss": 0.1523, "step": 1554 }, { "epoch": 2.75, "grad_norm": 0.14216284453868866, "learning_rate": 1.693670045197582e-05, "loss": 0.1354, "step": 1555 }, { "epoch": 2.75, "grad_norm": 0.1293654590845108, "learning_rate": 1.669696263610815e-05, "loss": 0.1466, "step": 1556 }, { "epoch": 2.75, "grad_norm": 0.09816037863492966, "learning_rate": 1.6458904859125322e-05, "loss": 0.1344, "step": 1557 }, { "epoch": 2.76, "grad_norm": 0.1260695606470108, "learning_rate": 1.622252794855433e-05, "loss": 0.1464, "step": 1558 }, { "epoch": 2.76, "grad_norm": 0.04835040122270584, "learning_rate": 1.5987832726079343e-05, "loss": 0.1292, "step": 1559 }, { "epoch": 2.76, "grad_norm": 0.1723223179578781, "learning_rate": 1.5754820007538473e-05, "loss": 0.1474, "step": 1560 }, { "epoch": 2.76, "grad_norm": 0.04331492260098457, "learning_rate": 1.5523490602921353e-05, "loss": 0.1513, "step": 1561 }, { "epoch": 2.76, "grad_norm": 0.18255582451820374, "learning_rate": 1.5293845316366185e-05, "loss": 0.1502, "step": 1562 }, { "epoch": 2.76, "eval_loss": 0.13326887786388397, "eval_runtime": 14.1351, "eval_samples_per_second": 33.746, "eval_steps_per_second": 8.489, "step": 1562 }, { "epoch": 2.77, "grad_norm": 0.3445633351802826, "learning_rate": 1.5065884946156682e-05, "loss": 0.1229, "step": 1563 }, { "epoch": 2.77, "grad_norm": 0.0768701359629631, "learning_rate": 1.483961028471975e-05, "loss": 0.1318, "step": 1564 }, { "epoch": 2.77, "grad_norm": 0.09043741226196289, "learning_rate": 1.4615022118622368e-05, "loss": 0.1038, "step": 1565 }, { "epoch": 2.77, "grad_norm": 0.12818877398967743, "learning_rate": 1.4392121228569088e-05, "loss": 0.121, "step": 1566 }, { "epoch": 2.77, "grad_norm": 0.08737614750862122, "learning_rate": 1.4170908389399107e-05, "loss": 0.1169, "step": 1567 }, { "epoch": 2.77, "grad_norm": 0.041610416024923325, "learning_rate": 1.3951384370083697e-05, "loss": 0.139, "step": 1568 }, { "epoch": 2.78, "grad_norm": 0.13382531702518463, "learning_rate": 1.3733549933723666e-05, "loss": 0.1242, "step": 1569 }, { "epoch": 2.78, "grad_norm": 0.14836427569389343, "learning_rate": 1.3517405837546404e-05, "loss": 0.1292, "step": 1570 }, { "epoch": 2.78, "grad_norm": 0.16950459778308868, "learning_rate": 1.3302952832903392e-05, "loss": 0.1338, "step": 1571 }, { "epoch": 2.78, "grad_norm": 0.09254854172468185, "learning_rate": 1.3090191665267814e-05, "loss": 0.1016, "step": 1572 }, { "epoch": 2.78, "grad_norm": 0.04616238549351692, "learning_rate": 1.2879123074231502e-05, "loss": 0.1467, "step": 1573 }, { "epoch": 2.78, "grad_norm": 0.04659713804721832, "learning_rate": 1.2669747793502828e-05, "loss": 0.1246, "step": 1574 }, { "epoch": 2.79, "grad_norm": 0.0829511284828186, "learning_rate": 1.2462066550903816e-05, "loss": 0.153, "step": 1575 }, { "epoch": 2.79, "grad_norm": 0.12231741100549698, "learning_rate": 1.225608006836776e-05, "loss": 0.1125, "step": 1576 }, { "epoch": 2.79, "grad_norm": 0.0418907031416893, "learning_rate": 1.2051789061936713e-05, "loss": 0.1282, "step": 1577 }, { "epoch": 2.79, "grad_norm": 0.03682105615735054, "learning_rate": 1.1849194241759009e-05, "loss": 0.1337, "step": 1578 }, { "epoch": 2.79, "grad_norm": 0.21309414505958557, "learning_rate": 1.1648296312086747e-05, "loss": 0.11, "step": 1579 }, { "epoch": 2.8, "grad_norm": 0.06915529817342758, "learning_rate": 1.1449095971273304e-05, "loss": 0.1595, "step": 1580 }, { "epoch": 2.8, "grad_norm": 0.07684849202632904, "learning_rate": 1.1251593911771052e-05, "loss": 0.1307, "step": 1581 }, { "epoch": 2.8, "grad_norm": 0.07943115383386612, "learning_rate": 1.1055790820128919e-05, "loss": 0.1172, "step": 1582 }, { "epoch": 2.8, "grad_norm": 0.21117864549160004, "learning_rate": 1.0861687376989671e-05, "loss": 0.1199, "step": 1583 }, { "epoch": 2.8, "grad_norm": 0.18909168243408203, "learning_rate": 1.0669284257088185e-05, "loss": 0.1482, "step": 1584 }, { "epoch": 2.8, "grad_norm": 0.027331866323947906, "learning_rate": 1.0478582129248516e-05, "loss": 0.1238, "step": 1585 }, { "epoch": 2.81, "grad_norm": 0.12267359346151352, "learning_rate": 1.0289581656381774e-05, "loss": 0.1284, "step": 1586 }, { "epoch": 2.81, "grad_norm": 0.07290299981832504, "learning_rate": 1.0102283495483977e-05, "loss": 0.1385, "step": 1587 }, { "epoch": 2.81, "grad_norm": 0.10076455026865005, "learning_rate": 9.916688297633647e-06, "loss": 0.1455, "step": 1588 }, { "epoch": 2.81, "grad_norm": 0.2671211361885071, "learning_rate": 9.732796707989377e-06, "loss": 0.16, "step": 1589 }, { "epoch": 2.81, "grad_norm": 0.11416134238243103, "learning_rate": 9.550609365787888e-06, "loss": 0.1575, "step": 1590 }, { "epoch": 2.81, "grad_norm": 0.08602559566497803, "learning_rate": 9.37012690434147e-06, "loss": 0.1292, "step": 1591 }, { "epoch": 2.82, "grad_norm": 0.046941716223955154, "learning_rate": 9.191349951036266e-06, "loss": 0.1017, "step": 1592 }, { "epoch": 2.82, "grad_norm": 0.061204541474580765, "learning_rate": 9.014279127329605e-06, "loss": 0.1385, "step": 1593 }, { "epoch": 2.82, "grad_norm": 0.07410020381212234, "learning_rate": 8.838915048748064e-06, "loss": 0.1474, "step": 1594 }, { "epoch": 2.82, "grad_norm": 0.35706961154937744, "learning_rate": 8.66525832488535e-06, "loss": 0.1596, "step": 1595 }, { "epoch": 2.82, "grad_norm": 0.06470204889774323, "learning_rate": 8.493309559399976e-06, "loss": 0.1628, "step": 1596 }, { "epoch": 2.83, "grad_norm": 0.18308192491531372, "learning_rate": 8.323069350013479e-06, "loss": 0.1176, "step": 1597 }, { "epoch": 2.83, "grad_norm": 0.06461644172668457, "learning_rate": 8.154538288508206e-06, "loss": 0.126, "step": 1598 }, { "epoch": 2.83, "grad_norm": 0.2490461766719818, "learning_rate": 7.987716960725144e-06, "loss": 0.1468, "step": 1599 }, { "epoch": 2.83, "grad_norm": 0.11734851449728012, "learning_rate": 7.822605946561923e-06, "loss": 0.145, "step": 1600 }, { "epoch": 2.83, "grad_norm": 0.039642583578825, "learning_rate": 7.659205819970927e-06, "loss": 0.1268, "step": 1601 }, { "epoch": 2.83, "grad_norm": 0.04151405021548271, "learning_rate": 7.497517148957244e-06, "loss": 0.1301, "step": 1602 }, { "epoch": 2.84, "grad_norm": 0.1388237625360489, "learning_rate": 7.3375404955766665e-06, "loss": 0.1382, "step": 1603 }, { "epoch": 2.84, "grad_norm": 0.26035836338996887, "learning_rate": 7.179276415933633e-06, "loss": 0.1538, "step": 1604 }, { "epoch": 2.84, "grad_norm": 0.22738611698150635, "learning_rate": 7.022725460179458e-06, "loss": 0.1083, "step": 1605 }, { "epoch": 2.84, "grad_norm": 0.2859216630458832, "learning_rate": 6.867888172510439e-06, "loss": 0.1362, "step": 1606 }, { "epoch": 2.84, "grad_norm": 0.22834646701812744, "learning_rate": 6.7147650911658086e-06, "loss": 0.1289, "step": 1607 }, { "epoch": 2.84, "grad_norm": 0.05140649899840355, "learning_rate": 6.5633567484259525e-06, "loss": 0.1191, "step": 1608 }, { "epoch": 2.85, "grad_norm": 0.06531380861997604, "learning_rate": 6.413663670610526e-06, "loss": 0.1122, "step": 1609 }, { "epoch": 2.85, "grad_norm": 0.031297944486141205, "learning_rate": 6.26568637807673e-06, "loss": 0.1274, "step": 1610 }, { "epoch": 2.85, "grad_norm": 0.23136821389198303, "learning_rate": 6.119425385217314e-06, "loss": 0.1178, "step": 1611 }, { "epoch": 2.85, "grad_norm": 0.11101561784744263, "learning_rate": 5.9748812004590255e-06, "loss": 0.1435, "step": 1612 }, { "epoch": 2.85, "grad_norm": 0.12405877560377121, "learning_rate": 5.832054326260605e-06, "loss": 0.1342, "step": 1613 }, { "epoch": 2.86, "grad_norm": 0.02779628150165081, "learning_rate": 5.69094525911118e-06, "loss": 0.1283, "step": 1614 }, { "epoch": 2.86, "grad_norm": 0.1838769018650055, "learning_rate": 5.551554489528432e-06, "loss": 0.1569, "step": 1615 }, { "epoch": 2.86, "grad_norm": 0.06827875226736069, "learning_rate": 5.413882502057155e-06, "loss": 0.1119, "step": 1616 }, { "epoch": 2.86, "grad_norm": 0.07234180718660355, "learning_rate": 5.277929775267143e-06, "loss": 0.1302, "step": 1617 }, { "epoch": 2.86, "grad_norm": 0.23422658443450928, "learning_rate": 5.143696781751972e-06, "loss": 0.1459, "step": 1618 }, { "epoch": 2.86, "grad_norm": 0.10591956228017807, "learning_rate": 5.011183988127055e-06, "loss": 0.1529, "step": 1619 }, { "epoch": 2.87, "grad_norm": 0.04643354192376137, "learning_rate": 4.880391855028088e-06, "loss": 0.1496, "step": 1620 }, { "epoch": 2.87, "grad_norm": 0.2558877766132355, "learning_rate": 4.751320837109552e-06, "loss": 0.1484, "step": 1621 }, { "epoch": 2.87, "grad_norm": 0.14346656203269958, "learning_rate": 4.6239713830429354e-06, "loss": 0.1295, "step": 1622 }, { "epoch": 2.87, "grad_norm": 0.06926041841506958, "learning_rate": 4.498343935515348e-06, "loss": 0.1275, "step": 1623 }, { "epoch": 2.87, "grad_norm": 0.10302523523569107, "learning_rate": 4.374438931228075e-06, "loss": 0.1432, "step": 1624 }, { "epoch": 2.87, "grad_norm": 0.12526661157608032, "learning_rate": 4.252256800894694e-06, "loss": 0.1271, "step": 1625 }, { "epoch": 2.88, "grad_norm": 0.08141893148422241, "learning_rate": 4.131797969239903e-06, "loss": 0.1352, "step": 1626 }, { "epoch": 2.88, "grad_norm": 0.03432145342230797, "learning_rate": 4.013062854998028e-06, "loss": 0.1307, "step": 1627 }, { "epoch": 2.88, "grad_norm": 0.04721994698047638, "learning_rate": 3.896051870911188e-06, "loss": 0.1333, "step": 1628 }, { "epoch": 2.88, "grad_norm": 0.07389276474714279, "learning_rate": 3.7807654237284605e-06, "loss": 0.1245, "step": 1629 }, { "epoch": 2.88, "grad_norm": 0.027808329090476036, "learning_rate": 3.6672039142039425e-06, "loss": 0.1354, "step": 1630 }, { "epoch": 2.89, "grad_norm": 0.18238960206508636, "learning_rate": 3.5553677370957495e-06, "loss": 0.1252, "step": 1631 }, { "epoch": 2.89, "grad_norm": 0.042762529104948044, "learning_rate": 3.445257281164349e-06, "loss": 0.1445, "step": 1632 }, { "epoch": 2.89, "grad_norm": 0.16484946012496948, "learning_rate": 3.3368729291712863e-06, "loss": 0.1459, "step": 1633 }, { "epoch": 2.89, "grad_norm": 0.04074009507894516, "learning_rate": 3.2302150578780165e-06, "loss": 0.1229, "step": 1634 }, { "epoch": 2.89, "grad_norm": 0.03902529180049896, "learning_rate": 3.125284038044407e-06, "loss": 0.1303, "step": 1635 }, { "epoch": 2.89, "grad_norm": 0.0906100794672966, "learning_rate": 3.0220802344275157e-06, "loss": 0.1367, "step": 1636 }, { "epoch": 2.9, "grad_norm": 0.07743262499570847, "learning_rate": 2.9206040057802584e-06, "loss": 0.1423, "step": 1637 }, { "epoch": 2.9, "grad_norm": 0.0563175305724144, "learning_rate": 2.8208557048503556e-06, "loss": 0.122, "step": 1638 }, { "epoch": 2.9, "grad_norm": 0.08874353021383286, "learning_rate": 2.7228356783788876e-06, "loss": 0.1037, "step": 1639 }, { "epoch": 2.9, "grad_norm": 0.05931072309613228, "learning_rate": 2.626544267099129e-06, "loss": 0.1155, "step": 1640 }, { "epoch": 2.9, "grad_norm": 0.03920688480138779, "learning_rate": 2.531981805735606e-06, "loss": 0.1462, "step": 1641 }, { "epoch": 2.9, "grad_norm": 0.09828366339206696, "learning_rate": 2.4391486230024297e-06, "loss": 0.1603, "step": 1642 }, { "epoch": 2.91, "grad_norm": 0.16284847259521484, "learning_rate": 2.3480450416027423e-06, "loss": 0.1222, "step": 1643 }, { "epoch": 2.91, "grad_norm": 0.0588088184595108, "learning_rate": 2.2586713782272172e-06, "loss": 0.1423, "step": 1644 }, { "epoch": 2.91, "grad_norm": 0.21331720054149628, "learning_rate": 2.1710279435530057e-06, "loss": 0.1567, "step": 1645 }, { "epoch": 2.91, "grad_norm": 0.1025921106338501, "learning_rate": 2.0851150422427913e-06, "loss": 0.1297, "step": 1646 }, { "epoch": 2.91, "grad_norm": 0.12087155133485794, "learning_rate": 2.0009329729435146e-06, "loss": 0.1207, "step": 1647 }, { "epoch": 2.92, "grad_norm": 0.12067259848117828, "learning_rate": 1.9184820282855953e-06, "loss": 0.1335, "step": 1648 }, { "epoch": 2.92, "grad_norm": 0.12102551758289337, "learning_rate": 1.8377624948817673e-06, "loss": 0.1483, "step": 1649 }, { "epoch": 2.92, "grad_norm": 0.07149103283882141, "learning_rate": 1.7587746533260785e-06, "loss": 0.1423, "step": 1650 }, { "epoch": 2.92, "grad_norm": 0.19505378603935242, "learning_rate": 1.6815187781928921e-06, "loss": 0.1045, "step": 1651 }, { "epoch": 2.92, "grad_norm": 0.14010155200958252, "learning_rate": 1.6059951380359984e-06, "loss": 0.1282, "step": 1652 }, { "epoch": 2.92, "grad_norm": 0.061695683747529984, "learning_rate": 1.5322039953878374e-06, "loss": 0.1342, "step": 1653 }, { "epoch": 2.93, "grad_norm": 0.24404413998126984, "learning_rate": 1.4601456067580564e-06, "loss": 0.1379, "step": 1654 }, { "epoch": 2.93, "grad_norm": 0.07468795031309128, "learning_rate": 1.3898202226333423e-06, "loss": 0.1219, "step": 1655 }, { "epoch": 2.93, "grad_norm": 0.061813950538635254, "learning_rate": 1.3212280874759231e-06, "loss": 0.1151, "step": 1656 }, { "epoch": 2.93, "grad_norm": 0.1875925213098526, "learning_rate": 1.2543694397230686e-06, "loss": 0.1474, "step": 1657 }, { "epoch": 2.93, "grad_norm": 0.06623123586177826, "learning_rate": 1.1892445117862028e-06, "loss": 0.0881, "step": 1658 }, { "epoch": 2.93, "grad_norm": 0.057737018913030624, "learning_rate": 1.1258535300499583e-06, "loss": 0.1452, "step": 1659 }, { "epoch": 2.94, "grad_norm": 0.03276927024126053, "learning_rate": 1.0641967148716236e-06, "loss": 0.1435, "step": 1660 }, { "epoch": 2.94, "grad_norm": 0.07240304350852966, "learning_rate": 1.004274280580142e-06, "loss": 0.1266, "step": 1661 }, { "epoch": 2.94, "grad_norm": 0.10652832686901093, "learning_rate": 9.460864354755017e-07, "loss": 0.1628, "step": 1662 }, { "epoch": 2.94, "grad_norm": 0.37316522002220154, "learning_rate": 8.896333818280145e-07, "loss": 0.1186, "step": 1663 }, { "epoch": 2.94, "grad_norm": 0.20198196172714233, "learning_rate": 8.349153158774825e-07, "loss": 0.1195, "step": 1664 }, { "epoch": 2.95, "grad_norm": 0.18203914165496826, "learning_rate": 7.8193242783281e-07, "loss": 0.1618, "step": 1665 }, { "epoch": 2.95, "grad_norm": 0.1698591411113739, "learning_rate": 7.306849018708927e-07, "loss": 0.1423, "step": 1666 }, { "epoch": 2.95, "grad_norm": 0.13877595961093903, "learning_rate": 6.811729161363966e-07, "loss": 0.1068, "step": 1667 }, { "epoch": 2.95, "grad_norm": 0.2812526524066925, "learning_rate": 6.333966427409243e-07, "loss": 0.1271, "step": 1668 }, { "epoch": 2.95, "grad_norm": 0.2276443988084793, "learning_rate": 5.873562477624605e-07, "loss": 0.1471, "step": 1669 }, { "epoch": 2.95, "grad_norm": 0.09518153220415115, "learning_rate": 5.430518912448168e-07, "loss": 0.1251, "step": 1670 }, { "epoch": 2.96, "grad_norm": 0.0484699048101902, "learning_rate": 5.004837271970764e-07, "loss": 0.115, "step": 1671 }, { "epoch": 2.96, "grad_norm": 0.09272460639476776, "learning_rate": 4.596519035929281e-07, "loss": 0.1178, "step": 1672 }, { "epoch": 2.96, "grad_norm": 0.1327453851699829, "learning_rate": 4.2055656237038884e-07, "loss": 0.1252, "step": 1673 }, { "epoch": 2.96, "grad_norm": 0.10774451494216919, "learning_rate": 3.83197839431082e-07, "loss": 0.1243, "step": 1674 }, { "epoch": 2.96, "grad_norm": 0.05414144694805145, "learning_rate": 3.475758646400151e-07, "loss": 0.1254, "step": 1675 }, { "epoch": 2.97, "grad_norm": 0.05690954253077507, "learning_rate": 3.1369076182480305e-07, "loss": 0.1227, "step": 1676 }, { "epoch": 2.97, "grad_norm": 0.02989388071000576, "learning_rate": 2.815426487755568e-07, "loss": 0.1144, "step": 1677 }, { "epoch": 2.97, "grad_norm": 0.07636480778455734, "learning_rate": 2.5113163724427293e-07, "loss": 0.1264, "step": 1678 }, { "epoch": 2.97, "grad_norm": 0.14926138520240784, "learning_rate": 2.2245783294444488e-07, "loss": 0.1386, "step": 1679 }, { "epoch": 2.97, "grad_norm": 0.20612908899784088, "learning_rate": 1.9552133555084116e-07, "loss": 0.1107, "step": 1680 }, { "epoch": 2.97, "grad_norm": 0.16899043321609497, "learning_rate": 1.7032223869911656e-07, "loss": 0.1392, "step": 1681 }, { "epoch": 2.98, "grad_norm": 0.03907778859138489, "learning_rate": 1.4686062998525706e-07, "loss": 0.1191, "step": 1682 }, { "epoch": 2.98, "grad_norm": 0.05882280319929123, "learning_rate": 1.2513659096569097e-07, "loss": 0.1223, "step": 1683 }, { "epoch": 2.98, "grad_norm": 0.09647829085588455, "learning_rate": 1.0515019715656716e-07, "loss": 0.1278, "step": 1684 }, { "epoch": 2.98, "grad_norm": 0.15963368117809296, "learning_rate": 8.690151803386614e-08, "loss": 0.1391, "step": 1685 }, { "epoch": 2.98, "grad_norm": 0.13670912384986877, "learning_rate": 7.0390617032845e-08, "loss": 0.1424, "step": 1686 }, { "epoch": 2.98, "grad_norm": 0.1605810821056366, "learning_rate": 5.5617551548148294e-08, "loss": 0.1438, "step": 1687 }, { "epoch": 2.99, "grad_norm": 0.15910910069942474, "learning_rate": 4.258237293325307e-08, "loss": 0.1518, "step": 1688 }, { "epoch": 2.99, "grad_norm": 0.09237740933895111, "learning_rate": 3.1285126500579795e-08, "loss": 0.1212, "step": 1689 }, { "epoch": 2.99, "grad_norm": 0.05058419331908226, "learning_rate": 2.1725851521103845e-08, "loss": 0.1282, "step": 1690 }, { "epoch": 2.99, "grad_norm": 0.03350934758782387, "learning_rate": 1.3904581224410962e-08, "loss": 0.1285, "step": 1691 }, { "epoch": 2.99, "grad_norm": 0.11487533152103424, "learning_rate": 7.821342798530751e-09, "loss": 0.1352, "step": 1692 }, { "epoch": 3.0, "grad_norm": 0.03711562231183052, "learning_rate": 3.4761573897701404e-09, "loss": 0.1145, "step": 1693 }, { "epoch": 3.0, "grad_norm": 0.0804651752114296, "learning_rate": 8.690401026578698e-10, "loss": 0.1144, "step": 1694 }, { "epoch": 3.0, "grad_norm": 0.13091543316841125, "learning_rate": 0.0, "loss": 0.127, "step": 1695 } ], "logging_steps": 1, "max_steps": 1695, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 565, "total_flos": 1.549439947809751e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }