[ { "loss": 30.8234, "grad_norm": 0.5755094885826111, "learning_rate": 0.0009991248796709547, "epoch": 0.0 }, { "loss": 24.1229, "grad_norm": 0.2920963764190674, "learning_rate": 0.0009982497593419095, "epoch": 0.01 }, { "loss": 22.7986, "grad_norm": 0.5106011629104614, "learning_rate": 0.0009973746390128642, "epoch": 0.01 }, { "loss": 21.4924, "grad_norm": 0.9322375059127808, "learning_rate": 0.000996499518683819, "epoch": 0.01 }, { "loss": 20.7911, "grad_norm": 0.8529098629951477, "learning_rate": 0.0009956243983547737, "epoch": 0.01 }, { "loss": 19.338, "grad_norm": 0.776152491569519, "learning_rate": 0.0009947492780257286, "epoch": 0.02 }, { "loss": 19.0175, "grad_norm": 2.11796498298645, "learning_rate": 0.0009938741576966832, "epoch": 0.02 }, { "loss": 18.2997, "grad_norm": 1.3791886568069458, "learning_rate": 0.0009929990373676381, "epoch": 0.02 }, { "loss": 17.2791, "grad_norm": 1.3849037885665894, "learning_rate": 0.0009921239170385928, "epoch": 0.02 }, { "loss": 17.3609, "grad_norm": 1.1861941814422607, "learning_rate": 0.0009912487967095476, "epoch": 0.03 }, { "loss": 17.1215, "grad_norm": 1.494122862815857, "learning_rate": 0.0009903736763805023, "epoch": 0.03 }, { "loss": 16.3944, "grad_norm": 1.5872834920883179, "learning_rate": 0.0009894985560514572, "epoch": 0.03 }, { "loss": 16.0054, "grad_norm": 1.2658979892730713, "learning_rate": 0.0009886234357224118, "epoch": 0.03 }, { "loss": 15.5523, "grad_norm": 0.8640480041503906, "learning_rate": 0.0009877483153933667, "epoch": 0.04 }, { "loss": 16.2465, "grad_norm": 0.8946548700332642, "learning_rate": 0.0009868731950643213, "epoch": 0.04 }, { "loss": 15.0235, "grad_norm": 0.9279372692108154, "learning_rate": 0.0009859980747352762, "epoch": 0.04 }, { "loss": 15.7517, "grad_norm": 0.8807494044303894, "learning_rate": 0.0009851229544062309, "epoch": 0.04 }, { "loss": 14.6884, "grad_norm": 0.683822751045227, "learning_rate": 0.0009842478340771857, "epoch": 0.05 }, { "loss": 14.0949, "grad_norm": 1.1334095001220703, "learning_rate": 0.0009833727137481404, "epoch": 0.05 }, { "loss": 14.3378, "grad_norm": 1.1247657537460327, "learning_rate": 0.0009824975934190953, "epoch": 0.05 }, { "loss": 13.7597, "grad_norm": 0.9332773685455322, "learning_rate": 0.00098162247309005, "epoch": 0.06 }, { "loss": 14.5567, "grad_norm": 0.8742538690567017, "learning_rate": 0.0009807473527610048, "epoch": 0.06 }, { "loss": 14.0188, "grad_norm": 1.5592143535614014, "learning_rate": 0.0009798722324319594, "epoch": 0.06 }, { "loss": 13.9401, "grad_norm": 0.9473065733909607, "learning_rate": 0.0009789971121029143, "epoch": 0.06 }, { "loss": 13.5177, "grad_norm": 0.5469663143157959, "learning_rate": 0.000978121991773869, "epoch": 0.07 }, { "loss": 13.513, "grad_norm": 1.7497597932815552, "learning_rate": 0.0009772468714448236, "epoch": 0.07 }, { "loss": 13.6579, "grad_norm": 0.7552927136421204, "learning_rate": 0.0009763717511157785, "epoch": 0.07 }, { "loss": 13.0899, "grad_norm": 0.5602779984474182, "learning_rate": 0.0009754966307867332, "epoch": 0.07 }, { "loss": 13.8637, "grad_norm": 0.6577705144882202, "learning_rate": 0.000974621510457688, "epoch": 0.08 }, { "loss": 14.2909, "grad_norm": 1.0710817575454712, "learning_rate": 0.0009737463901286428, "epoch": 0.08 }, { "loss": 13.3632, "grad_norm": 0.48803457617759705, "learning_rate": 0.0009728712697995975, "epoch": 0.08 }, { "loss": 13.5002, "grad_norm": 0.9970788359642029, "learning_rate": 0.0009719961494705523, "epoch": 0.08 }, { "loss": 13.6276, "grad_norm": 0.9624769687652588, "learning_rate": 0.000971121029141507, "epoch": 0.09 }, { "loss": 13.7281, "grad_norm": 0.8082631230354309, "learning_rate": 0.0009702459088124618, "epoch": 0.09 }, { "loss": 13.0793, "grad_norm": 0.6732771396636963, "learning_rate": 0.0009693707884834166, "epoch": 0.09 }, { "loss": 12.6621, "grad_norm": 0.8451002240180969, "learning_rate": 0.0009684956681543713, "epoch": 0.09 }, { "loss": 13.2374, "grad_norm": 1.1656385660171509, "learning_rate": 0.0009676205478253261, "epoch": 0.1 }, { "loss": 12.7625, "grad_norm": 0.9667061567306519, "learning_rate": 0.0009667454274962808, "epoch": 0.1 }, { "loss": 13.0046, "grad_norm": 0.9311497807502747, "learning_rate": 0.0009658703071672355, "epoch": 0.1 }, { "loss": 12.9037, "grad_norm": 1.1891040802001953, "learning_rate": 0.0009649951868381903, "epoch": 0.1 }, { "loss": 12.6521, "grad_norm": 1.1127817630767822, "learning_rate": 0.000964120066509145, "epoch": 0.11 }, { "loss": 13.2942, "grad_norm": 0.6665758490562439, "learning_rate": 0.0009632449461800998, "epoch": 0.11 }, { "loss": 12.4443, "grad_norm": 0.8878126740455627, "learning_rate": 0.0009623698258510546, "epoch": 0.11 }, { "loss": 13.0001, "grad_norm": 1.5000464916229248, "learning_rate": 0.0009614947055220093, "epoch": 0.12 }, { "loss": 12.2303, "grad_norm": 1.1078687906265259, "learning_rate": 0.0009606195851929641, "epoch": 0.12 }, { "loss": 12.1915, "grad_norm": 0.8044748306274414, "learning_rate": 0.0009597444648639187, "epoch": 0.12 }, { "loss": 12.7246, "grad_norm": 0.9232500195503235, "learning_rate": 0.0009588693445348735, "epoch": 0.12 }, { "loss": 11.9769, "grad_norm": 0.7413458824157715, "learning_rate": 0.0009579942242058283, "epoch": 0.13 }, { "loss": 12.8006, "grad_norm": 1.1132707595825195, "learning_rate": 0.000957119103876783, "epoch": 0.13 }, { "loss": 12.4323, "grad_norm": 0.7814503312110901, "learning_rate": 0.0009562439835477378, "epoch": 0.13 }, { "loss": 12.3482, "grad_norm": 0.8854762315750122, "learning_rate": 0.0009553688632186925, "epoch": 0.13 }, { "loss": 12.5045, "grad_norm": 0.704131007194519, "learning_rate": 0.0009544937428896473, "epoch": 0.14 }, { "loss": 12.1405, "grad_norm": 0.7020297050476074, "learning_rate": 0.0009536186225606021, "epoch": 0.14 }, { "loss": 11.5427, "grad_norm": 0.398807168006897, "learning_rate": 0.0009527435022315568, "epoch": 0.14 }, { "loss": 12.655, "grad_norm": 1.0002299547195435, "learning_rate": 0.0009518683819025116, "epoch": 0.14 }, { "loss": 11.9656, "grad_norm": 0.7870428562164307, "learning_rate": 0.0009509932615734664, "epoch": 0.15 }, { "loss": 12.4639, "grad_norm": 0.9154604077339172, "learning_rate": 0.0009501181412444211, "epoch": 0.15 }, { "loss": 11.6344, "grad_norm": 1.1896569728851318, "learning_rate": 0.0009492430209153759, "epoch": 0.15 }, { "loss": 12.4516, "grad_norm": 0.8169024586677551, "learning_rate": 0.0009483679005863306, "epoch": 0.15 }, { "loss": 12.1848, "grad_norm": 0.8429264426231384, "learning_rate": 0.0009474927802572854, "epoch": 0.16 }, { "loss": 11.2014, "grad_norm": 0.8499436378479004, "learning_rate": 0.0009466176599282402, "epoch": 0.16 }, { "loss": 12.2217, "grad_norm": 0.8969743251800537, "learning_rate": 0.0009457425395991948, "epoch": 0.16 }, { "loss": 11.7729, "grad_norm": 1.0959218740463257, "learning_rate": 0.0009448674192701496, "epoch": 0.17 }, { "loss": 11.6254, "grad_norm": 1.1692876815795898, "learning_rate": 0.0009439922989411043, "epoch": 0.17 }, { "loss": 11.5698, "grad_norm": 1.9476372003555298, "learning_rate": 0.0009431171786120591, "epoch": 0.17 }, { "loss": 11.4321, "grad_norm": 1.1742662191390991, "learning_rate": 0.0009422420582830139, "epoch": 0.17 }, { "loss": 11.3224, "grad_norm": 0.9839737415313721, "learning_rate": 0.0009413669379539686, "epoch": 0.18 }, { "loss": 11.8269, "grad_norm": 0.9094179272651672, "learning_rate": 0.0009404918176249234, "epoch": 0.18 }, { "loss": 11.8652, "grad_norm": 0.9139958620071411, "learning_rate": 0.0009396166972958782, "epoch": 0.18 }, { "loss": 11.5493, "grad_norm": 0.7938945889472961, "learning_rate": 0.0009387415769668329, "epoch": 0.18 }, { "loss": 11.413, "grad_norm": 0.8102487921714783, "learning_rate": 0.0009378664566377877, "epoch": 0.19 }, { "loss": 11.4015, "grad_norm": 0.5892770290374756, "learning_rate": 0.0009369913363087424, "epoch": 0.19 }, { "loss": 10.8455, "grad_norm": 0.7269143462181091, "learning_rate": 0.0009361162159796972, "epoch": 0.19 }, { "loss": 11.5612, "grad_norm": 0.8169882893562317, "learning_rate": 0.000935241095650652, "epoch": 0.19 }, { "loss": 10.545, "grad_norm": 0.8424365520477295, "learning_rate": 0.0009343659753216067, "epoch": 0.2 }, { "loss": 10.8486, "grad_norm": 0.855518102645874, "learning_rate": 0.0009334908549925615, "epoch": 0.2 }, { "loss": 10.3733, "grad_norm": 1.1463903188705444, "learning_rate": 0.0009326157346635162, "epoch": 0.2 }, { "loss": 10.794, "grad_norm": 0.7493767142295837, "learning_rate": 0.000931740614334471, "epoch": 0.2 }, { "loss": 10.5943, "grad_norm": 0.8767346739768982, "learning_rate": 0.0009308654940054258, "epoch": 0.21 }, { "loss": 11.4169, "grad_norm": 1.0650781393051147, "learning_rate": 0.0009299903736763805, "epoch": 0.21 }, { "loss": 10.8176, "grad_norm": 0.8954362869262695, "learning_rate": 0.0009291152533473353, "epoch": 0.21 }, { "loss": 10.9644, "grad_norm": 0.697245180606842, "learning_rate": 0.0009282401330182901, "epoch": 0.22 }, { "loss": 11.0427, "grad_norm": 1.5471469163894653, "learning_rate": 0.0009273650126892448, "epoch": 0.22 }, { "loss": 10.8293, "grad_norm": 0.7173879146575928, "learning_rate": 0.0009264898923601996, "epoch": 0.22 }, { "loss": 10.744, "grad_norm": 1.1271495819091797, "learning_rate": 0.0009256147720311543, "epoch": 0.22 }, { "loss": 10.3733, "grad_norm": 0.7106486558914185, "learning_rate": 0.0009247396517021091, "epoch": 0.23 }, { "loss": 10.9536, "grad_norm": 1.1200592517852783, "learning_rate": 0.0009238645313730638, "epoch": 0.23 }, { "loss": 10.4749, "grad_norm": 1.0028458833694458, "learning_rate": 0.0009229894110440185, "epoch": 0.23 }, { "loss": 11.4667, "grad_norm": 1.187585711479187, "learning_rate": 0.0009221142907149733, "epoch": 0.23 }, { "loss": 10.3349, "grad_norm": 0.8691514134407043, "learning_rate": 0.000921239170385928, "epoch": 0.24 }, { "loss": 10.6188, "grad_norm": 0.8789599537849426, "learning_rate": 0.0009203640500568828, "epoch": 0.24 }, { "loss": 10.454, "grad_norm": 0.8376362919807434, "learning_rate": 0.0009194889297278376, "epoch": 0.24 }, { "loss": 10.2419, "grad_norm": 1.0760575532913208, "learning_rate": 0.0009186138093987923, "epoch": 0.24 }, { "loss": 10.8593, "grad_norm": 0.709028422832489, "learning_rate": 0.0009177386890697471, "epoch": 0.25 }, { "loss": 11.073, "grad_norm": 1.0934019088745117, "learning_rate": 0.0009168635687407019, "epoch": 0.25 }, { "loss": 10.5596, "grad_norm": 0.7833492159843445, "learning_rate": 0.0009159884484116566, "epoch": 0.25 }, { "loss": 11.2079, "grad_norm": 0.8762934803962708, "learning_rate": 0.0009151133280826114, "epoch": 0.25 }, { "loss": 11.2229, "grad_norm": 0.8059395551681519, "learning_rate": 0.0009142382077535661, "epoch": 0.26 }, { "loss": 10.8706, "grad_norm": 1.0892099142074585, "learning_rate": 0.0009133630874245209, "epoch": 0.26 }, { "loss": 10.9983, "grad_norm": 0.7471132278442383, "learning_rate": 0.0009124879670954757, "epoch": 0.26 }, { "loss": 11.4291, "grad_norm": 0.9766479730606079, "learning_rate": 0.0009116128467664304, "epoch": 0.27 }, { "loss": 10.5895, "grad_norm": 0.7469794154167175, "learning_rate": 0.0009107377264373852, "epoch": 0.27 }, { "loss": 9.9826, "grad_norm": 0.9510082602500916, "learning_rate": 0.00090986260610834, "epoch": 0.27 }, { "loss": 10.1785, "grad_norm": 0.8061089515686035, "learning_rate": 0.0009089874857792947, "epoch": 0.27 }, { "loss": 10.5502, "grad_norm": 0.7467952966690063, "learning_rate": 0.0009081123654502495, "epoch": 0.28 }, { "loss": 10.4848, "grad_norm": 0.9167515635490417, "learning_rate": 0.0009072372451212042, "epoch": 0.28 }, { "loss": 10.7841, "grad_norm": 1.0157630443572998, "learning_rate": 0.000906362124792159, "epoch": 0.28 }, { "loss": 10.6985, "grad_norm": 0.8764671087265015, "learning_rate": 0.0009054870044631138, "epoch": 0.28 }, { "loss": 10.4706, "grad_norm": 0.7716103196144104, "learning_rate": 0.0009046118841340685, "epoch": 0.29 }, { "loss": 10.4371, "grad_norm": 0.83449387550354, "learning_rate": 0.0009037367638050233, "epoch": 0.29 }, { "loss": 10.2414, "grad_norm": 0.785839855670929, "learning_rate": 0.000902861643475978, "epoch": 0.29 }, { "loss": 10.0213, "grad_norm": 0.7405595183372498, "learning_rate": 0.0009019865231469327, "epoch": 0.29 }, { "loss": 10.2501, "grad_norm": 0.929263710975647, "learning_rate": 0.0009011114028178875, "epoch": 0.3 }, { "loss": 10.6749, "grad_norm": 0.9185034036636353, "learning_rate": 0.0009002362824888422, "epoch": 0.3 }, { "loss": 10.4313, "grad_norm": 0.7888991832733154, "learning_rate": 0.000899361162159797, "epoch": 0.3 }, { "loss": 10.4389, "grad_norm": 0.9736090302467346, "learning_rate": 0.0008984860418307517, "epoch": 0.3 }, { "loss": 9.9148, "grad_norm": 0.7677895426750183, "learning_rate": 0.0008976109215017065, "epoch": 0.31 }, { "loss": 9.7635, "grad_norm": 0.9090219736099243, "learning_rate": 0.0008967358011726613, "epoch": 0.31 }, { "loss": 10.0211, "grad_norm": 0.7184523344039917, "learning_rate": 0.000895860680843616, "epoch": 0.31 }, { "loss": 9.9932, "grad_norm": 1.0859735012054443, "learning_rate": 0.0008949855605145708, "epoch": 0.31 }, { "loss": 10.2804, "grad_norm": 1.0252892971038818, "learning_rate": 0.0008941104401855256, "epoch": 0.32 }, { "loss": 10.0543, "grad_norm": 1.1707403659820557, "learning_rate": 0.0008932353198564803, "epoch": 0.32 }, { "loss": 10.6658, "grad_norm": 0.6616178750991821, "learning_rate": 0.0008923601995274351, "epoch": 0.32 }, { "loss": 9.8623, "grad_norm": 1.9947571754455566, "learning_rate": 0.0008914850791983898, "epoch": 0.33 }, { "loss": 10.1607, "grad_norm": 1.3363871574401855, "learning_rate": 0.0008906099588693446, "epoch": 0.33 }, { "loss": 10.1063, "grad_norm": 1.0039112567901611, "learning_rate": 0.0008897348385402994, "epoch": 0.33 }, { "loss": 9.7059, "grad_norm": 1.0225836038589478, "learning_rate": 0.0008888597182112541, "epoch": 0.33 }, { "loss": 10.2506, "grad_norm": 1.1005779504776, "learning_rate": 0.0008879845978822089, "epoch": 0.34 }, { "loss": 10.3011, "grad_norm": 1.1654433012008667, "learning_rate": 0.0008871094775531636, "epoch": 0.34 }, { "loss": 10.088, "grad_norm": 0.9155218601226807, "learning_rate": 0.0008862343572241184, "epoch": 0.34 }, { "loss": 9.8835, "grad_norm": 1.2090198993682861, "learning_rate": 0.0008853592368950732, "epoch": 0.34 }, { "loss": 9.6644, "grad_norm": 1.5198620557785034, "learning_rate": 0.0008844841165660279, "epoch": 0.35 }, { "loss": 9.6799, "grad_norm": 1.0043960809707642, "learning_rate": 0.0008836089962369827, "epoch": 0.35 }, { "loss": 10.0658, "grad_norm": 1.0404608249664307, "learning_rate": 0.0008827338759079375, "epoch": 0.35 }, { "loss": 9.9551, "grad_norm": 1.0412163734436035, "learning_rate": 0.0008818587555788922, "epoch": 0.35 }, { "loss": 9.4082, "grad_norm": 0.9032560586929321, "learning_rate": 0.000880983635249847, "epoch": 0.36 }, { "loss": 10.2566, "grad_norm": 1.2763034105300903, "learning_rate": 0.0008801085149208016, "epoch": 0.36 }, { "loss": 9.8585, "grad_norm": 0.8143719434738159, "learning_rate": 0.0008792333945917563, "epoch": 0.36 }, { "loss": 9.5974, "grad_norm": 1.3916654586791992, "learning_rate": 0.000878358274262711, "epoch": 0.36 }, { "loss": 10.611, "grad_norm": 1.2270894050598145, "learning_rate": 0.0008774831539336658, "epoch": 0.37 }, { "loss": 9.4489, "grad_norm": 1.339573621749878, "learning_rate": 0.0008766080336046206, "epoch": 0.37 }, { "loss": 9.769, "grad_norm": 1.023978352546692, "learning_rate": 0.0008757329132755753, "epoch": 0.37 }, { "loss": 9.7854, "grad_norm": 1.1513617038726807, "learning_rate": 0.0008748577929465301, "epoch": 0.38 }, { "loss": 9.4378, "grad_norm": 0.9918627142906189, "learning_rate": 0.0008739826726174849, "epoch": 0.38 }, { "loss": 9.6902, "grad_norm": 0.9365573525428772, "learning_rate": 0.0008731075522884396, "epoch": 0.38 }, { "loss": 9.5533, "grad_norm": 1.1697934865951538, "learning_rate": 0.0008722324319593944, "epoch": 0.38 }, { "loss": 9.5204, "grad_norm": 1.2257342338562012, "learning_rate": 0.0008713573116303491, "epoch": 0.39 }, { "loss": 9.636, "grad_norm": 1.0158884525299072, "learning_rate": 0.0008704821913013039, "epoch": 0.39 }, { "loss": 9.8914, "grad_norm": 1.4228135347366333, "learning_rate": 0.0008696070709722587, "epoch": 0.39 }, { "loss": 9.3714, "grad_norm": 1.2829135656356812, "learning_rate": 0.0008687319506432134, "epoch": 0.39 }, { "loss": 9.7498, "grad_norm": 1.2624573707580566, "learning_rate": 0.0008678568303141682, "epoch": 0.4 }, { "loss": 9.8928, "grad_norm": 1.3651659488677979, "learning_rate": 0.000866981709985123, "epoch": 0.4 }, { "loss": 10.3697, "grad_norm": 1.1383252143859863, "learning_rate": 0.0008661065896560777, "epoch": 0.4 }, { "loss": 10.1876, "grad_norm": 1.1688463687896729, "learning_rate": 0.0008652314693270325, "epoch": 0.4 }, { "loss": 9.7974, "grad_norm": 1.1377474069595337, "learning_rate": 0.0008643563489979872, "epoch": 0.41 }, { "loss": 9.5742, "grad_norm": 1.0107587575912476, "learning_rate": 0.000863481228668942, "epoch": 0.41 }, { "loss": 9.9821, "grad_norm": 1.3488329648971558, "learning_rate": 0.0008626061083398968, "epoch": 0.41 }, { "loss": 9.3107, "grad_norm": 1.0305010080337524, "learning_rate": 0.0008617309880108515, "epoch": 0.41 }, { "loss": 9.3456, "grad_norm": 0.8658286929130554, "learning_rate": 0.0008608558676818063, "epoch": 0.42 }, { "loss": 9.3709, "grad_norm": 1.1033709049224854, "learning_rate": 0.000859980747352761, "epoch": 0.42 }, { "loss": 9.5077, "grad_norm": 1.1051572561264038, "learning_rate": 0.0008591056270237157, "epoch": 0.42 }, { "loss": 9.1458, "grad_norm": 1.3423538208007812, "learning_rate": 0.0008582305066946705, "epoch": 0.43 }, { "loss": 9.657, "grad_norm": 1.1479153633117676, "learning_rate": 0.0008573553863656252, "epoch": 0.43 }, { "loss": 10.5804, "grad_norm": 1.1615872383117676, "learning_rate": 0.00085648026603658, "epoch": 0.43 }, { "loss": 8.2792, "grad_norm": 1.212221384048462, "learning_rate": 0.0008556051457075347, "epoch": 0.43 }, { "loss": 9.3785, "grad_norm": 1.0849367380142212, "learning_rate": 0.0008547300253784895, "epoch": 0.44 }, { "loss": 9.4097, "grad_norm": 1.119325041770935, "learning_rate": 0.0008538549050494443, "epoch": 0.44 }, { "loss": 9.3308, "grad_norm": 1.3356918096542358, "learning_rate": 0.000852979784720399, "epoch": 0.44 }, { "loss": 9.4548, "grad_norm": 0.9954844117164612, "learning_rate": 0.0008521046643913538, "epoch": 0.44 }, { "loss": 8.9297, "grad_norm": 0.8752724528312683, "learning_rate": 0.0008512295440623086, "epoch": 0.45 }, { "loss": 9.1389, "grad_norm": 1.2811753749847412, "learning_rate": 0.0008503544237332633, "epoch": 0.45 }, { "loss": 9.3155, "grad_norm": 1.253055453300476, "learning_rate": 0.0008494793034042181, "epoch": 0.45 }, { "loss": 9.548, "grad_norm": 1.2081260681152344, "learning_rate": 0.0008486041830751728, "epoch": 0.45 }, { "loss": 9.0236, "grad_norm": 1.3752362728118896, "learning_rate": 0.0008477290627461276, "epoch": 0.46 }, { "loss": 9.0533, "grad_norm": 1.057065725326538, "learning_rate": 0.0008468539424170824, "epoch": 0.46 }, { "loss": 9.0675, "grad_norm": 1.0036309957504272, "learning_rate": 0.0008459788220880371, "epoch": 0.46 }, { "loss": 9.5195, "grad_norm": 1.3881008625030518, "learning_rate": 0.0008451037017589919, "epoch": 0.46 }, { "loss": 9.3519, "grad_norm": 1.4355233907699585, "learning_rate": 0.0008442285814299467, "epoch": 0.47 }, { "loss": 9.6383, "grad_norm": 0.9438649415969849, "learning_rate": 0.0008433534611009014, "epoch": 0.47 }, { "loss": 9.2643, "grad_norm": 0.8599776029586792, "learning_rate": 0.0008424783407718562, "epoch": 0.47 }, { "loss": 8.9869, "grad_norm": 1.1090342998504639, "learning_rate": 0.0008416032204428109, "epoch": 0.48 }, { "loss": 9.2475, "grad_norm": 1.272929310798645, "learning_rate": 0.0008407281001137657, "epoch": 0.48 }, { "loss": 9.5772, "grad_norm": 0.9889743328094482, "learning_rate": 0.0008398529797847205, "epoch": 0.48 }, { "loss": 9.9227, "grad_norm": 1.2748692035675049, "learning_rate": 0.0008389778594556752, "epoch": 0.48 }, { "loss": 9.9915, "grad_norm": 1.4889165163040161, "learning_rate": 0.00083810273912663, "epoch": 0.49 }, { "loss": 9.0012, "grad_norm": 1.2172118425369263, "learning_rate": 0.0008372276187975846, "epoch": 0.49 }, { "loss": 9.2968, "grad_norm": 1.0313849449157715, "learning_rate": 0.0008363524984685394, "epoch": 0.49 }, { "loss": 8.9158, "grad_norm": 1.3325482606887817, "learning_rate": 0.0008354773781394942, "epoch": 0.49 }, { "loss": 9.0097, "grad_norm": 1.5407133102416992, "learning_rate": 0.0008346022578104489, "epoch": 0.5 }, { "loss": 9.0166, "grad_norm": 1.1565685272216797, "learning_rate": 0.0008337271374814037, "epoch": 0.5 }, { "loss": 9.1856, "grad_norm": 1.0405404567718506, "learning_rate": 0.0008328520171523584, "epoch": 0.5 }, { "loss": 9.2405, "grad_norm": 1.465058445930481, "learning_rate": 0.0008319768968233132, "epoch": 0.5 }, { "loss": 8.835, "grad_norm": 0.9321463704109192, "learning_rate": 0.000831101776494268, "epoch": 0.51 }, { "loss": 9.4076, "grad_norm": 1.1780034303665161, "learning_rate": 0.0008302266561652227, "epoch": 0.51 }, { "loss": 9.5994, "grad_norm": 1.488897681236267, "learning_rate": 0.0008293515358361775, "epoch": 0.51 }, { "loss": 8.6378, "grad_norm": 1.0508447885513306, "learning_rate": 0.0008284764155071323, "epoch": 0.51 }, { "loss": 8.7946, "grad_norm": 1.2236040830612183, "learning_rate": 0.000827601295178087, "epoch": 0.52 }, { "loss": 9.4619, "grad_norm": 1.0602221488952637, "learning_rate": 0.0008267261748490418, "epoch": 0.52 }, { "loss": 8.927, "grad_norm": 1.476576328277588, "learning_rate": 0.0008258510545199965, "epoch": 0.52 }, { "loss": 8.766, "grad_norm": 1.2723809480667114, "learning_rate": 0.0008249759341909513, "epoch": 0.52 }, { "loss": 9.1577, "grad_norm": 1.2955093383789062, "learning_rate": 0.0008241008138619061, "epoch": 0.53 }, { "loss": 8.8254, "grad_norm": 1.1421802043914795, "learning_rate": 0.0008232256935328608, "epoch": 0.53 }, { "loss": 9.3559, "grad_norm": 1.2015204429626465, "learning_rate": 0.0008223505732038156, "epoch": 0.53 }, { "loss": 8.7055, "grad_norm": 1.02347993850708, "learning_rate": 0.0008214754528747703, "epoch": 0.54 }, { "loss": 9.1773, "grad_norm": 1.0733789205551147, "learning_rate": 0.0008206003325457251, "epoch": 0.54 }, { "loss": 9.4909, "grad_norm": 1.140329360961914, "learning_rate": 0.0008197252122166799, "epoch": 0.54 }, { "loss": 8.4982, "grad_norm": 0.8933946490287781, "learning_rate": 0.0008188500918876346, "epoch": 0.54 }, { "loss": 9.4497, "grad_norm": 1.3848881721496582, "learning_rate": 0.0008179749715585894, "epoch": 0.55 }, { "loss": 9.5758, "grad_norm": 1.175162672996521, "learning_rate": 0.0008170998512295442, "epoch": 0.55 }, { "loss": 9.5138, "grad_norm": 1.1983882188796997, "learning_rate": 0.0008162247309004989, "epoch": 0.55 }, { "loss": 9.0283, "grad_norm": 0.9055472612380981, "learning_rate": 0.0008153496105714536, "epoch": 0.55 }, { "loss": 9.2822, "grad_norm": 0.8885380029678345, "learning_rate": 0.0008144744902424083, "epoch": 0.56 }, { "loss": 8.9084, "grad_norm": 1.0463942289352417, "learning_rate": 0.0008135993699133631, "epoch": 0.56 }, { "loss": 9.0612, "grad_norm": 1.1517601013183594, "learning_rate": 0.0008127242495843179, "epoch": 0.56 }, { "loss": 9.7954, "grad_norm": 1.6062026023864746, "learning_rate": 0.0008118491292552726, "epoch": 0.56 }, { "loss": 8.823, "grad_norm": 1.079883098602295, "learning_rate": 0.0008109740089262274, "epoch": 0.57 }, { "loss": 8.6287, "grad_norm": 0.8593969345092773, "learning_rate": 0.0008100988885971821, "epoch": 0.57 }, { "loss": 9.046, "grad_norm": 1.5058172941207886, "learning_rate": 0.0008092237682681369, "epoch": 0.57 }, { "loss": 8.4422, "grad_norm": 1.0326484441757202, "learning_rate": 0.0008083486479390917, "epoch": 0.57 }, { "loss": 9.5016, "grad_norm": 0.9177812337875366, "learning_rate": 0.0008074735276100464, "epoch": 0.58 }, { "loss": 8.4734, "grad_norm": 1.1267443895339966, "learning_rate": 0.0008065984072810012, "epoch": 0.58 }, { "loss": 8.5878, "grad_norm": 0.9788813591003418, "learning_rate": 0.000805723286951956, "epoch": 0.58 }, { "loss": 9.1188, "grad_norm": 1.1300309896469116, "learning_rate": 0.0008048481666229107, "epoch": 0.59 }, { "loss": 8.7167, "grad_norm": 0.9951778650283813, "learning_rate": 0.0008039730462938655, "epoch": 0.59 }, { "loss": 9.1088, "grad_norm": 0.9415300488471985, "learning_rate": 0.0008030979259648202, "epoch": 0.59 }, { "loss": 8.4083, "grad_norm": 0.990203320980072, "learning_rate": 0.000802222805635775, "epoch": 0.59 }, { "loss": 8.4926, "grad_norm": 1.0430456399917603, "learning_rate": 0.0008013476853067298, "epoch": 0.6 }, { "loss": 9.3307, "grad_norm": 0.9623116254806519, "learning_rate": 0.0008004725649776845, "epoch": 0.6 }, { "loss": 8.8633, "grad_norm": 1.0354257822036743, "learning_rate": 0.0007995974446486392, "epoch": 0.6 }, { "loss": 8.7932, "grad_norm": 1.1962673664093018, "learning_rate": 0.0007987223243195939, "epoch": 0.6 }, { "loss": 8.4265, "grad_norm": 1.0186195373535156, "learning_rate": 0.0007978472039905487, "epoch": 0.61 }, { "loss": 8.4596, "grad_norm": 1.2448772192001343, "learning_rate": 0.0007969720836615035, "epoch": 0.61 }, { "loss": 9.0019, "grad_norm": 1.11643385887146, "learning_rate": 0.0007960969633324582, "epoch": 0.61 }, { "loss": 8.7469, "grad_norm": 1.9622658491134644, "learning_rate": 0.000795221843003413, "epoch": 0.61 }, { "loss": 8.208, "grad_norm": 0.9547304511070251, "learning_rate": 0.0007943467226743676, "epoch": 0.62 }, { "loss": 8.3751, "grad_norm": 0.8313985466957092, "learning_rate": 0.0007934716023453224, "epoch": 0.62 }, { "loss": 8.6238, "grad_norm": 0.9323874711990356, "learning_rate": 0.0007925964820162772, "epoch": 0.62 }, { "loss": 9.0078, "grad_norm": 1.0662554502487183, "learning_rate": 0.0007917213616872319, "epoch": 0.62 }, { "loss": 8.7407, "grad_norm": 1.197045087814331, "learning_rate": 0.0007908462413581867, "epoch": 0.63 }, { "loss": 8.9698, "grad_norm": 1.0494697093963623, "learning_rate": 0.0007899711210291415, "epoch": 0.63 }, { "loss": 8.56, "grad_norm": 0.9860395789146423, "learning_rate": 0.0007890960007000962, "epoch": 0.63 }, { "loss": 8.624, "grad_norm": 0.8026842474937439, "learning_rate": 0.000788220880371051, "epoch": 0.64 }, { "loss": 9.1911, "grad_norm": 1.0249046087265015, "learning_rate": 0.0007873457600420057, "epoch": 0.64 }, { "loss": 8.552, "grad_norm": 1.3037137985229492, "learning_rate": 0.0007864706397129605, "epoch": 0.64 }, { "loss": 8.6872, "grad_norm": 1.1018158197402954, "learning_rate": 0.0007855955193839153, "epoch": 0.64 }, { "loss": 8.5007, "grad_norm": 0.9974724054336548, "learning_rate": 0.00078472039905487, "epoch": 0.65 }, { "loss": 9.3866, "grad_norm": 1.2537139654159546, "learning_rate": 0.0007838452787258248, "epoch": 0.65 }, { "loss": 8.9869, "grad_norm": 1.2758492231369019, "learning_rate": 0.0007829701583967795, "epoch": 0.65 }, { "loss": 8.266, "grad_norm": 0.9684768915176392, "learning_rate": 0.0007820950380677343, "epoch": 0.65 }, { "loss": 9.0718, "grad_norm": 1.0212547779083252, "learning_rate": 0.0007812199177386891, "epoch": 0.66 }, { "loss": 8.1438, "grad_norm": 1.2493318319320679, "learning_rate": 0.0007803447974096438, "epoch": 0.66 }, { "loss": 8.4132, "grad_norm": 0.8168124556541443, "learning_rate": 0.0007794696770805986, "epoch": 0.66 }, { "loss": 8.4466, "grad_norm": 1.2837003469467163, "learning_rate": 0.0007785945567515534, "epoch": 0.66 }, { "loss": 8.6008, "grad_norm": 1.1589733362197876, "learning_rate": 0.0007777194364225081, "epoch": 0.67 }, { "loss": 8.7002, "grad_norm": 1.036216378211975, "learning_rate": 0.0007768443160934629, "epoch": 0.67 }, { "loss": 8.9616, "grad_norm": 0.9488565921783447, "learning_rate": 0.0007759691957644176, "epoch": 0.67 }, { "loss": 8.9011, "grad_norm": 1.1349655389785767, "learning_rate": 0.0007750940754353724, "epoch": 0.67 }, { "loss": 8.7398, "grad_norm": 1.3466508388519287, "learning_rate": 0.0007742189551063272, "epoch": 0.68 }, { "loss": 8.1787, "grad_norm": 1.1343966722488403, "learning_rate": 0.0007733438347772819, "epoch": 0.68 }, { "loss": 8.4513, "grad_norm": 0.9983484148979187, "learning_rate": 0.0007724687144482366, "epoch": 0.68 }, { "loss": 8.6249, "grad_norm": 1.4816855192184448, "learning_rate": 0.0007715935941191913, "epoch": 0.69 }, { "loss": 8.9094, "grad_norm": 1.0790578126907349, "learning_rate": 0.0007707184737901461, "epoch": 0.69 }, { "loss": 8.0177, "grad_norm": 1.2572119235992432, "learning_rate": 0.0007698433534611009, "epoch": 0.69 }, { "loss": 8.5014, "grad_norm": 1.123079776763916, "learning_rate": 0.0007689682331320556, "epoch": 0.69 }, { "loss": 8.2177, "grad_norm": 0.8789654970169067, "learning_rate": 0.0007680931128030104, "epoch": 0.7 }, { "loss": 8.3753, "grad_norm": 0.9512013792991638, "learning_rate": 0.0007672179924739651, "epoch": 0.7 }, { "loss": 8.5434, "grad_norm": 1.929919719696045, "learning_rate": 0.0007663428721449199, "epoch": 0.7 }, { "loss": 8.5505, "grad_norm": 1.1756147146224976, "learning_rate": 0.0007654677518158747, "epoch": 0.7 }, { "loss": 8.8823, "grad_norm": 1.1833679676055908, "learning_rate": 0.0007645926314868294, "epoch": 0.71 }, { "loss": 8.6715, "grad_norm": 1.4701839685440063, "learning_rate": 0.0007637175111577842, "epoch": 0.71 }, { "loss": 8.7559, "grad_norm": 0.9352959990501404, "learning_rate": 0.0007629299028616435, "epoch": 0.71 }, { "loss": 9.5594, "grad_norm": 1.0391898155212402, "learning_rate": 0.0007620547825325983, "epoch": 0.71 }, { "loss": 8.3431, "grad_norm": 1.0766905546188354, "learning_rate": 0.000761179662203553, "epoch": 0.72 }, { "loss": 8.3928, "grad_norm": 1.10299551486969, "learning_rate": 0.0007603045418745078, "epoch": 0.72 }, { "loss": 8.9913, "grad_norm": 1.1581339836120605, "learning_rate": 0.0007594294215454624, "epoch": 0.72 }, { "loss": 8.5142, "grad_norm": 1.086441993713379, "learning_rate": 0.0007585543012164172, "epoch": 0.72 }, { "loss": 8.7005, "grad_norm": 0.9478667974472046, "learning_rate": 0.000757679180887372, "epoch": 0.73 }, { "loss": 8.608, "grad_norm": 1.0929220914840698, "learning_rate": 0.0007568040605583267, "epoch": 0.73 }, { "loss": 8.1125, "grad_norm": 1.217629313468933, "learning_rate": 0.0007559289402292815, "epoch": 0.73 }, { "loss": 8.4331, "grad_norm": 1.2786823511123657, "learning_rate": 0.0007550538199002362, "epoch": 0.73 }, { "loss": 9.1985, "grad_norm": 1.0184354782104492, "learning_rate": 0.000754178699571191, "epoch": 0.74 }, { "loss": 8.6549, "grad_norm": 0.93660968542099, "learning_rate": 0.0007533035792421458, "epoch": 0.74 }, { "loss": 8.7819, "grad_norm": 1.0092636346817017, "learning_rate": 0.0007524284589131005, "epoch": 0.74 }, { "loss": 8.3759, "grad_norm": 1.2108792066574097, "learning_rate": 0.0007515533385840553, "epoch": 0.75 }, { "loss": 8.4973, "grad_norm": 0.9994498491287231, "learning_rate": 0.00075067821825501, "epoch": 0.75 }, { "loss": 8.3731, "grad_norm": 1.153273344039917, "learning_rate": 0.0007498030979259648, "epoch": 0.75 }, { "loss": 8.4148, "grad_norm": 1.051223874092102, "learning_rate": 0.0007489279775969196, "epoch": 0.75 }, { "loss": 8.6672, "grad_norm": 1.4810237884521484, "learning_rate": 0.0007480528572678743, "epoch": 0.76 }, { "loss": 8.6439, "grad_norm": 1.021606206893921, "learning_rate": 0.0007471777369388291, "epoch": 0.76 }, { "loss": 8.7591, "grad_norm": 0.8680776357650757, "learning_rate": 0.0007463026166097839, "epoch": 0.76 }, { "loss": 9.0187, "grad_norm": 1.0177042484283447, "learning_rate": 0.0007454274962807386, "epoch": 0.76 }, { "loss": 8.9481, "grad_norm": 1.2384392023086548, "learning_rate": 0.0007445523759516934, "epoch": 0.77 }, { "loss": 8.6184, "grad_norm": 1.3748959302902222, "learning_rate": 0.0007436772556226481, "epoch": 0.77 }, { "loss": 8.3906, "grad_norm": 1.042493462562561, "learning_rate": 0.0007428021352936029, "epoch": 0.77 }, { "loss": 9.3308, "grad_norm": 1.0647776126861572, "learning_rate": 0.0007419270149645576, "epoch": 0.77 }, { "loss": 8.332, "grad_norm": 1.2385993003845215, "learning_rate": 0.0007410518946355123, "epoch": 0.78 }, { "loss": 8.3127, "grad_norm": 1.0191227197647095, "learning_rate": 0.0007401767743064671, "epoch": 0.78 }, { "loss": 8.3151, "grad_norm": 0.8735216856002808, "learning_rate": 0.0007393016539774218, "epoch": 0.78 }, { "loss": 8.701, "grad_norm": 1.202993392944336, "learning_rate": 0.0007384265336483766, "epoch": 0.78 }, { "loss": 7.8262, "grad_norm": 0.9682905673980713, "learning_rate": 0.0007375514133193314, "epoch": 0.79 }, { "loss": 8.4729, "grad_norm": 1.2290154695510864, "learning_rate": 0.0007366762929902861, "epoch": 0.79 }, { "loss": 8.9253, "grad_norm": 1.0369175672531128, "learning_rate": 0.0007358011726612409, "epoch": 0.79 }, { "loss": 9.2036, "grad_norm": 1.0748445987701416, "learning_rate": 0.0007349260523321957, "epoch": 0.8 }, { "loss": 8.2364, "grad_norm": 1.147964596748352, "learning_rate": 0.0007340509320031504, "epoch": 0.8 }, { "loss": 9.006, "grad_norm": 1.0363622903823853, "learning_rate": 0.0007331758116741052, "epoch": 0.8 }, { "loss": 8.7969, "grad_norm": 1.2576889991760254, "learning_rate": 0.0007323006913450599, "epoch": 0.8 }, { "loss": 8.4052, "grad_norm": 1.1075588464736938, "learning_rate": 0.0007314255710160147, "epoch": 0.81 }, { "loss": 8.5912, "grad_norm": 1.0697672367095947, "learning_rate": 0.0007305504506869695, "epoch": 0.81 }, { "loss": 8.7837, "grad_norm": 1.0865002870559692, "learning_rate": 0.0007296753303579242, "epoch": 0.81 }, { "loss": 8.0798, "grad_norm": 1.3645957708358765, "learning_rate": 0.000728800210028879, "epoch": 0.81 }, { "loss": 8.2649, "grad_norm": 1.0889688730239868, "learning_rate": 0.0007279250896998337, "epoch": 0.82 }, { "loss": 7.902, "grad_norm": 0.9943633675575256, "learning_rate": 0.0007270499693707885, "epoch": 0.82 }, { "loss": 8.493, "grad_norm": 1.3548861742019653, "learning_rate": 0.0007261748490417433, "epoch": 0.82 }, { "loss": 9.2024, "grad_norm": 1.1603728532791138, "learning_rate": 0.000725299728712698, "epoch": 0.82 }, { "loss": 8.7272, "grad_norm": 1.2872350215911865, "learning_rate": 0.0007244246083836528, "epoch": 0.83 }, { "loss": 8.8292, "grad_norm": 1.0431410074234009, "learning_rate": 0.0007235494880546076, "epoch": 0.83 }, { "loss": 8.0473, "grad_norm": 0.9648978114128113, "learning_rate": 0.0007226743677255623, "epoch": 0.83 }, { "loss": 8.134, "grad_norm": 0.8962783217430115, "learning_rate": 0.0007217992473965171, "epoch": 0.83 }, { "loss": 8.2796, "grad_norm": 0.8879069685935974, "learning_rate": 0.0007209241270674718, "epoch": 0.84 }, { "loss": 8.6275, "grad_norm": 1.0046008825302124, "learning_rate": 0.0007200490067384265, "epoch": 0.84 }, { "loss": 8.2847, "grad_norm": 1.1034067869186401, "learning_rate": 0.0007191738864093813, "epoch": 0.84 }, { "loss": 8.723, "grad_norm": 0.9179050326347351, "learning_rate": 0.000718298766080336, "epoch": 0.85 }, { "loss": 8.2843, "grad_norm": 1.0402296781539917, "learning_rate": 0.0007174236457512908, "epoch": 0.85 }, { "loss": 8.2487, "grad_norm": 1.2751373052597046, "learning_rate": 0.0007165485254222455, "epoch": 0.85 }, { "loss": 8.3491, "grad_norm": 0.8596373200416565, "learning_rate": 0.0007156734050932003, "epoch": 0.85 }, { "loss": 8.4695, "grad_norm": 1.0553058385849, "learning_rate": 0.0007147982847641551, "epoch": 0.86 }, { "loss": 8.74, "grad_norm": 1.0505644083023071, "learning_rate": 0.0007139231644351098, "epoch": 0.86 }, { "loss": 8.3704, "grad_norm": 1.4136569499969482, "learning_rate": 0.0007130480441060646, "epoch": 0.86 }, { "loss": 7.9998, "grad_norm": 0.9397268295288086, "learning_rate": 0.0007121729237770194, "epoch": 0.86 }, { "loss": 8.5978, "grad_norm": 1.1479915380477905, "learning_rate": 0.0007112978034479741, "epoch": 0.87 }, { "loss": 8.6225, "grad_norm": 1.0489866733551025, "learning_rate": 0.0007104226831189289, "epoch": 0.87 }, { "loss": 8.3155, "grad_norm": 0.9371022582054138, "learning_rate": 0.0007095475627898836, "epoch": 0.87 }, { "loss": 8.3844, "grad_norm": 1.1981381177902222, "learning_rate": 0.0007086724424608384, "epoch": 0.87 }, { "loss": 8.5061, "grad_norm": 0.8924277424812317, "learning_rate": 0.0007077973221317932, "epoch": 0.88 }, { "loss": 8.1918, "grad_norm": 1.4077969789505005, "learning_rate": 0.0007069222018027479, "epoch": 0.88 }, { "loss": 8.3377, "grad_norm": 1.1926066875457764, "learning_rate": 0.0007060470814737027, "epoch": 0.88 }, { "loss": 8.4682, "grad_norm": 1.1524171829223633, "learning_rate": 0.0007051719611446574, "epoch": 0.88 }, { "loss": 8.5678, "grad_norm": 1.0660207271575928, "learning_rate": 0.0007042968408156122, "epoch": 0.89 }, { "loss": 7.9908, "grad_norm": 1.1786776781082153, "learning_rate": 0.000703421720486567, "epoch": 0.89 }, { "loss": 9.0339, "grad_norm": 0.9970653057098389, "learning_rate": 0.0007025466001575217, "epoch": 0.89 }, { "loss": 8.6511, "grad_norm": 1.171247124671936, "learning_rate": 0.0007016714798284765, "epoch": 0.9 }, { "loss": 8.0249, "grad_norm": 1.1036537885665894, "learning_rate": 0.0007007963594994313, "epoch": 0.9 }, { "loss": 8.2895, "grad_norm": 1.4363912343978882, "learning_rate": 0.000699921239170386, "epoch": 0.9 }, { "loss": 8.4263, "grad_norm": 1.2977561950683594, "learning_rate": 0.0006990461188413408, "epoch": 0.9 }, { "loss": 8.3236, "grad_norm": 1.2732399702072144, "learning_rate": 0.0006981709985122954, "epoch": 0.91 }, { "loss": 8.0876, "grad_norm": 0.8092446327209473, "learning_rate": 0.0006972958781832502, "epoch": 0.91 }, { "loss": 8.3052, "grad_norm": 1.0607753992080688, "learning_rate": 0.000696420757854205, "epoch": 0.91 }, { "loss": 8.2821, "grad_norm": 1.2833763360977173, "learning_rate": 0.0006955456375251597, "epoch": 0.91 }, { "loss": 8.0437, "grad_norm": 1.2291605472564697, "learning_rate": 0.0006946705171961145, "epoch": 0.92 }, { "loss": 7.9172, "grad_norm": 0.9950680732727051, "learning_rate": 0.0006937953968670692, "epoch": 0.92 }, { "loss": 7.8579, "grad_norm": 1.170876145362854, "learning_rate": 0.000692920276538024, "epoch": 0.92 }, { "loss": 8.7343, "grad_norm": 1.0266340970993042, "learning_rate": 0.0006920451562089788, "epoch": 0.92 }, { "loss": 8.3685, "grad_norm": 1.1194366216659546, "learning_rate": 0.0006911700358799335, "epoch": 0.93 }, { "loss": 8.8983, "grad_norm": 1.130362868309021, "learning_rate": 0.0006902949155508883, "epoch": 0.93 }, { "loss": 8.3624, "grad_norm": 1.2582019567489624, "learning_rate": 0.000689419795221843, "epoch": 0.93 }, { "loss": 8.5332, "grad_norm": 1.0985493659973145, "learning_rate": 0.0006885446748927978, "epoch": 0.93 }, { "loss": 8.263, "grad_norm": 1.0480501651763916, "learning_rate": 0.0006876695545637526, "epoch": 0.94 }, { "loss": 8.1911, "grad_norm": 1.085471510887146, "learning_rate": 0.0006867944342347073, "epoch": 0.94 }, { "loss": 8.6767, "grad_norm": 1.109959602355957, "learning_rate": 0.0006859193139056621, "epoch": 0.94 }, { "loss": 8.1904, "grad_norm": 0.9299295544624329, "learning_rate": 0.0006850441935766169, "epoch": 0.94 }, { "loss": 7.9858, "grad_norm": 1.3819242715835571, "learning_rate": 0.0006841690732475716, "epoch": 0.95 }, { "loss": 8.3134, "grad_norm": 1.499324083328247, "learning_rate": 0.0006832939529185264, "epoch": 0.95 }, { "loss": 8.1389, "grad_norm": 1.0068879127502441, "learning_rate": 0.0006824188325894811, "epoch": 0.95 }, { "loss": 8.0979, "grad_norm": 1.232861876487732, "learning_rate": 0.0006815437122604359, "epoch": 0.96 }, { "loss": 8.1456, "grad_norm": 1.020922064781189, "learning_rate": 0.0006806685919313907, "epoch": 0.96 }, { "loss": 8.1438, "grad_norm": 1.2880629301071167, "learning_rate": 0.0006797934716023453, "epoch": 0.96 }, { "loss": 7.8589, "grad_norm": 1.2720872163772583, "learning_rate": 0.0006789183512733001, "epoch": 0.96 }, { "loss": 8.338, "grad_norm": 1.1569981575012207, "learning_rate": 0.0006780432309442548, "epoch": 0.97 }, { "loss": 7.6167, "grad_norm": 1.0755385160446167, "learning_rate": 0.0006771681106152095, "epoch": 0.97 }, { "loss": 9.1889, "grad_norm": 1.1371173858642578, "learning_rate": 0.0006762929902861643, "epoch": 0.97 }, { "loss": 8.1603, "grad_norm": 1.2543790340423584, "learning_rate": 0.000675417869957119, "epoch": 0.97 }, { "loss": 8.1684, "grad_norm": 1.665987491607666, "learning_rate": 0.0006745427496280738, "epoch": 0.98 }, { "loss": 8.4957, "grad_norm": 1.1479765176773071, "learning_rate": 0.0006736676292990285, "epoch": 0.98 }, { "loss": 7.998, "grad_norm": 1.1416277885437012, "learning_rate": 0.0006727925089699833, "epoch": 0.98 }, { "loss": 8.4458, "grad_norm": 1.2610832452774048, "learning_rate": 0.0006719173886409381, "epoch": 0.98 }, { "loss": 8.2715, "grad_norm": 1.2478748559951782, "learning_rate": 0.0006710422683118928, "epoch": 0.99 }, { "loss": 8.0882, "grad_norm": 0.9021313190460205, "learning_rate": 0.0006701671479828476, "epoch": 0.99 }, { "loss": 8.2404, "grad_norm": 1.0023951530456543, "learning_rate": 0.0006692920276538024, "epoch": 0.99 }, { "loss": 8.681, "grad_norm": 1.3342375755310059, "learning_rate": 0.0006684169073247571, "epoch": 0.99 }, { "loss": 8.024, "grad_norm": 1.0199118852615356, "learning_rate": 0.0006675417869957119, "epoch": 1.0 }, { "loss": 8.3688, "grad_norm": 0.893786609172821, "learning_rate": 0.0006666666666666666, "epoch": 1.0 }, { "loss": 8.0561, "grad_norm": 1.2774296998977661, "learning_rate": 0.0006657915463376214, "epoch": 1.0 }, { "loss": 7.8444, "grad_norm": 1.0824223756790161, "learning_rate": 0.0006649164260085762, "epoch": 1.01 }, { "loss": 8.1771, "grad_norm": 0.869452178478241, "learning_rate": 0.0006640413056795309, "epoch": 1.01 }, { "loss": 7.6838, "grad_norm": 1.1132241487503052, "learning_rate": 0.0006631661853504857, "epoch": 1.01 }, { "loss": 7.9475, "grad_norm": 1.2853749990463257, "learning_rate": 0.0006622910650214405, "epoch": 1.01 }, { "loss": 8.8546, "grad_norm": 1.2339048385620117, "learning_rate": 0.0006614159446923952, "epoch": 1.02 }, { "loss": 8.1339, "grad_norm": 1.2211487293243408, "learning_rate": 0.00066054082436335, "epoch": 1.02 }, { "loss": 7.402, "grad_norm": 1.0966975688934326, "learning_rate": 0.0006596657040343047, "epoch": 1.02 }, { "loss": 8.1777, "grad_norm": 1.0253325700759888, "learning_rate": 0.0006587905837052595, "epoch": 1.02 }, { "loss": 8.2748, "grad_norm": 1.2987836599349976, "learning_rate": 0.0006579154633762143, "epoch": 1.03 }, { "loss": 8.3225, "grad_norm": 0.945371687412262, "learning_rate": 0.000657040343047169, "epoch": 1.03 }, { "loss": 8.4416, "grad_norm": 1.0868079662322998, "learning_rate": 0.0006561652227181238, "epoch": 1.03 }, { "loss": 8.1007, "grad_norm": 1.0190479755401611, "learning_rate": 0.0006552901023890784, "epoch": 1.03 }, { "loss": 8.1317, "grad_norm": 1.0896625518798828, "learning_rate": 0.0006544149820600332, "epoch": 1.04 }, { "loss": 7.7364, "grad_norm": 1.1690502166748047, "learning_rate": 0.000653539861730988, "epoch": 1.04 }, { "loss": 7.8173, "grad_norm": 1.0521645545959473, "learning_rate": 0.0006526647414019427, "epoch": 1.04 }, { "loss": 7.6212, "grad_norm": 1.3057899475097656, "learning_rate": 0.0006517896210728975, "epoch": 1.04 }, { "loss": 8.0228, "grad_norm": 0.968885064125061, "learning_rate": 0.0006509145007438522, "epoch": 1.05 }, { "loss": 7.8535, "grad_norm": 1.1838873624801636, "learning_rate": 0.000650039380414807, "epoch": 1.05 }, { "loss": 8.1991, "grad_norm": 1.0967016220092773, "learning_rate": 0.0006491642600857618, "epoch": 1.05 }, { "loss": 8.1515, "grad_norm": 1.0798629522323608, "learning_rate": 0.0006482891397567165, "epoch": 1.06 }, { "loss": 8.291, "grad_norm": 1.1506596803665161, "learning_rate": 0.0006474140194276713, "epoch": 1.06 }, { "loss": 7.956, "grad_norm": 1.0459505319595337, "learning_rate": 0.0006465388990986261, "epoch": 1.06 }, { "loss": 8.4393, "grad_norm": 1.070776343345642, "learning_rate": 0.0006456637787695808, "epoch": 1.06 }, { "loss": 8.5445, "grad_norm": 1.3064284324645996, "learning_rate": 0.0006447886584405356, "epoch": 1.07 }, { "loss": 8.701, "grad_norm": 1.0707839727401733, "learning_rate": 0.0006439135381114903, "epoch": 1.07 }, { "loss": 7.4342, "grad_norm": 1.123377799987793, "learning_rate": 0.0006430384177824451, "epoch": 1.07 }, { "loss": 8.4883, "grad_norm": 1.7230886220932007, "learning_rate": 0.0006421632974533999, "epoch": 1.07 }, { "loss": 8.5288, "grad_norm": 0.9721227288246155, "learning_rate": 0.0006412881771243546, "epoch": 1.08 }, { "loss": 7.8249, "grad_norm": 1.2729851007461548, "learning_rate": 0.0006404130567953094, "epoch": 1.08 }, { "loss": 8.3277, "grad_norm": 0.9693044424057007, "learning_rate": 0.0006395379364662642, "epoch": 1.08 }, { "loss": 7.8798, "grad_norm": 1.104020118713379, "learning_rate": 0.0006386628161372189, "epoch": 1.08 }, { "loss": 7.899, "grad_norm": 1.0556141138076782, "learning_rate": 0.0006377876958081737, "epoch": 1.09 }, { "loss": 8.6403, "grad_norm": 1.227303147315979, "learning_rate": 0.0006369125754791284, "epoch": 1.09 }, { "loss": 8.7407, "grad_norm": 1.2486103773117065, "learning_rate": 0.0006360374551500832, "epoch": 1.09 }, { "loss": 8.226, "grad_norm": 1.1452488899230957, "learning_rate": 0.000635162334821038, "epoch": 1.09 }, { "loss": 8.5083, "grad_norm": 1.466182827949524, "learning_rate": 0.0006342872144919927, "epoch": 1.1 }, { "loss": 7.8041, "grad_norm": 1.2693302631378174, "learning_rate": 0.0006334120941629474, "epoch": 1.1 }, { "loss": 7.918, "grad_norm": 1.1236190795898438, "learning_rate": 0.0006325369738339021, "epoch": 1.1 }, { "loss": 7.8792, "grad_norm": 0.9166776537895203, "learning_rate": 0.0006316618535048569, "epoch": 1.11 }, { "loss": 8.3714, "grad_norm": 1.2021427154541016, "learning_rate": 0.0006307867331758117, "epoch": 1.11 }, { "loss": 8.5282, "grad_norm": 1.1508140563964844, "learning_rate": 0.0006299116128467664, "epoch": 1.11 }, { "loss": 7.7235, "grad_norm": 1.044027328491211, "learning_rate": 0.0006290364925177212, "epoch": 1.11 }, { "loss": 8.0483, "grad_norm": 1.00051748752594, "learning_rate": 0.000628161372188676, "epoch": 1.12 }, { "loss": 8.0003, "grad_norm": 1.0397716760635376, "learning_rate": 0.0006272862518596307, "epoch": 1.12 }, { "loss": 8.274, "grad_norm": 1.0577192306518555, "learning_rate": 0.0006264111315305855, "epoch": 1.12 }, { "loss": 7.8435, "grad_norm": 1.1829681396484375, "learning_rate": 0.0006255360112015402, "epoch": 1.12 }, { "loss": 8.5019, "grad_norm": 1.9353641271591187, "learning_rate": 0.000624660890872495, "epoch": 1.13 }, { "loss": 8.4582, "grad_norm": 1.237269639968872, "learning_rate": 0.0006237857705434498, "epoch": 1.13 }, { "loss": 8.0735, "grad_norm": 1.1674834489822388, "learning_rate": 0.0006229106502144045, "epoch": 1.13 }, { "loss": 8.3781, "grad_norm": 1.32883620262146, "learning_rate": 0.0006220355298853593, "epoch": 1.13 }, { "loss": 8.723, "grad_norm": 1.3197271823883057, "learning_rate": 0.000621160409556314, "epoch": 1.14 }, { "loss": 8.414, "grad_norm": 1.137764573097229, "learning_rate": 0.0006202852892272688, "epoch": 1.14 }, { "loss": 7.9197, "grad_norm": 1.1574738025665283, "learning_rate": 0.0006194101688982236, "epoch": 1.14 }, { "loss": 8.09, "grad_norm": 1.0444676876068115, "learning_rate": 0.0006185350485691783, "epoch": 1.14 }, { "loss": 7.3329, "grad_norm": 0.8655235767364502, "learning_rate": 0.0006176599282401331, "epoch": 1.15 }, { "loss": 8.4163, "grad_norm": 0.9860300421714783, "learning_rate": 0.0006167848079110879, "epoch": 1.15 }, { "loss": 8.2608, "grad_norm": 1.1680139303207397, "learning_rate": 0.0006159096875820426, "epoch": 1.15 }, { "loss": 7.9283, "grad_norm": 1.545938491821289, "learning_rate": 0.0006150345672529974, "epoch": 1.15 }, { "loss": 8.4113, "grad_norm": 1.2768994569778442, "learning_rate": 0.0006141594469239521, "epoch": 1.16 }, { "loss": 8.2389, "grad_norm": 1.0001721382141113, "learning_rate": 0.0006132843265949069, "epoch": 1.16 }, { "loss": 8.397, "grad_norm": 1.8651808500289917, "learning_rate": 0.0006124092062658617, "epoch": 1.16 }, { "loss": 8.003, "grad_norm": 0.947693407535553, "learning_rate": 0.0006115340859368163, "epoch": 1.17 }, { "loss": 7.5861, "grad_norm": 1.1168384552001953, "learning_rate": 0.0006106589656077711, "epoch": 1.17 }, { "loss": 8.7788, "grad_norm": 1.1341112852096558, "learning_rate": 0.0006097838452787258, "epoch": 1.17 }, { "loss": 7.9428, "grad_norm": 1.2905473709106445, "learning_rate": 0.0006089087249496806, "epoch": 1.17 }, { "loss": 8.6196, "grad_norm": 0.9961435794830322, "learning_rate": 0.0006080336046206354, "epoch": 1.18 }, { "loss": 8.224, "grad_norm": 1.3134316205978394, "learning_rate": 0.0006071584842915901, "epoch": 1.18 }, { "loss": 7.9156, "grad_norm": 1.5898418426513672, "learning_rate": 0.0006062833639625449, "epoch": 1.18 }, { "loss": 8.2147, "grad_norm": 0.99250727891922, "learning_rate": 0.0006054082436334996, "epoch": 1.18 }, { "loss": 7.6957, "grad_norm": 1.2642431259155273, "learning_rate": 0.0006045331233044544, "epoch": 1.19 }, { "loss": 7.7926, "grad_norm": 1.314082384109497, "learning_rate": 0.0006036580029754092, "epoch": 1.19 }, { "loss": 7.9682, "grad_norm": 1.1342573165893555, "learning_rate": 0.0006027828826463639, "epoch": 1.19 }, { "loss": 8.0208, "grad_norm": 1.3015680313110352, "learning_rate": 0.0006019077623173187, "epoch": 1.19 }, { "loss": 8.3608, "grad_norm": 0.9990431666374207, "learning_rate": 0.0006010326419882735, "epoch": 1.2 }, { "loss": 8.2009, "grad_norm": 0.9804344773292542, "learning_rate": 0.0006001575216592282, "epoch": 1.2 }, { "loss": 8.0484, "grad_norm": 1.1591954231262207, "learning_rate": 0.0005992824013301829, "epoch": 1.2 }, { "loss": 8.116, "grad_norm": 1.042474627494812, "learning_rate": 0.0005984072810011376, "epoch": 1.2 }, { "loss": 7.9246, "grad_norm": 1.8579179048538208, "learning_rate": 0.0005975321606720924, "epoch": 1.21 }, { "loss": 7.9183, "grad_norm": 0.8727061748504639, "learning_rate": 0.0005966570403430472, "epoch": 1.21 }, { "loss": 7.675, "grad_norm": 1.0189380645751953, "learning_rate": 0.0005957819200140019, "epoch": 1.21 }, { "loss": 7.6222, "grad_norm": 1.0766206979751587, "learning_rate": 0.0005949067996849567, "epoch": 1.22 }, { "loss": 7.6455, "grad_norm": 1.121745228767395, "learning_rate": 0.0005940316793559114, "epoch": 1.22 }, { "loss": 8.1449, "grad_norm": 1.2497507333755493, "learning_rate": 0.0005931565590268662, "epoch": 1.22 }, { "loss": 8.3586, "grad_norm": 1.301903486251831, "learning_rate": 0.000592281438697821, "epoch": 1.22 }, { "loss": 8.163, "grad_norm": 1.1964079141616821, "learning_rate": 0.0005914063183687757, "epoch": 1.23 }, { "loss": 8.2938, "grad_norm": 1.1423827409744263, "learning_rate": 0.0005905311980397304, "epoch": 1.23 }, { "loss": 8.165, "grad_norm": 1.119884967803955, "learning_rate": 0.0005896560777106851, "epoch": 1.23 }, { "loss": 7.7234, "grad_norm": 1.4375518560409546, "learning_rate": 0.0005887809573816399, "epoch": 1.23 }, { "loss": 8.0758, "grad_norm": 1.1417185068130493, "learning_rate": 0.0005879058370525947, "epoch": 1.24 }, { "loss": 7.9137, "grad_norm": 1.048060417175293, "learning_rate": 0.0005870307167235494, "epoch": 1.24 }, { "loss": 8.4029, "grad_norm": 0.9880658388137817, "learning_rate": 0.0005861555963945042, "epoch": 1.24 }, { "loss": 8.4489, "grad_norm": 1.000611424446106, "learning_rate": 0.000585280476065459, "epoch": 1.24 }, { "loss": 8.2688, "grad_norm": 1.3099920749664307, "learning_rate": 0.0005844053557364137, "epoch": 1.25 }, { "loss": 7.7948, "grad_norm": 0.8548302054405212, "learning_rate": 0.0005835302354073685, "epoch": 1.25 }, { "loss": 8.442, "grad_norm": 1.1732860803604126, "learning_rate": 0.0005826551150783232, "epoch": 1.25 }, { "loss": 7.6346, "grad_norm": 0.803125262260437, "learning_rate": 0.000581779994749278, "epoch": 1.25 }, { "loss": 8.0567, "grad_norm": 1.258419156074524, "learning_rate": 0.0005809048744202328, "epoch": 1.26 }, { "loss": 8.1142, "grad_norm": 1.1331418752670288, "learning_rate": 0.0005800297540911875, "epoch": 1.26 }, { "loss": 8.5457, "grad_norm": 1.5619804859161377, "learning_rate": 0.0005791546337621423, "epoch": 1.26 }, { "loss": 7.9416, "grad_norm": 1.880534052848816, "learning_rate": 0.000578279513433097, "epoch": 1.27 }, { "loss": 7.8216, "grad_norm": 1.2279471158981323, "learning_rate": 0.0005774043931040518, "epoch": 1.27 }, { "loss": 7.8216, "grad_norm": 1.1597974300384521, "learning_rate": 0.0005765292727750066, "epoch": 1.27 }, { "loss": 7.9033, "grad_norm": 1.1710484027862549, "learning_rate": 0.0005756541524459613, "epoch": 1.27 }, { "loss": 7.6036, "grad_norm": 1.0655231475830078, "learning_rate": 0.0005747790321169161, "epoch": 1.28 }, { "loss": 7.5982, "grad_norm": 1.0066710710525513, "learning_rate": 0.0005739039117878709, "epoch": 1.28 }, { "loss": 7.738, "grad_norm": 1.1333460807800293, "learning_rate": 0.0005730287914588256, "epoch": 1.28 }, { "loss": 8.0025, "grad_norm": 1.468841791152954, "learning_rate": 0.0005721536711297804, "epoch": 1.28 }, { "loss": 7.4888, "grad_norm": 1.1363178491592407, "learning_rate": 0.0005712785508007351, "epoch": 1.29 }, { "loss": 7.3176, "grad_norm": 1.1589970588684082, "learning_rate": 0.0005704034304716899, "epoch": 1.29 }, { "loss": 7.6323, "grad_norm": 0.9033693075180054, "learning_rate": 0.0005695283101426447, "epoch": 1.29 }, { "loss": 7.8839, "grad_norm": 1.2384039163589478, "learning_rate": 0.0005686531898135993, "epoch": 1.29 }, { "loss": 7.8408, "grad_norm": 1.3826912641525269, "learning_rate": 0.0005677780694845541, "epoch": 1.3 }, { "loss": 7.4433, "grad_norm": 1.1403487920761108, "learning_rate": 0.0005669029491555088, "epoch": 1.3 }, { "loss": 8.5407, "grad_norm": 1.037423014640808, "learning_rate": 0.0005660278288264636, "epoch": 1.3 }, { "loss": 8.0943, "grad_norm": 1.4421013593673706, "learning_rate": 0.0005651527084974184, "epoch": 1.3 }, { "loss": 7.7771, "grad_norm": 1.2977713346481323, "learning_rate": 0.0005642775881683731, "epoch": 1.31 }, { "loss": 7.54, "grad_norm": 1.049196720123291, "learning_rate": 0.0005634024678393279, "epoch": 1.31 }, { "loss": 7.4699, "grad_norm": 1.0489652156829834, "learning_rate": 0.0005625273475102827, "epoch": 1.31 }, { "loss": 7.9441, "grad_norm": 1.1373968124389648, "learning_rate": 0.0005616522271812374, "epoch": 1.32 }, { "loss": 7.2627, "grad_norm": 1.0570902824401855, "learning_rate": 0.0005607771068521922, "epoch": 1.32 }, { "loss": 7.7472, "grad_norm": 1.0547776222229004, "learning_rate": 0.0005599019865231469, "epoch": 1.32 }, { "loss": 7.8815, "grad_norm": 1.2481534481048584, "learning_rate": 0.0005590268661941017, "epoch": 1.32 }, { "loss": 8.2547, "grad_norm": 1.1728442907333374, "learning_rate": 0.0005581517458650565, "epoch": 1.33 }, { "loss": 7.5035, "grad_norm": 1.0567808151245117, "learning_rate": 0.0005572766255360112, "epoch": 1.33 }, { "loss": 7.9982, "grad_norm": 0.8234537243843079, "learning_rate": 0.000556401505206966, "epoch": 1.33 }, { "loss": 7.5333, "grad_norm": 1.09587824344635, "learning_rate": 0.0005555263848779207, "epoch": 1.33 }, { "loss": 7.768, "grad_norm": 1.3897008895874023, "learning_rate": 0.0005546512645488755, "epoch": 1.34 }, { "loss": 7.7645, "grad_norm": 1.1089082956314087, "learning_rate": 0.0005537761442198303, "epoch": 1.34 }, { "loss": 7.7809, "grad_norm": 1.2678576707839966, "learning_rate": 0.000552901023890785, "epoch": 1.34 }, { "loss": 7.7376, "grad_norm": 1.3946635723114014, "learning_rate": 0.0005520259035617398, "epoch": 1.34 }, { "loss": 8.2773, "grad_norm": 1.3742512464523315, "learning_rate": 0.0005511507832326946, "epoch": 1.35 }, { "loss": 7.7902, "grad_norm": 1.416434645652771, "learning_rate": 0.0005502756629036493, "epoch": 1.35 }, { "loss": 7.6157, "grad_norm": 1.0419012308120728, "learning_rate": 0.0005494005425746041, "epoch": 1.35 }, { "loss": 7.5897, "grad_norm": 1.7180145978927612, "learning_rate": 0.0005485254222455588, "epoch": 1.35 }, { "loss": 8.0068, "grad_norm": 1.6651771068572998, "learning_rate": 0.0005476503019165136, "epoch": 1.36 }, { "loss": 7.4023, "grad_norm": 1.0715596675872803, "learning_rate": 0.0005467751815874683, "epoch": 1.36 }, { "loss": 8.0369, "grad_norm": 1.208898901939392, "learning_rate": 0.000545900061258423, "epoch": 1.36 }, { "loss": 7.6188, "grad_norm": 0.9920070767402649, "learning_rate": 0.0005450249409293778, "epoch": 1.36 }, { "loss": 8.6854, "grad_norm": 1.174086570739746, "learning_rate": 0.0005441498206003325, "epoch": 1.37 }, { "loss": 7.5733, "grad_norm": 1.244912028312683, "learning_rate": 0.0005432747002712873, "epoch": 1.37 }, { "loss": 7.389, "grad_norm": 1.5966273546218872, "learning_rate": 0.0005423995799422421, "epoch": 1.37 }, { "loss": 8.1756, "grad_norm": 1.0320965051651, "learning_rate": 0.0005415244596131968, "epoch": 1.38 }, { "loss": 8.897, "grad_norm": 1.2478450536727905, "learning_rate": 0.0005406493392841516, "epoch": 1.38 }, { "loss": 7.6083, "grad_norm": 1.4347364902496338, "learning_rate": 0.0005397742189551064, "epoch": 1.38 }, { "loss": 7.9916, "grad_norm": 1.1878119707107544, "learning_rate": 0.0005388990986260611, "epoch": 1.38 }, { "loss": 8.1032, "grad_norm": 1.3169543743133545, "learning_rate": 0.0005380239782970159, "epoch": 1.39 }, { "loss": 7.3094, "grad_norm": 1.271192193031311, "learning_rate": 0.0005371488579679706, "epoch": 1.39 }, { "loss": 7.2947, "grad_norm": 1.484824299812317, "learning_rate": 0.0005362737376389254, "epoch": 1.39 }, { "loss": 7.7483, "grad_norm": 1.0237884521484375, "learning_rate": 0.0005353986173098802, "epoch": 1.39 }, { "loss": 7.7284, "grad_norm": 1.141897201538086, "learning_rate": 0.0005345234969808349, "epoch": 1.4 }, { "loss": 7.9684, "grad_norm": 1.2076783180236816, "learning_rate": 0.0005336483766517897, "epoch": 1.4 }, { "loss": 7.4731, "grad_norm": 1.0815685987472534, "learning_rate": 0.0005327732563227444, "epoch": 1.4 }, { "loss": 7.6468, "grad_norm": 1.9115163087844849, "learning_rate": 0.0005318981359936992, "epoch": 1.4 }, { "loss": 8.179, "grad_norm": 1.1872133016586304, "learning_rate": 0.000531023015664654, "epoch": 1.41 }, { "loss": 8.1254, "grad_norm": 1.144726037979126, "learning_rate": 0.0005301478953356087, "epoch": 1.41 }, { "loss": 7.7947, "grad_norm": 1.562495231628418, "learning_rate": 0.0005292727750065635, "epoch": 1.41 }, { "loss": 7.2917, "grad_norm": 1.20420241355896, "learning_rate": 0.0005283976546775183, "epoch": 1.41 }, { "loss": 7.9956, "grad_norm": 1.0302613973617554, "learning_rate": 0.000527522534348473, "epoch": 1.42 }, { "loss": 7.8058, "grad_norm": 1.161452293395996, "learning_rate": 0.0005266474140194278, "epoch": 1.42 }, { "loss": 8.2652, "grad_norm": 1.2876991033554077, "learning_rate": 0.0005257722936903825, "epoch": 1.42 }, { "loss": 8.0375, "grad_norm": 1.1002925634384155, "learning_rate": 0.0005248971733613372, "epoch": 1.43 }, { "loss": 7.82, "grad_norm": 1.0201154947280884, "learning_rate": 0.000524022053032292, "epoch": 1.43 }, { "loss": 8.3203, "grad_norm": 1.1177037954330444, "learning_rate": 0.0005231469327032467, "epoch": 1.43 }, { "loss": 7.9789, "grad_norm": 1.4295682907104492, "learning_rate": 0.0005222718123742015, "epoch": 1.43 }, { "loss": 8.0088, "grad_norm": 1.4420737028121948, "learning_rate": 0.0005213966920451562, "epoch": 1.44 }, { "loss": 7.8298, "grad_norm": 1.1020231246948242, "learning_rate": 0.000520521571716111, "epoch": 1.44 }, { "loss": 7.8801, "grad_norm": 1.4339189529418945, "learning_rate": 0.0005196464513870657, "epoch": 1.44 }, { "loss": 7.6756, "grad_norm": 1.5243607759475708, "learning_rate": 0.0005187713310580204, "epoch": 1.44 }, { "loss": 8.1007, "grad_norm": 0.9880979657173157, "learning_rate": 0.0005178962107289752, "epoch": 1.45 }, { "loss": 7.7396, "grad_norm": 1.1447367668151855, "learning_rate": 0.0005170210903999299, "epoch": 1.45 }, { "loss": 7.8537, "grad_norm": 1.384048342704773, "learning_rate": 0.0005161459700708847, "epoch": 1.45 }, { "loss": 7.8855, "grad_norm": 1.3757721185684204, "learning_rate": 0.0005152708497418395, "epoch": 1.45 }, { "loss": 7.8651, "grad_norm": 1.1160024404525757, "learning_rate": 0.0005143957294127942, "epoch": 1.46 }, { "loss": 7.8378, "grad_norm": 0.9774546027183533, "learning_rate": 0.000513520609083749, "epoch": 1.46 }, { "loss": 7.9251, "grad_norm": 1.5181477069854736, "learning_rate": 0.0005126454887547038, "epoch": 1.46 }, { "loss": 8.6781, "grad_norm": 1.203229308128357, "learning_rate": 0.0005117703684256585, "epoch": 1.46 }, { "loss": 7.6571, "grad_norm": 1.0401496887207031, "learning_rate": 0.0005108952480966133, "epoch": 1.47 }, { "loss": 7.3908, "grad_norm": 1.3228225708007812, "learning_rate": 0.000510020127767568, "epoch": 1.47 }, { "loss": 8.1244, "grad_norm": 1.3072296380996704, "learning_rate": 0.0005091450074385228, "epoch": 1.47 }, { "loss": 7.7535, "grad_norm": 1.9105629920959473, "learning_rate": 0.0005082698871094776, "epoch": 1.48 }, { "loss": 8.2387, "grad_norm": 1.3035160303115845, "learning_rate": 0.0005073947667804323, "epoch": 1.48 }, { "loss": 7.998, "grad_norm": 0.9805745482444763, "learning_rate": 0.0005065196464513871, "epoch": 1.48 }, { "loss": 8.0499, "grad_norm": 1.28218412399292, "learning_rate": 0.0005056445261223418, "epoch": 1.48 }, { "loss": 8.0939, "grad_norm": 1.289697527885437, "learning_rate": 0.0005047694057932966, "epoch": 1.49 }, { "loss": 7.8801, "grad_norm": 1.3982206583023071, "learning_rate": 0.0005038942854642513, "epoch": 1.49 }, { "loss": 7.5012, "grad_norm": 1.1884011030197144, "learning_rate": 0.000503019165135206, "epoch": 1.49 }, { "loss": 7.7792, "grad_norm": 1.2014328241348267, "learning_rate": 0.0005021440448061608, "epoch": 1.49 }, { "loss": 8.3151, "grad_norm": 1.2958098649978638, "learning_rate": 0.0005012689244771155, "epoch": 1.5 }, { "loss": 7.3702, "grad_norm": 1.1195346117019653, "learning_rate": 0.0005003938041480703, "epoch": 1.5 }, { "loss": 8.0952, "grad_norm": 1.2185337543487549, "learning_rate": 0.0004995186838190251, "epoch": 1.5 }, { "loss": 7.6605, "grad_norm": 1.1054099798202515, "learning_rate": 0.0004986435634899798, "epoch": 1.5 }, { "loss": 7.8926, "grad_norm": 1.3183029890060425, "learning_rate": 0.0004977684431609346, "epoch": 1.51 }, { "loss": 7.8356, "grad_norm": 1.3786067962646484, "learning_rate": 0.0004968933228318894, "epoch": 1.51 }, { "loss": 7.7605, "grad_norm": 1.3373888731002808, "learning_rate": 0.0004960182025028441, "epoch": 1.51 }, { "loss": 7.9272, "grad_norm": 1.5524091720581055, "learning_rate": 0.0004951430821737989, "epoch": 1.51 }, { "loss": 8.1264, "grad_norm": 0.927689790725708, "learning_rate": 0.0004942679618447536, "epoch": 1.52 }, { "loss": 8.1456, "grad_norm": 1.4429559707641602, "learning_rate": 0.0004933928415157084, "epoch": 1.52 }, { "loss": 8.5349, "grad_norm": 1.17830228805542, "learning_rate": 0.0004925177211866632, "epoch": 1.52 }, { "loss": 8.4138, "grad_norm": 1.7398778200149536, "learning_rate": 0.0004916426008576179, "epoch": 1.53 }, { "loss": 7.6329, "grad_norm": 1.101945161819458, "learning_rate": 0.0004907674805285727, "epoch": 1.53 }, { "loss": 8.2694, "grad_norm": 1.2424931526184082, "learning_rate": 0.0004898923601995274, "epoch": 1.53 }, { "loss": 7.2639, "grad_norm": 0.8726850748062134, "learning_rate": 0.0004890172398704822, "epoch": 1.53 }, { "loss": 7.5542, "grad_norm": 1.020978331565857, "learning_rate": 0.0004881421195414369, "epoch": 1.54 }, { "loss": 7.3334, "grad_norm": 1.058136224746704, "learning_rate": 0.0004872669992123917, "epoch": 1.54 }, { "loss": 7.6285, "grad_norm": 1.7856310606002808, "learning_rate": 0.00048639187888334644, "epoch": 1.54 }, { "loss": 7.8873, "grad_norm": 1.1540299654006958, "learning_rate": 0.0004855167585543012, "epoch": 1.54 }, { "loss": 7.5676, "grad_norm": 1.4844547510147095, "learning_rate": 0.00048464163822525597, "epoch": 1.55 }, { "loss": 8.0284, "grad_norm": 1.1018364429473877, "learning_rate": 0.00048376651789621073, "epoch": 1.55 }, { "loss": 7.8478, "grad_norm": 1.4421080350875854, "learning_rate": 0.0004828913975671655, "epoch": 1.55 }, { "loss": 8.0614, "grad_norm": 1.322413444519043, "learning_rate": 0.00048201627723812025, "epoch": 1.55 }, { "loss": 7.9015, "grad_norm": 1.1930081844329834, "learning_rate": 0.000481141156909075, "epoch": 1.56 }, { "loss": 7.843, "grad_norm": 1.2846688032150269, "learning_rate": 0.0004802660365800298, "epoch": 1.56 }, { "loss": 7.8268, "grad_norm": 2.0413529872894287, "learning_rate": 0.00047939091625098454, "epoch": 1.56 }, { "loss": 7.3241, "grad_norm": 1.058362364768982, "learning_rate": 0.0004785157959219393, "epoch": 1.56 }, { "loss": 7.8329, "grad_norm": 1.725417971611023, "learning_rate": 0.00047764067559289406, "epoch": 1.57 }, { "loss": 7.6295, "grad_norm": 1.1373404264450073, "learning_rate": 0.00047676555526384877, "epoch": 1.57 }, { "loss": 7.4763, "grad_norm": 1.1107378005981445, "learning_rate": 0.00047589043493480353, "epoch": 1.57 }, { "loss": 7.8846, "grad_norm": 1.2450941801071167, "learning_rate": 0.0004750153146057583, "epoch": 1.57 }, { "loss": 8.4109, "grad_norm": 1.0643541812896729, "learning_rate": 0.00047414019427671305, "epoch": 1.58 }, { "loss": 7.9126, "grad_norm": 1.2940372228622437, "learning_rate": 0.0004732650739476678, "epoch": 1.58 }, { "loss": 7.7132, "grad_norm": 2.6067655086517334, "learning_rate": 0.0004723899536186226, "epoch": 1.58 }, { "loss": 7.5708, "grad_norm": 0.9783037304878235, "learning_rate": 0.00047151483328957734, "epoch": 1.59 }, { "loss": 7.2771, "grad_norm": 1.037582278251648, "learning_rate": 0.0004706397129605321, "epoch": 1.59 }, { "loss": 7.7599, "grad_norm": 1.0178707838058472, "learning_rate": 0.00046976459263148686, "epoch": 1.59 }, { "loss": 7.1538, "grad_norm": 1.558307409286499, "learning_rate": 0.0004688894723024416, "epoch": 1.59 }, { "loss": 7.5229, "grad_norm": 1.1060800552368164, "learning_rate": 0.0004680143519733964, "epoch": 1.6 }, { "loss": 7.5813, "grad_norm": 1.8988709449768066, "learning_rate": 0.00046713923164435115, "epoch": 1.6 }, { "loss": 7.8319, "grad_norm": 1.6066781282424927, "learning_rate": 0.00046626411131530586, "epoch": 1.6 }, { "loss": 7.8222, "grad_norm": 1.4711729288101196, "learning_rate": 0.0004653889909862606, "epoch": 1.6 }, { "loss": 7.6115, "grad_norm": 1.3585811853408813, "learning_rate": 0.0004645138706572154, "epoch": 1.61 }, { "loss": 7.7618, "grad_norm": 1.1487444639205933, "learning_rate": 0.00046363875032817014, "epoch": 1.61 }, { "loss": 7.868, "grad_norm": 1.4386248588562012, "learning_rate": 0.0004627636299991249, "epoch": 1.61 }, { "loss": 7.7931, "grad_norm": 1.0714224576950073, "learning_rate": 0.00046188850967007967, "epoch": 1.61 }, { "loss": 8.2688, "grad_norm": 1.6375863552093506, "learning_rate": 0.00046101338934103443, "epoch": 1.62 }, { "loss": 7.621, "grad_norm": 1.024120807647705, "learning_rate": 0.0004601382690119892, "epoch": 1.62 }, { "loss": 8.2226, "grad_norm": 1.2234493494033813, "learning_rate": 0.0004592631486829439, "epoch": 1.62 }, { "loss": 7.596, "grad_norm": 1.0593066215515137, "learning_rate": 0.00045838802835389866, "epoch": 1.62 }, { "loss": 7.7407, "grad_norm": 1.2529680728912354, "learning_rate": 0.0004575129080248534, "epoch": 1.63 }, { "loss": 7.221, "grad_norm": 1.1312929391860962, "learning_rate": 0.0004566377876958082, "epoch": 1.63 }, { "loss": 7.6136, "grad_norm": 1.4004312753677368, "learning_rate": 0.00045576266736676294, "epoch": 1.63 }, { "loss": 7.9718, "grad_norm": 1.4514151811599731, "learning_rate": 0.00045488754703771765, "epoch": 1.64 }, { "loss": 7.3337, "grad_norm": 1.1595350503921509, "learning_rate": 0.0004540124267086724, "epoch": 1.64 }, { "loss": 7.7414, "grad_norm": 1.1403205394744873, "learning_rate": 0.0004531373063796272, "epoch": 1.64 }, { "loss": 7.7323, "grad_norm": 1.677051305770874, "learning_rate": 0.00045226218605058194, "epoch": 1.64 }, { "loss": 8.0048, "grad_norm": 1.338146686553955, "learning_rate": 0.0004513870657215367, "epoch": 1.65 }, { "loss": 7.9544, "grad_norm": 1.0941588878631592, "learning_rate": 0.00045051194539249146, "epoch": 1.65 }, { "loss": 8.1043, "grad_norm": 1.224746584892273, "learning_rate": 0.0004496368250634462, "epoch": 1.65 }, { "loss": 8.0849, "grad_norm": 1.5772489309310913, "learning_rate": 0.000448761704734401, "epoch": 1.65 }, { "loss": 7.3165, "grad_norm": 1.4434912204742432, "learning_rate": 0.00044788658440535575, "epoch": 1.66 }, { "loss": 7.8826, "grad_norm": 0.9971029162406921, "learning_rate": 0.0004470114640763105, "epoch": 1.66 }, { "loss": 7.7822, "grad_norm": 1.061712384223938, "learning_rate": 0.00044613634374726527, "epoch": 1.66 }, { "loss": 7.8387, "grad_norm": 1.6292518377304077, "learning_rate": 0.00044526122341822003, "epoch": 1.66 }, { "loss": 7.3463, "grad_norm": 1.0507898330688477, "learning_rate": 0.00044438610308917474, "epoch": 1.67 }, { "loss": 7.693, "grad_norm": 1.332474708557129, "learning_rate": 0.0004435109827601295, "epoch": 1.67 }, { "loss": 7.4542, "grad_norm": 1.3393101692199707, "learning_rate": 0.00044263586243108426, "epoch": 1.67 }, { "loss": 7.4236, "grad_norm": 1.4949504137039185, "learning_rate": 0.000441760742102039, "epoch": 1.67 }, { "loss": 7.4087, "grad_norm": 1.3824454545974731, "learning_rate": 0.0004408856217729938, "epoch": 1.68 }, { "loss": 7.3, "grad_norm": 1.3991942405700684, "learning_rate": 0.00044001050144394855, "epoch": 1.68 }, { "loss": 7.1648, "grad_norm": 1.3270092010498047, "learning_rate": 0.0004391353811149033, "epoch": 1.68 }, { "loss": 7.753, "grad_norm": 1.1912864446640015, "learning_rate": 0.00043826026078585807, "epoch": 1.69 }, { "loss": 7.6531, "grad_norm": 1.2112165689468384, "learning_rate": 0.00043738514045681283, "epoch": 1.69 }, { "loss": 8.0168, "grad_norm": 1.0204828977584839, "learning_rate": 0.0004365100201277676, "epoch": 1.69 }, { "loss": 7.8334, "grad_norm": 1.8065035343170166, "learning_rate": 0.00043563489979872236, "epoch": 1.69 }, { "loss": 7.9395, "grad_norm": 1.1826367378234863, "learning_rate": 0.0004347597794696771, "epoch": 1.7 }, { "loss": 8.0071, "grad_norm": 0.9689782857894897, "learning_rate": 0.00043388465914063183, "epoch": 1.7 }, { "loss": 7.5284, "grad_norm": 0.9889323115348816, "learning_rate": 0.0004330095388115866, "epoch": 1.7 }, { "loss": 7.7318, "grad_norm": 1.4257516860961914, "learning_rate": 0.00043213441848254135, "epoch": 1.7 }, { "loss": 7.6343, "grad_norm": 1.623134970664978, "learning_rate": 0.0004312592981534961, "epoch": 1.71 }, { "loss": 7.9645, "grad_norm": 1.2686361074447632, "learning_rate": 0.0004303841778244509, "epoch": 1.71 }, { "loss": 7.5339, "grad_norm": 1.5115247964859009, "learning_rate": 0.00042950905749540564, "epoch": 1.71 }, { "loss": 7.7401, "grad_norm": 1.285506010055542, "learning_rate": 0.0004286339371663604, "epoch": 1.71 }, { "loss": 7.6018, "grad_norm": 1.4150651693344116, "learning_rate": 0.00042775881683731516, "epoch": 1.72 }, { "loss": 7.4015, "grad_norm": 1.485231637954712, "learning_rate": 0.0004268836965082699, "epoch": 1.72 }, { "loss": 8.4429, "grad_norm": 2.1629021167755127, "learning_rate": 0.0004260085761792247, "epoch": 1.72 }, { "loss": 7.8298, "grad_norm": 1.1586624383926392, "learning_rate": 0.00042513345585017945, "epoch": 1.72 }, { "loss": 7.8121, "grad_norm": 1.0134670734405518, "learning_rate": 0.0004242583355211342, "epoch": 1.73 }, { "loss": 7.8337, "grad_norm": 1.257633090019226, "learning_rate": 0.00042338321519208897, "epoch": 1.73 }, { "loss": 7.6701, "grad_norm": 1.212266445159912, "learning_rate": 0.0004225080948630437, "epoch": 1.73 }, { "loss": 7.706, "grad_norm": 1.2191237211227417, "learning_rate": 0.00042163297453399844, "epoch": 1.74 }, { "loss": 7.4639, "grad_norm": 1.476140022277832, "learning_rate": 0.0004207578542049532, "epoch": 1.74 }, { "loss": 7.8126, "grad_norm": 1.0655369758605957, "learning_rate": 0.0004198827338759079, "epoch": 1.74 }, { "loss": 7.4091, "grad_norm": 1.3340696096420288, "learning_rate": 0.00041900761354686267, "epoch": 1.74 }, { "loss": 7.5701, "grad_norm": 1.3290128707885742, "learning_rate": 0.00041813249321781743, "epoch": 1.75 }, { "loss": 7.5513, "grad_norm": 1.1993497610092163, "learning_rate": 0.0004172573728887722, "epoch": 1.75 }, { "loss": 7.5115, "grad_norm": 0.9953559041023254, "learning_rate": 0.00041638225255972696, "epoch": 1.75 }, { "loss": 8.0513, "grad_norm": 1.1929738521575928, "learning_rate": 0.0004155071322306817, "epoch": 1.75 }, { "loss": 7.4431, "grad_norm": 1.0211223363876343, "learning_rate": 0.0004146320119016365, "epoch": 1.76 }, { "loss": 7.4024, "grad_norm": 1.0484708547592163, "learning_rate": 0.00041375689157259124, "epoch": 1.76 }, { "loss": 7.4321, "grad_norm": 1.2012499570846558, "learning_rate": 0.000412881771243546, "epoch": 1.76 }, { "loss": 7.2608, "grad_norm": 0.9850478768348694, "learning_rate": 0.0004120066509145007, "epoch": 1.76 }, { "loss": 7.4744, "grad_norm": 1.1142171621322632, "learning_rate": 0.00041113153058545547, "epoch": 1.77 }, { "loss": 7.8258, "grad_norm": 1.0107368230819702, "learning_rate": 0.00041025641025641023, "epoch": 1.77 }, { "loss": 8.0338, "grad_norm": 1.3827756643295288, "learning_rate": 0.000409381289927365, "epoch": 1.77 }, { "loss": 7.2029, "grad_norm": 1.056078553199768, "learning_rate": 0.00040850616959831976, "epoch": 1.77 }, { "loss": 7.9763, "grad_norm": 1.3796826601028442, "learning_rate": 0.0004076310492692745, "epoch": 1.78 }, { "loss": 7.39, "grad_norm": 1.5586506128311157, "learning_rate": 0.0004067559289402293, "epoch": 1.78 }, { "loss": 7.7479, "grad_norm": 1.3467471599578857, "learning_rate": 0.00040588080861118404, "epoch": 1.78 }, { "loss": 7.5281, "grad_norm": 1.5824648141860962, "learning_rate": 0.0004050056882821388, "epoch": 1.78 }, { "loss": 7.4095, "grad_norm": 1.5600448846817017, "learning_rate": 0.00040413056795309357, "epoch": 1.79 }, { "loss": 7.6296, "grad_norm": 1.4003773927688599, "learning_rate": 0.00040325544762404833, "epoch": 1.79 }, { "loss": 7.2299, "grad_norm": 1.1784484386444092, "learning_rate": 0.0004023803272950031, "epoch": 1.79 }, { "loss": 7.2215, "grad_norm": 1.0865730047225952, "learning_rate": 0.0004015052069659578, "epoch": 1.8 }, { "loss": 7.619, "grad_norm": 1.3708497285842896, "learning_rate": 0.00040063008663691256, "epoch": 1.8 }, { "loss": 7.6305, "grad_norm": 1.3728278875350952, "learning_rate": 0.0003997549663078673, "epoch": 1.8 }, { "loss": 7.4218, "grad_norm": 1.385901689529419, "learning_rate": 0.0003988798459788221, "epoch": 1.8 }, { "loss": 7.7959, "grad_norm": 1.5370672941207886, "learning_rate": 0.00039800472564977685, "epoch": 1.81 }, { "loss": 7.8249, "grad_norm": 1.039469838142395, "learning_rate": 0.0003971296053207316, "epoch": 1.81 }, { "loss": 7.4311, "grad_norm": 1.4947952032089233, "learning_rate": 0.00039625448499168637, "epoch": 1.81 }, { "loss": 7.7797, "grad_norm": 1.2262136936187744, "learning_rate": 0.00039546687669554567, "epoch": 1.81 }, { "loss": 7.8844, "grad_norm": 1.5757509469985962, "learning_rate": 0.00039459175636650043, "epoch": 1.82 }, { "loss": 7.8737, "grad_norm": 1.2183258533477783, "learning_rate": 0.0003937166360374552, "epoch": 1.82 }, { "loss": 7.3515, "grad_norm": 1.3697617053985596, "learning_rate": 0.00039284151570840995, "epoch": 1.82 }, { "loss": 7.1169, "grad_norm": 1.3007692098617554, "learning_rate": 0.00039196639537936466, "epoch": 1.82 }, { "loss": 7.2926, "grad_norm": 1.3538720607757568, "learning_rate": 0.0003910912750503194, "epoch": 1.83 }, { "loss": 7.8445, "grad_norm": 1.4245976209640503, "learning_rate": 0.0003902161547212742, "epoch": 1.83 }, { "loss": 7.456, "grad_norm": 1.323899269104004, "learning_rate": 0.00038934103439222894, "epoch": 1.83 }, { "loss": 7.6163, "grad_norm": 1.2635420560836792, "learning_rate": 0.0003884659140631837, "epoch": 1.83 }, { "loss": 7.5885, "grad_norm": 1.4714936017990112, "learning_rate": 0.00038759079373413847, "epoch": 1.84 }, { "loss": 7.8382, "grad_norm": 1.1696442365646362, "learning_rate": 0.00038671567340509323, "epoch": 1.84 }, { "loss": 7.4885, "grad_norm": 1.3797491788864136, "learning_rate": 0.000385840553076048, "epoch": 1.84 }, { "loss": 7.4614, "grad_norm": 1.0410481691360474, "learning_rate": 0.00038496543274700275, "epoch": 1.85 }, { "loss": 6.9584, "grad_norm": 1.7356559038162231, "learning_rate": 0.0003840903124179575, "epoch": 1.85 }, { "loss": 7.8161, "grad_norm": 1.326489806175232, "learning_rate": 0.0003832151920889123, "epoch": 1.85 }, { "loss": 7.6985, "grad_norm": 1.3822075128555298, "learning_rate": 0.00038234007175986704, "epoch": 1.85 }, { "loss": 7.9532, "grad_norm": 1.2612171173095703, "learning_rate": 0.00038146495143082175, "epoch": 1.86 }, { "loss": 7.3309, "grad_norm": 1.8743207454681396, "learning_rate": 0.0003805898311017765, "epoch": 1.86 }, { "loss": 7.8573, "grad_norm": 1.515641212463379, "learning_rate": 0.0003797147107727312, "epoch": 1.86 }, { "loss": 8.0815, "grad_norm": 1.970818281173706, "learning_rate": 0.000378839590443686, "epoch": 1.86 }, { "loss": 7.7197, "grad_norm": 1.6418136358261108, "learning_rate": 0.00037796447011464074, "epoch": 1.87 }, { "loss": 7.6527, "grad_norm": 1.3693944215774536, "learning_rate": 0.0003770893497855955, "epoch": 1.87 }, { "loss": 7.7717, "grad_norm": 1.311493992805481, "learning_rate": 0.00037621422945655026, "epoch": 1.87 }, { "loss": 8.0735, "grad_norm": 1.593992829322815, "learning_rate": 0.000375339109127505, "epoch": 1.87 }, { "loss": 7.4285, "grad_norm": 1.212729573249817, "learning_rate": 0.0003744639887984598, "epoch": 1.88 }, { "loss": 7.7873, "grad_norm": 1.1326895952224731, "learning_rate": 0.00037358886846941455, "epoch": 1.88 }, { "loss": 7.3515, "grad_norm": 1.3937299251556396, "learning_rate": 0.0003727137481403693, "epoch": 1.88 }, { "loss": 7.353, "grad_norm": 1.5152568817138672, "learning_rate": 0.00037183862781132407, "epoch": 1.88 }, { "loss": 7.8015, "grad_norm": 1.207973599433899, "learning_rate": 0.0003709635074822788, "epoch": 1.89 }, { "loss": 7.6713, "grad_norm": 1.003139615058899, "learning_rate": 0.00037008838715323354, "epoch": 1.89 }, { "loss": 7.9247, "grad_norm": 1.1870025396347046, "learning_rate": 0.0003692132668241883, "epoch": 1.89 }, { "loss": 7.4496, "grad_norm": 1.237275242805481, "learning_rate": 0.00036833814649514307, "epoch": 1.9 }, { "loss": 7.2638, "grad_norm": 1.7287304401397705, "learning_rate": 0.00036746302616609783, "epoch": 1.9 }, { "loss": 7.7464, "grad_norm": 1.5875813961029053, "learning_rate": 0.0003665879058370526, "epoch": 1.9 }, { "loss": 7.683, "grad_norm": 1.7219480276107788, "learning_rate": 0.00036571278550800735, "epoch": 1.9 }, { "loss": 7.6059, "grad_norm": 1.3815206289291382, "learning_rate": 0.0003648376651789621, "epoch": 1.91 }, { "loss": 7.3258, "grad_norm": 1.1902978420257568, "learning_rate": 0.0003639625448499169, "epoch": 1.91 }, { "loss": 7.4436, "grad_norm": 1.6532816886901855, "learning_rate": 0.00036308742452087164, "epoch": 1.91 }, { "loss": 7.438, "grad_norm": 1.1358212232589722, "learning_rate": 0.0003622123041918264, "epoch": 1.91 }, { "loss": 7.9777, "grad_norm": 1.3459230661392212, "learning_rate": 0.00036133718386278116, "epoch": 1.92 }, { "loss": 7.9087, "grad_norm": 1.0352368354797363, "learning_rate": 0.0003604620635337359, "epoch": 1.92 }, { "loss": 7.5855, "grad_norm": 1.2582918405532837, "learning_rate": 0.00035958694320469063, "epoch": 1.92 }, { "loss": 7.4576, "grad_norm": 1.1787996292114258, "learning_rate": 0.0003587118228756454, "epoch": 1.92 }, { "loss": 7.2572, "grad_norm": 1.2917609214782715, "learning_rate": 0.00035783670254660015, "epoch": 1.93 }, { "loss": 7.6433, "grad_norm": 1.1689330339431763, "learning_rate": 0.0003569615822175549, "epoch": 1.93 }, { "loss": 7.6579, "grad_norm": 1.2844352722167969, "learning_rate": 0.0003560864618885097, "epoch": 1.93 }, { "loss": 7.5178, "grad_norm": 1.498838186264038, "learning_rate": 0.00035521134155946444, "epoch": 1.93 }, { "loss": 7.0155, "grad_norm": 1.3718552589416504, "learning_rate": 0.0003543362212304192, "epoch": 1.94 }, { "loss": 7.7558, "grad_norm": 1.2343835830688477, "learning_rate": 0.00035346110090137396, "epoch": 1.94 }, { "loss": 7.4386, "grad_norm": 1.307979702949524, "learning_rate": 0.0003525859805723287, "epoch": 1.94 }, { "loss": 7.4287, "grad_norm": 1.46335768699646, "learning_rate": 0.0003517108602432835, "epoch": 1.95 }, { "loss": 7.0541, "grad_norm": 1.4892301559448242, "learning_rate": 0.00035083573991423825, "epoch": 1.95 }, { "loss": 7.6458, "grad_norm": 1.3297821283340454, "learning_rate": 0.000349960619585193, "epoch": 1.95 }, { "loss": 7.3704, "grad_norm": 1.9190036058425903, "learning_rate": 0.0003490854992561477, "epoch": 1.95 }, { "loss": 7.9292, "grad_norm": 1.1013009548187256, "learning_rate": 0.0003482103789271025, "epoch": 1.96 }, { "loss": 7.8039, "grad_norm": 1.284121036529541, "learning_rate": 0.00034733525859805724, "epoch": 1.96 }, { "loss": 7.7188, "grad_norm": 1.118995189666748, "learning_rate": 0.000346460138269012, "epoch": 1.96 }, { "loss": 7.3617, "grad_norm": 1.5446746349334717, "learning_rate": 0.00034558501793996676, "epoch": 1.96 }, { "loss": 7.5614, "grad_norm": 1.254835844039917, "learning_rate": 0.0003447098976109215, "epoch": 1.97 }, { "loss": 7.9923, "grad_norm": 2.215224266052246, "learning_rate": 0.0003438347772818763, "epoch": 1.97 }, { "loss": 7.6609, "grad_norm": 1.2917975187301636, "learning_rate": 0.00034295965695283105, "epoch": 1.97 }, { "loss": 6.9695, "grad_norm": 1.3251945972442627, "learning_rate": 0.0003420845366237858, "epoch": 1.97 }, { "loss": 7.4109, "grad_norm": 1.5397628545761108, "learning_rate": 0.0003412094162947406, "epoch": 1.98 }, { "loss": 7.3063, "grad_norm": 1.1789202690124512, "learning_rate": 0.00034033429596569534, "epoch": 1.98 }, { "loss": 7.8137, "grad_norm": 1.6068191528320312, "learning_rate": 0.00033945917563665004, "epoch": 1.98 }, { "loss": 7.5466, "grad_norm": 1.2397950887680054, "learning_rate": 0.00033858405530760475, "epoch": 1.98 }, { "loss": 7.9522, "grad_norm": 1.5175119638442993, "learning_rate": 0.0003377089349785595, "epoch": 1.99 }, { "loss": 7.6781, "grad_norm": 1.315258502960205, "learning_rate": 0.0003368338146495143, "epoch": 1.99 }, { "loss": 7.7292, "grad_norm": 2.664515256881714, "learning_rate": 0.00033595869432046904, "epoch": 1.99 }, { "loss": 8.1965, "grad_norm": 1.405129313468933, "learning_rate": 0.0003350835739914238, "epoch": 1.99 }, { "loss": 7.4133, "grad_norm": 1.0774602890014648, "learning_rate": 0.00033420845366237856, "epoch": 2.0 }, { "loss": 8.1777, "grad_norm": 1.75553560256958, "learning_rate": 0.0003333333333333333, "epoch": 2.0 }, { "loss": 7.5693, "grad_norm": 1.857081651687622, "learning_rate": 0.0003324582130042881, "epoch": 2.0 }, { "loss": 7.4888, "grad_norm": 1.0721529722213745, "learning_rate": 0.00033158309267524285, "epoch": 2.01 }, { "loss": 7.1311, "grad_norm": 1.0766797065734863, "learning_rate": 0.0003307079723461976, "epoch": 2.01 }, { "loss": 7.5107, "grad_norm": 1.4615150690078735, "learning_rate": 0.00032983285201715237, "epoch": 2.01 }, { "loss": 7.5258, "grad_norm": 1.4252068996429443, "learning_rate": 0.00032895773168810713, "epoch": 2.01 }, { "loss": 7.6049, "grad_norm": 1.2926585674285889, "learning_rate": 0.0003280826113590619, "epoch": 2.02 }, { "loss": 7.2436, "grad_norm": 1.6630724668502808, "learning_rate": 0.0003272074910300166, "epoch": 2.02 }, { "loss": 6.951, "grad_norm": 1.2705895900726318, "learning_rate": 0.00032633237070097136, "epoch": 2.02 }, { "loss": 7.4782, "grad_norm": 1.6801918745040894, "learning_rate": 0.0003254572503719261, "epoch": 2.02 }, { "loss": 7.7247, "grad_norm": 1.2789455652236938, "learning_rate": 0.0003245821300428809, "epoch": 2.03 }, { "loss": 7.65, "grad_norm": 1.0772324800491333, "learning_rate": 0.00032370700971383565, "epoch": 2.03 }, { "loss": 7.3484, "grad_norm": 1.218855857849121, "learning_rate": 0.0003228318893847904, "epoch": 2.03 }, { "loss": 7.7201, "grad_norm": 1.7484831809997559, "learning_rate": 0.00032195676905574517, "epoch": 2.03 }, { "loss": 7.606, "grad_norm": 1.4081809520721436, "learning_rate": 0.00032108164872669993, "epoch": 2.04 }, { "loss": 7.4735, "grad_norm": 1.2214211225509644, "learning_rate": 0.0003202065283976547, "epoch": 2.04 }, { "loss": 7.3052, "grad_norm": 2.243197441101074, "learning_rate": 0.00031933140806860946, "epoch": 2.04 }, { "loss": 7.2611, "grad_norm": 1.0560696125030518, "learning_rate": 0.0003184562877395642, "epoch": 2.04 }, { "loss": 7.3347, "grad_norm": 1.3903985023498535, "learning_rate": 0.000317581167410519, "epoch": 2.05 }, { "loss": 7.4106, "grad_norm": 1.285888910293579, "learning_rate": 0.0003167060470814737, "epoch": 2.05 }, { "loss": 7.3237, "grad_norm": 1.6455745697021484, "learning_rate": 0.00031583092675242845, "epoch": 2.05 }, { "loss": 7.4445, "grad_norm": 1.3552714586257935, "learning_rate": 0.0003149558064233832, "epoch": 2.06 }, { "loss": 7.3175, "grad_norm": 1.4250375032424927, "learning_rate": 0.000314080686094338, "epoch": 2.06 }, { "loss": 7.5334, "grad_norm": 1.8445017337799072, "learning_rate": 0.00031320556576529274, "epoch": 2.06 }, { "loss": 7.7627, "grad_norm": 1.1116868257522583, "learning_rate": 0.0003123304454362475, "epoch": 2.06 }, { "loss": 7.5347, "grad_norm": 1.1636768579483032, "learning_rate": 0.00031145532510720226, "epoch": 2.07 }, { "loss": 7.6581, "grad_norm": 1.4612860679626465, "learning_rate": 0.000310580204778157, "epoch": 2.07 }, { "loss": 7.6164, "grad_norm": 1.4403191804885864, "learning_rate": 0.0003097050844491118, "epoch": 2.07 }, { "loss": 7.3776, "grad_norm": 1.366955041885376, "learning_rate": 0.00030882996412006655, "epoch": 2.07 }, { "loss": 7.556, "grad_norm": 1.4476971626281738, "learning_rate": 0.0003079548437910213, "epoch": 2.08 }, { "loss": 7.6019, "grad_norm": 1.4753084182739258, "learning_rate": 0.00030707972346197607, "epoch": 2.08 }, { "loss": 7.8493, "grad_norm": 1.2335758209228516, "learning_rate": 0.00030620460313293083, "epoch": 2.08 }, { "loss": 7.9252, "grad_norm": 1.3958989381790161, "learning_rate": 0.00030532948280388554, "epoch": 2.08 }, { "loss": 7.2945, "grad_norm": 1.4621672630310059, "learning_rate": 0.0003044543624748403, "epoch": 2.09 }, { "loss": 7.3977, "grad_norm": 1.428195834159851, "learning_rate": 0.00030357924214579506, "epoch": 2.09 }, { "loss": 7.74, "grad_norm": 1.363600492477417, "learning_rate": 0.0003027041218167498, "epoch": 2.09 }, { "loss": 7.4894, "grad_norm": 1.2117736339569092, "learning_rate": 0.0003018290014877046, "epoch": 2.09 }, { "loss": 7.5678, "grad_norm": 1.9844530820846558, "learning_rate": 0.00030095388115865935, "epoch": 2.1 }, { "loss": 7.6681, "grad_norm": 1.3558523654937744, "learning_rate": 0.0003000787608296141, "epoch": 2.1 }, { "loss": 7.9793, "grad_norm": 1.3802049160003662, "learning_rate": 0.0002992036405005688, "epoch": 2.1 }, { "loss": 8.1848, "grad_norm": 1.845702886581421, "learning_rate": 0.0002983285201715236, "epoch": 2.11 }, { "loss": 7.2184, "grad_norm": 1.4479707479476929, "learning_rate": 0.00029745339984247834, "epoch": 2.11 }, { "loss": 7.4373, "grad_norm": 1.9233028888702393, "learning_rate": 0.0002965782795134331, "epoch": 2.11 }, { "loss": 7.2478, "grad_norm": 1.3621513843536377, "learning_rate": 0.00029570315918438786, "epoch": 2.11 }, { "loss": 7.5867, "grad_norm": 1.449763298034668, "learning_rate": 0.00029482803885534257, "epoch": 2.12 }, { "loss": 7.2909, "grad_norm": 1.543834924697876, "learning_rate": 0.00029395291852629733, "epoch": 2.12 }, { "loss": 7.5481, "grad_norm": 1.2582162618637085, "learning_rate": 0.0002930777981972521, "epoch": 2.12 }, { "loss": 7.2092, "grad_norm": 1.25532865524292, "learning_rate": 0.00029220267786820686, "epoch": 2.12 }, { "loss": 7.5117, "grad_norm": 1.4368300437927246, "learning_rate": 0.0002913275575391616, "epoch": 2.13 }, { "loss": 7.8661, "grad_norm": 1.4054632186889648, "learning_rate": 0.0002904524372101164, "epoch": 2.13 }, { "loss": 7.7641, "grad_norm": 1.4426825046539307, "learning_rate": 0.00028957731688107114, "epoch": 2.13 }, { "loss": 6.9808, "grad_norm": 1.6069836616516113, "learning_rate": 0.0002887021965520259, "epoch": 2.13 }, { "loss": 8.0412, "grad_norm": 1.603289246559143, "learning_rate": 0.00028782707622298067, "epoch": 2.14 }, { "loss": 7.7541, "grad_norm": 1.2069703340530396, "learning_rate": 0.00028695195589393543, "epoch": 2.14 }, { "loss": 7.5413, "grad_norm": 1.2976186275482178, "learning_rate": 0.0002860768355648902, "epoch": 2.14 }, { "loss": 7.6833, "grad_norm": 1.4646226167678833, "learning_rate": 0.00028520171523584495, "epoch": 2.14 }, { "loss": 7.3603, "grad_norm": 1.3783011436462402, "learning_rate": 0.00028432659490679966, "epoch": 2.15 }, { "loss": 7.1131, "grad_norm": 1.1677837371826172, "learning_rate": 0.0002834514745777544, "epoch": 2.15 }, { "loss": 7.8353, "grad_norm": 1.5966696739196777, "learning_rate": 0.0002825763542487092, "epoch": 2.15 }, { "loss": 7.651, "grad_norm": 1.3074275255203247, "learning_rate": 0.00028170123391966394, "epoch": 2.16 }, { "loss": 6.8535, "grad_norm": 1.2238943576812744, "learning_rate": 0.0002808261135906187, "epoch": 2.16 }, { "loss": 7.1677, "grad_norm": 1.2107079029083252, "learning_rate": 0.00027995099326157347, "epoch": 2.16 }, { "loss": 7.1232, "grad_norm": 1.482686996459961, "learning_rate": 0.00027907587293252823, "epoch": 2.16 }, { "loss": 7.6958, "grad_norm": 1.9235337972640991, "learning_rate": 0.000278200752603483, "epoch": 2.17 }, { "loss": 7.5763, "grad_norm": 1.0629470348358154, "learning_rate": 0.00027732563227443775, "epoch": 2.17 }, { "loss": 7.417, "grad_norm": 1.4404977560043335, "learning_rate": 0.0002764505119453925, "epoch": 2.17 }, { "loss": 7.4457, "grad_norm": 1.6266590356826782, "learning_rate": 0.0002755753916163473, "epoch": 2.17 }, { "loss": 7.6768, "grad_norm": 1.4418647289276123, "learning_rate": 0.00027470027128730204, "epoch": 2.18 }, { "loss": 7.7301, "grad_norm": 1.7269823551177979, "learning_rate": 0.0002738251509582568, "epoch": 2.18 }, { "loss": 7.1704, "grad_norm": 1.9527968168258667, "learning_rate": 0.0002729500306292115, "epoch": 2.18 }, { "loss": 8.0284, "grad_norm": 1.1195765733718872, "learning_rate": 0.00027207491030016627, "epoch": 2.18 }, { "loss": 7.876, "grad_norm": 1.381032109260559, "learning_rate": 0.00027119978997112103, "epoch": 2.19 }, { "loss": 7.4609, "grad_norm": 2.2558112144470215, "learning_rate": 0.0002703246696420758, "epoch": 2.19 }, { "loss": 7.524, "grad_norm": 1.0892398357391357, "learning_rate": 0.00026944954931303056, "epoch": 2.19 }, { "loss": 7.1756, "grad_norm": 1.432793140411377, "learning_rate": 0.0002685744289839853, "epoch": 2.19 }, { "loss": 7.4677, "grad_norm": 2.4381473064422607, "learning_rate": 0.0002676993086549401, "epoch": 2.2 }, { "loss": 7.2004, "grad_norm": 1.0947704315185547, "learning_rate": 0.00026682418832589484, "epoch": 2.2 }, { "loss": 7.6084, "grad_norm": 1.1396403312683105, "learning_rate": 0.0002659490679968496, "epoch": 2.2 }, { "loss": 7.4592, "grad_norm": 1.7132469415664673, "learning_rate": 0.00026507394766780437, "epoch": 2.2 }, { "loss": 7.6666, "grad_norm": 1.507416844367981, "learning_rate": 0.00026419882733875913, "epoch": 2.21 }, { "loss": 7.9483, "grad_norm": 1.997502326965332, "learning_rate": 0.0002633237070097139, "epoch": 2.21 }, { "loss": 6.8979, "grad_norm": 1.180274486541748, "learning_rate": 0.0002624485866806686, "epoch": 2.21 }, { "loss": 7.5387, "grad_norm": 1.4130629301071167, "learning_rate": 0.00026157346635162336, "epoch": 2.22 }, { "loss": 7.7374, "grad_norm": 1.9466407299041748, "learning_rate": 0.0002606983460225781, "epoch": 2.22 }, { "loss": 7.2489, "grad_norm": 1.2844946384429932, "learning_rate": 0.00025982322569353283, "epoch": 2.22 }, { "loss": 7.2583, "grad_norm": 1.4728493690490723, "learning_rate": 0.0002589481053644876, "epoch": 2.22 }, { "loss": 7.1689, "grad_norm": 1.505767583847046, "learning_rate": 0.00025807298503544235, "epoch": 2.23 }, { "loss": 7.3824, "grad_norm": 1.164609432220459, "learning_rate": 0.0002571978647063971, "epoch": 2.23 }, { "loss": 8.208, "grad_norm": 1.3337666988372803, "learning_rate": 0.0002563227443773519, "epoch": 2.23 }, { "loss": 7.1503, "grad_norm": 1.2840052843093872, "learning_rate": 0.00025544762404830664, "epoch": 2.23 }, { "loss": 7.7838, "grad_norm": 1.6767994165420532, "learning_rate": 0.0002545725037192614, "epoch": 2.24 }, { "loss": 7.4818, "grad_norm": 1.2790688276290894, "learning_rate": 0.00025369738339021616, "epoch": 2.24 }, { "loss": 7.1404, "grad_norm": 1.9306037425994873, "learning_rate": 0.0002528222630611709, "epoch": 2.24 }, { "loss": 6.9151, "grad_norm": 1.0568101406097412, "learning_rate": 0.00025194714273212563, "epoch": 2.24 }, { "loss": 7.5813, "grad_norm": 1.8494940996170044, "learning_rate": 0.0002510720224030804, "epoch": 2.25 }, { "loss": 7.1433, "grad_norm": 1.2321641445159912, "learning_rate": 0.00025019690207403515, "epoch": 2.25 }, { "loss": 7.0211, "grad_norm": 1.5231260061264038, "learning_rate": 0.0002493217817449899, "epoch": 2.25 }, { "loss": 7.5108, "grad_norm": 1.6787548065185547, "learning_rate": 0.0002484466614159447, "epoch": 2.25 }, { "loss": 7.5859, "grad_norm": 1.8862128257751465, "learning_rate": 0.00024757154108689944, "epoch": 2.26 }, { "loss": 7.0871, "grad_norm": 1.5295615196228027, "learning_rate": 0.0002466964207578542, "epoch": 2.26 }, { "loss": 7.2151, "grad_norm": 1.6439179182052612, "learning_rate": 0.00024582130042880896, "epoch": 2.26 }, { "loss": 7.851, "grad_norm": 1.5902001857757568, "learning_rate": 0.0002449461800997637, "epoch": 2.27 }, { "loss": 7.695, "grad_norm": 1.447240948677063, "learning_rate": 0.00024407105977071846, "epoch": 2.27 }, { "loss": 7.218, "grad_norm": 1.7448298931121826, "learning_rate": 0.00024319593944167322, "epoch": 2.27 }, { "loss": 7.4559, "grad_norm": 1.7815390825271606, "learning_rate": 0.00024232081911262798, "epoch": 2.27 }, { "loss": 7.5519, "grad_norm": 1.746805191040039, "learning_rate": 0.00024144569878358275, "epoch": 2.28 }, { "loss": 7.4818, "grad_norm": 1.771155834197998, "learning_rate": 0.0002405705784545375, "epoch": 2.28 }, { "loss": 7.8775, "grad_norm": 1.2886364459991455, "learning_rate": 0.00023969545812549227, "epoch": 2.28 }, { "loss": 7.0862, "grad_norm": 1.3562748432159424, "learning_rate": 0.00023882033779644703, "epoch": 2.28 }, { "loss": 7.4458, "grad_norm": 1.5549288988113403, "learning_rate": 0.00023794521746740177, "epoch": 2.29 }, { "loss": 7.5017, "grad_norm": 1.3231199979782104, "learning_rate": 0.00023707009713835653, "epoch": 2.29 }, { "loss": 6.9317, "grad_norm": 1.0973995923995972, "learning_rate": 0.0002361949768093113, "epoch": 2.29 }, { "loss": 7.2512, "grad_norm": 1.161665916442871, "learning_rate": 0.00023531985648026605, "epoch": 2.29 }, { "loss": 7.3376, "grad_norm": 1.1249802112579346, "learning_rate": 0.0002344447361512208, "epoch": 2.3 }, { "loss": 7.6856, "grad_norm": 1.4549752473831177, "learning_rate": 0.00023356961582217557, "epoch": 2.3 }, { "loss": 7.6518, "grad_norm": 1.2443310022354126, "learning_rate": 0.0002326944954931303, "epoch": 2.3 }, { "loss": 7.9287, "grad_norm": 1.2414274215698242, "learning_rate": 0.00023181937516408507, "epoch": 2.3 }, { "loss": 7.4844, "grad_norm": 1.250632882118225, "learning_rate": 0.00023094425483503983, "epoch": 2.31 }, { "loss": 6.9439, "grad_norm": 1.5678353309631348, "learning_rate": 0.0002300691345059946, "epoch": 2.31 }, { "loss": 7.2214, "grad_norm": 1.2777363061904907, "learning_rate": 0.00022919401417694933, "epoch": 2.31 }, { "loss": 7.6909, "grad_norm": 1.1702243089675903, "learning_rate": 0.0002283188938479041, "epoch": 2.32 }, { "loss": 7.843, "grad_norm": 1.1647387742996216, "learning_rate": 0.00022744377351885883, "epoch": 2.32 }, { "loss": 7.5598, "grad_norm": 1.5888360738754272, "learning_rate": 0.0002265686531898136, "epoch": 2.32 }, { "loss": 7.4084, "grad_norm": 1.2132010459899902, "learning_rate": 0.00022569353286076835, "epoch": 2.32 }, { "loss": 8.0077, "grad_norm": 1.3676106929779053, "learning_rate": 0.0002248184125317231, "epoch": 2.33 }, { "loss": 7.4475, "grad_norm": 1.4785172939300537, "learning_rate": 0.00022394329220267787, "epoch": 2.33 }, { "loss": 7.4934, "grad_norm": 1.6854803562164307, "learning_rate": 0.00022306817187363264, "epoch": 2.33 }, { "loss": 7.5371, "grad_norm": 1.3336540460586548, "learning_rate": 0.00022219305154458737, "epoch": 2.33 }, { "loss": 7.091, "grad_norm": 1.5374839305877686, "learning_rate": 0.00022131793121554213, "epoch": 2.34 }, { "loss": 7.5715, "grad_norm": 1.259857177734375, "learning_rate": 0.0002204428108864969, "epoch": 2.34 }, { "loss": 7.5012, "grad_norm": 1.435889482498169, "learning_rate": 0.00021956769055745166, "epoch": 2.34 }, { "loss": 7.5925, "grad_norm": 1.6067544221878052, "learning_rate": 0.00021869257022840642, "epoch": 2.34 }, { "loss": 7.2756, "grad_norm": 1.2057377099990845, "learning_rate": 0.00021781744989936118, "epoch": 2.35 }, { "loss": 7.0737, "grad_norm": 1.0249065160751343, "learning_rate": 0.00021694232957031591, "epoch": 2.35 }, { "loss": 7.2857, "grad_norm": 1.1336891651153564, "learning_rate": 0.00021606720924127068, "epoch": 2.35 }, { "loss": 7.0709, "grad_norm": 1.1853156089782715, "learning_rate": 0.00021519208891222544, "epoch": 2.35 }, { "loss": 6.9118, "grad_norm": 1.4682341814041138, "learning_rate": 0.0002143169685831802, "epoch": 2.36 }, { "loss": 7.3363, "grad_norm": 1.3039721250534058, "learning_rate": 0.00021344184825413496, "epoch": 2.36 }, { "loss": 7.2827, "grad_norm": 1.28932785987854, "learning_rate": 0.00021256672792508972, "epoch": 2.36 }, { "loss": 7.6069, "grad_norm": 1.7343271970748901, "learning_rate": 0.00021169160759604448, "epoch": 2.37 }, { "loss": 7.3543, "grad_norm": 1.9730132818222046, "learning_rate": 0.00021081648726699922, "epoch": 2.37 }, { "loss": 7.3351, "grad_norm": 2.070822238922119, "learning_rate": 0.00020994136693795395, "epoch": 2.37 }, { "loss": 7.3199, "grad_norm": 1.1327873468399048, "learning_rate": 0.00020906624660890872, "epoch": 2.37 }, { "loss": 7.4058, "grad_norm": 1.3796617984771729, "learning_rate": 0.00020819112627986348, "epoch": 2.38 }, { "loss": 7.3027, "grad_norm": 1.8397942781448364, "learning_rate": 0.00020731600595081824, "epoch": 2.38 }, { "loss": 7.6354, "grad_norm": 1.4503923654556274, "learning_rate": 0.0002065283976546775, "epoch": 2.38 }, { "loss": 7.2284, "grad_norm": 1.550950527191162, "learning_rate": 0.00020565327732563227, "epoch": 2.38 }, { "loss": 7.3061, "grad_norm": 1.5306216478347778, "learning_rate": 0.00020477815699658703, "epoch": 2.39 }, { "loss": 7.3337, "grad_norm": 1.269167184829712, "learning_rate": 0.0002039030366675418, "epoch": 2.39 }, { "loss": 7.7686, "grad_norm": 1.600019931793213, "learning_rate": 0.00020302791633849656, "epoch": 2.39 }, { "loss": 7.35, "grad_norm": 1.5773662328720093, "learning_rate": 0.0002021527960094513, "epoch": 2.39 }, { "loss": 7.3691, "grad_norm": 1.547160029411316, "learning_rate": 0.00020127767568040605, "epoch": 2.4 }, { "loss": 7.4863, "grad_norm": 1.4968856573104858, "learning_rate": 0.00020040255535136081, "epoch": 2.4 }, { "loss": 7.9482, "grad_norm": 1.2087891101837158, "learning_rate": 0.00019952743502231558, "epoch": 2.4 }, { "loss": 7.0255, "grad_norm": 1.290597677230835, "learning_rate": 0.00019865231469327034, "epoch": 2.4 }, { "loss": 7.178, "grad_norm": 1.5743247270584106, "learning_rate": 0.0001977771943642251, "epoch": 2.41 }, { "loss": 7.6474, "grad_norm": 1.5197412967681885, "learning_rate": 0.00019690207403517984, "epoch": 2.41 }, { "loss": 7.3527, "grad_norm": 1.4716495275497437, "learning_rate": 0.0001960269537061346, "epoch": 2.41 }, { "loss": 7.6313, "grad_norm": 1.9746785163879395, "learning_rate": 0.00019515183337708936, "epoch": 2.41 }, { "loss": 7.6972, "grad_norm": 1.2683417797088623, "learning_rate": 0.00019427671304804412, "epoch": 2.42 }, { "loss": 7.1378, "grad_norm": 1.1373748779296875, "learning_rate": 0.00019340159271899888, "epoch": 2.42 }, { "loss": 7.0196, "grad_norm": 1.4191349744796753, "learning_rate": 0.00019252647238995364, "epoch": 2.42 }, { "loss": 6.9102, "grad_norm": 1.6580002307891846, "learning_rate": 0.00019165135206090838, "epoch": 2.43 }, { "loss": 7.5105, "grad_norm": 1.2877469062805176, "learning_rate": 0.00019077623173186314, "epoch": 2.43 }, { "loss": 8.0212, "grad_norm": 1.2933236360549927, "learning_rate": 0.00018990111140281788, "epoch": 2.43 }, { "loss": 7.2108, "grad_norm": 1.6515684127807617, "learning_rate": 0.00018902599107377264, "epoch": 2.43 }, { "loss": 7.2944, "grad_norm": 1.443547010421753, "learning_rate": 0.0001881508707447274, "epoch": 2.44 }, { "loss": 6.9623, "grad_norm": 1.5022013187408447, "learning_rate": 0.00018727575041568216, "epoch": 2.44 }, { "loss": 7.5751, "grad_norm": 1.639228343963623, "learning_rate": 0.0001864006300866369, "epoch": 2.44 }, { "loss": 7.6183, "grad_norm": 1.3685816526412964, "learning_rate": 0.00018552550975759166, "epoch": 2.44 }, { "loss": 7.7862, "grad_norm": 1.4008909463882446, "learning_rate": 0.00018465038942854642, "epoch": 2.45 }, { "loss": 7.3036, "grad_norm": 1.4068384170532227, "learning_rate": 0.00018377526909950118, "epoch": 2.45 }, { "loss": 7.3222, "grad_norm": 1.4874199628829956, "learning_rate": 0.00018290014877045594, "epoch": 2.45 }, { "loss": 7.4538, "grad_norm": 2.161606788635254, "learning_rate": 0.0001820250284414107, "epoch": 2.45 }, { "loss": 7.099, "grad_norm": 1.4761602878570557, "learning_rate": 0.00018114990811236544, "epoch": 2.46 }, { "loss": 7.6725, "grad_norm": 1.3598577976226807, "learning_rate": 0.0001802747877833202, "epoch": 2.46 }, { "loss": 7.4651, "grad_norm": 1.352389931678772, "learning_rate": 0.00017939966745427496, "epoch": 2.46 }, { "loss": 7.0266, "grad_norm": 1.302270770072937, "learning_rate": 0.00017852454712522973, "epoch": 2.46 }, { "loss": 7.4879, "grad_norm": 1.2166621685028076, "learning_rate": 0.0001776494267961845, "epoch": 2.47 }, { "loss": 6.7354, "grad_norm": 1.4442105293273926, "learning_rate": 0.00017677430646713925, "epoch": 2.47 }, { "loss": 7.1184, "grad_norm": 1.6301904916763306, "learning_rate": 0.000175899186138094, "epoch": 2.47 }, { "loss": 7.4326, "grad_norm": 1.2478090524673462, "learning_rate": 0.00017502406580904875, "epoch": 2.48 }, { "loss": 7.6185, "grad_norm": 1.2676613330841064, "learning_rate": 0.0001741489454800035, "epoch": 2.48 }, { "loss": 7.439, "grad_norm": 1.4324458837509155, "learning_rate": 0.00017327382515095827, "epoch": 2.48 }, { "loss": 7.7999, "grad_norm": 1.634446382522583, "learning_rate": 0.00017239870482191303, "epoch": 2.48 }, { "loss": 7.3043, "grad_norm": 1.2877479791641235, "learning_rate": 0.0001715235844928678, "epoch": 2.49 }, { "loss": 7.054, "grad_norm": 1.7003803253173828, "learning_rate": 0.00017064846416382255, "epoch": 2.49 }, { "loss": 7.1568, "grad_norm": 1.8888310194015503, "learning_rate": 0.00016977334383477726, "epoch": 2.49 }, { "loss": 7.3495, "grad_norm": 1.2593083381652832, "learning_rate": 0.00016889822350573202, "epoch": 2.49 }, { "loss": 7.4716, "grad_norm": 1.4410508871078491, "learning_rate": 0.00016802310317668679, "epoch": 2.5 }, { "loss": 7.5133, "grad_norm": 1.20904541015625, "learning_rate": 0.00016714798284764155, "epoch": 2.5 }, { "loss": 7.3222, "grad_norm": 1.4503611326217651, "learning_rate": 0.0001662728625185963, "epoch": 2.5 }, { "loss": 7.6387, "grad_norm": 1.3705183267593384, "learning_rate": 0.00016539774218955107, "epoch": 2.5 }, { "loss": 7.0609, "grad_norm": 1.2106906175613403, "learning_rate": 0.0001645226218605058, "epoch": 2.51 }, { "loss": 7.342, "grad_norm": 1.5564229488372803, "learning_rate": 0.00016364750153146057, "epoch": 2.51 }, { "loss": 7.8121, "grad_norm": 1.6493812799453735, "learning_rate": 0.00016277238120241533, "epoch": 2.51 }, { "loss": 7.3909, "grad_norm": 1.9025623798370361, "learning_rate": 0.0001618972608733701, "epoch": 2.51 }, { "loss": 7.0106, "grad_norm": 1.2934685945510864, "learning_rate": 0.00016102214054432485, "epoch": 2.52 }, { "loss": 7.5199, "grad_norm": 1.2549662590026855, "learning_rate": 0.00016014702021527962, "epoch": 2.52 }, { "loss": 7.3509, "grad_norm": 1.2111480236053467, "learning_rate": 0.00015927189988623435, "epoch": 2.52 }, { "loss": 7.5281, "grad_norm": 2.2498984336853027, "learning_rate": 0.0001583967795571891, "epoch": 2.53 }, { "loss": 7.5218, "grad_norm": 1.4710973501205444, "learning_rate": 0.00015752165922814387, "epoch": 2.53 }, { "loss": 7.1575, "grad_norm": 1.4040391445159912, "learning_rate": 0.00015664653889909864, "epoch": 2.53 }, { "loss": 7.3097, "grad_norm": 2.3657708168029785, "learning_rate": 0.0001557714185700534, "epoch": 2.53 }, { "loss": 7.3235, "grad_norm": 1.8456711769104004, "learning_rate": 0.00015489629824100816, "epoch": 2.54 }, { "loss": 7.1772, "grad_norm": 1.3032398223876953, "learning_rate": 0.0001540211779119629, "epoch": 2.54 }, { "loss": 7.331, "grad_norm": 1.2472988367080688, "learning_rate": 0.00015314605758291766, "epoch": 2.54 }, { "loss": 6.9758, "grad_norm": 1.1861238479614258, "learning_rate": 0.00015227093725387242, "epoch": 2.54 }, { "loss": 7.357, "grad_norm": 1.2937425374984741, "learning_rate": 0.00015139581692482718, "epoch": 2.55 }, { "loss": 7.6132, "grad_norm": 1.5241109132766724, "learning_rate": 0.00015052069659578194, "epoch": 2.55 }, { "loss": 7.1769, "grad_norm": 1.2426915168762207, "learning_rate": 0.00014964557626673668, "epoch": 2.55 }, { "loss": 7.2242, "grad_norm": 1.5336363315582275, "learning_rate": 0.0001487704559376914, "epoch": 2.55 }, { "loss": 8.0839, "grad_norm": 1.6944379806518555, "learning_rate": 0.00014789533560864617, "epoch": 2.56 }, { "loss": 7.2667, "grad_norm": 1.6602429151535034, "learning_rate": 0.00014702021527960093, "epoch": 2.56 }, { "loss": 7.4821, "grad_norm": 1.331986665725708, "learning_rate": 0.0001461450949505557, "epoch": 2.56 }, { "loss": 7.4808, "grad_norm": 1.4923409223556519, "learning_rate": 0.00014526997462151046, "epoch": 2.56 }, { "loss": 7.3579, "grad_norm": 1.5323739051818848, "learning_rate": 0.00014439485429246522, "epoch": 2.57 }, { "loss": 7.1833, "grad_norm": 1.0281411409378052, "learning_rate": 0.00014351973396341998, "epoch": 2.57 }, { "loss": 7.521, "grad_norm": 1.777385950088501, "learning_rate": 0.00014264461363437472, "epoch": 2.57 }, { "loss": 7.5531, "grad_norm": 1.7528423070907593, "learning_rate": 0.00014176949330532948, "epoch": 2.58 }, { "loss": 7.3295, "grad_norm": 1.665503740310669, "learning_rate": 0.00014089437297628424, "epoch": 2.58 }, { "loss": 6.9815, "grad_norm": 1.4323763847351074, "learning_rate": 0.000140019252647239, "epoch": 2.58 }, { "loss": 7.7957, "grad_norm": 1.2623038291931152, "learning_rate": 0.00013914413231819376, "epoch": 2.58 }, { "loss": 7.2667, "grad_norm": 1.3770829439163208, "learning_rate": 0.00013826901198914853, "epoch": 2.59 }, { "loss": 7.2641, "grad_norm": 1.495597243309021, "learning_rate": 0.00013739389166010326, "epoch": 2.59 }, { "loss": 7.6276, "grad_norm": 1.0396783351898193, "learning_rate": 0.00013651877133105802, "epoch": 2.59 }, { "loss": 7.4811, "grad_norm": 1.5590603351593018, "learning_rate": 0.00013564365100201278, "epoch": 2.59 }, { "loss": 6.9941, "grad_norm": 1.266262173652649, "learning_rate": 0.00013476853067296755, "epoch": 2.6 }, { "loss": 7.0138, "grad_norm": 1.3331608772277832, "learning_rate": 0.0001338934103439223, "epoch": 2.6 }, { "loss": 7.6792, "grad_norm": 1.54330575466156, "learning_rate": 0.00013301829001487707, "epoch": 2.6 }, { "loss": 7.5151, "grad_norm": 1.266360878944397, "learning_rate": 0.0001321431696858318, "epoch": 2.6 }, { "loss": 7.6357, "grad_norm": 1.1992617845535278, "learning_rate": 0.00013126804935678657, "epoch": 2.61 }, { "loss": 7.6848, "grad_norm": 1.6269259452819824, "learning_rate": 0.00013039292902774133, "epoch": 2.61 }, { "loss": 7.3941, "grad_norm": 1.4221471548080444, "learning_rate": 0.00012951780869869606, "epoch": 2.61 }, { "loss": 7.5638, "grad_norm": 1.31778085231781, "learning_rate": 0.00012864268836965082, "epoch": 2.61 }, { "loss": 7.3716, "grad_norm": 1.4217979907989502, "learning_rate": 0.00012776756804060559, "epoch": 2.62 }, { "loss": 7.7403, "grad_norm": 1.549012541770935, "learning_rate": 0.00012689244771156032, "epoch": 2.62 }, { "loss": 7.5079, "grad_norm": 1.7808821201324463, "learning_rate": 0.00012601732738251508, "epoch": 2.62 }, { "loss": 7.338, "grad_norm": 1.6030139923095703, "learning_rate": 0.00012514220705346984, "epoch": 2.62 }, { "loss": 7.2113, "grad_norm": 1.688103437423706, "learning_rate": 0.0001242670867244246, "epoch": 2.63 }, { "loss": 7.5297, "grad_norm": 1.4482861757278442, "learning_rate": 0.00012339196639537937, "epoch": 2.63 }, { "loss": 7.6226, "grad_norm": 1.481149435043335, "learning_rate": 0.00012251684606633413, "epoch": 2.63 }, { "loss": 7.1199, "grad_norm": 1.5914816856384277, "learning_rate": 0.00012164172573728888, "epoch": 2.64 }, { "loss": 7.5294, "grad_norm": 1.6436686515808105, "learning_rate": 0.00012076660540824364, "epoch": 2.64 }, { "loss": 7.7319, "grad_norm": 1.422884225845337, "learning_rate": 0.00011989148507919839, "epoch": 2.64 }, { "loss": 7.5878, "grad_norm": 1.2468681335449219, "learning_rate": 0.00011901636475015315, "epoch": 2.64 }, { "loss": 7.4093, "grad_norm": 1.6080206632614136, "learning_rate": 0.00011814124442110791, "epoch": 2.65 }, { "loss": 6.927, "grad_norm": 1.2568819522857666, "learning_rate": 0.00011726612409206266, "epoch": 2.65 }, { "loss": 7.524, "grad_norm": 1.4558569192886353, "learning_rate": 0.00011639100376301742, "epoch": 2.65 }, { "loss": 6.7721, "grad_norm": 1.3554805517196655, "learning_rate": 0.00011551588343397218, "epoch": 2.65 }, { "loss": 7.5129, "grad_norm": 2.061342239379883, "learning_rate": 0.00011464076310492692, "epoch": 2.66 }, { "loss": 7.271, "grad_norm": 1.7581554651260376, "learning_rate": 0.00011376564277588168, "epoch": 2.66 }, { "loss": 7.4605, "grad_norm": 1.3818498849868774, "learning_rate": 0.00011289052244683644, "epoch": 2.66 }, { "loss": 7.2747, "grad_norm": 1.4640157222747803, "learning_rate": 0.00011201540211779119, "epoch": 2.66 }, { "loss": 7.4137, "grad_norm": 1.628440499305725, "learning_rate": 0.00011114028178874595, "epoch": 2.67 }, { "loss": 7.1947, "grad_norm": 2.1291253566741943, "learning_rate": 0.00011026516145970071, "epoch": 2.67 }, { "loss": 7.3972, "grad_norm": 1.53203284740448, "learning_rate": 0.00010939004113065546, "epoch": 2.67 }, { "loss": 7.1343, "grad_norm": 1.7009447813034058, "learning_rate": 0.00010851492080161022, "epoch": 2.67 }, { "loss": 7.4999, "grad_norm": 1.981833815574646, "learning_rate": 0.00010763980047256499, "epoch": 2.68 }, { "loss": 7.0649, "grad_norm": 1.4151135683059692, "learning_rate": 0.00010676468014351973, "epoch": 2.68 }, { "loss": 7.4975, "grad_norm": 1.8214997053146362, "learning_rate": 0.0001058895598144745, "epoch": 2.68 }, { "loss": 7.1928, "grad_norm": 1.475014328956604, "learning_rate": 0.00010501443948542926, "epoch": 2.69 }, { "loss": 6.7309, "grad_norm": 1.500470757484436, "learning_rate": 0.00010413931915638399, "epoch": 2.69 }, { "loss": 7.2154, "grad_norm": 1.0923032760620117, "learning_rate": 0.00010326419882733875, "epoch": 2.69 }, { "loss": 7.4584, "grad_norm": 1.476189136505127, "learning_rate": 0.00010238907849829352, "epoch": 2.69 }, { "loss": 7.5696, "grad_norm": 1.3299099206924438, "learning_rate": 0.00010151395816924828, "epoch": 2.7 }, { "loss": 7.4462, "grad_norm": 1.248026967048645, "learning_rate": 0.00010063883784020303, "epoch": 2.7 }, { "loss": 7.057, "grad_norm": 1.5154845714569092, "learning_rate": 9.976371751115779e-05, "epoch": 2.7 }, { "loss": 7.4942, "grad_norm": 1.504868745803833, "learning_rate": 9.888859718211255e-05, "epoch": 2.7 }, { "loss": 7.7042, "grad_norm": 1.2087482213974, "learning_rate": 9.80134768530673e-05, "epoch": 2.71 }, { "loss": 7.7138, "grad_norm": 2.066254138946533, "learning_rate": 9.713835652402206e-05, "epoch": 2.71 }, { "loss": 7.4746, "grad_norm": 1.2078548669815063, "learning_rate": 9.626323619497682e-05, "epoch": 2.71 }, { "loss": 7.5682, "grad_norm": 1.2530779838562012, "learning_rate": 9.538811586593157e-05, "epoch": 2.71 }, { "loss": 7.4491, "grad_norm": 1.5170719623565674, "learning_rate": 9.451299553688632e-05, "epoch": 2.72 }, { "loss": 7.2938, "grad_norm": 1.2933870553970337, "learning_rate": 9.363787520784108e-05, "epoch": 2.72 }, { "loss": 7.1455, "grad_norm": 1.212755799293518, "learning_rate": 9.276275487879583e-05, "epoch": 2.72 }, { "loss": 7.3702, "grad_norm": 1.4118942022323608, "learning_rate": 9.188763454975059e-05, "epoch": 2.72 }, { "loss": 7.1194, "grad_norm": 1.575276494026184, "learning_rate": 9.101251422070535e-05, "epoch": 2.73 }, { "loss": 7.046, "grad_norm": 1.3244752883911133, "learning_rate": 9.01373938916601e-05, "epoch": 2.73 }, { "loss": 6.875, "grad_norm": 1.369280219078064, "learning_rate": 8.926227356261486e-05, "epoch": 2.73 }, { "loss": 7.4045, "grad_norm": 1.3210042715072632, "learning_rate": 8.838715323356962e-05, "epoch": 2.74 }, { "loss": 7.5159, "grad_norm": 1.4352552890777588, "learning_rate": 8.751203290452437e-05, "epoch": 2.74 }, { "loss": 7.2315, "grad_norm": 1.4860197305679321, "learning_rate": 8.663691257547913e-05, "epoch": 2.74 }, { "loss": 6.8597, "grad_norm": 1.2331523895263672, "learning_rate": 8.57617922464339e-05, "epoch": 2.74 }, { "loss": 7.3485, "grad_norm": 1.2187525033950806, "learning_rate": 8.488667191738863e-05, "epoch": 2.75 }, { "loss": 7.388, "grad_norm": 1.1800241470336914, "learning_rate": 8.401155158834339e-05, "epoch": 2.75 }, { "loss": 6.9186, "grad_norm": 1.3542723655700684, "learning_rate": 8.313643125929815e-05, "epoch": 2.75 }, { "loss": 6.9582, "grad_norm": 1.3839143514633179, "learning_rate": 8.22613109302529e-05, "epoch": 2.75 }, { "loss": 7.4176, "grad_norm": 1.4546840190887451, "learning_rate": 8.138619060120766e-05, "epoch": 2.76 }, { "loss": 7.2731, "grad_norm": 1.3623560667037964, "learning_rate": 8.051107027216243e-05, "epoch": 2.76 }, { "loss": 7.1633, "grad_norm": 1.9331005811691284, "learning_rate": 7.963594994311717e-05, "epoch": 2.76 }, { "loss": 6.8972, "grad_norm": 1.2791029214859009, "learning_rate": 7.876082961407194e-05, "epoch": 2.76 }, { "loss": 7.1043, "grad_norm": 1.6202424764633179, "learning_rate": 7.78857092850267e-05, "epoch": 2.77 }, { "loss": 7.0727, "grad_norm": 1.0835381746292114, "learning_rate": 7.701058895598145e-05, "epoch": 2.77 }, { "loss": 7.0958, "grad_norm": 1.2778371572494507, "learning_rate": 7.613546862693621e-05, "epoch": 2.77 }, { "loss": 7.2219, "grad_norm": 1.9295389652252197, "learning_rate": 7.526034829789097e-05, "epoch": 2.77 }, { "loss": 7.0189, "grad_norm": 1.9394477605819702, "learning_rate": 7.43852279688457e-05, "epoch": 2.78 }, { "loss": 7.0144, "grad_norm": 1.4238934516906738, "learning_rate": 7.351010763980047e-05, "epoch": 2.78 }, { "loss": 7.2353, "grad_norm": 1.350537657737732, "learning_rate": 7.263498731075523e-05, "epoch": 2.78 }, { "loss": 6.7353, "grad_norm": 1.3214153051376343, "learning_rate": 7.175986698170999e-05, "epoch": 2.79 }, { "loss": 7.4143, "grad_norm": 2.469216823577881, "learning_rate": 7.088474665266474e-05, "epoch": 2.79 }, { "loss": 7.4276, "grad_norm": 1.414184808731079, "learning_rate": 7.00096263236195e-05, "epoch": 2.79 }, { "loss": 6.9842, "grad_norm": 1.4708011150360107, "learning_rate": 6.913450599457426e-05, "epoch": 2.79 }, { "loss": 7.572, "grad_norm": 1.449560284614563, "learning_rate": 6.825938566552901e-05, "epoch": 2.8 }, { "loss": 7.3449, "grad_norm": 1.1261264085769653, "learning_rate": 6.738426533648377e-05, "epoch": 2.8 }, { "loss": 7.1776, "grad_norm": 1.5502110719680786, "learning_rate": 6.650914500743853e-05, "epoch": 2.8 }, { "loss": 7.0565, "grad_norm": 1.3916562795639038, "learning_rate": 6.563402467839328e-05, "epoch": 2.8 }, { "loss": 7.0882, "grad_norm": 1.361229658126831, "learning_rate": 6.475890434934803e-05, "epoch": 2.81 }, { "loss": 6.981, "grad_norm": 1.6100305318832397, "learning_rate": 6.388378402030279e-05, "epoch": 2.81 }, { "loss": 7.2502, "grad_norm": 1.5449306964874268, "learning_rate": 6.300866369125754e-05, "epoch": 2.81 }, { "loss": 7.4208, "grad_norm": 1.3188410997390747, "learning_rate": 6.21335433622123e-05, "epoch": 2.81 }, { "loss": 7.2957, "grad_norm": 1.543289303779602, "learning_rate": 6.125842303316706e-05, "epoch": 2.82 }, { "loss": 7.0319, "grad_norm": 1.1590594053268433, "learning_rate": 6.038330270412182e-05, "epoch": 2.82 }, { "loss": 7.23, "grad_norm": 1.1623939275741577, "learning_rate": 5.9508182375076575e-05, "epoch": 2.82 }, { "loss": 7.1254, "grad_norm": 1.6204333305358887, "learning_rate": 5.863306204603133e-05, "epoch": 2.82 }, { "loss": 7.4319, "grad_norm": 1.5845638513565063, "learning_rate": 5.775794171698609e-05, "epoch": 2.83 }, { "loss": 7.4574, "grad_norm": 1.3281787633895874, "learning_rate": 5.688282138794084e-05, "epoch": 2.83 }, { "loss": 6.8629, "grad_norm": 1.6502999067306519, "learning_rate": 5.6007701058895595e-05, "epoch": 2.83 }, { "loss": 7.1493, "grad_norm": 1.7768168449401855, "learning_rate": 5.513258072985036e-05, "epoch": 2.83 }, { "loss": 7.1971, "grad_norm": 1.1763763427734375, "learning_rate": 5.425746040080511e-05, "epoch": 2.84 }, { "loss": 7.4182, "grad_norm": 1.4033911228179932, "learning_rate": 5.338234007175987e-05, "epoch": 2.84 }, { "loss": 6.8175, "grad_norm": 1.5407586097717285, "learning_rate": 5.250721974271463e-05, "epoch": 2.84 }, { "loss": 7.5091, "grad_norm": 1.5829062461853027, "learning_rate": 5.163209941366938e-05, "epoch": 2.85 }, { "loss": 7.0728, "grad_norm": 1.3185957670211792, "learning_rate": 5.075697908462414e-05, "epoch": 2.85 }, { "loss": 7.1931, "grad_norm": 1.1996837854385376, "learning_rate": 4.9881858755578894e-05, "epoch": 2.85 }, { "loss": 7.2327, "grad_norm": 1.6188883781433105, "learning_rate": 4.900673842653365e-05, "epoch": 2.85 }, { "loss": 7.2432, "grad_norm": 1.7829197645187378, "learning_rate": 4.813161809748841e-05, "epoch": 2.86 }, { "loss": 6.8231, "grad_norm": 1.3998175859451294, "learning_rate": 4.725649776844316e-05, "epoch": 2.86 }, { "loss": 7.5838, "grad_norm": 1.6664845943450928, "learning_rate": 4.6381377439397914e-05, "epoch": 2.86 }, { "loss": 7.3804, "grad_norm": 1.2328096628189087, "learning_rate": 4.5506257110352676e-05, "epoch": 2.86 }, { "loss": 7.1497, "grad_norm": 1.5543657541275024, "learning_rate": 4.463113678130743e-05, "epoch": 2.87 }, { "loss": 7.5067, "grad_norm": 2.0711114406585693, "learning_rate": 4.3756016452262186e-05, "epoch": 2.87 }, { "loss": 7.1481, "grad_norm": 2.340829372406006, "learning_rate": 4.288089612321695e-05, "epoch": 2.87 }, { "loss": 7.2767, "grad_norm": 1.3014119863510132, "learning_rate": 4.2005775794171696e-05, "epoch": 2.87 }, { "loss": 7.2583, "grad_norm": 1.186070442199707, "learning_rate": 4.113065546512645e-05, "epoch": 2.88 }, { "loss": 7.7179, "grad_norm": 1.4286901950836182, "learning_rate": 4.025553513608121e-05, "epoch": 2.88 }, { "loss": 6.9271, "grad_norm": 1.561988115310669, "learning_rate": 3.938041480703597e-05, "epoch": 2.88 }, { "loss": 6.9378, "grad_norm": 1.2756584882736206, "learning_rate": 3.8505294477990723e-05, "epoch": 2.88 }, { "loss": 7.8091, "grad_norm": 1.5452569723129272, "learning_rate": 3.7630174148945485e-05, "epoch": 2.89 }, { "loss": 6.7905, "grad_norm": 1.2616968154907227, "learning_rate": 3.6755053819900234e-05, "epoch": 2.89 }, { "loss": 7.3958, "grad_norm": 1.1684807538986206, "learning_rate": 3.5879933490854995e-05, "epoch": 2.89 }, { "loss": 6.9238, "grad_norm": 1.351366639137268, "learning_rate": 3.500481316180975e-05, "epoch": 2.9 }, { "loss": 7.4026, "grad_norm": 1.2473573684692383, "learning_rate": 3.4129692832764505e-05, "epoch": 2.9 }, { "loss": 7.4247, "grad_norm": 1.5123474597930908, "learning_rate": 3.325457250371927e-05, "epoch": 2.9 }, { "loss": 7.0967, "grad_norm": 1.1452938318252563, "learning_rate": 3.2379452174674016e-05, "epoch": 2.9 }, { "loss": 7.0357, "grad_norm": 1.1505627632141113, "learning_rate": 3.150433184562877e-05, "epoch": 2.91 }, { "loss": 7.4973, "grad_norm": 1.438091516494751, "learning_rate": 3.062921151658353e-05, "epoch": 2.91 }, { "loss": 7.4715, "grad_norm": 1.1489310264587402, "learning_rate": 2.9754091187538288e-05, "epoch": 2.91 }, { "loss": 7.0076, "grad_norm": 1.3423534631729126, "learning_rate": 2.8878970858493046e-05, "epoch": 2.91 }, { "loss": 7.0935, "grad_norm": 1.2484374046325684, "learning_rate": 2.8003850529447798e-05, "epoch": 2.92 }, { "loss": 7.1792, "grad_norm": 1.310231328010559, "learning_rate": 2.7128730200402556e-05, "epoch": 2.92 }, { "loss": 7.3469, "grad_norm": 1.417974591255188, "learning_rate": 2.6253609871357314e-05, "epoch": 2.92 }, { "loss": 7.2473, "grad_norm": 1.3878840208053589, "learning_rate": 2.537848954231207e-05, "epoch": 2.92 }, { "loss": 7.1321, "grad_norm": 1.6403028964996338, "learning_rate": 2.459088124617135e-05, "epoch": 2.93 }, { "loss": 7.6076, "grad_norm": 1.2110294103622437, "learning_rate": 2.3715760917126104e-05, "epoch": 2.93 }, { "loss": 7.3466, "grad_norm": 1.203755497932434, "learning_rate": 2.2840640588080863e-05, "epoch": 2.93 }, { "loss": 7.4367, "grad_norm": 1.2081892490386963, "learning_rate": 2.1965520259035618e-05, "epoch": 2.93 }, { "loss": 7.6191, "grad_norm": 1.2515225410461426, "learning_rate": 2.1090399929990373e-05, "epoch": 2.94 }, { "loss": 7.2915, "grad_norm": 1.2461618185043335, "learning_rate": 2.021527960094513e-05, "epoch": 2.94 }, { "loss": 7.0825, "grad_norm": 1.3424855470657349, "learning_rate": 1.9340159271899886e-05, "epoch": 2.94 }, { "loss": 7.6924, "grad_norm": 1.2109103202819824, "learning_rate": 1.846503894285464e-05, "epoch": 2.95 }, { "loss": 7.531, "grad_norm": 1.2161798477172852, "learning_rate": 1.75899186138094e-05, "epoch": 2.95 }, { "loss": 7.1992, "grad_norm": 1.347778081893921, "learning_rate": 1.6714798284764158e-05, "epoch": 2.95 }, { "loss": 7.7785, "grad_norm": 1.2869161367416382, "learning_rate": 1.583967795571891e-05, "epoch": 2.95 }, { "loss": 7.6703, "grad_norm": 1.1452679634094238, "learning_rate": 1.4964557626673668e-05, "epoch": 2.96 }, { "loss": 7.3311, "grad_norm": 1.7757437229156494, "learning_rate": 1.4089437297628423e-05, "epoch": 2.96 }, { "loss": 7.4272, "grad_norm": 1.2730258703231812, "learning_rate": 1.3214316968583182e-05, "epoch": 2.96 }, { "loss": 6.8195, "grad_norm": 1.0826276540756226, "learning_rate": 1.2339196639537937e-05, "epoch": 2.96 }, { "loss": 7.1219, "grad_norm": 1.3847414255142212, "learning_rate": 1.1464076310492692e-05, "epoch": 2.97 }, { "loss": 7.5912, "grad_norm": 1.4612926244735718, "learning_rate": 1.0588955981447449e-05, "epoch": 2.97 }, { "loss": 6.9373, "grad_norm": 1.5692036151885986, "learning_rate": 9.713835652402205e-06, "epoch": 2.97 }, { "loss": 7.7104, "grad_norm": 1.4740134477615356, "learning_rate": 8.838715323356962e-06, "epoch": 2.97 }, { "loss": 7.1918, "grad_norm": 1.026573657989502, "learning_rate": 7.963594994311717e-06, "epoch": 2.98 }, { "loss": 6.8717, "grad_norm": 1.1959487199783325, "learning_rate": 7.088474665266474e-06, "epoch": 2.98 }, { "loss": 7.4154, "grad_norm": 1.1354584693908691, "learning_rate": 6.213354336221231e-06, "epoch": 2.98 }, { "loss": 7.1622, "grad_norm": 1.3372441530227661, "learning_rate": 5.338234007175987e-06, "epoch": 2.98 }, { "loss": 6.9564, "grad_norm": 1.1713366508483887, "learning_rate": 4.463113678130743e-06, "epoch": 2.99 }, { "loss": 7.462, "grad_norm": 1.8238294124603271, "learning_rate": 3.587993349085499e-06, "epoch": 2.99 }, { "loss": 7.5493, "grad_norm": 1.3313993215560913, "learning_rate": 2.7128730200402555e-06, "epoch": 2.99 }, { "loss": 7.2399, "grad_norm": 1.1780248880386353, "learning_rate": 1.8377526909950118e-06, "epoch": 3.0 }, { "loss": 6.879, "grad_norm": 1.2703826427459717, "learning_rate": 9.626323619497682e-07, "epoch": 3.0 }, { "train_runtime": 104781.7564, "train_samples_per_second": 3.49, "train_steps_per_second": 0.109, "train_loss": 8.437174775609405, "epoch": 3.0 } ]