{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9973915878708834, "eval_steps": 500, "global_step": 2452, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016302575806977503, "grad_norm": 0.7840946912765503, "learning_rate": 0.0, "loss": 0.4647, "step": 1 }, { "epoch": 0.0032605151613955006, "grad_norm": 0.30409395694732666, "learning_rate": 2.1533827903669654e-05, "loss": 0.4264, "step": 2 }, { "epoch": 0.00489077274209325, "grad_norm": 0.2784154713153839, "learning_rate": 3.413030972429927e-05, "loss": 0.3859, "step": 3 }, { "epoch": 0.006521030322791001, "grad_norm": 0.2332989126443863, "learning_rate": 4.306765580733931e-05, "loss": 0.3903, "step": 4 }, { "epoch": 0.008151287903488751, "grad_norm": 0.18156054615974426, "learning_rate": 5e-05, "loss": 0.4113, "step": 5 }, { "epoch": 0.0097815454841865, "grad_norm": 0.1587788164615631, "learning_rate": 5.5664137627968925e-05, "loss": 0.3769, "step": 6 }, { "epoch": 0.011411803064884252, "grad_norm": 0.13205036520957947, "learning_rate": 6.0453097756108376e-05, "loss": 0.356, "step": 7 }, { "epoch": 0.013042060645582002, "grad_norm": 0.131794735789299, "learning_rate": 6.460148371100896e-05, "loss": 0.3509, "step": 8 }, { "epoch": 0.014672318226279752, "grad_norm": 0.13253478705883026, "learning_rate": 6.826061944859854e-05, "loss": 0.3566, "step": 9 }, { "epoch": 0.016302575806977502, "grad_norm": 0.1417822539806366, "learning_rate": 7.153382790366967e-05, "loss": 0.3328, "step": 10 }, { "epoch": 0.01793283338767525, "grad_norm": 0.1301659494638443, "learning_rate": 7.449480512024892e-05, "loss": 0.3558, "step": 11 }, { "epoch": 0.019563090968373, "grad_norm": 0.1542751044034958, "learning_rate": 7.719796553163858e-05, "loss": 0.3789, "step": 12 }, { "epoch": 0.021193348549070755, "grad_norm": 0.11910240352153778, "learning_rate": 7.968463205835412e-05, "loss": 0.3458, "step": 13 }, { "epoch": 0.022823606129768505, "grad_norm": 0.128227099776268, "learning_rate": 8.198692565977803e-05, "loss": 0.3414, "step": 14 }, { "epoch": 0.024453863710466255, "grad_norm": 0.1177757978439331, "learning_rate": 8.413030972429928e-05, "loss": 0.3145, "step": 15 }, { "epoch": 0.026084121291164004, "grad_norm": 0.1591070592403412, "learning_rate": 8.613531161467861e-05, "loss": 0.335, "step": 16 }, { "epoch": 0.027714378871861754, "grad_norm": 0.1489201784133911, "learning_rate": 8.80187213861294e-05, "loss": 0.2942, "step": 17 }, { "epoch": 0.029344636452559504, "grad_norm": 0.1476200520992279, "learning_rate": 8.979444735226819e-05, "loss": 0.3173, "step": 18 }, { "epoch": 0.030974894033257254, "grad_norm": 0.20016862452030182, "learning_rate": 9.147414002175752e-05, "loss": 0.3336, "step": 19 }, { "epoch": 0.032605151613955004, "grad_norm": 0.15757253766059875, "learning_rate": 9.306765580733931e-05, "loss": 0.324, "step": 20 }, { "epoch": 0.034235409194652754, "grad_norm": 0.19339101016521454, "learning_rate": 9.458340748040766e-05, "loss": 0.3186, "step": 21 }, { "epoch": 0.0358656667753505, "grad_norm": 0.1541537493467331, "learning_rate": 9.602863302391859e-05, "loss": 0.3142, "step": 22 }, { "epoch": 0.03749592435604825, "grad_norm": 0.1751069873571396, "learning_rate": 9.740960467331899e-05, "loss": 0.3163, "step": 23 }, { "epoch": 0.039126181936746, "grad_norm": 0.13823041319847107, "learning_rate": 9.873179343530825e-05, "loss": 0.3033, "step": 24 }, { "epoch": 0.04075643951744375, "grad_norm": 0.15399403870105743, "learning_rate": 0.0001, "loss": 0.2893, "step": 25 }, { "epoch": 0.04238669709814151, "grad_norm": 0.16494381427764893, "learning_rate": 0.0001, "loss": 0.3078, "step": 26 }, { "epoch": 0.04401695467883926, "grad_norm": 0.14704033732414246, "learning_rate": 0.0001, "loss": 0.2921, "step": 27 }, { "epoch": 0.04564721225953701, "grad_norm": 0.15865178406238556, "learning_rate": 0.0001, "loss": 0.317, "step": 28 }, { "epoch": 0.04727746984023476, "grad_norm": 0.12749037146568298, "learning_rate": 0.0001, "loss": 0.2851, "step": 29 }, { "epoch": 0.04890772742093251, "grad_norm": 0.12464679777622223, "learning_rate": 0.0001, "loss": 0.2865, "step": 30 }, { "epoch": 0.05053798500163026, "grad_norm": 0.14522074162960052, "learning_rate": 0.0001, "loss": 0.2914, "step": 31 }, { "epoch": 0.05216824258232801, "grad_norm": 0.11201729625463486, "learning_rate": 0.0001, "loss": 0.2773, "step": 32 }, { "epoch": 0.05379850016302576, "grad_norm": 0.1620592176914215, "learning_rate": 0.0001, "loss": 0.2885, "step": 33 }, { "epoch": 0.05542875774372351, "grad_norm": 0.12515400350093842, "learning_rate": 0.0001, "loss": 0.2874, "step": 34 }, { "epoch": 0.05705901532442126, "grad_norm": 0.11235728859901428, "learning_rate": 0.0001, "loss": 0.2552, "step": 35 }, { "epoch": 0.05868927290511901, "grad_norm": 0.1376715451478958, "learning_rate": 0.0001, "loss": 0.2738, "step": 36 }, { "epoch": 0.06031953048581676, "grad_norm": 0.12501214444637299, "learning_rate": 0.0001, "loss": 0.2698, "step": 37 }, { "epoch": 0.06194978806651451, "grad_norm": 0.16538746654987335, "learning_rate": 0.0001, "loss": 0.2846, "step": 38 }, { "epoch": 0.06358004564721226, "grad_norm": 0.11548929661512375, "learning_rate": 0.0001, "loss": 0.2848, "step": 39 }, { "epoch": 0.06521030322791001, "grad_norm": 0.10166307538747787, "learning_rate": 0.0001, "loss": 0.2578, "step": 40 }, { "epoch": 0.06684056080860776, "grad_norm": 0.10707216709852219, "learning_rate": 0.0001, "loss": 0.2724, "step": 41 }, { "epoch": 0.06847081838930551, "grad_norm": 0.11877011507749557, "learning_rate": 0.0001, "loss": 0.2896, "step": 42 }, { "epoch": 0.07010107597000326, "grad_norm": 0.10695178061723709, "learning_rate": 0.0001, "loss": 0.2517, "step": 43 }, { "epoch": 0.071731333550701, "grad_norm": 0.1269809901714325, "learning_rate": 0.0001, "loss": 0.2928, "step": 44 }, { "epoch": 0.07336159113139876, "grad_norm": 0.1300528198480606, "learning_rate": 0.0001, "loss": 0.2843, "step": 45 }, { "epoch": 0.0749918487120965, "grad_norm": 0.10712793469429016, "learning_rate": 0.0001, "loss": 0.252, "step": 46 }, { "epoch": 0.07662210629279426, "grad_norm": 0.11702849715948105, "learning_rate": 0.0001, "loss": 0.2711, "step": 47 }, { "epoch": 0.078252363873492, "grad_norm": 0.11782944947481155, "learning_rate": 0.0001, "loss": 0.2752, "step": 48 }, { "epoch": 0.07988262145418976, "grad_norm": 0.12026762962341309, "learning_rate": 0.0001, "loss": 0.2669, "step": 49 }, { "epoch": 0.0815128790348875, "grad_norm": 0.11663699150085449, "learning_rate": 0.0001, "loss": 0.2618, "step": 50 }, { "epoch": 0.08314313661558526, "grad_norm": 0.1347968578338623, "learning_rate": 0.0001, "loss": 0.2622, "step": 51 }, { "epoch": 0.08477339419628302, "grad_norm": 0.11729194968938828, "learning_rate": 0.0001, "loss": 0.2718, "step": 52 }, { "epoch": 0.08640365177698077, "grad_norm": 0.17483912408351898, "learning_rate": 0.0001, "loss": 0.2824, "step": 53 }, { "epoch": 0.08803390935767852, "grad_norm": 0.13471879065036774, "learning_rate": 0.0001, "loss": 0.2767, "step": 54 }, { "epoch": 0.08966416693837627, "grad_norm": 0.14546330273151398, "learning_rate": 0.0001, "loss": 0.2742, "step": 55 }, { "epoch": 0.09129442451907402, "grad_norm": 0.1298438459634781, "learning_rate": 0.0001, "loss": 0.2542, "step": 56 }, { "epoch": 0.09292468209977177, "grad_norm": 0.1432414948940277, "learning_rate": 0.0001, "loss": 0.2737, "step": 57 }, { "epoch": 0.09455493968046952, "grad_norm": 0.12804590165615082, "learning_rate": 0.0001, "loss": 0.2737, "step": 58 }, { "epoch": 0.09618519726116727, "grad_norm": 0.1591406762599945, "learning_rate": 0.0001, "loss": 0.2662, "step": 59 }, { "epoch": 0.09781545484186502, "grad_norm": 0.11162862926721573, "learning_rate": 0.0001, "loss": 0.2519, "step": 60 }, { "epoch": 0.09944571242256277, "grad_norm": 0.13492637872695923, "learning_rate": 0.0001, "loss": 0.2567, "step": 61 }, { "epoch": 0.10107597000326052, "grad_norm": 0.14795710146427155, "learning_rate": 0.0001, "loss": 0.2624, "step": 62 }, { "epoch": 0.10270622758395827, "grad_norm": 0.18151932954788208, "learning_rate": 0.0001, "loss": 0.2661, "step": 63 }, { "epoch": 0.10433648516465602, "grad_norm": 0.1397087723016739, "learning_rate": 0.0001, "loss": 0.2668, "step": 64 }, { "epoch": 0.10596674274535377, "grad_norm": 0.15592671930789948, "learning_rate": 0.0001, "loss": 0.2624, "step": 65 }, { "epoch": 0.10759700032605152, "grad_norm": 0.13322919607162476, "learning_rate": 0.0001, "loss": 0.2588, "step": 66 }, { "epoch": 0.10922725790674927, "grad_norm": 0.14810779690742493, "learning_rate": 0.0001, "loss": 0.2603, "step": 67 }, { "epoch": 0.11085751548744702, "grad_norm": 0.15098345279693604, "learning_rate": 0.0001, "loss": 0.2648, "step": 68 }, { "epoch": 0.11248777306814477, "grad_norm": 0.13700629770755768, "learning_rate": 0.0001, "loss": 0.2743, "step": 69 }, { "epoch": 0.11411803064884252, "grad_norm": 0.1113063395023346, "learning_rate": 0.0001, "loss": 0.2355, "step": 70 }, { "epoch": 0.11574828822954027, "grad_norm": 0.13374823331832886, "learning_rate": 0.0001, "loss": 0.2743, "step": 71 }, { "epoch": 0.11737854581023802, "grad_norm": 0.12639856338500977, "learning_rate": 0.0001, "loss": 0.2543, "step": 72 }, { "epoch": 0.11900880339093577, "grad_norm": 0.10666421055793762, "learning_rate": 0.0001, "loss": 0.2579, "step": 73 }, { "epoch": 0.12063906097163352, "grad_norm": 0.1311688870191574, "learning_rate": 0.0001, "loss": 0.2582, "step": 74 }, { "epoch": 0.12226931855233127, "grad_norm": 0.10417995601892471, "learning_rate": 0.0001, "loss": 0.2607, "step": 75 }, { "epoch": 0.12389957613302902, "grad_norm": 0.12071429938077927, "learning_rate": 0.0001, "loss": 0.2383, "step": 76 }, { "epoch": 0.12552983371372678, "grad_norm": 0.10600121319293976, "learning_rate": 0.0001, "loss": 0.2463, "step": 77 }, { "epoch": 0.12716009129442452, "grad_norm": 0.12528207898139954, "learning_rate": 0.0001, "loss": 0.262, "step": 78 }, { "epoch": 0.12879034887512228, "grad_norm": 0.10099775344133377, "learning_rate": 0.0001, "loss": 0.2524, "step": 79 }, { "epoch": 0.13042060645582002, "grad_norm": 0.12773346900939941, "learning_rate": 0.0001, "loss": 0.2572, "step": 80 }, { "epoch": 0.13205086403651778, "grad_norm": 0.09972451627254486, "learning_rate": 0.0001, "loss": 0.2503, "step": 81 }, { "epoch": 0.13368112161721551, "grad_norm": 0.1333281397819519, "learning_rate": 0.0001, "loss": 0.2469, "step": 82 }, { "epoch": 0.13531137919791328, "grad_norm": 0.12170375138521194, "learning_rate": 0.0001, "loss": 0.2603, "step": 83 }, { "epoch": 0.13694163677861101, "grad_norm": 0.13836190104484558, "learning_rate": 0.0001, "loss": 0.2736, "step": 84 }, { "epoch": 0.13857189435930878, "grad_norm": 0.11757717281579971, "learning_rate": 0.0001, "loss": 0.2738, "step": 85 }, { "epoch": 0.14020215194000651, "grad_norm": 0.1507899910211563, "learning_rate": 0.0001, "loss": 0.2455, "step": 86 }, { "epoch": 0.14183240952070428, "grad_norm": 0.12341590225696564, "learning_rate": 0.0001, "loss": 0.2572, "step": 87 }, { "epoch": 0.143462667101402, "grad_norm": 0.1795368492603302, "learning_rate": 0.0001, "loss": 0.2788, "step": 88 }, { "epoch": 0.14509292468209978, "grad_norm": 0.10201910138130188, "learning_rate": 0.0001, "loss": 0.2324, "step": 89 }, { "epoch": 0.1467231822627975, "grad_norm": 0.12101958692073822, "learning_rate": 0.0001, "loss": 0.2427, "step": 90 }, { "epoch": 0.14835343984349528, "grad_norm": 0.18389801681041718, "learning_rate": 0.0001, "loss": 0.2653, "step": 91 }, { "epoch": 0.149983697424193, "grad_norm": 0.13933244347572327, "learning_rate": 0.0001, "loss": 0.2488, "step": 92 }, { "epoch": 0.15161395500489078, "grad_norm": 0.18606036901474, "learning_rate": 0.0001, "loss": 0.2618, "step": 93 }, { "epoch": 0.1532442125855885, "grad_norm": 0.14273059368133545, "learning_rate": 0.0001, "loss": 0.2437, "step": 94 }, { "epoch": 0.15487447016628628, "grad_norm": 0.10715391486883163, "learning_rate": 0.0001, "loss": 0.2532, "step": 95 }, { "epoch": 0.156504727746984, "grad_norm": 0.14086727797985077, "learning_rate": 0.0001, "loss": 0.2512, "step": 96 }, { "epoch": 0.15813498532768178, "grad_norm": 0.12368986010551453, "learning_rate": 0.0001, "loss": 0.2468, "step": 97 }, { "epoch": 0.1597652429083795, "grad_norm": 0.14032916724681854, "learning_rate": 0.0001, "loss": 0.2706, "step": 98 }, { "epoch": 0.16139550048907728, "grad_norm": 0.12382455170154572, "learning_rate": 0.0001, "loss": 0.2432, "step": 99 }, { "epoch": 0.163025758069775, "grad_norm": 0.11922313272953033, "learning_rate": 0.0001, "loss": 0.2584, "step": 100 }, { "epoch": 0.16465601565047278, "grad_norm": 0.12890221178531647, "learning_rate": 0.0001, "loss": 0.2503, "step": 101 }, { "epoch": 0.1662862732311705, "grad_norm": 0.1167726069688797, "learning_rate": 0.0001, "loss": 0.2513, "step": 102 }, { "epoch": 0.16791653081186828, "grad_norm": 0.1356429159641266, "learning_rate": 0.0001, "loss": 0.2774, "step": 103 }, { "epoch": 0.16954678839256604, "grad_norm": 0.13073065876960754, "learning_rate": 0.0001, "loss": 0.2497, "step": 104 }, { "epoch": 0.17117704597326378, "grad_norm": 0.1371566206216812, "learning_rate": 0.0001, "loss": 0.2606, "step": 105 }, { "epoch": 0.17280730355396154, "grad_norm": 0.13844291865825653, "learning_rate": 0.0001, "loss": 0.2461, "step": 106 }, { "epoch": 0.17443756113465927, "grad_norm": 0.12421681731939316, "learning_rate": 0.0001, "loss": 0.2353, "step": 107 }, { "epoch": 0.17606781871535704, "grad_norm": 0.11806869506835938, "learning_rate": 0.0001, "loss": 0.2458, "step": 108 }, { "epoch": 0.17769807629605477, "grad_norm": 0.13172969222068787, "learning_rate": 0.0001, "loss": 0.2596, "step": 109 }, { "epoch": 0.17932833387675254, "grad_norm": 0.10319314897060394, "learning_rate": 0.0001, "loss": 0.2294, "step": 110 }, { "epoch": 0.18095859145745027, "grad_norm": 0.1400890052318573, "learning_rate": 0.0001, "loss": 0.243, "step": 111 }, { "epoch": 0.18258884903814804, "grad_norm": 0.11478696018457413, "learning_rate": 0.0001, "loss": 0.2586, "step": 112 }, { "epoch": 0.18421910661884577, "grad_norm": 0.11043928563594818, "learning_rate": 0.0001, "loss": 0.2481, "step": 113 }, { "epoch": 0.18584936419954354, "grad_norm": 0.12820887565612793, "learning_rate": 0.0001, "loss": 0.2627, "step": 114 }, { "epoch": 0.18747962178024127, "grad_norm": 0.14865200221538544, "learning_rate": 0.0001, "loss": 0.2664, "step": 115 }, { "epoch": 0.18910987936093904, "grad_norm": 0.11971151828765869, "learning_rate": 0.0001, "loss": 0.2462, "step": 116 }, { "epoch": 0.19074013694163677, "grad_norm": 0.11976753175258636, "learning_rate": 0.0001, "loss": 0.2364, "step": 117 }, { "epoch": 0.19237039452233454, "grad_norm": 0.11274496465921402, "learning_rate": 0.0001, "loss": 0.2444, "step": 118 }, { "epoch": 0.19400065210303227, "grad_norm": 0.13468791544437408, "learning_rate": 0.0001, "loss": 0.2449, "step": 119 }, { "epoch": 0.19563090968373004, "grad_norm": 0.14791305363178253, "learning_rate": 0.0001, "loss": 0.2687, "step": 120 }, { "epoch": 0.19726116726442777, "grad_norm": 0.11851610988378525, "learning_rate": 0.0001, "loss": 0.2395, "step": 121 }, { "epoch": 0.19889142484512554, "grad_norm": 0.1273575872182846, "learning_rate": 0.0001, "loss": 0.2421, "step": 122 }, { "epoch": 0.20052168242582327, "grad_norm": 0.12378577888011932, "learning_rate": 0.0001, "loss": 0.2394, "step": 123 }, { "epoch": 0.20215194000652104, "grad_norm": 0.14497146010398865, "learning_rate": 0.0001, "loss": 0.2377, "step": 124 }, { "epoch": 0.20378219758721877, "grad_norm": 0.15099285542964935, "learning_rate": 0.0001, "loss": 0.2437, "step": 125 }, { "epoch": 0.20541245516791654, "grad_norm": 0.1162499412894249, "learning_rate": 0.0001, "loss": 0.2463, "step": 126 }, { "epoch": 0.20704271274861427, "grad_norm": 0.13596996665000916, "learning_rate": 0.0001, "loss": 0.2251, "step": 127 }, { "epoch": 0.20867297032931204, "grad_norm": 0.1420516073703766, "learning_rate": 0.0001, "loss": 0.2616, "step": 128 }, { "epoch": 0.21030322791000977, "grad_norm": 0.1711655855178833, "learning_rate": 0.0001, "loss": 0.2599, "step": 129 }, { "epoch": 0.21193348549070753, "grad_norm": 0.12194457650184631, "learning_rate": 0.0001, "loss": 0.2441, "step": 130 }, { "epoch": 0.21356374307140527, "grad_norm": 0.11060874909162521, "learning_rate": 0.0001, "loss": 0.2207, "step": 131 }, { "epoch": 0.21519400065210303, "grad_norm": 0.1700860857963562, "learning_rate": 0.0001, "loss": 0.2587, "step": 132 }, { "epoch": 0.21682425823280077, "grad_norm": 0.12098715454339981, "learning_rate": 0.0001, "loss": 0.2407, "step": 133 }, { "epoch": 0.21845451581349853, "grad_norm": 0.1312795877456665, "learning_rate": 0.0001, "loss": 0.2505, "step": 134 }, { "epoch": 0.22008477339419627, "grad_norm": 0.12589812278747559, "learning_rate": 0.0001, "loss": 0.2416, "step": 135 }, { "epoch": 0.22171503097489403, "grad_norm": 0.11886783689260483, "learning_rate": 0.0001, "loss": 0.2589, "step": 136 }, { "epoch": 0.22334528855559177, "grad_norm": 0.12084617465734482, "learning_rate": 0.0001, "loss": 0.2443, "step": 137 }, { "epoch": 0.22497554613628953, "grad_norm": 0.12979595363140106, "learning_rate": 0.0001, "loss": 0.2441, "step": 138 }, { "epoch": 0.2266058037169873, "grad_norm": 0.1361086517572403, "learning_rate": 0.0001, "loss": 0.2304, "step": 139 }, { "epoch": 0.22823606129768503, "grad_norm": 0.13962669670581818, "learning_rate": 0.0001, "loss": 0.2511, "step": 140 }, { "epoch": 0.2298663188783828, "grad_norm": 0.13188579678535461, "learning_rate": 0.0001, "loss": 0.2478, "step": 141 }, { "epoch": 0.23149657645908053, "grad_norm": 0.14598271250724792, "learning_rate": 0.0001, "loss": 0.2516, "step": 142 }, { "epoch": 0.2331268340397783, "grad_norm": 0.14357416331768036, "learning_rate": 0.0001, "loss": 0.2414, "step": 143 }, { "epoch": 0.23475709162047603, "grad_norm": 0.11369388550519943, "learning_rate": 0.0001, "loss": 0.2331, "step": 144 }, { "epoch": 0.2363873492011738, "grad_norm": 0.14625732600688934, "learning_rate": 0.0001, "loss": 0.2177, "step": 145 }, { "epoch": 0.23801760678187153, "grad_norm": 0.14222382009029388, "learning_rate": 0.0001, "loss": 0.2541, "step": 146 }, { "epoch": 0.2396478643625693, "grad_norm": 0.11487408727407455, "learning_rate": 0.0001, "loss": 0.224, "step": 147 }, { "epoch": 0.24127812194326703, "grad_norm": 0.1486762911081314, "learning_rate": 0.0001, "loss": 0.2235, "step": 148 }, { "epoch": 0.2429083795239648, "grad_norm": 0.1313285082578659, "learning_rate": 0.0001, "loss": 0.2558, "step": 149 }, { "epoch": 0.24453863710466253, "grad_norm": 0.10049161314964294, "learning_rate": 0.0001, "loss": 0.2342, "step": 150 }, { "epoch": 0.2461688946853603, "grad_norm": 0.13103991746902466, "learning_rate": 0.0001, "loss": 0.2472, "step": 151 }, { "epoch": 0.24779915226605803, "grad_norm": 0.1439107060432434, "learning_rate": 0.0001, "loss": 0.2536, "step": 152 }, { "epoch": 0.2494294098467558, "grad_norm": 0.11082714796066284, "learning_rate": 0.0001, "loss": 0.2245, "step": 153 }, { "epoch": 0.25105966742745356, "grad_norm": 0.15587423741817474, "learning_rate": 0.0001, "loss": 0.2539, "step": 154 }, { "epoch": 0.2526899250081513, "grad_norm": 0.13212022185325623, "learning_rate": 0.0001, "loss": 0.2488, "step": 155 }, { "epoch": 0.25432018258884903, "grad_norm": 0.1342269331216812, "learning_rate": 0.0001, "loss": 0.2414, "step": 156 }, { "epoch": 0.25595044016954677, "grad_norm": 0.14503146708011627, "learning_rate": 0.0001, "loss": 0.2464, "step": 157 }, { "epoch": 0.25758069775024456, "grad_norm": 0.1345171481370926, "learning_rate": 0.0001, "loss": 0.2528, "step": 158 }, { "epoch": 0.2592109553309423, "grad_norm": 0.12593773007392883, "learning_rate": 0.0001, "loss": 0.2293, "step": 159 }, { "epoch": 0.26084121291164003, "grad_norm": 0.11615213751792908, "learning_rate": 0.0001, "loss": 0.2103, "step": 160 }, { "epoch": 0.26247147049233777, "grad_norm": 0.12266945838928223, "learning_rate": 0.0001, "loss": 0.2338, "step": 161 }, { "epoch": 0.26410172807303556, "grad_norm": 0.13719286024570465, "learning_rate": 0.0001, "loss": 0.2526, "step": 162 }, { "epoch": 0.2657319856537333, "grad_norm": 0.12018352746963501, "learning_rate": 0.0001, "loss": 0.2344, "step": 163 }, { "epoch": 0.26736224323443103, "grad_norm": 0.12734843790531158, "learning_rate": 0.0001, "loss": 0.2228, "step": 164 }, { "epoch": 0.26899250081512877, "grad_norm": 0.18533261120319366, "learning_rate": 0.0001, "loss": 0.2616, "step": 165 }, { "epoch": 0.27062275839582656, "grad_norm": 0.12489785999059677, "learning_rate": 0.0001, "loss": 0.2481, "step": 166 }, { "epoch": 0.2722530159765243, "grad_norm": 0.11786579340696335, "learning_rate": 0.0001, "loss": 0.2459, "step": 167 }, { "epoch": 0.27388327355722203, "grad_norm": 0.11804036796092987, "learning_rate": 0.0001, "loss": 0.2507, "step": 168 }, { "epoch": 0.27551353113791976, "grad_norm": 0.12300978600978851, "learning_rate": 0.0001, "loss": 0.2402, "step": 169 }, { "epoch": 0.27714378871861756, "grad_norm": 0.123172327876091, "learning_rate": 0.0001, "loss": 0.2393, "step": 170 }, { "epoch": 0.2787740462993153, "grad_norm": 0.12156080454587936, "learning_rate": 0.0001, "loss": 0.2366, "step": 171 }, { "epoch": 0.28040430388001303, "grad_norm": 0.12459408491849899, "learning_rate": 0.0001, "loss": 0.2449, "step": 172 }, { "epoch": 0.2820345614607108, "grad_norm": 0.10773272812366486, "learning_rate": 0.0001, "loss": 0.2327, "step": 173 }, { "epoch": 0.28366481904140856, "grad_norm": 0.11818479001522064, "learning_rate": 0.0001, "loss": 0.2429, "step": 174 }, { "epoch": 0.2852950766221063, "grad_norm": 0.11246544122695923, "learning_rate": 0.0001, "loss": 0.2264, "step": 175 }, { "epoch": 0.286925334202804, "grad_norm": 0.1404242068529129, "learning_rate": 0.0001, "loss": 0.2523, "step": 176 }, { "epoch": 0.2885555917835018, "grad_norm": 0.18125034868717194, "learning_rate": 0.0001, "loss": 0.2378, "step": 177 }, { "epoch": 0.29018584936419956, "grad_norm": 0.14246758818626404, "learning_rate": 0.0001, "loss": 0.2454, "step": 178 }, { "epoch": 0.2918161069448973, "grad_norm": 0.13600490987300873, "learning_rate": 0.0001, "loss": 0.2473, "step": 179 }, { "epoch": 0.293446364525595, "grad_norm": 0.12414231896400452, "learning_rate": 0.0001, "loss": 0.2438, "step": 180 }, { "epoch": 0.2950766221062928, "grad_norm": 0.12273810058832169, "learning_rate": 0.0001, "loss": 0.2311, "step": 181 }, { "epoch": 0.29670687968699055, "grad_norm": 0.104936882853508, "learning_rate": 0.0001, "loss": 0.2223, "step": 182 }, { "epoch": 0.2983371372676883, "grad_norm": 0.12966670095920563, "learning_rate": 0.0001, "loss": 0.2269, "step": 183 }, { "epoch": 0.299967394848386, "grad_norm": 0.11775387078523636, "learning_rate": 0.0001, "loss": 0.2434, "step": 184 }, { "epoch": 0.3015976524290838, "grad_norm": 0.16570591926574707, "learning_rate": 0.0001, "loss": 0.2497, "step": 185 }, { "epoch": 0.30322791000978155, "grad_norm": 0.1248738244175911, "learning_rate": 0.0001, "loss": 0.2348, "step": 186 }, { "epoch": 0.3048581675904793, "grad_norm": 0.17375342547893524, "learning_rate": 0.0001, "loss": 0.2368, "step": 187 }, { "epoch": 0.306488425171177, "grad_norm": 0.1663849800825119, "learning_rate": 0.0001, "loss": 0.2289, "step": 188 }, { "epoch": 0.3081186827518748, "grad_norm": 0.10794688761234283, "learning_rate": 0.0001, "loss": 0.2214, "step": 189 }, { "epoch": 0.30974894033257255, "grad_norm": 0.14207707345485687, "learning_rate": 0.0001, "loss": 0.2278, "step": 190 }, { "epoch": 0.3113791979132703, "grad_norm": 0.14883771538734436, "learning_rate": 0.0001, "loss": 0.2543, "step": 191 }, { "epoch": 0.313009455493968, "grad_norm": 0.1482788771390915, "learning_rate": 0.0001, "loss": 0.2361, "step": 192 }, { "epoch": 0.3146397130746658, "grad_norm": 0.16753090918064117, "learning_rate": 0.0001, "loss": 0.2441, "step": 193 }, { "epoch": 0.31626997065536355, "grad_norm": 0.10236582159996033, "learning_rate": 0.0001, "loss": 0.2091, "step": 194 }, { "epoch": 0.3179002282360613, "grad_norm": 0.11371749639511108, "learning_rate": 0.0001, "loss": 0.2303, "step": 195 }, { "epoch": 0.319530485816759, "grad_norm": 0.12658603489398956, "learning_rate": 0.0001, "loss": 0.2361, "step": 196 }, { "epoch": 0.3211607433974568, "grad_norm": 0.11970578879117966, "learning_rate": 0.0001, "loss": 0.2361, "step": 197 }, { "epoch": 0.32279100097815455, "grad_norm": 0.13204766809940338, "learning_rate": 0.0001, "loss": 0.2475, "step": 198 }, { "epoch": 0.3244212585588523, "grad_norm": 0.11528757214546204, "learning_rate": 0.0001, "loss": 0.2395, "step": 199 }, { "epoch": 0.32605151613955, "grad_norm": 0.11398864537477493, "learning_rate": 0.0001, "loss": 0.2294, "step": 200 }, { "epoch": 0.3276817737202478, "grad_norm": 0.13498589396476746, "learning_rate": 0.0001, "loss": 0.2461, "step": 201 }, { "epoch": 0.32931203130094555, "grad_norm": 0.10650527477264404, "learning_rate": 0.0001, "loss": 0.2279, "step": 202 }, { "epoch": 0.3309422888816433, "grad_norm": 0.10871734470129013, "learning_rate": 0.0001, "loss": 0.2379, "step": 203 }, { "epoch": 0.332572546462341, "grad_norm": 0.13913440704345703, "learning_rate": 0.0001, "loss": 0.2374, "step": 204 }, { "epoch": 0.3342028040430388, "grad_norm": 0.1410413384437561, "learning_rate": 0.0001, "loss": 0.2389, "step": 205 }, { "epoch": 0.33583306162373655, "grad_norm": 0.1187388077378273, "learning_rate": 0.0001, "loss": 0.2173, "step": 206 }, { "epoch": 0.3374633192044343, "grad_norm": 0.10804294794797897, "learning_rate": 0.0001, "loss": 0.2312, "step": 207 }, { "epoch": 0.3390935767851321, "grad_norm": 0.11228567361831665, "learning_rate": 0.0001, "loss": 0.2287, "step": 208 }, { "epoch": 0.3407238343658298, "grad_norm": 0.12240911275148392, "learning_rate": 0.0001, "loss": 0.2264, "step": 209 }, { "epoch": 0.34235409194652755, "grad_norm": 0.11001206934452057, "learning_rate": 0.0001, "loss": 0.2465, "step": 210 }, { "epoch": 0.3439843495272253, "grad_norm": 0.11600527912378311, "learning_rate": 0.0001, "loss": 0.2259, "step": 211 }, { "epoch": 0.3456146071079231, "grad_norm": 0.11184750497341156, "learning_rate": 0.0001, "loss": 0.2134, "step": 212 }, { "epoch": 0.3472448646886208, "grad_norm": 0.1267496645450592, "learning_rate": 0.0001, "loss": 0.2471, "step": 213 }, { "epoch": 0.34887512226931855, "grad_norm": 0.12638495862483978, "learning_rate": 0.0001, "loss": 0.2486, "step": 214 }, { "epoch": 0.3505053798500163, "grad_norm": 0.11810494214296341, "learning_rate": 0.0001, "loss": 0.2296, "step": 215 }, { "epoch": 0.3521356374307141, "grad_norm": 0.1319112330675125, "learning_rate": 0.0001, "loss": 0.223, "step": 216 }, { "epoch": 0.3537658950114118, "grad_norm": 0.13196417689323425, "learning_rate": 0.0001, "loss": 0.2273, "step": 217 }, { "epoch": 0.35539615259210955, "grad_norm": 0.13283449411392212, "learning_rate": 0.0001, "loss": 0.2395, "step": 218 }, { "epoch": 0.3570264101728073, "grad_norm": 0.11351513117551804, "learning_rate": 0.0001, "loss": 0.2305, "step": 219 }, { "epoch": 0.3586566677535051, "grad_norm": 0.11622969061136246, "learning_rate": 0.0001, "loss": 0.2267, "step": 220 }, { "epoch": 0.3602869253342028, "grad_norm": 0.12463167309761047, "learning_rate": 0.0001, "loss": 0.2345, "step": 221 }, { "epoch": 0.36191718291490055, "grad_norm": 0.12457557767629623, "learning_rate": 0.0001, "loss": 0.2394, "step": 222 }, { "epoch": 0.3635474404955983, "grad_norm": 0.11849746108055115, "learning_rate": 0.0001, "loss": 0.2183, "step": 223 }, { "epoch": 0.3651776980762961, "grad_norm": 0.13964387774467468, "learning_rate": 0.0001, "loss": 0.2577, "step": 224 }, { "epoch": 0.3668079556569938, "grad_norm": 0.11555647104978561, "learning_rate": 0.0001, "loss": 0.2154, "step": 225 }, { "epoch": 0.36843821323769155, "grad_norm": 0.09998490661382675, "learning_rate": 0.0001, "loss": 0.2122, "step": 226 }, { "epoch": 0.3700684708183893, "grad_norm": 0.12696896493434906, "learning_rate": 0.0001, "loss": 0.2353, "step": 227 }, { "epoch": 0.3716987283990871, "grad_norm": 0.118056520819664, "learning_rate": 0.0001, "loss": 0.2229, "step": 228 }, { "epoch": 0.3733289859797848, "grad_norm": 0.11977580189704895, "learning_rate": 0.0001, "loss": 0.2306, "step": 229 }, { "epoch": 0.37495924356048255, "grad_norm": 0.13963200151920319, "learning_rate": 0.0001, "loss": 0.238, "step": 230 }, { "epoch": 0.3765895011411803, "grad_norm": 0.1165039986371994, "learning_rate": 0.0001, "loss": 0.2298, "step": 231 }, { "epoch": 0.3782197587218781, "grad_norm": 0.15564818680286407, "learning_rate": 0.0001, "loss": 0.2177, "step": 232 }, { "epoch": 0.3798500163025758, "grad_norm": 0.12317235767841339, "learning_rate": 0.0001, "loss": 0.2229, "step": 233 }, { "epoch": 0.38148027388327355, "grad_norm": 0.12770213186740875, "learning_rate": 0.0001, "loss": 0.239, "step": 234 }, { "epoch": 0.3831105314639713, "grad_norm": 0.13377554714679718, "learning_rate": 0.0001, "loss": 0.2194, "step": 235 }, { "epoch": 0.3847407890446691, "grad_norm": 0.2012566477060318, "learning_rate": 0.0001, "loss": 0.2432, "step": 236 }, { "epoch": 0.3863710466253668, "grad_norm": 0.14003917574882507, "learning_rate": 0.0001, "loss": 0.2331, "step": 237 }, { "epoch": 0.38800130420606455, "grad_norm": 0.13453909754753113, "learning_rate": 0.0001, "loss": 0.2255, "step": 238 }, { "epoch": 0.3896315617867623, "grad_norm": 0.12276874482631683, "learning_rate": 0.0001, "loss": 0.2256, "step": 239 }, { "epoch": 0.3912618193674601, "grad_norm": 0.12277834117412567, "learning_rate": 0.0001, "loss": 0.2126, "step": 240 }, { "epoch": 0.3928920769481578, "grad_norm": 0.10638459771871567, "learning_rate": 0.0001, "loss": 0.2303, "step": 241 }, { "epoch": 0.39452233452885554, "grad_norm": 0.13198097050189972, "learning_rate": 0.0001, "loss": 0.223, "step": 242 }, { "epoch": 0.39615259210955334, "grad_norm": 0.11877847462892532, "learning_rate": 0.0001, "loss": 0.252, "step": 243 }, { "epoch": 0.3977828496902511, "grad_norm": 0.1367732435464859, "learning_rate": 0.0001, "loss": 0.2388, "step": 244 }, { "epoch": 0.3994131072709488, "grad_norm": 0.10984091460704803, "learning_rate": 0.0001, "loss": 0.2127, "step": 245 }, { "epoch": 0.40104336485164654, "grad_norm": 0.12004056572914124, "learning_rate": 0.0001, "loss": 0.2145, "step": 246 }, { "epoch": 0.40267362243234434, "grad_norm": 0.1057971715927124, "learning_rate": 0.0001, "loss": 0.2316, "step": 247 }, { "epoch": 0.40430388001304207, "grad_norm": 0.11223854869604111, "learning_rate": 0.0001, "loss": 0.237, "step": 248 }, { "epoch": 0.4059341375937398, "grad_norm": 0.11464496701955795, "learning_rate": 0.0001, "loss": 0.2384, "step": 249 }, { "epoch": 0.40756439517443754, "grad_norm": 0.12331773340702057, "learning_rate": 0.0001, "loss": 0.2375, "step": 250 }, { "epoch": 0.40919465275513534, "grad_norm": 0.1102685034275055, "learning_rate": 0.0001, "loss": 0.2361, "step": 251 }, { "epoch": 0.41082491033583307, "grad_norm": 0.12823829054832458, "learning_rate": 0.0001, "loss": 0.23, "step": 252 }, { "epoch": 0.4124551679165308, "grad_norm": 0.12393566220998764, "learning_rate": 0.0001, "loss": 0.2442, "step": 253 }, { "epoch": 0.41408542549722854, "grad_norm": 0.10847245156764984, "learning_rate": 0.0001, "loss": 0.2253, "step": 254 }, { "epoch": 0.41571568307792633, "grad_norm": 0.10520011931657791, "learning_rate": 0.0001, "loss": 0.238, "step": 255 }, { "epoch": 0.41734594065862407, "grad_norm": 0.10316835343837738, "learning_rate": 0.0001, "loss": 0.2147, "step": 256 }, { "epoch": 0.4189761982393218, "grad_norm": 0.09753026813268661, "learning_rate": 0.0001, "loss": 0.2212, "step": 257 }, { "epoch": 0.42060645582001954, "grad_norm": 0.12001580744981766, "learning_rate": 0.0001, "loss": 0.2347, "step": 258 }, { "epoch": 0.42223671340071733, "grad_norm": 0.1057935580611229, "learning_rate": 0.0001, "loss": 0.2184, "step": 259 }, { "epoch": 0.42386697098141507, "grad_norm": 0.12323079258203506, "learning_rate": 0.0001, "loss": 0.2318, "step": 260 }, { "epoch": 0.4254972285621128, "grad_norm": 0.11835148930549622, "learning_rate": 0.0001, "loss": 0.2373, "step": 261 }, { "epoch": 0.42712748614281054, "grad_norm": 0.11595102399587631, "learning_rate": 0.0001, "loss": 0.2171, "step": 262 }, { "epoch": 0.42875774372350833, "grad_norm": 0.11034155637025833, "learning_rate": 0.0001, "loss": 0.2209, "step": 263 }, { "epoch": 0.43038800130420607, "grad_norm": 0.10659411549568176, "learning_rate": 0.0001, "loss": 0.2228, "step": 264 }, { "epoch": 0.4320182588849038, "grad_norm": 0.15346385538578033, "learning_rate": 0.0001, "loss": 0.234, "step": 265 }, { "epoch": 0.43364851646560154, "grad_norm": 0.11137716472148895, "learning_rate": 0.0001, "loss": 0.2297, "step": 266 }, { "epoch": 0.43527877404629933, "grad_norm": 0.12295756489038467, "learning_rate": 0.0001, "loss": 0.24, "step": 267 }, { "epoch": 0.43690903162699707, "grad_norm": 0.13455668091773987, "learning_rate": 0.0001, "loss": 0.2379, "step": 268 }, { "epoch": 0.4385392892076948, "grad_norm": 0.12878896296024323, "learning_rate": 0.0001, "loss": 0.2406, "step": 269 }, { "epoch": 0.44016954678839254, "grad_norm": 0.112666055560112, "learning_rate": 0.0001, "loss": 0.208, "step": 270 }, { "epoch": 0.44179980436909033, "grad_norm": 0.13783974945545197, "learning_rate": 0.0001, "loss": 0.2497, "step": 271 }, { "epoch": 0.44343006194978807, "grad_norm": 0.10861429572105408, "learning_rate": 0.0001, "loss": 0.217, "step": 272 }, { "epoch": 0.4450603195304858, "grad_norm": 0.16053490340709686, "learning_rate": 0.0001, "loss": 0.2504, "step": 273 }, { "epoch": 0.44669057711118354, "grad_norm": 0.12352034449577332, "learning_rate": 0.0001, "loss": 0.2389, "step": 274 }, { "epoch": 0.44832083469188133, "grad_norm": 0.12012365460395813, "learning_rate": 0.0001, "loss": 0.2213, "step": 275 }, { "epoch": 0.44995109227257907, "grad_norm": 0.10671349614858627, "learning_rate": 0.0001, "loss": 0.2286, "step": 276 }, { "epoch": 0.4515813498532768, "grad_norm": 0.12723585963249207, "learning_rate": 0.0001, "loss": 0.2204, "step": 277 }, { "epoch": 0.4532116074339746, "grad_norm": 0.13079780340194702, "learning_rate": 0.0001, "loss": 0.2383, "step": 278 }, { "epoch": 0.45484186501467233, "grad_norm": 0.13152571022510529, "learning_rate": 0.0001, "loss": 0.2296, "step": 279 }, { "epoch": 0.45647212259537007, "grad_norm": 0.17980937659740448, "learning_rate": 0.0001, "loss": 0.2227, "step": 280 }, { "epoch": 0.4581023801760678, "grad_norm": 0.1296575367450714, "learning_rate": 0.0001, "loss": 0.233, "step": 281 }, { "epoch": 0.4597326377567656, "grad_norm": 0.14362689852714539, "learning_rate": 0.0001, "loss": 0.2299, "step": 282 }, { "epoch": 0.46136289533746333, "grad_norm": 0.13908450305461884, "learning_rate": 0.0001, "loss": 0.2261, "step": 283 }, { "epoch": 0.46299315291816107, "grad_norm": 0.13326814770698547, "learning_rate": 0.0001, "loss": 0.2254, "step": 284 }, { "epoch": 0.4646234104988588, "grad_norm": 0.1157006099820137, "learning_rate": 0.0001, "loss": 0.2255, "step": 285 }, { "epoch": 0.4662536680795566, "grad_norm": 0.14651785790920258, "learning_rate": 0.0001, "loss": 0.2243, "step": 286 }, { "epoch": 0.46788392566025433, "grad_norm": 0.12821364402770996, "learning_rate": 0.0001, "loss": 0.2324, "step": 287 }, { "epoch": 0.46951418324095207, "grad_norm": 0.13196085393428802, "learning_rate": 0.0001, "loss": 0.2286, "step": 288 }, { "epoch": 0.4711444408216498, "grad_norm": 0.1328345388174057, "learning_rate": 0.0001, "loss": 0.2309, "step": 289 }, { "epoch": 0.4727746984023476, "grad_norm": 0.12376872450113297, "learning_rate": 0.0001, "loss": 0.2434, "step": 290 }, { "epoch": 0.47440495598304533, "grad_norm": 0.11294668167829514, "learning_rate": 0.0001, "loss": 0.1987, "step": 291 }, { "epoch": 0.47603521356374306, "grad_norm": 0.11866224557161331, "learning_rate": 0.0001, "loss": 0.2086, "step": 292 }, { "epoch": 0.4776654711444408, "grad_norm": 0.11296320706605911, "learning_rate": 0.0001, "loss": 0.2217, "step": 293 }, { "epoch": 0.4792957287251386, "grad_norm": 0.1112779900431633, "learning_rate": 0.0001, "loss": 0.2112, "step": 294 }, { "epoch": 0.48092598630583633, "grad_norm": 0.11955477297306061, "learning_rate": 0.0001, "loss": 0.2377, "step": 295 }, { "epoch": 0.48255624388653406, "grad_norm": 0.11908381432294846, "learning_rate": 0.0001, "loss": 0.2353, "step": 296 }, { "epoch": 0.4841865014672318, "grad_norm": 0.10947412997484207, "learning_rate": 0.0001, "loss": 0.2351, "step": 297 }, { "epoch": 0.4858167590479296, "grad_norm": 0.19641439616680145, "learning_rate": 0.0001, "loss": 0.2293, "step": 298 }, { "epoch": 0.4874470166286273, "grad_norm": 0.11824516952037811, "learning_rate": 0.0001, "loss": 0.2332, "step": 299 }, { "epoch": 0.48907727420932506, "grad_norm": 0.11680760979652405, "learning_rate": 0.0001, "loss": 0.2469, "step": 300 }, { "epoch": 0.4907075317900228, "grad_norm": 0.12153971940279007, "learning_rate": 0.0001, "loss": 0.2228, "step": 301 }, { "epoch": 0.4923377893707206, "grad_norm": 0.12106509506702423, "learning_rate": 0.0001, "loss": 0.2375, "step": 302 }, { "epoch": 0.4939680469514183, "grad_norm": 0.1114591732621193, "learning_rate": 0.0001, "loss": 0.2175, "step": 303 }, { "epoch": 0.49559830453211606, "grad_norm": 0.10288353264331818, "learning_rate": 0.0001, "loss": 0.2033, "step": 304 }, { "epoch": 0.4972285621128138, "grad_norm": 0.12334591895341873, "learning_rate": 0.0001, "loss": 0.2355, "step": 305 }, { "epoch": 0.4988588196935116, "grad_norm": 0.12344373017549515, "learning_rate": 0.0001, "loss": 0.2278, "step": 306 }, { "epoch": 0.5004890772742093, "grad_norm": 0.11695542931556702, "learning_rate": 0.0001, "loss": 0.2083, "step": 307 }, { "epoch": 0.5021193348549071, "grad_norm": 0.1250382661819458, "learning_rate": 0.0001, "loss": 0.2299, "step": 308 }, { "epoch": 0.5037495924356048, "grad_norm": 0.10904502868652344, "learning_rate": 0.0001, "loss": 0.2178, "step": 309 }, { "epoch": 0.5053798500163026, "grad_norm": 0.12705446779727936, "learning_rate": 0.0001, "loss": 0.2278, "step": 310 }, { "epoch": 0.5070101075970004, "grad_norm": 0.12775245308876038, "learning_rate": 0.0001, "loss": 0.2344, "step": 311 }, { "epoch": 0.5086403651776981, "grad_norm": 0.1119925007224083, "learning_rate": 0.0001, "loss": 0.2176, "step": 312 }, { "epoch": 0.5102706227583959, "grad_norm": 0.13966351747512817, "learning_rate": 0.0001, "loss": 0.2242, "step": 313 }, { "epoch": 0.5119008803390935, "grad_norm": 0.11344596743583679, "learning_rate": 0.0001, "loss": 0.233, "step": 314 }, { "epoch": 0.5135311379197913, "grad_norm": 0.12142585963010788, "learning_rate": 0.0001, "loss": 0.2279, "step": 315 }, { "epoch": 0.5151613955004891, "grad_norm": 0.11385513842105865, "learning_rate": 0.0001, "loss": 0.2197, "step": 316 }, { "epoch": 0.5167916530811868, "grad_norm": 0.10977032035589218, "learning_rate": 0.0001, "loss": 0.2142, "step": 317 }, { "epoch": 0.5184219106618846, "grad_norm": 0.11549077928066254, "learning_rate": 0.0001, "loss": 0.2039, "step": 318 }, { "epoch": 0.5200521682425824, "grad_norm": 0.13420367240905762, "learning_rate": 0.0001, "loss": 0.2214, "step": 319 }, { "epoch": 0.5216824258232801, "grad_norm": 0.12192299962043762, "learning_rate": 0.0001, "loss": 0.2201, "step": 320 }, { "epoch": 0.5233126834039779, "grad_norm": 0.1398283839225769, "learning_rate": 0.0001, "loss": 0.2213, "step": 321 }, { "epoch": 0.5249429409846755, "grad_norm": 0.13691936433315277, "learning_rate": 0.0001, "loss": 0.2457, "step": 322 }, { "epoch": 0.5265731985653733, "grad_norm": 0.14428359270095825, "learning_rate": 0.0001, "loss": 0.235, "step": 323 }, { "epoch": 0.5282034561460711, "grad_norm": 0.10904989391565323, "learning_rate": 0.0001, "loss": 0.2124, "step": 324 }, { "epoch": 0.5298337137267688, "grad_norm": 0.12136954814195633, "learning_rate": 0.0001, "loss": 0.2157, "step": 325 }, { "epoch": 0.5314639713074666, "grad_norm": 0.11056441813707352, "learning_rate": 0.0001, "loss": 0.2304, "step": 326 }, { "epoch": 0.5330942288881644, "grad_norm": 0.1466270238161087, "learning_rate": 0.0001, "loss": 0.2202, "step": 327 }, { "epoch": 0.5347244864688621, "grad_norm": 0.11299032717943192, "learning_rate": 0.0001, "loss": 0.2241, "step": 328 }, { "epoch": 0.5363547440495599, "grad_norm": 0.1448090374469757, "learning_rate": 0.0001, "loss": 0.2382, "step": 329 }, { "epoch": 0.5379850016302575, "grad_norm": 0.146869957447052, "learning_rate": 0.0001, "loss": 0.2184, "step": 330 }, { "epoch": 0.5396152592109553, "grad_norm": 0.12791307270526886, "learning_rate": 0.0001, "loss": 0.221, "step": 331 }, { "epoch": 0.5412455167916531, "grad_norm": 0.13846737146377563, "learning_rate": 0.0001, "loss": 0.2247, "step": 332 }, { "epoch": 0.5428757743723508, "grad_norm": 0.12737876176834106, "learning_rate": 0.0001, "loss": 0.2114, "step": 333 }, { "epoch": 0.5445060319530486, "grad_norm": 0.11661113798618317, "learning_rate": 0.0001, "loss": 0.2291, "step": 334 }, { "epoch": 0.5461362895337464, "grad_norm": 0.11759155988693237, "learning_rate": 0.0001, "loss": 0.2153, "step": 335 }, { "epoch": 0.5477665471144441, "grad_norm": 0.11843594163656235, "learning_rate": 0.0001, "loss": 0.2205, "step": 336 }, { "epoch": 0.5493968046951418, "grad_norm": 0.12133964151144028, "learning_rate": 0.0001, "loss": 0.2369, "step": 337 }, { "epoch": 0.5510270622758395, "grad_norm": 0.1370978206396103, "learning_rate": 0.0001, "loss": 0.2233, "step": 338 }, { "epoch": 0.5526573198565373, "grad_norm": 0.14518651366233826, "learning_rate": 0.0001, "loss": 0.2257, "step": 339 }, { "epoch": 0.5542875774372351, "grad_norm": 0.12819088995456696, "learning_rate": 0.0001, "loss": 0.2387, "step": 340 }, { "epoch": 0.5559178350179328, "grad_norm": 0.102646104991436, "learning_rate": 0.0001, "loss": 0.2262, "step": 341 }, { "epoch": 0.5575480925986306, "grad_norm": 0.11202642321586609, "learning_rate": 0.0001, "loss": 0.24, "step": 342 }, { "epoch": 0.5591783501793284, "grad_norm": 0.12525996565818787, "learning_rate": 0.0001, "loss": 0.2284, "step": 343 }, { "epoch": 0.5608086077600261, "grad_norm": 0.09891889244318008, "learning_rate": 0.0001, "loss": 0.218, "step": 344 }, { "epoch": 0.5624388653407238, "grad_norm": 0.11816040426492691, "learning_rate": 0.0001, "loss": 0.2204, "step": 345 }, { "epoch": 0.5640691229214216, "grad_norm": 0.10832750797271729, "learning_rate": 0.0001, "loss": 0.2202, "step": 346 }, { "epoch": 0.5656993805021193, "grad_norm": 0.10662806034088135, "learning_rate": 0.0001, "loss": 0.2239, "step": 347 }, { "epoch": 0.5673296380828171, "grad_norm": 0.11352869868278503, "learning_rate": 0.0001, "loss": 0.2217, "step": 348 }, { "epoch": 0.5689598956635148, "grad_norm": 0.10960965603590012, "learning_rate": 0.0001, "loss": 0.2142, "step": 349 }, { "epoch": 0.5705901532442126, "grad_norm": 0.1158093810081482, "learning_rate": 0.0001, "loss": 0.2313, "step": 350 }, { "epoch": 0.5722204108249104, "grad_norm": 0.09648110717535019, "learning_rate": 0.0001, "loss": 0.2136, "step": 351 }, { "epoch": 0.573850668405608, "grad_norm": 0.10889653116464615, "learning_rate": 0.0001, "loss": 0.2079, "step": 352 }, { "epoch": 0.5754809259863058, "grad_norm": 0.12388119101524353, "learning_rate": 0.0001, "loss": 0.2307, "step": 353 }, { "epoch": 0.5771111835670036, "grad_norm": 0.11496174335479736, "learning_rate": 0.0001, "loss": 0.2232, "step": 354 }, { "epoch": 0.5787414411477013, "grad_norm": 0.1000954732298851, "learning_rate": 0.0001, "loss": 0.2018, "step": 355 }, { "epoch": 0.5803716987283991, "grad_norm": 0.11189039051532745, "learning_rate": 0.0001, "loss": 0.2044, "step": 356 }, { "epoch": 0.5820019563090968, "grad_norm": 0.12651148438453674, "learning_rate": 0.0001, "loss": 0.2152, "step": 357 }, { "epoch": 0.5836322138897946, "grad_norm": 0.12214814871549606, "learning_rate": 0.0001, "loss": 0.2174, "step": 358 }, { "epoch": 0.5852624714704924, "grad_norm": 0.11027688533067703, "learning_rate": 0.0001, "loss": 0.219, "step": 359 }, { "epoch": 0.58689272905119, "grad_norm": 0.12585967779159546, "learning_rate": 0.0001, "loss": 0.2198, "step": 360 }, { "epoch": 0.5885229866318878, "grad_norm": 0.11782504618167877, "learning_rate": 0.0001, "loss": 0.2107, "step": 361 }, { "epoch": 0.5901532442125856, "grad_norm": 0.12490631639957428, "learning_rate": 0.0001, "loss": 0.2342, "step": 362 }, { "epoch": 0.5917835017932833, "grad_norm": 0.10052972286939621, "learning_rate": 0.0001, "loss": 0.2077, "step": 363 }, { "epoch": 0.5934137593739811, "grad_norm": 0.11178889870643616, "learning_rate": 0.0001, "loss": 0.2152, "step": 364 }, { "epoch": 0.5950440169546788, "grad_norm": 0.10644172877073288, "learning_rate": 0.0001, "loss": 0.1977, "step": 365 }, { "epoch": 0.5966742745353766, "grad_norm": 0.1278517097234726, "learning_rate": 0.0001, "loss": 0.2059, "step": 366 }, { "epoch": 0.5983045321160744, "grad_norm": 0.15825654566287994, "learning_rate": 0.0001, "loss": 0.2269, "step": 367 }, { "epoch": 0.599934789696772, "grad_norm": 0.11004846543073654, "learning_rate": 0.0001, "loss": 0.2184, "step": 368 }, { "epoch": 0.6015650472774698, "grad_norm": 0.10813707113265991, "learning_rate": 0.0001, "loss": 0.2287, "step": 369 }, { "epoch": 0.6031953048581676, "grad_norm": 0.10354539006948471, "learning_rate": 0.0001, "loss": 0.2154, "step": 370 }, { "epoch": 0.6048255624388653, "grad_norm": 0.11675768345594406, "learning_rate": 0.0001, "loss": 0.2341, "step": 371 }, { "epoch": 0.6064558200195631, "grad_norm": 0.12332635372877121, "learning_rate": 0.0001, "loss": 0.2268, "step": 372 }, { "epoch": 0.6080860776002608, "grad_norm": 0.11673764884471893, "learning_rate": 0.0001, "loss": 0.2371, "step": 373 }, { "epoch": 0.6097163351809586, "grad_norm": 0.11986733227968216, "learning_rate": 0.0001, "loss": 0.2218, "step": 374 }, { "epoch": 0.6113465927616564, "grad_norm": 0.12927494943141937, "learning_rate": 0.0001, "loss": 0.2271, "step": 375 }, { "epoch": 0.612976850342354, "grad_norm": 0.0948595330119133, "learning_rate": 0.0001, "loss": 0.197, "step": 376 }, { "epoch": 0.6146071079230518, "grad_norm": 0.12510277330875397, "learning_rate": 0.0001, "loss": 0.226, "step": 377 }, { "epoch": 0.6162373655037496, "grad_norm": 0.10695146024227142, "learning_rate": 0.0001, "loss": 0.2146, "step": 378 }, { "epoch": 0.6178676230844473, "grad_norm": 0.1367439478635788, "learning_rate": 0.0001, "loss": 0.2397, "step": 379 }, { "epoch": 0.6194978806651451, "grad_norm": 0.1275574266910553, "learning_rate": 0.0001, "loss": 0.2028, "step": 380 }, { "epoch": 0.6211281382458429, "grad_norm": 0.11883818358182907, "learning_rate": 0.0001, "loss": 0.2277, "step": 381 }, { "epoch": 0.6227583958265406, "grad_norm": 0.12322292476892471, "learning_rate": 0.0001, "loss": 0.2318, "step": 382 }, { "epoch": 0.6243886534072384, "grad_norm": 0.11404566466808319, "learning_rate": 0.0001, "loss": 0.23, "step": 383 }, { "epoch": 0.626018910987936, "grad_norm": 0.1219937726855278, "learning_rate": 0.0001, "loss": 0.2208, "step": 384 }, { "epoch": 0.6276491685686338, "grad_norm": 0.11882983148097992, "learning_rate": 0.0001, "loss": 0.221, "step": 385 }, { "epoch": 0.6292794261493316, "grad_norm": 0.1069592535495758, "learning_rate": 0.0001, "loss": 0.2239, "step": 386 }, { "epoch": 0.6309096837300293, "grad_norm": 0.12435311079025269, "learning_rate": 0.0001, "loss": 0.2252, "step": 387 }, { "epoch": 0.6325399413107271, "grad_norm": 0.11291278898715973, "learning_rate": 0.0001, "loss": 0.2187, "step": 388 }, { "epoch": 0.6341701988914249, "grad_norm": 0.14565667510032654, "learning_rate": 0.0001, "loss": 0.2238, "step": 389 }, { "epoch": 0.6358004564721226, "grad_norm": 0.11719778180122375, "learning_rate": 0.0001, "loss": 0.2263, "step": 390 }, { "epoch": 0.6374307140528204, "grad_norm": 0.11908719688653946, "learning_rate": 0.0001, "loss": 0.2056, "step": 391 }, { "epoch": 0.639060971633518, "grad_norm": 0.14423371851444244, "learning_rate": 0.0001, "loss": 0.2201, "step": 392 }, { "epoch": 0.6406912292142158, "grad_norm": 0.1208101361989975, "learning_rate": 0.0001, "loss": 0.2229, "step": 393 }, { "epoch": 0.6423214867949136, "grad_norm": 0.13141198456287384, "learning_rate": 0.0001, "loss": 0.2265, "step": 394 }, { "epoch": 0.6439517443756113, "grad_norm": 0.1494779735803604, "learning_rate": 0.0001, "loss": 0.2154, "step": 395 }, { "epoch": 0.6455820019563091, "grad_norm": 0.11805243790149689, "learning_rate": 0.0001, "loss": 0.2012, "step": 396 }, { "epoch": 0.6472122595370069, "grad_norm": 0.13480523228645325, "learning_rate": 0.0001, "loss": 0.2172, "step": 397 }, { "epoch": 0.6488425171177046, "grad_norm": 0.11627724766731262, "learning_rate": 0.0001, "loss": 0.2377, "step": 398 }, { "epoch": 0.6504727746984024, "grad_norm": 0.1052049919962883, "learning_rate": 0.0001, "loss": 0.2203, "step": 399 }, { "epoch": 0.6521030322791, "grad_norm": 0.12532803416252136, "learning_rate": 0.0001, "loss": 0.224, "step": 400 }, { "epoch": 0.6537332898597978, "grad_norm": 0.11664199829101562, "learning_rate": 0.0001, "loss": 0.2354, "step": 401 }, { "epoch": 0.6553635474404956, "grad_norm": 0.10853651911020279, "learning_rate": 0.0001, "loss": 0.2097, "step": 402 }, { "epoch": 0.6569938050211933, "grad_norm": 0.10371599346399307, "learning_rate": 0.0001, "loss": 0.2058, "step": 403 }, { "epoch": 0.6586240626018911, "grad_norm": 0.1397281438112259, "learning_rate": 0.0001, "loss": 0.2259, "step": 404 }, { "epoch": 0.6602543201825889, "grad_norm": 0.11628320068120956, "learning_rate": 0.0001, "loss": 0.2183, "step": 405 }, { "epoch": 0.6618845777632866, "grad_norm": 0.13096722960472107, "learning_rate": 0.0001, "loss": 0.2219, "step": 406 }, { "epoch": 0.6635148353439844, "grad_norm": 0.11224295198917389, "learning_rate": 0.0001, "loss": 0.2249, "step": 407 }, { "epoch": 0.665145092924682, "grad_norm": 0.1743100881576538, "learning_rate": 0.0001, "loss": 0.2356, "step": 408 }, { "epoch": 0.6667753505053798, "grad_norm": 0.1063716933131218, "learning_rate": 0.0001, "loss": 0.2049, "step": 409 }, { "epoch": 0.6684056080860776, "grad_norm": 0.12047860771417618, "learning_rate": 0.0001, "loss": 0.2176, "step": 410 }, { "epoch": 0.6700358656667753, "grad_norm": 0.1088520810008049, "learning_rate": 0.0001, "loss": 0.2267, "step": 411 }, { "epoch": 0.6716661232474731, "grad_norm": 0.12248346954584122, "learning_rate": 0.0001, "loss": 0.2098, "step": 412 }, { "epoch": 0.6732963808281709, "grad_norm": 0.13586483895778656, "learning_rate": 0.0001, "loss": 0.222, "step": 413 }, { "epoch": 0.6749266384088686, "grad_norm": 0.1342141330242157, "learning_rate": 0.0001, "loss": 0.2126, "step": 414 }, { "epoch": 0.6765568959895664, "grad_norm": 0.11067284643650055, "learning_rate": 0.0001, "loss": 0.206, "step": 415 }, { "epoch": 0.6781871535702642, "grad_norm": 0.14228413999080658, "learning_rate": 0.0001, "loss": 0.2252, "step": 416 }, { "epoch": 0.6798174111509618, "grad_norm": 0.12066151201725006, "learning_rate": 0.0001, "loss": 0.2163, "step": 417 }, { "epoch": 0.6814476687316596, "grad_norm": 0.15157178044319153, "learning_rate": 0.0001, "loss": 0.2243, "step": 418 }, { "epoch": 0.6830779263123573, "grad_norm": 0.12287676334381104, "learning_rate": 0.0001, "loss": 0.2185, "step": 419 }, { "epoch": 0.6847081838930551, "grad_norm": 0.11439267545938492, "learning_rate": 0.0001, "loss": 0.2002, "step": 420 }, { "epoch": 0.6863384414737529, "grad_norm": 0.10508660227060318, "learning_rate": 0.0001, "loss": 0.2143, "step": 421 }, { "epoch": 0.6879686990544506, "grad_norm": 0.10114973783493042, "learning_rate": 0.0001, "loss": 0.2089, "step": 422 }, { "epoch": 0.6895989566351484, "grad_norm": 0.11443648487329483, "learning_rate": 0.0001, "loss": 0.2205, "step": 423 }, { "epoch": 0.6912292142158462, "grad_norm": 0.1153668612241745, "learning_rate": 0.0001, "loss": 0.2263, "step": 424 }, { "epoch": 0.6928594717965438, "grad_norm": 0.10309400409460068, "learning_rate": 0.0001, "loss": 0.2197, "step": 425 }, { "epoch": 0.6944897293772416, "grad_norm": 0.10628131031990051, "learning_rate": 0.0001, "loss": 0.2073, "step": 426 }, { "epoch": 0.6961199869579393, "grad_norm": 0.1151309385895729, "learning_rate": 0.0001, "loss": 0.22, "step": 427 }, { "epoch": 0.6977502445386371, "grad_norm": 0.13086111843585968, "learning_rate": 0.0001, "loss": 0.2192, "step": 428 }, { "epoch": 0.6993805021193349, "grad_norm": 0.1018524318933487, "learning_rate": 0.0001, "loss": 0.2168, "step": 429 }, { "epoch": 0.7010107597000326, "grad_norm": 0.1204724982380867, "learning_rate": 0.0001, "loss": 0.2226, "step": 430 }, { "epoch": 0.7026410172807304, "grad_norm": 0.11043181270360947, "learning_rate": 0.0001, "loss": 0.2207, "step": 431 }, { "epoch": 0.7042712748614282, "grad_norm": 0.10462189465761185, "learning_rate": 0.0001, "loss": 0.2173, "step": 432 }, { "epoch": 0.7059015324421258, "grad_norm": 0.11756114661693573, "learning_rate": 0.0001, "loss": 0.2165, "step": 433 }, { "epoch": 0.7075317900228236, "grad_norm": 0.11371811479330063, "learning_rate": 0.0001, "loss": 0.217, "step": 434 }, { "epoch": 0.7091620476035213, "grad_norm": 0.11309879273176193, "learning_rate": 0.0001, "loss": 0.2046, "step": 435 }, { "epoch": 0.7107923051842191, "grad_norm": 0.11104720830917358, "learning_rate": 0.0001, "loss": 0.1995, "step": 436 }, { "epoch": 0.7124225627649169, "grad_norm": 0.10949276387691498, "learning_rate": 0.0001, "loss": 0.21, "step": 437 }, { "epoch": 0.7140528203456146, "grad_norm": 0.13036222755908966, "learning_rate": 0.0001, "loss": 0.2187, "step": 438 }, { "epoch": 0.7156830779263124, "grad_norm": 0.11818300932645798, "learning_rate": 0.0001, "loss": 0.2164, "step": 439 }, { "epoch": 0.7173133355070102, "grad_norm": 0.11692320555448532, "learning_rate": 0.0001, "loss": 0.2113, "step": 440 }, { "epoch": 0.7189435930877078, "grad_norm": 0.11360203474760056, "learning_rate": 0.0001, "loss": 0.2244, "step": 441 }, { "epoch": 0.7205738506684056, "grad_norm": 0.11834365874528885, "learning_rate": 0.0001, "loss": 0.2303, "step": 442 }, { "epoch": 0.7222041082491033, "grad_norm": 0.13222841918468475, "learning_rate": 0.0001, "loss": 0.2192, "step": 443 }, { "epoch": 0.7238343658298011, "grad_norm": 0.12007411569356918, "learning_rate": 0.0001, "loss": 0.2158, "step": 444 }, { "epoch": 0.7254646234104989, "grad_norm": 0.11357498168945312, "learning_rate": 0.0001, "loss": 0.2222, "step": 445 }, { "epoch": 0.7270948809911966, "grad_norm": 0.14090843498706818, "learning_rate": 0.0001, "loss": 0.2173, "step": 446 }, { "epoch": 0.7287251385718944, "grad_norm": 0.13058516383171082, "learning_rate": 0.0001, "loss": 0.2263, "step": 447 }, { "epoch": 0.7303553961525922, "grad_norm": 0.12427127361297607, "learning_rate": 0.0001, "loss": 0.2235, "step": 448 }, { "epoch": 0.7319856537332898, "grad_norm": 0.10180344432592392, "learning_rate": 0.0001, "loss": 0.2056, "step": 449 }, { "epoch": 0.7336159113139876, "grad_norm": 0.11706260591745377, "learning_rate": 0.0001, "loss": 0.2178, "step": 450 }, { "epoch": 0.7352461688946854, "grad_norm": 0.14062075316905975, "learning_rate": 0.0001, "loss": 0.2296, "step": 451 }, { "epoch": 0.7368764264753831, "grad_norm": 0.1115693598985672, "learning_rate": 0.0001, "loss": 0.2348, "step": 452 }, { "epoch": 0.7385066840560809, "grad_norm": 0.1121508777141571, "learning_rate": 0.0001, "loss": 0.217, "step": 453 }, { "epoch": 0.7401369416367786, "grad_norm": 0.12056824564933777, "learning_rate": 0.0001, "loss": 0.2225, "step": 454 }, { "epoch": 0.7417671992174764, "grad_norm": 0.12506930530071259, "learning_rate": 0.0001, "loss": 0.2215, "step": 455 }, { "epoch": 0.7433974567981741, "grad_norm": 0.1259409487247467, "learning_rate": 0.0001, "loss": 0.2015, "step": 456 }, { "epoch": 0.7450277143788718, "grad_norm": 0.13509246706962585, "learning_rate": 0.0001, "loss": 0.2271, "step": 457 }, { "epoch": 0.7466579719595696, "grad_norm": 0.13348042964935303, "learning_rate": 0.0001, "loss": 0.2227, "step": 458 }, { "epoch": 0.7482882295402674, "grad_norm": 0.12885409593582153, "learning_rate": 0.0001, "loss": 0.2144, "step": 459 }, { "epoch": 0.7499184871209651, "grad_norm": 0.13527405261993408, "learning_rate": 0.0001, "loss": 0.2349, "step": 460 }, { "epoch": 0.7515487447016629, "grad_norm": 0.15392392873764038, "learning_rate": 0.0001, "loss": 0.2258, "step": 461 }, { "epoch": 0.7531790022823606, "grad_norm": 0.11966606229543686, "learning_rate": 0.0001, "loss": 0.2179, "step": 462 }, { "epoch": 0.7548092598630584, "grad_norm": 0.10815181583166122, "learning_rate": 0.0001, "loss": 0.2222, "step": 463 }, { "epoch": 0.7564395174437561, "grad_norm": 0.11875923722982407, "learning_rate": 0.0001, "loss": 0.2169, "step": 464 }, { "epoch": 0.7580697750244538, "grad_norm": 0.11258353292942047, "learning_rate": 0.0001, "loss": 0.2024, "step": 465 }, { "epoch": 0.7597000326051516, "grad_norm": 0.1222681850194931, "learning_rate": 0.0001, "loss": 0.2277, "step": 466 }, { "epoch": 0.7613302901858494, "grad_norm": 0.11641412228345871, "learning_rate": 0.0001, "loss": 0.2178, "step": 467 }, { "epoch": 0.7629605477665471, "grad_norm": 0.12195785343647003, "learning_rate": 0.0001, "loss": 0.2077, "step": 468 }, { "epoch": 0.7645908053472449, "grad_norm": 0.10462360829114914, "learning_rate": 0.0001, "loss": 0.2092, "step": 469 }, { "epoch": 0.7662210629279426, "grad_norm": 0.11324596405029297, "learning_rate": 0.0001, "loss": 0.2238, "step": 470 }, { "epoch": 0.7678513205086404, "grad_norm": 0.12687797844409943, "learning_rate": 0.0001, "loss": 0.231, "step": 471 }, { "epoch": 0.7694815780893381, "grad_norm": 0.12114600092172623, "learning_rate": 0.0001, "loss": 0.2217, "step": 472 }, { "epoch": 0.7711118356700358, "grad_norm": 0.12222952395677567, "learning_rate": 0.0001, "loss": 0.2238, "step": 473 }, { "epoch": 0.7727420932507336, "grad_norm": 0.11903360486030579, "learning_rate": 0.0001, "loss": 0.2079, "step": 474 }, { "epoch": 0.7743723508314314, "grad_norm": 0.12019330263137817, "learning_rate": 0.0001, "loss": 0.2074, "step": 475 }, { "epoch": 0.7760026084121291, "grad_norm": 0.1122792437672615, "learning_rate": 0.0001, "loss": 0.2086, "step": 476 }, { "epoch": 0.7776328659928269, "grad_norm": 0.11570253223180771, "learning_rate": 0.0001, "loss": 0.2172, "step": 477 }, { "epoch": 0.7792631235735246, "grad_norm": 0.14799444377422333, "learning_rate": 0.0001, "loss": 0.2163, "step": 478 }, { "epoch": 0.7808933811542224, "grad_norm": 0.09483097493648529, "learning_rate": 0.0001, "loss": 0.1999, "step": 479 }, { "epoch": 0.7825236387349201, "grad_norm": 0.11984442174434662, "learning_rate": 0.0001, "loss": 0.2017, "step": 480 }, { "epoch": 0.7841538963156178, "grad_norm": 0.11621296405792236, "learning_rate": 0.0001, "loss": 0.2149, "step": 481 }, { "epoch": 0.7857841538963156, "grad_norm": 0.0937175378203392, "learning_rate": 0.0001, "loss": 0.2032, "step": 482 }, { "epoch": 0.7874144114770134, "grad_norm": 0.12285979092121124, "learning_rate": 0.0001, "loss": 0.2083, "step": 483 }, { "epoch": 0.7890446690577111, "grad_norm": 0.13726738095283508, "learning_rate": 0.0001, "loss": 0.2189, "step": 484 }, { "epoch": 0.7906749266384089, "grad_norm": 0.1135246530175209, "learning_rate": 0.0001, "loss": 0.2057, "step": 485 }, { "epoch": 0.7923051842191067, "grad_norm": 0.11096853017807007, "learning_rate": 0.0001, "loss": 0.1984, "step": 486 }, { "epoch": 0.7939354417998044, "grad_norm": 0.10587968677282333, "learning_rate": 0.0001, "loss": 0.2213, "step": 487 }, { "epoch": 0.7955656993805021, "grad_norm": 0.11208285391330719, "learning_rate": 0.0001, "loss": 0.2104, "step": 488 }, { "epoch": 0.7971959569611998, "grad_norm": 0.12265545129776001, "learning_rate": 0.0001, "loss": 0.1976, "step": 489 }, { "epoch": 0.7988262145418976, "grad_norm": 0.09317664057016373, "learning_rate": 0.0001, "loss": 0.1927, "step": 490 }, { "epoch": 0.8004564721225954, "grad_norm": 0.10856229066848755, "learning_rate": 0.0001, "loss": 0.2024, "step": 491 }, { "epoch": 0.8020867297032931, "grad_norm": 0.09949705749750137, "learning_rate": 0.0001, "loss": 0.2015, "step": 492 }, { "epoch": 0.8037169872839909, "grad_norm": 0.13871848583221436, "learning_rate": 0.0001, "loss": 0.2161, "step": 493 }, { "epoch": 0.8053472448646887, "grad_norm": 0.11129366606473923, "learning_rate": 0.0001, "loss": 0.2064, "step": 494 }, { "epoch": 0.8069775024453864, "grad_norm": 0.12229236960411072, "learning_rate": 0.0001, "loss": 0.2145, "step": 495 }, { "epoch": 0.8086077600260841, "grad_norm": 0.12729904055595398, "learning_rate": 0.0001, "loss": 0.1889, "step": 496 }, { "epoch": 0.8102380176067818, "grad_norm": 0.13496865332126617, "learning_rate": 0.0001, "loss": 0.2275, "step": 497 }, { "epoch": 0.8118682751874796, "grad_norm": 0.11651238799095154, "learning_rate": 0.0001, "loss": 0.2113, "step": 498 }, { "epoch": 0.8134985327681774, "grad_norm": 0.14498771727085114, "learning_rate": 0.0001, "loss": 0.2324, "step": 499 }, { "epoch": 0.8151287903488751, "grad_norm": 0.13545864820480347, "learning_rate": 0.0001, "loss": 0.1865, "step": 500 }, { "epoch": 0.8151287903488751, "eval_loss": 0.21210059523582458, "eval_runtime": 469.38, "eval_samples_per_second": 4.02, "eval_steps_per_second": 1.006, "step": 500 }, { "epoch": 0.8167590479295729, "grad_norm": 0.1212705671787262, "learning_rate": 0.0001, "loss": 0.2263, "step": 501 }, { "epoch": 0.8183893055102707, "grad_norm": 0.12040545791387558, "learning_rate": 0.0001, "loss": 0.2149, "step": 502 }, { "epoch": 0.8200195630909684, "grad_norm": 0.10691879689693451, "learning_rate": 0.0001, "loss": 0.1914, "step": 503 }, { "epoch": 0.8216498206716661, "grad_norm": 0.11082441359758377, "learning_rate": 0.0001, "loss": 0.1967, "step": 504 }, { "epoch": 0.8232800782523638, "grad_norm": 0.12334942817687988, "learning_rate": 0.0001, "loss": 0.202, "step": 505 }, { "epoch": 0.8249103358330616, "grad_norm": 0.12807224690914154, "learning_rate": 0.0001, "loss": 0.1993, "step": 506 }, { "epoch": 0.8265405934137594, "grad_norm": 0.11735547333955765, "learning_rate": 0.0001, "loss": 0.23, "step": 507 }, { "epoch": 0.8281708509944571, "grad_norm": 0.12395918369293213, "learning_rate": 0.0001, "loss": 0.2266, "step": 508 }, { "epoch": 0.8298011085751549, "grad_norm": 0.1415017992258072, "learning_rate": 0.0001, "loss": 0.2122, "step": 509 }, { "epoch": 0.8314313661558527, "grad_norm": 0.11149226874113083, "learning_rate": 0.0001, "loss": 0.2073, "step": 510 }, { "epoch": 0.8330616237365503, "grad_norm": 0.12386713922023773, "learning_rate": 0.0001, "loss": 0.2291, "step": 511 }, { "epoch": 0.8346918813172481, "grad_norm": 0.10768789052963257, "learning_rate": 0.0001, "loss": 0.2047, "step": 512 }, { "epoch": 0.8363221388979458, "grad_norm": 0.11108648031949997, "learning_rate": 0.0001, "loss": 0.2126, "step": 513 }, { "epoch": 0.8379523964786436, "grad_norm": 0.11573906987905502, "learning_rate": 0.0001, "loss": 0.2045, "step": 514 }, { "epoch": 0.8395826540593414, "grad_norm": 0.09793173521757126, "learning_rate": 0.0001, "loss": 0.1954, "step": 515 }, { "epoch": 0.8412129116400391, "grad_norm": 0.12155526131391525, "learning_rate": 0.0001, "loss": 0.2151, "step": 516 }, { "epoch": 0.8428431692207369, "grad_norm": 0.1365749090909958, "learning_rate": 0.0001, "loss": 0.2231, "step": 517 }, { "epoch": 0.8444734268014347, "grad_norm": 0.10344486683607101, "learning_rate": 0.0001, "loss": 0.1829, "step": 518 }, { "epoch": 0.8461036843821323, "grad_norm": 0.11081143468618393, "learning_rate": 0.0001, "loss": 0.2193, "step": 519 }, { "epoch": 0.8477339419628301, "grad_norm": 0.11781128495931625, "learning_rate": 0.0001, "loss": 0.2051, "step": 520 }, { "epoch": 0.8493641995435279, "grad_norm": 0.1616438925266266, "learning_rate": 0.0001, "loss": 0.2099, "step": 521 }, { "epoch": 0.8509944571242256, "grad_norm": 0.12249930948019028, "learning_rate": 0.0001, "loss": 0.2336, "step": 522 }, { "epoch": 0.8526247147049234, "grad_norm": 0.11882970482110977, "learning_rate": 0.0001, "loss": 0.2219, "step": 523 }, { "epoch": 0.8542549722856211, "grad_norm": 0.10900752991437912, "learning_rate": 0.0001, "loss": 0.2135, "step": 524 }, { "epoch": 0.8558852298663189, "grad_norm": 0.11639473587274551, "learning_rate": 0.0001, "loss": 0.2128, "step": 525 }, { "epoch": 0.8575154874470167, "grad_norm": 0.1093585193157196, "learning_rate": 0.0001, "loss": 0.2132, "step": 526 }, { "epoch": 0.8591457450277143, "grad_norm": 0.1086277961730957, "learning_rate": 0.0001, "loss": 0.228, "step": 527 }, { "epoch": 0.8607760026084121, "grad_norm": 0.10523232817649841, "learning_rate": 0.0001, "loss": 0.1957, "step": 528 }, { "epoch": 0.8624062601891099, "grad_norm": 0.10368810594081879, "learning_rate": 0.0001, "loss": 0.2233, "step": 529 }, { "epoch": 0.8640365177698076, "grad_norm": 0.09829546511173248, "learning_rate": 0.0001, "loss": 0.2051, "step": 530 }, { "epoch": 0.8656667753505054, "grad_norm": 0.12013105303049088, "learning_rate": 0.0001, "loss": 0.223, "step": 531 }, { "epoch": 0.8672970329312031, "grad_norm": 0.13377144932746887, "learning_rate": 0.0001, "loss": 0.2159, "step": 532 }, { "epoch": 0.8689272905119009, "grad_norm": 0.11236433684825897, "learning_rate": 0.0001, "loss": 0.2165, "step": 533 }, { "epoch": 0.8705575480925987, "grad_norm": 0.11153242737054825, "learning_rate": 0.0001, "loss": 0.2063, "step": 534 }, { "epoch": 0.8721878056732963, "grad_norm": 0.13448600471019745, "learning_rate": 0.0001, "loss": 0.2289, "step": 535 }, { "epoch": 0.8738180632539941, "grad_norm": 0.11363071203231812, "learning_rate": 0.0001, "loss": 0.2285, "step": 536 }, { "epoch": 0.8754483208346919, "grad_norm": 0.13538241386413574, "learning_rate": 0.0001, "loss": 0.2052, "step": 537 }, { "epoch": 0.8770785784153896, "grad_norm": 0.11804122477769852, "learning_rate": 0.0001, "loss": 0.1973, "step": 538 }, { "epoch": 0.8787088359960874, "grad_norm": 0.13027773797512054, "learning_rate": 0.0001, "loss": 0.2136, "step": 539 }, { "epoch": 0.8803390935767851, "grad_norm": 0.10185594856739044, "learning_rate": 0.0001, "loss": 0.2155, "step": 540 }, { "epoch": 0.8819693511574829, "grad_norm": 0.14260874688625336, "learning_rate": 0.0001, "loss": 0.2201, "step": 541 }, { "epoch": 0.8835996087381807, "grad_norm": 0.11737281084060669, "learning_rate": 0.0001, "loss": 0.2172, "step": 542 }, { "epoch": 0.8852298663188783, "grad_norm": 0.12430301308631897, "learning_rate": 0.0001, "loss": 0.2108, "step": 543 }, { "epoch": 0.8868601238995761, "grad_norm": 0.1229623332619667, "learning_rate": 0.0001, "loss": 0.214, "step": 544 }, { "epoch": 0.8884903814802739, "grad_norm": 0.1039818823337555, "learning_rate": 0.0001, "loss": 0.2094, "step": 545 }, { "epoch": 0.8901206390609716, "grad_norm": 0.11727850139141083, "learning_rate": 0.0001, "loss": 0.1965, "step": 546 }, { "epoch": 0.8917508966416694, "grad_norm": 0.11485180258750916, "learning_rate": 0.0001, "loss": 0.2127, "step": 547 }, { "epoch": 0.8933811542223671, "grad_norm": 0.13651061058044434, "learning_rate": 0.0001, "loss": 0.224, "step": 548 }, { "epoch": 0.8950114118030649, "grad_norm": 0.11545626074075699, "learning_rate": 0.0001, "loss": 0.219, "step": 549 }, { "epoch": 0.8966416693837627, "grad_norm": 0.10819267481565475, "learning_rate": 0.0001, "loss": 0.1873, "step": 550 }, { "epoch": 0.8982719269644603, "grad_norm": 0.12021040916442871, "learning_rate": 0.0001, "loss": 0.2109, "step": 551 }, { "epoch": 0.8999021845451581, "grad_norm": 0.10751428455114365, "learning_rate": 0.0001, "loss": 0.1989, "step": 552 }, { "epoch": 0.9015324421258559, "grad_norm": 0.11271633207798004, "learning_rate": 0.0001, "loss": 0.2074, "step": 553 }, { "epoch": 0.9031626997065536, "grad_norm": 0.11875735968351364, "learning_rate": 0.0001, "loss": 0.2087, "step": 554 }, { "epoch": 0.9047929572872514, "grad_norm": 0.1122482642531395, "learning_rate": 0.0001, "loss": 0.2114, "step": 555 }, { "epoch": 0.9064232148679492, "grad_norm": 0.13074877858161926, "learning_rate": 0.0001, "loss": 0.2083, "step": 556 }, { "epoch": 0.9080534724486469, "grad_norm": 0.10370821505784988, "learning_rate": 0.0001, "loss": 0.1987, "step": 557 }, { "epoch": 0.9096837300293447, "grad_norm": 0.11413192003965378, "learning_rate": 0.0001, "loss": 0.1996, "step": 558 }, { "epoch": 0.9113139876100423, "grad_norm": 0.11211336404085159, "learning_rate": 0.0001, "loss": 0.2042, "step": 559 }, { "epoch": 0.9129442451907401, "grad_norm": 0.11445408314466476, "learning_rate": 0.0001, "loss": 0.2132, "step": 560 }, { "epoch": 0.9145745027714379, "grad_norm": 0.11431146413087845, "learning_rate": 0.0001, "loss": 0.2208, "step": 561 }, { "epoch": 0.9162047603521356, "grad_norm": 0.12157010287046432, "learning_rate": 0.0001, "loss": 0.1936, "step": 562 }, { "epoch": 0.9178350179328334, "grad_norm": 0.10648233443498611, "learning_rate": 0.0001, "loss": 0.2021, "step": 563 }, { "epoch": 0.9194652755135312, "grad_norm": 0.1113019660115242, "learning_rate": 0.0001, "loss": 0.1996, "step": 564 }, { "epoch": 0.9210955330942289, "grad_norm": 0.1145205944776535, "learning_rate": 0.0001, "loss": 0.2106, "step": 565 }, { "epoch": 0.9227257906749267, "grad_norm": 0.11848907917737961, "learning_rate": 0.0001, "loss": 0.2165, "step": 566 }, { "epoch": 0.9243560482556243, "grad_norm": 0.10608423501253128, "learning_rate": 0.0001, "loss": 0.19, "step": 567 }, { "epoch": 0.9259863058363221, "grad_norm": 0.1123075932264328, "learning_rate": 0.0001, "loss": 0.2138, "step": 568 }, { "epoch": 0.9276165634170199, "grad_norm": 0.11272522807121277, "learning_rate": 0.0001, "loss": 0.2124, "step": 569 }, { "epoch": 0.9292468209977176, "grad_norm": 0.13107183575630188, "learning_rate": 0.0001, "loss": 0.1916, "step": 570 }, { "epoch": 0.9308770785784154, "grad_norm": 0.11995590478181839, "learning_rate": 0.0001, "loss": 0.2188, "step": 571 }, { "epoch": 0.9325073361591132, "grad_norm": 0.5125681161880493, "learning_rate": 0.0001, "loss": 0.2093, "step": 572 }, { "epoch": 0.9341375937398109, "grad_norm": 0.15056641399860382, "learning_rate": 0.0001, "loss": 0.216, "step": 573 }, { "epoch": 0.9357678513205087, "grad_norm": 0.13608971238136292, "learning_rate": 0.0001, "loss": 0.2072, "step": 574 }, { "epoch": 0.9373981089012063, "grad_norm": 0.11466903239488602, "learning_rate": 0.0001, "loss": 0.2098, "step": 575 }, { "epoch": 0.9390283664819041, "grad_norm": 0.12261076271533966, "learning_rate": 0.0001, "loss": 0.1975, "step": 576 }, { "epoch": 0.9406586240626019, "grad_norm": 0.11684715747833252, "learning_rate": 0.0001, "loss": 0.2048, "step": 577 }, { "epoch": 0.9422888816432996, "grad_norm": 0.11564768105745316, "learning_rate": 0.0001, "loss": 0.2082, "step": 578 }, { "epoch": 0.9439191392239974, "grad_norm": 0.10801440477371216, "learning_rate": 0.0001, "loss": 0.2089, "step": 579 }, { "epoch": 0.9455493968046952, "grad_norm": 0.11598918586969376, "learning_rate": 0.0001, "loss": 0.204, "step": 580 }, { "epoch": 0.9471796543853929, "grad_norm": 0.12915532290935516, "learning_rate": 0.0001, "loss": 0.2011, "step": 581 }, { "epoch": 0.9488099119660907, "grad_norm": 0.14291346073150635, "learning_rate": 0.0001, "loss": 0.2236, "step": 582 }, { "epoch": 0.9504401695467883, "grad_norm": 0.11026699841022491, "learning_rate": 0.0001, "loss": 0.2129, "step": 583 }, { "epoch": 0.9520704271274861, "grad_norm": 0.14368709921836853, "learning_rate": 0.0001, "loss": 0.2064, "step": 584 }, { "epoch": 0.9537006847081839, "grad_norm": 0.10485952347517014, "learning_rate": 0.0001, "loss": 0.2016, "step": 585 }, { "epoch": 0.9553309422888816, "grad_norm": 0.1238325983285904, "learning_rate": 0.0001, "loss": 0.2199, "step": 586 }, { "epoch": 0.9569611998695794, "grad_norm": 0.11727755516767502, "learning_rate": 0.0001, "loss": 0.2069, "step": 587 }, { "epoch": 0.9585914574502772, "grad_norm": 0.12809327244758606, "learning_rate": 0.0001, "loss": 0.2109, "step": 588 }, { "epoch": 0.9602217150309749, "grad_norm": 0.1122494712471962, "learning_rate": 0.0001, "loss": 0.2053, "step": 589 }, { "epoch": 0.9618519726116727, "grad_norm": 0.11698026210069656, "learning_rate": 0.0001, "loss": 0.2147, "step": 590 }, { "epoch": 0.9634822301923704, "grad_norm": 0.15164814889431, "learning_rate": 0.0001, "loss": 0.2159, "step": 591 }, { "epoch": 0.9651124877730681, "grad_norm": 0.18676526844501495, "learning_rate": 0.0001, "loss": 0.216, "step": 592 }, { "epoch": 0.9667427453537659, "grad_norm": 0.13141092658042908, "learning_rate": 0.0001, "loss": 0.2148, "step": 593 }, { "epoch": 0.9683730029344636, "grad_norm": 0.1369103491306305, "learning_rate": 0.0001, "loss": 0.2081, "step": 594 }, { "epoch": 0.9700032605151614, "grad_norm": 0.127171128988266, "learning_rate": 0.0001, "loss": 0.2147, "step": 595 }, { "epoch": 0.9716335180958592, "grad_norm": 0.11275147646665573, "learning_rate": 0.0001, "loss": 0.2078, "step": 596 }, { "epoch": 0.9732637756765569, "grad_norm": 0.12019902467727661, "learning_rate": 0.0001, "loss": 0.2128, "step": 597 }, { "epoch": 0.9748940332572547, "grad_norm": 0.11609843373298645, "learning_rate": 0.0001, "loss": 0.2217, "step": 598 }, { "epoch": 0.9765242908379524, "grad_norm": 0.10565144568681717, "learning_rate": 0.0001, "loss": 0.2131, "step": 599 }, { "epoch": 0.9781545484186501, "grad_norm": 0.1125059723854065, "learning_rate": 0.0001, "loss": 0.2077, "step": 600 }, { "epoch": 0.9797848059993479, "grad_norm": 0.10678024590015411, "learning_rate": 0.0001, "loss": 0.2137, "step": 601 }, { "epoch": 0.9814150635800456, "grad_norm": 0.11686601489782333, "learning_rate": 0.0001, "loss": 0.2093, "step": 602 }, { "epoch": 0.9830453211607434, "grad_norm": 0.16189296543598175, "learning_rate": 0.0001, "loss": 0.202, "step": 603 }, { "epoch": 0.9846755787414412, "grad_norm": 0.10957061499357224, "learning_rate": 0.0001, "loss": 0.2212, "step": 604 }, { "epoch": 0.9863058363221389, "grad_norm": 0.1792004108428955, "learning_rate": 0.0001, "loss": 0.2027, "step": 605 }, { "epoch": 0.9879360939028367, "grad_norm": 0.13383732736110687, "learning_rate": 0.0001, "loss": 0.2186, "step": 606 }, { "epoch": 0.9895663514835344, "grad_norm": 0.1217399537563324, "learning_rate": 0.0001, "loss": 0.2169, "step": 607 }, { "epoch": 0.9911966090642321, "grad_norm": 0.11006692051887512, "learning_rate": 0.0001, "loss": 0.2061, "step": 608 }, { "epoch": 0.9928268666449299, "grad_norm": 0.1142544150352478, "learning_rate": 0.0001, "loss": 0.2202, "step": 609 }, { "epoch": 0.9944571242256276, "grad_norm": 0.10690510272979736, "learning_rate": 0.0001, "loss": 0.2125, "step": 610 }, { "epoch": 0.9960873818063254, "grad_norm": 0.11827582865953445, "learning_rate": 0.0001, "loss": 0.2109, "step": 611 }, { "epoch": 0.9977176393870232, "grad_norm": 0.1035865843296051, "learning_rate": 0.0001, "loss": 0.2035, "step": 612 }, { "epoch": 0.9993478969677209, "grad_norm": 0.10805841535329819, "learning_rate": 0.0001, "loss": 0.2071, "step": 613 }, { "epoch": 1.0009781545484187, "grad_norm": 0.11635322868824005, "learning_rate": 0.0001, "loss": 0.2148, "step": 614 }, { "epoch": 1.0026084121291163, "grad_norm": 0.1252707839012146, "learning_rate": 0.0001, "loss": 0.201, "step": 615 }, { "epoch": 1.0042386697098142, "grad_norm": 0.11579035967588425, "learning_rate": 0.0001, "loss": 0.2082, "step": 616 }, { "epoch": 1.005868927290512, "grad_norm": 0.11776672303676605, "learning_rate": 0.0001, "loss": 0.2036, "step": 617 }, { "epoch": 1.0074991848712096, "grad_norm": 0.12210860103368759, "learning_rate": 0.0001, "loss": 0.1978, "step": 618 }, { "epoch": 1.0091294424519075, "grad_norm": 0.11382795125246048, "learning_rate": 0.0001, "loss": 0.2096, "step": 619 }, { "epoch": 1.0107597000326052, "grad_norm": 0.12547257542610168, "learning_rate": 0.0001, "loss": 0.2095, "step": 620 }, { "epoch": 1.0123899576133029, "grad_norm": 0.10857869684696198, "learning_rate": 0.0001, "loss": 0.1986, "step": 621 }, { "epoch": 1.0140202151940008, "grad_norm": 0.11652690917253494, "learning_rate": 0.0001, "loss": 0.2101, "step": 622 }, { "epoch": 1.0156504727746984, "grad_norm": 0.13851098716259003, "learning_rate": 0.0001, "loss": 0.2037, "step": 623 }, { "epoch": 1.0172807303553961, "grad_norm": 0.10633445531129837, "learning_rate": 0.0001, "loss": 0.1827, "step": 624 }, { "epoch": 1.0189109879360938, "grad_norm": 0.13127733767032623, "learning_rate": 0.0001, "loss": 0.2161, "step": 625 }, { "epoch": 1.0205412455167917, "grad_norm": 0.14254391193389893, "learning_rate": 0.0001, "loss": 0.1945, "step": 626 }, { "epoch": 1.0221715030974894, "grad_norm": 0.12574298679828644, "learning_rate": 0.0001, "loss": 0.212, "step": 627 }, { "epoch": 1.023801760678187, "grad_norm": 0.1356106996536255, "learning_rate": 0.0001, "loss": 0.2258, "step": 628 }, { "epoch": 1.025432018258885, "grad_norm": 0.11071506142616272, "learning_rate": 0.0001, "loss": 0.2125, "step": 629 }, { "epoch": 1.0270622758395827, "grad_norm": 0.12080221623182297, "learning_rate": 0.0001, "loss": 0.1954, "step": 630 }, { "epoch": 1.0286925334202803, "grad_norm": 0.11128834635019302, "learning_rate": 0.0001, "loss": 0.1884, "step": 631 }, { "epoch": 1.0303227910009782, "grad_norm": 0.13292762637138367, "learning_rate": 0.0001, "loss": 0.2087, "step": 632 }, { "epoch": 1.031953048581676, "grad_norm": 0.11642225086688995, "learning_rate": 0.0001, "loss": 0.2161, "step": 633 }, { "epoch": 1.0335833061623736, "grad_norm": 0.12899143993854523, "learning_rate": 0.0001, "loss": 0.201, "step": 634 }, { "epoch": 1.0352135637430715, "grad_norm": 0.11403586715459824, "learning_rate": 0.0001, "loss": 0.1926, "step": 635 }, { "epoch": 1.0368438213237692, "grad_norm": 0.13663539290428162, "learning_rate": 0.0001, "loss": 0.1902, "step": 636 }, { "epoch": 1.0384740789044669, "grad_norm": 0.11796414107084274, "learning_rate": 0.0001, "loss": 0.1918, "step": 637 }, { "epoch": 1.0401043364851648, "grad_norm": 0.11379235237836838, "learning_rate": 0.0001, "loss": 0.1748, "step": 638 }, { "epoch": 1.0417345940658624, "grad_norm": 0.11975998431444168, "learning_rate": 0.0001, "loss": 0.1937, "step": 639 }, { "epoch": 1.0433648516465601, "grad_norm": 0.12381618469953537, "learning_rate": 0.0001, "loss": 0.202, "step": 640 }, { "epoch": 1.0449951092272578, "grad_norm": 0.14013221859931946, "learning_rate": 0.0001, "loss": 0.2016, "step": 641 }, { "epoch": 1.0466253668079557, "grad_norm": 0.14774538576602936, "learning_rate": 0.0001, "loss": 0.2068, "step": 642 }, { "epoch": 1.0482556243886534, "grad_norm": 0.10894253849983215, "learning_rate": 0.0001, "loss": 0.1881, "step": 643 }, { "epoch": 1.049885881969351, "grad_norm": 0.12854108214378357, "learning_rate": 0.0001, "loss": 0.2095, "step": 644 }, { "epoch": 1.051516139550049, "grad_norm": 0.1026046872138977, "learning_rate": 0.0001, "loss": 0.1977, "step": 645 }, { "epoch": 1.0531463971307466, "grad_norm": 0.11042814701795578, "learning_rate": 0.0001, "loss": 0.1953, "step": 646 }, { "epoch": 1.0547766547114443, "grad_norm": 0.11693835258483887, "learning_rate": 0.0001, "loss": 0.2152, "step": 647 }, { "epoch": 1.0564069122921422, "grad_norm": 0.11914242058992386, "learning_rate": 0.0001, "loss": 0.1962, "step": 648 }, { "epoch": 1.05803716987284, "grad_norm": 0.11633922904729843, "learning_rate": 0.0001, "loss": 0.1924, "step": 649 }, { "epoch": 1.0596674274535376, "grad_norm": 0.10438397526741028, "learning_rate": 0.0001, "loss": 0.1942, "step": 650 }, { "epoch": 1.0612976850342355, "grad_norm": 0.14384230971336365, "learning_rate": 0.0001, "loss": 0.2175, "step": 651 }, { "epoch": 1.0629279426149332, "grad_norm": 0.13774406909942627, "learning_rate": 0.0001, "loss": 0.208, "step": 652 }, { "epoch": 1.0645582001956309, "grad_norm": 0.1285228431224823, "learning_rate": 0.0001, "loss": 0.1938, "step": 653 }, { "epoch": 1.0661884577763288, "grad_norm": 0.1321713626384735, "learning_rate": 0.0001, "loss": 0.1978, "step": 654 }, { "epoch": 1.0678187153570264, "grad_norm": 0.13438454270362854, "learning_rate": 0.0001, "loss": 0.2087, "step": 655 }, { "epoch": 1.0694489729377241, "grad_norm": 0.12169782817363739, "learning_rate": 0.0001, "loss": 0.1867, "step": 656 }, { "epoch": 1.0710792305184218, "grad_norm": 0.12447503954172134, "learning_rate": 0.0001, "loss": 0.2029, "step": 657 }, { "epoch": 1.0727094880991197, "grad_norm": 0.135478213429451, "learning_rate": 0.0001, "loss": 0.2053, "step": 658 }, { "epoch": 1.0743397456798174, "grad_norm": 0.11875299364328384, "learning_rate": 0.0001, "loss": 0.2127, "step": 659 }, { "epoch": 1.075970003260515, "grad_norm": 0.13131263852119446, "learning_rate": 0.0001, "loss": 0.1888, "step": 660 }, { "epoch": 1.077600260841213, "grad_norm": 0.10770665854215622, "learning_rate": 0.0001, "loss": 0.189, "step": 661 }, { "epoch": 1.0792305184219106, "grad_norm": 0.11619222909212112, "learning_rate": 0.0001, "loss": 0.2015, "step": 662 }, { "epoch": 1.0808607760026083, "grad_norm": 0.10690273344516754, "learning_rate": 0.0001, "loss": 0.1916, "step": 663 }, { "epoch": 1.0824910335833062, "grad_norm": 0.13410069048404694, "learning_rate": 0.0001, "loss": 0.1988, "step": 664 }, { "epoch": 1.084121291164004, "grad_norm": 0.12140218168497086, "learning_rate": 0.0001, "loss": 0.1968, "step": 665 }, { "epoch": 1.0857515487447016, "grad_norm": 0.13700392842292786, "learning_rate": 0.0001, "loss": 0.2007, "step": 666 }, { "epoch": 1.0873818063253995, "grad_norm": 0.12108683586120605, "learning_rate": 0.0001, "loss": 0.2057, "step": 667 }, { "epoch": 1.0890120639060972, "grad_norm": 0.24911844730377197, "learning_rate": 0.0001, "loss": 0.1774, "step": 668 }, { "epoch": 1.0906423214867949, "grad_norm": 0.1707676649093628, "learning_rate": 0.0001, "loss": 0.2091, "step": 669 }, { "epoch": 1.0922725790674928, "grad_norm": 0.1396542340517044, "learning_rate": 0.0001, "loss": 0.206, "step": 670 }, { "epoch": 1.0939028366481904, "grad_norm": 0.12876304984092712, "learning_rate": 0.0001, "loss": 0.1981, "step": 671 }, { "epoch": 1.0955330942288881, "grad_norm": 0.16617383062839508, "learning_rate": 0.0001, "loss": 0.2049, "step": 672 }, { "epoch": 1.097163351809586, "grad_norm": 0.1533372402191162, "learning_rate": 0.0001, "loss": 0.2143, "step": 673 }, { "epoch": 1.0987936093902837, "grad_norm": 0.16476184129714966, "learning_rate": 0.0001, "loss": 0.1948, "step": 674 }, { "epoch": 1.1004238669709814, "grad_norm": 0.27153271436691284, "learning_rate": 0.0001, "loss": 0.1948, "step": 675 }, { "epoch": 1.102054124551679, "grad_norm": 0.14181676506996155, "learning_rate": 0.0001, "loss": 0.2063, "step": 676 }, { "epoch": 1.103684382132377, "grad_norm": 0.1463928073644638, "learning_rate": 0.0001, "loss": 0.1861, "step": 677 }, { "epoch": 1.1053146397130746, "grad_norm": 0.12665878236293793, "learning_rate": 0.0001, "loss": 0.201, "step": 678 }, { "epoch": 1.1069448972937723, "grad_norm": 0.12541510164737701, "learning_rate": 0.0001, "loss": 0.2077, "step": 679 }, { "epoch": 1.1085751548744702, "grad_norm": 0.11624184995889664, "learning_rate": 0.0001, "loss": 0.1958, "step": 680 }, { "epoch": 1.110205412455168, "grad_norm": 0.11761149019002914, "learning_rate": 0.0001, "loss": 0.1957, "step": 681 }, { "epoch": 1.1118356700358656, "grad_norm": 0.12375003099441528, "learning_rate": 0.0001, "loss": 0.2028, "step": 682 }, { "epoch": 1.1134659276165635, "grad_norm": 0.12303782999515533, "learning_rate": 0.0001, "loss": 0.2031, "step": 683 }, { "epoch": 1.1150961851972612, "grad_norm": 0.11100725084543228, "learning_rate": 0.0001, "loss": 0.2101, "step": 684 }, { "epoch": 1.1167264427779588, "grad_norm": 0.09348457306623459, "learning_rate": 0.0001, "loss": 0.1749, "step": 685 }, { "epoch": 1.1183567003586568, "grad_norm": 0.1131131500005722, "learning_rate": 0.0001, "loss": 0.2038, "step": 686 }, { "epoch": 1.1199869579393544, "grad_norm": 0.15573875606060028, "learning_rate": 0.0001, "loss": 0.2093, "step": 687 }, { "epoch": 1.1216172155200521, "grad_norm": 0.24191588163375854, "learning_rate": 0.0001, "loss": 0.1905, "step": 688 }, { "epoch": 1.12324747310075, "grad_norm": 0.14230898022651672, "learning_rate": 0.0001, "loss": 0.2029, "step": 689 }, { "epoch": 1.1248777306814477, "grad_norm": 0.1414661705493927, "learning_rate": 0.0001, "loss": 0.2048, "step": 690 }, { "epoch": 1.1265079882621454, "grad_norm": 0.12486743181943893, "learning_rate": 0.0001, "loss": 0.1719, "step": 691 }, { "epoch": 1.1281382458428433, "grad_norm": 0.12714742124080658, "learning_rate": 0.0001, "loss": 0.1963, "step": 692 }, { "epoch": 1.129768503423541, "grad_norm": 0.12481556832790375, "learning_rate": 0.0001, "loss": 0.185, "step": 693 }, { "epoch": 1.1313987610042386, "grad_norm": 0.10976168513298035, "learning_rate": 0.0001, "loss": 0.2018, "step": 694 }, { "epoch": 1.1330290185849363, "grad_norm": 0.11793534457683563, "learning_rate": 0.0001, "loss": 0.1985, "step": 695 }, { "epoch": 1.1346592761656342, "grad_norm": 0.22251418232917786, "learning_rate": 0.0001, "loss": 0.2008, "step": 696 }, { "epoch": 1.136289533746332, "grad_norm": 0.14538496732711792, "learning_rate": 0.0001, "loss": 0.192, "step": 697 }, { "epoch": 1.1379197913270296, "grad_norm": 0.1503850817680359, "learning_rate": 0.0001, "loss": 0.2001, "step": 698 }, { "epoch": 1.1395500489077275, "grad_norm": 0.1038767620921135, "learning_rate": 0.0001, "loss": 0.1924, "step": 699 }, { "epoch": 1.1411803064884252, "grad_norm": 0.12644357979297638, "learning_rate": 0.0001, "loss": 0.1931, "step": 700 }, { "epoch": 1.1428105640691228, "grad_norm": 0.13702809810638428, "learning_rate": 0.0001, "loss": 0.1887, "step": 701 }, { "epoch": 1.1444408216498207, "grad_norm": 0.10647303611040115, "learning_rate": 0.0001, "loss": 0.1907, "step": 702 }, { "epoch": 1.1460710792305184, "grad_norm": 0.12528447806835175, "learning_rate": 0.0001, "loss": 0.2013, "step": 703 }, { "epoch": 1.147701336811216, "grad_norm": 0.21072790026664734, "learning_rate": 0.0001, "loss": 0.1979, "step": 704 }, { "epoch": 1.149331594391914, "grad_norm": 0.14460940659046173, "learning_rate": 0.0001, "loss": 0.1974, "step": 705 }, { "epoch": 1.1509618519726117, "grad_norm": 0.14423388242721558, "learning_rate": 0.0001, "loss": 0.2138, "step": 706 }, { "epoch": 1.1525921095533094, "grad_norm": 0.13311851024627686, "learning_rate": 0.0001, "loss": 0.197, "step": 707 }, { "epoch": 1.154222367134007, "grad_norm": 0.2122778594493866, "learning_rate": 0.0001, "loss": 0.1845, "step": 708 }, { "epoch": 1.155852624714705, "grad_norm": 0.12857376039028168, "learning_rate": 0.0001, "loss": 0.1888, "step": 709 }, { "epoch": 1.1574828822954026, "grad_norm": 0.1168324202299118, "learning_rate": 0.0001, "loss": 0.1844, "step": 710 }, { "epoch": 1.1591131398761005, "grad_norm": 0.1203528344631195, "learning_rate": 0.0001, "loss": 0.2015, "step": 711 }, { "epoch": 1.1607433974567982, "grad_norm": 0.10224917531013489, "learning_rate": 0.0001, "loss": 0.1806, "step": 712 }, { "epoch": 1.162373655037496, "grad_norm": 0.13524509966373444, "learning_rate": 0.0001, "loss": 0.1919, "step": 713 }, { "epoch": 1.1640039126181936, "grad_norm": 0.20357482135295868, "learning_rate": 0.0001, "loss": 0.1938, "step": 714 }, { "epoch": 1.1656341701988915, "grad_norm": 0.11235658079385757, "learning_rate": 0.0001, "loss": 0.169, "step": 715 }, { "epoch": 1.1672644277795892, "grad_norm": 0.12972696125507355, "learning_rate": 0.0001, "loss": 0.1819, "step": 716 }, { "epoch": 1.1688946853602868, "grad_norm": 0.128658264875412, "learning_rate": 0.0001, "loss": 0.2, "step": 717 }, { "epoch": 1.1705249429409847, "grad_norm": 0.13908959925174713, "learning_rate": 0.0001, "loss": 0.2102, "step": 718 }, { "epoch": 1.1721552005216824, "grad_norm": 0.12704800069332123, "learning_rate": 0.0001, "loss": 0.1957, "step": 719 }, { "epoch": 1.17378545810238, "grad_norm": 0.11856772005558014, "learning_rate": 0.0001, "loss": 0.1963, "step": 720 }, { "epoch": 1.175415715683078, "grad_norm": 0.11944851279258728, "learning_rate": 0.0001, "loss": 0.203, "step": 721 }, { "epoch": 1.1770459732637757, "grad_norm": 0.12153194844722748, "learning_rate": 0.0001, "loss": 0.2094, "step": 722 }, { "epoch": 1.1786762308444734, "grad_norm": 0.09558034688234329, "learning_rate": 0.0001, "loss": 0.1688, "step": 723 }, { "epoch": 1.1803064884251713, "grad_norm": 0.1317276656627655, "learning_rate": 0.0001, "loss": 0.2023, "step": 724 }, { "epoch": 1.181936746005869, "grad_norm": 0.10799970477819443, "learning_rate": 0.0001, "loss": 0.201, "step": 725 }, { "epoch": 1.1835670035865666, "grad_norm": 0.11460768431425095, "learning_rate": 0.0001, "loss": 0.1917, "step": 726 }, { "epoch": 1.1851972611672643, "grad_norm": 0.10214045643806458, "learning_rate": 0.0001, "loss": 0.1861, "step": 727 }, { "epoch": 1.1868275187479622, "grad_norm": 0.11441674083471298, "learning_rate": 0.0001, "loss": 0.2121, "step": 728 }, { "epoch": 1.18845777632866, "grad_norm": 0.1123800054192543, "learning_rate": 0.0001, "loss": 0.2038, "step": 729 }, { "epoch": 1.1900880339093578, "grad_norm": 0.11834810674190521, "learning_rate": 0.0001, "loss": 0.1871, "step": 730 }, { "epoch": 1.1917182914900555, "grad_norm": 0.12345534563064575, "learning_rate": 0.0001, "loss": 0.2092, "step": 731 }, { "epoch": 1.1933485490707532, "grad_norm": 0.11334647983312607, "learning_rate": 0.0001, "loss": 0.1852, "step": 732 }, { "epoch": 1.1949788066514508, "grad_norm": 0.13123269379138947, "learning_rate": 0.0001, "loss": 0.2042, "step": 733 }, { "epoch": 1.1966090642321487, "grad_norm": 0.14825505018234253, "learning_rate": 0.0001, "loss": 0.1988, "step": 734 }, { "epoch": 1.1982393218128464, "grad_norm": 0.17332376539707184, "learning_rate": 0.0001, "loss": 0.2023, "step": 735 }, { "epoch": 1.199869579393544, "grad_norm": 0.1413118690252304, "learning_rate": 0.0001, "loss": 0.2003, "step": 736 }, { "epoch": 1.201499836974242, "grad_norm": 0.14043155312538147, "learning_rate": 0.0001, "loss": 0.1924, "step": 737 }, { "epoch": 1.2031300945549397, "grad_norm": 0.11321795731782913, "learning_rate": 0.0001, "loss": 0.1867, "step": 738 }, { "epoch": 1.2047603521356374, "grad_norm": 0.11889634281396866, "learning_rate": 0.0001, "loss": 0.1978, "step": 739 }, { "epoch": 1.2063906097163353, "grad_norm": 0.14727137982845306, "learning_rate": 0.0001, "loss": 0.2097, "step": 740 }, { "epoch": 1.208020867297033, "grad_norm": 0.1467553675174713, "learning_rate": 0.0001, "loss": 0.1998, "step": 741 }, { "epoch": 1.2096511248777306, "grad_norm": 0.12245435267686844, "learning_rate": 0.0001, "loss": 0.1971, "step": 742 }, { "epoch": 1.2112813824584285, "grad_norm": 0.12549766898155212, "learning_rate": 0.0001, "loss": 0.1981, "step": 743 }, { "epoch": 1.2129116400391262, "grad_norm": 0.11142990738153458, "learning_rate": 0.0001, "loss": 0.2054, "step": 744 }, { "epoch": 1.214541897619824, "grad_norm": 0.11524354666471481, "learning_rate": 0.0001, "loss": 0.1892, "step": 745 }, { "epoch": 1.2161721552005216, "grad_norm": 0.15625351667404175, "learning_rate": 0.0001, "loss": 0.1998, "step": 746 }, { "epoch": 1.2178024127812195, "grad_norm": 0.10953667014837265, "learning_rate": 0.0001, "loss": 0.1994, "step": 747 }, { "epoch": 1.2194326703619172, "grad_norm": 0.13718314468860626, "learning_rate": 0.0001, "loss": 0.1971, "step": 748 }, { "epoch": 1.2210629279426148, "grad_norm": 0.13286104798316956, "learning_rate": 0.0001, "loss": 0.2169, "step": 749 }, { "epoch": 1.2226931855233127, "grad_norm": 0.122381292283535, "learning_rate": 0.0001, "loss": 0.1844, "step": 750 }, { "epoch": 1.2243234431040104, "grad_norm": 0.13149864971637726, "learning_rate": 0.0001, "loss": 0.1999, "step": 751 }, { "epoch": 1.225953700684708, "grad_norm": 0.129011869430542, "learning_rate": 0.0001, "loss": 0.2067, "step": 752 }, { "epoch": 1.227583958265406, "grad_norm": 0.12609954178333282, "learning_rate": 0.0001, "loss": 0.1957, "step": 753 }, { "epoch": 1.2292142158461037, "grad_norm": 0.12401840835809708, "learning_rate": 0.0001, "loss": 0.2102, "step": 754 }, { "epoch": 1.2308444734268014, "grad_norm": 0.1191437765955925, "learning_rate": 0.0001, "loss": 0.1977, "step": 755 }, { "epoch": 1.2324747310074993, "grad_norm": 0.11774501204490662, "learning_rate": 0.0001, "loss": 0.1844, "step": 756 }, { "epoch": 1.234104988588197, "grad_norm": 0.09502781182527542, "learning_rate": 0.0001, "loss": 0.1859, "step": 757 }, { "epoch": 1.2357352461688946, "grad_norm": 0.12842731177806854, "learning_rate": 0.0001, "loss": 0.1856, "step": 758 }, { "epoch": 1.2373655037495925, "grad_norm": 0.1238941103219986, "learning_rate": 0.0001, "loss": 0.2036, "step": 759 }, { "epoch": 1.2389957613302902, "grad_norm": 0.120622418820858, "learning_rate": 0.0001, "loss": 0.1904, "step": 760 }, { "epoch": 1.240626018910988, "grad_norm": 0.16394123435020447, "learning_rate": 0.0001, "loss": 0.2024, "step": 761 }, { "epoch": 1.2422562764916858, "grad_norm": 0.13155362010002136, "learning_rate": 0.0001, "loss": 0.1941, "step": 762 }, { "epoch": 1.2438865340723835, "grad_norm": 0.11317794770002365, "learning_rate": 0.0001, "loss": 0.1994, "step": 763 }, { "epoch": 1.2455167916530812, "grad_norm": 0.1305047571659088, "learning_rate": 0.0001, "loss": 0.2106, "step": 764 }, { "epoch": 1.2471470492337788, "grad_norm": 0.12853112816810608, "learning_rate": 0.0001, "loss": 0.1993, "step": 765 }, { "epoch": 1.2487773068144767, "grad_norm": 0.12205260246992111, "learning_rate": 0.0001, "loss": 0.195, "step": 766 }, { "epoch": 1.2504075643951744, "grad_norm": 0.12035010755062103, "learning_rate": 0.0001, "loss": 0.2054, "step": 767 }, { "epoch": 1.2520378219758723, "grad_norm": 0.12386268377304077, "learning_rate": 0.0001, "loss": 0.2047, "step": 768 }, { "epoch": 1.25366807955657, "grad_norm": 0.11850553750991821, "learning_rate": 0.0001, "loss": 0.2104, "step": 769 }, { "epoch": 1.2552983371372677, "grad_norm": 0.12291909754276276, "learning_rate": 0.0001, "loss": 0.2073, "step": 770 }, { "epoch": 1.2569285947179654, "grad_norm": 0.11463002860546112, "learning_rate": 0.0001, "loss": 0.1904, "step": 771 }, { "epoch": 1.2585588522986633, "grad_norm": 0.11392191052436829, "learning_rate": 0.0001, "loss": 0.1974, "step": 772 }, { "epoch": 1.260189109879361, "grad_norm": 0.11817040294408798, "learning_rate": 0.0001, "loss": 0.1994, "step": 773 }, { "epoch": 1.2618193674600586, "grad_norm": 0.11032837629318237, "learning_rate": 0.0001, "loss": 0.204, "step": 774 }, { "epoch": 1.2634496250407565, "grad_norm": 0.12903887033462524, "learning_rate": 0.0001, "loss": 0.2054, "step": 775 }, { "epoch": 1.2650798826214542, "grad_norm": 0.10510163009166718, "learning_rate": 0.0001, "loss": 0.1917, "step": 776 }, { "epoch": 1.266710140202152, "grad_norm": 0.10939998179674149, "learning_rate": 0.0001, "loss": 0.1831, "step": 777 }, { "epoch": 1.2683403977828496, "grad_norm": 0.13864178955554962, "learning_rate": 0.0001, "loss": 0.1995, "step": 778 }, { "epoch": 1.2699706553635475, "grad_norm": 0.13182048499584198, "learning_rate": 0.0001, "loss": 0.1982, "step": 779 }, { "epoch": 1.2716009129442452, "grad_norm": 0.13241499662399292, "learning_rate": 0.0001, "loss": 0.2119, "step": 780 }, { "epoch": 1.273231170524943, "grad_norm": 0.10950156301259995, "learning_rate": 0.0001, "loss": 0.1905, "step": 781 }, { "epoch": 1.2748614281056407, "grad_norm": 0.1152586042881012, "learning_rate": 0.0001, "loss": 0.2138, "step": 782 }, { "epoch": 1.2764916856863384, "grad_norm": 0.09927522391080856, "learning_rate": 0.0001, "loss": 0.1856, "step": 783 }, { "epoch": 1.278121943267036, "grad_norm": 0.12837468087673187, "learning_rate": 0.0001, "loss": 0.2013, "step": 784 }, { "epoch": 1.279752200847734, "grad_norm": 0.11714199930429459, "learning_rate": 0.0001, "loss": 0.1913, "step": 785 }, { "epoch": 1.2813824584284317, "grad_norm": 0.1406959891319275, "learning_rate": 0.0001, "loss": 0.204, "step": 786 }, { "epoch": 1.2830127160091294, "grad_norm": 0.11594724655151367, "learning_rate": 0.0001, "loss": 0.1758, "step": 787 }, { "epoch": 1.2846429735898273, "grad_norm": 0.1368754804134369, "learning_rate": 0.0001, "loss": 0.2067, "step": 788 }, { "epoch": 1.286273231170525, "grad_norm": 0.14420491456985474, "learning_rate": 0.0001, "loss": 0.1757, "step": 789 }, { "epoch": 1.2879034887512226, "grad_norm": 0.11989482492208481, "learning_rate": 0.0001, "loss": 0.1968, "step": 790 }, { "epoch": 1.2895337463319203, "grad_norm": 0.09911071509122849, "learning_rate": 0.0001, "loss": 0.1907, "step": 791 }, { "epoch": 1.2911640039126182, "grad_norm": 0.10733790695667267, "learning_rate": 0.0001, "loss": 0.1819, "step": 792 }, { "epoch": 1.2927942614933159, "grad_norm": 0.12739041447639465, "learning_rate": 0.0001, "loss": 0.2038, "step": 793 }, { "epoch": 1.2944245190740138, "grad_norm": 0.12702591717243195, "learning_rate": 0.0001, "loss": 0.1899, "step": 794 }, { "epoch": 1.2960547766547115, "grad_norm": 0.10774042457342148, "learning_rate": 0.0001, "loss": 0.2061, "step": 795 }, { "epoch": 1.2976850342354092, "grad_norm": 0.14348004758358002, "learning_rate": 0.0001, "loss": 0.2007, "step": 796 }, { "epoch": 1.2993152918161068, "grad_norm": 0.11154497414827347, "learning_rate": 0.0001, "loss": 0.1888, "step": 797 }, { "epoch": 1.3009455493968047, "grad_norm": 0.11274070292711258, "learning_rate": 0.0001, "loss": 0.199, "step": 798 }, { "epoch": 1.3025758069775024, "grad_norm": 0.12772290408611298, "learning_rate": 0.0001, "loss": 0.1994, "step": 799 }, { "epoch": 1.3042060645582003, "grad_norm": 0.138117715716362, "learning_rate": 0.0001, "loss": 0.2096, "step": 800 }, { "epoch": 1.305836322138898, "grad_norm": 0.11759902536869049, "learning_rate": 0.0001, "loss": 0.1935, "step": 801 }, { "epoch": 1.3074665797195957, "grad_norm": 0.1468537598848343, "learning_rate": 0.0001, "loss": 0.1878, "step": 802 }, { "epoch": 1.3090968373002934, "grad_norm": 0.19811546802520752, "learning_rate": 0.0001, "loss": 0.1843, "step": 803 }, { "epoch": 1.3107270948809913, "grad_norm": 0.11782370507717133, "learning_rate": 0.0001, "loss": 0.1935, "step": 804 }, { "epoch": 1.312357352461689, "grad_norm": 0.14963723719120026, "learning_rate": 0.0001, "loss": 0.2077, "step": 805 }, { "epoch": 1.3139876100423866, "grad_norm": 0.12052904069423676, "learning_rate": 0.0001, "loss": 0.1899, "step": 806 }, { "epoch": 1.3156178676230845, "grad_norm": 0.13625605404376984, "learning_rate": 0.0001, "loss": 0.2156, "step": 807 }, { "epoch": 1.3172481252037822, "grad_norm": 0.13118813931941986, "learning_rate": 0.0001, "loss": 0.2115, "step": 808 }, { "epoch": 1.3188783827844799, "grad_norm": 0.12743224203586578, "learning_rate": 0.0001, "loss": 0.2083, "step": 809 }, { "epoch": 1.3205086403651776, "grad_norm": 0.10934583097696304, "learning_rate": 0.0001, "loss": 0.1803, "step": 810 }, { "epoch": 1.3221388979458755, "grad_norm": 0.12755925953388214, "learning_rate": 0.0001, "loss": 0.195, "step": 811 }, { "epoch": 1.3237691555265731, "grad_norm": 0.13710075616836548, "learning_rate": 0.0001, "loss": 0.1967, "step": 812 }, { "epoch": 1.325399413107271, "grad_norm": 0.11187458783388138, "learning_rate": 0.0001, "loss": 0.1986, "step": 813 }, { "epoch": 1.3270296706879687, "grad_norm": 0.12023862451314926, "learning_rate": 0.0001, "loss": 0.2037, "step": 814 }, { "epoch": 1.3286599282686664, "grad_norm": 0.13946884870529175, "learning_rate": 0.0001, "loss": 0.2077, "step": 815 }, { "epoch": 1.330290185849364, "grad_norm": 0.1312728077173233, "learning_rate": 0.0001, "loss": 0.2021, "step": 816 }, { "epoch": 1.331920443430062, "grad_norm": 0.12684758007526398, "learning_rate": 0.0001, "loss": 0.2044, "step": 817 }, { "epoch": 1.3335507010107597, "grad_norm": 0.11161263287067413, "learning_rate": 0.0001, "loss": 0.1781, "step": 818 }, { "epoch": 1.3351809585914576, "grad_norm": 0.11830423027276993, "learning_rate": 0.0001, "loss": 0.2042, "step": 819 }, { "epoch": 1.3368112161721553, "grad_norm": 0.13050754368305206, "learning_rate": 0.0001, "loss": 0.2053, "step": 820 }, { "epoch": 1.338441473752853, "grad_norm": 0.11719314008951187, "learning_rate": 0.0001, "loss": 0.1935, "step": 821 }, { "epoch": 1.3400717313335506, "grad_norm": 0.1417597383260727, "learning_rate": 0.0001, "loss": 0.1915, "step": 822 }, { "epoch": 1.3417019889142485, "grad_norm": 0.10985542833805084, "learning_rate": 0.0001, "loss": 0.2057, "step": 823 }, { "epoch": 1.3433322464949462, "grad_norm": 0.1182430163025856, "learning_rate": 0.0001, "loss": 0.2075, "step": 824 }, { "epoch": 1.3449625040756439, "grad_norm": 0.11384329944849014, "learning_rate": 0.0001, "loss": 0.1973, "step": 825 }, { "epoch": 1.3465927616563418, "grad_norm": 0.13239991664886475, "learning_rate": 0.0001, "loss": 0.2035, "step": 826 }, { "epoch": 1.3482230192370395, "grad_norm": 0.11426941305398941, "learning_rate": 0.0001, "loss": 0.196, "step": 827 }, { "epoch": 1.3498532768177371, "grad_norm": 0.11955778300762177, "learning_rate": 0.0001, "loss": 0.2019, "step": 828 }, { "epoch": 1.3514835343984348, "grad_norm": 0.11563610285520554, "learning_rate": 0.0001, "loss": 0.2093, "step": 829 }, { "epoch": 1.3531137919791327, "grad_norm": 0.09637649357318878, "learning_rate": 0.0001, "loss": 0.1875, "step": 830 }, { "epoch": 1.3547440495598304, "grad_norm": 0.12103250622749329, "learning_rate": 0.0001, "loss": 0.2025, "step": 831 }, { "epoch": 1.3563743071405283, "grad_norm": 0.11947387456893921, "learning_rate": 0.0001, "loss": 0.1826, "step": 832 }, { "epoch": 1.358004564721226, "grad_norm": 0.12026092410087585, "learning_rate": 0.0001, "loss": 0.2012, "step": 833 }, { "epoch": 1.3596348223019237, "grad_norm": 0.12217531353235245, "learning_rate": 0.0001, "loss": 0.2122, "step": 834 }, { "epoch": 1.3612650798826214, "grad_norm": 0.11990203708410263, "learning_rate": 0.0001, "loss": 0.1959, "step": 835 }, { "epoch": 1.3628953374633193, "grad_norm": 0.12255218625068665, "learning_rate": 0.0001, "loss": 0.2043, "step": 836 }, { "epoch": 1.364525595044017, "grad_norm": 0.13003982603549957, "learning_rate": 0.0001, "loss": 0.1988, "step": 837 }, { "epoch": 1.3661558526247148, "grad_norm": 0.12093237787485123, "learning_rate": 0.0001, "loss": 0.197, "step": 838 }, { "epoch": 1.3677861102054125, "grad_norm": 0.11946821212768555, "learning_rate": 0.0001, "loss": 0.2038, "step": 839 }, { "epoch": 1.3694163677861102, "grad_norm": 0.18368889391422272, "learning_rate": 0.0001, "loss": 0.1706, "step": 840 }, { "epoch": 1.3710466253668079, "grad_norm": 0.1539110541343689, "learning_rate": 0.0001, "loss": 0.1916, "step": 841 }, { "epoch": 1.3726768829475058, "grad_norm": 0.11917300522327423, "learning_rate": 0.0001, "loss": 0.1994, "step": 842 }, { "epoch": 1.3743071405282035, "grad_norm": 0.1315378099679947, "learning_rate": 0.0001, "loss": 0.1937, "step": 843 }, { "epoch": 1.3759373981089011, "grad_norm": 0.13452459871768951, "learning_rate": 0.0001, "loss": 0.2124, "step": 844 }, { "epoch": 1.377567655689599, "grad_norm": 0.11259663850069046, "learning_rate": 0.0001, "loss": 0.1719, "step": 845 }, { "epoch": 1.3791979132702967, "grad_norm": 0.12540358304977417, "learning_rate": 0.0001, "loss": 0.1837, "step": 846 }, { "epoch": 1.3808281708509944, "grad_norm": 0.1402936577796936, "learning_rate": 0.0001, "loss": 0.2006, "step": 847 }, { "epoch": 1.382458428431692, "grad_norm": 0.13874374330043793, "learning_rate": 0.0001, "loss": 0.1884, "step": 848 }, { "epoch": 1.38408868601239, "grad_norm": 0.12956838309764862, "learning_rate": 0.0001, "loss": 0.1886, "step": 849 }, { "epoch": 1.3857189435930877, "grad_norm": 0.12310940027236938, "learning_rate": 0.0001, "loss": 0.1776, "step": 850 }, { "epoch": 1.3873492011737856, "grad_norm": 0.13739284873008728, "learning_rate": 0.0001, "loss": 0.207, "step": 851 }, { "epoch": 1.3889794587544833, "grad_norm": 0.1304449588060379, "learning_rate": 0.0001, "loss": 0.1869, "step": 852 }, { "epoch": 1.390609716335181, "grad_norm": 0.12641535699367523, "learning_rate": 0.0001, "loss": 0.1935, "step": 853 }, { "epoch": 1.3922399739158786, "grad_norm": 0.11408405750989914, "learning_rate": 0.0001, "loss": 0.1856, "step": 854 }, { "epoch": 1.3938702314965765, "grad_norm": 0.1254139244556427, "learning_rate": 0.0001, "loss": 0.2097, "step": 855 }, { "epoch": 1.3955004890772742, "grad_norm": 0.1231856718659401, "learning_rate": 0.0001, "loss": 0.1793, "step": 856 }, { "epoch": 1.3971307466579719, "grad_norm": 0.1396886110305786, "learning_rate": 0.0001, "loss": 0.2226, "step": 857 }, { "epoch": 1.3987610042386698, "grad_norm": 0.12817426025867462, "learning_rate": 0.0001, "loss": 0.2079, "step": 858 }, { "epoch": 1.4003912618193675, "grad_norm": 0.13148726522922516, "learning_rate": 0.0001, "loss": 0.1935, "step": 859 }, { "epoch": 1.4020215194000651, "grad_norm": 0.13093724846839905, "learning_rate": 0.0001, "loss": 0.2066, "step": 860 }, { "epoch": 1.4036517769807628, "grad_norm": 0.12283019721508026, "learning_rate": 0.0001, "loss": 0.186, "step": 861 }, { "epoch": 1.4052820345614607, "grad_norm": 0.1363324522972107, "learning_rate": 0.0001, "loss": 0.2062, "step": 862 }, { "epoch": 1.4069122921421584, "grad_norm": 0.14044034481048584, "learning_rate": 0.0001, "loss": 0.2002, "step": 863 }, { "epoch": 1.4085425497228563, "grad_norm": 0.11587396264076233, "learning_rate": 0.0001, "loss": 0.2011, "step": 864 }, { "epoch": 1.410172807303554, "grad_norm": 0.12819702923297882, "learning_rate": 0.0001, "loss": 0.1917, "step": 865 }, { "epoch": 1.4118030648842517, "grad_norm": 0.11986611038446426, "learning_rate": 0.0001, "loss": 0.1796, "step": 866 }, { "epoch": 1.4134333224649493, "grad_norm": 0.12252791970968246, "learning_rate": 0.0001, "loss": 0.2095, "step": 867 }, { "epoch": 1.4150635800456473, "grad_norm": 0.12779594957828522, "learning_rate": 0.0001, "loss": 0.1798, "step": 868 }, { "epoch": 1.416693837626345, "grad_norm": 0.12142222374677658, "learning_rate": 0.0001, "loss": 0.1921, "step": 869 }, { "epoch": 1.4183240952070428, "grad_norm": 0.12713171541690826, "learning_rate": 0.0001, "loss": 0.2122, "step": 870 }, { "epoch": 1.4199543527877405, "grad_norm": 0.11739431321620941, "learning_rate": 0.0001, "loss": 0.1981, "step": 871 }, { "epoch": 1.4215846103684382, "grad_norm": 0.11351068317890167, "learning_rate": 0.0001, "loss": 0.19, "step": 872 }, { "epoch": 1.4232148679491359, "grad_norm": 0.12316130101680756, "learning_rate": 0.0001, "loss": 0.1886, "step": 873 }, { "epoch": 1.4248451255298338, "grad_norm": 0.11452283710241318, "learning_rate": 0.0001, "loss": 0.1902, "step": 874 }, { "epoch": 1.4264753831105315, "grad_norm": 0.12895885109901428, "learning_rate": 0.0001, "loss": 0.1974, "step": 875 }, { "epoch": 1.4281056406912291, "grad_norm": 0.11376369744539261, "learning_rate": 0.0001, "loss": 0.1972, "step": 876 }, { "epoch": 1.429735898271927, "grad_norm": 0.0964648500084877, "learning_rate": 0.0001, "loss": 0.1836, "step": 877 }, { "epoch": 1.4313661558526247, "grad_norm": 0.12030114978551865, "learning_rate": 0.0001, "loss": 0.2071, "step": 878 }, { "epoch": 1.4329964134333224, "grad_norm": 0.1270144283771515, "learning_rate": 0.0001, "loss": 0.1906, "step": 879 }, { "epoch": 1.43462667101402, "grad_norm": 0.11537966132164001, "learning_rate": 0.0001, "loss": 0.1902, "step": 880 }, { "epoch": 1.436256928594718, "grad_norm": 0.10482136160135269, "learning_rate": 0.0001, "loss": 0.1764, "step": 881 }, { "epoch": 1.4378871861754157, "grad_norm": 0.10979556292295456, "learning_rate": 0.0001, "loss": 0.1939, "step": 882 }, { "epoch": 1.4395174437561136, "grad_norm": 0.1133527085185051, "learning_rate": 0.0001, "loss": 0.1985, "step": 883 }, { "epoch": 1.4411477013368112, "grad_norm": 0.12253648787736893, "learning_rate": 0.0001, "loss": 0.1861, "step": 884 }, { "epoch": 1.442777958917509, "grad_norm": 0.1556876003742218, "learning_rate": 0.0001, "loss": 0.1907, "step": 885 }, { "epoch": 1.4444082164982066, "grad_norm": 0.13067957758903503, "learning_rate": 0.0001, "loss": 0.2008, "step": 886 }, { "epoch": 1.4460384740789045, "grad_norm": 0.13614705204963684, "learning_rate": 0.0001, "loss": 0.1905, "step": 887 }, { "epoch": 1.4476687316596022, "grad_norm": 0.13412481546401978, "learning_rate": 0.0001, "loss": 0.1951, "step": 888 }, { "epoch": 1.4492989892403, "grad_norm": 0.12189559638500214, "learning_rate": 0.0001, "loss": 0.2039, "step": 889 }, { "epoch": 1.4509292468209978, "grad_norm": 0.10976207256317139, "learning_rate": 0.0001, "loss": 0.1708, "step": 890 }, { "epoch": 1.4525595044016955, "grad_norm": 0.13595980405807495, "learning_rate": 0.0001, "loss": 0.1826, "step": 891 }, { "epoch": 1.4541897619823931, "grad_norm": 0.1207745224237442, "learning_rate": 0.0001, "loss": 0.1843, "step": 892 }, { "epoch": 1.455820019563091, "grad_norm": 0.12685881555080414, "learning_rate": 0.0001, "loss": 0.2092, "step": 893 }, { "epoch": 1.4574502771437887, "grad_norm": 0.1244385838508606, "learning_rate": 0.0001, "loss": 0.2037, "step": 894 }, { "epoch": 1.4590805347244864, "grad_norm": 0.11779557913541794, "learning_rate": 0.0001, "loss": 0.2037, "step": 895 }, { "epoch": 1.4607107923051843, "grad_norm": 0.13534864783287048, "learning_rate": 0.0001, "loss": 0.1915, "step": 896 }, { "epoch": 1.462341049885882, "grad_norm": 0.1300112009048462, "learning_rate": 0.0001, "loss": 0.204, "step": 897 }, { "epoch": 1.4639713074665797, "grad_norm": 0.11177927255630493, "learning_rate": 0.0001, "loss": 0.1899, "step": 898 }, { "epoch": 1.4656015650472773, "grad_norm": 0.12038631737232208, "learning_rate": 0.0001, "loss": 0.1749, "step": 899 }, { "epoch": 1.4672318226279752, "grad_norm": 0.1431179791688919, "learning_rate": 0.0001, "loss": 0.2004, "step": 900 }, { "epoch": 1.468862080208673, "grad_norm": 0.11475679278373718, "learning_rate": 0.0001, "loss": 0.1849, "step": 901 }, { "epoch": 1.4704923377893708, "grad_norm": 0.12329886853694916, "learning_rate": 0.0001, "loss": 0.191, "step": 902 }, { "epoch": 1.4721225953700685, "grad_norm": 0.12444540858268738, "learning_rate": 0.0001, "loss": 0.1914, "step": 903 }, { "epoch": 1.4737528529507662, "grad_norm": 0.1236625537276268, "learning_rate": 0.0001, "loss": 0.1744, "step": 904 }, { "epoch": 1.4753831105314639, "grad_norm": 0.11803305894136429, "learning_rate": 0.0001, "loss": 0.1939, "step": 905 }, { "epoch": 1.4770133681121618, "grad_norm": 0.11509499698877335, "learning_rate": 0.0001, "loss": 0.1774, "step": 906 }, { "epoch": 1.4786436256928595, "grad_norm": 0.13276603817939758, "learning_rate": 0.0001, "loss": 0.1932, "step": 907 }, { "epoch": 1.4802738832735574, "grad_norm": 0.14081943035125732, "learning_rate": 0.0001, "loss": 0.1793, "step": 908 }, { "epoch": 1.481904140854255, "grad_norm": 0.15057384967803955, "learning_rate": 0.0001, "loss": 0.1945, "step": 909 }, { "epoch": 1.4835343984349527, "grad_norm": 0.15684540569782257, "learning_rate": 0.0001, "loss": 0.2001, "step": 910 }, { "epoch": 1.4851646560156504, "grad_norm": 0.15486294031143188, "learning_rate": 0.0001, "loss": 0.1888, "step": 911 }, { "epoch": 1.4867949135963483, "grad_norm": 0.13155148923397064, "learning_rate": 0.0001, "loss": 0.1966, "step": 912 }, { "epoch": 1.488425171177046, "grad_norm": 0.11298996210098267, "learning_rate": 0.0001, "loss": 0.1944, "step": 913 }, { "epoch": 1.4900554287577437, "grad_norm": 0.1128094419836998, "learning_rate": 0.0001, "loss": 0.1948, "step": 914 }, { "epoch": 1.4916856863384416, "grad_norm": 0.12357712537050247, "learning_rate": 0.0001, "loss": 0.1822, "step": 915 }, { "epoch": 1.4933159439191392, "grad_norm": 0.10895108431577682, "learning_rate": 0.0001, "loss": 0.193, "step": 916 }, { "epoch": 1.494946201499837, "grad_norm": 0.10755927115678787, "learning_rate": 0.0001, "loss": 0.1935, "step": 917 }, { "epoch": 1.4965764590805346, "grad_norm": 0.17024989426136017, "learning_rate": 0.0001, "loss": 0.196, "step": 918 }, { "epoch": 1.4982067166612325, "grad_norm": 0.1299123913049698, "learning_rate": 0.0001, "loss": 0.2118, "step": 919 }, { "epoch": 1.4998369742419302, "grad_norm": 0.12259229272603989, "learning_rate": 0.0001, "loss": 0.2014, "step": 920 }, { "epoch": 1.501467231822628, "grad_norm": 0.12206512689590454, "learning_rate": 0.0001, "loss": 0.1906, "step": 921 }, { "epoch": 1.5030974894033258, "grad_norm": 0.10726633667945862, "learning_rate": 0.0001, "loss": 0.1752, "step": 922 }, { "epoch": 1.5047277469840235, "grad_norm": 0.14176525175571442, "learning_rate": 0.0001, "loss": 0.211, "step": 923 }, { "epoch": 1.5063580045647211, "grad_norm": 0.11247315257787704, "learning_rate": 0.0001, "loss": 0.1785, "step": 924 }, { "epoch": 1.5079882621454188, "grad_norm": 0.12529021501541138, "learning_rate": 0.0001, "loss": 0.1911, "step": 925 }, { "epoch": 1.5096185197261167, "grad_norm": 0.15964820981025696, "learning_rate": 0.0001, "loss": 0.1881, "step": 926 }, { "epoch": 1.5112487773068146, "grad_norm": 0.14119292795658112, "learning_rate": 0.0001, "loss": 0.1852, "step": 927 }, { "epoch": 1.5128790348875123, "grad_norm": 0.14880679547786713, "learning_rate": 0.0001, "loss": 0.2091, "step": 928 }, { "epoch": 1.51450929246821, "grad_norm": 0.11705358326435089, "learning_rate": 0.0001, "loss": 0.1885, "step": 929 }, { "epoch": 1.5161395500489077, "grad_norm": 0.12521898746490479, "learning_rate": 0.0001, "loss": 0.1932, "step": 930 }, { "epoch": 1.5177698076296053, "grad_norm": 0.1378403753042221, "learning_rate": 0.0001, "loss": 0.2104, "step": 931 }, { "epoch": 1.5194000652103032, "grad_norm": 0.10180335491895676, "learning_rate": 0.0001, "loss": 0.1844, "step": 932 }, { "epoch": 1.5210303227910011, "grad_norm": 0.12105914950370789, "learning_rate": 0.0001, "loss": 0.207, "step": 933 }, { "epoch": 1.5226605803716988, "grad_norm": 0.12631486356258392, "learning_rate": 0.0001, "loss": 0.2082, "step": 934 }, { "epoch": 1.5242908379523965, "grad_norm": 0.12594158947467804, "learning_rate": 0.0001, "loss": 0.1866, "step": 935 }, { "epoch": 1.5259210955330942, "grad_norm": 0.10104666650295258, "learning_rate": 0.0001, "loss": 0.1938, "step": 936 }, { "epoch": 1.5275513531137919, "grad_norm": 0.13341638445854187, "learning_rate": 0.0001, "loss": 0.2042, "step": 937 }, { "epoch": 1.5291816106944898, "grad_norm": 0.15109512209892273, "learning_rate": 0.0001, "loss": 0.1878, "step": 938 }, { "epoch": 1.5308118682751874, "grad_norm": 0.12633870542049408, "learning_rate": 0.0001, "loss": 0.21, "step": 939 }, { "epoch": 1.5324421258558854, "grad_norm": 0.1257101148366928, "learning_rate": 0.0001, "loss": 0.1948, "step": 940 }, { "epoch": 1.534072383436583, "grad_norm": 0.1757507622241974, "learning_rate": 0.0001, "loss": 0.2001, "step": 941 }, { "epoch": 1.5357026410172807, "grad_norm": 0.12703821063041687, "learning_rate": 0.0001, "loss": 0.1873, "step": 942 }, { "epoch": 1.5373328985979784, "grad_norm": 0.11847993731498718, "learning_rate": 0.0001, "loss": 0.1884, "step": 943 }, { "epoch": 1.538963156178676, "grad_norm": 0.12398132681846619, "learning_rate": 0.0001, "loss": 0.1769, "step": 944 }, { "epoch": 1.540593413759374, "grad_norm": 0.13036128878593445, "learning_rate": 0.0001, "loss": 0.1866, "step": 945 }, { "epoch": 1.5422236713400719, "grad_norm": 0.10606612265110016, "learning_rate": 0.0001, "loss": 0.1666, "step": 946 }, { "epoch": 1.5438539289207696, "grad_norm": 0.12699076533317566, "learning_rate": 0.0001, "loss": 0.1923, "step": 947 }, { "epoch": 1.5454841865014672, "grad_norm": 0.13484394550323486, "learning_rate": 0.0001, "loss": 0.191, "step": 948 }, { "epoch": 1.547114444082165, "grad_norm": 0.11915871500968933, "learning_rate": 0.0001, "loss": 0.1805, "step": 949 }, { "epoch": 1.5487447016628626, "grad_norm": 0.09939049929380417, "learning_rate": 0.0001, "loss": 0.188, "step": 950 }, { "epoch": 1.5503749592435605, "grad_norm": 0.11767233163118362, "learning_rate": 0.0001, "loss": 0.1904, "step": 951 }, { "epoch": 1.5520052168242582, "grad_norm": 0.12781041860580444, "learning_rate": 0.0001, "loss": 0.1905, "step": 952 }, { "epoch": 1.553635474404956, "grad_norm": 0.1346462368965149, "learning_rate": 0.0001, "loss": 0.1924, "step": 953 }, { "epoch": 1.5552657319856538, "grad_norm": 0.12352897226810455, "learning_rate": 0.0001, "loss": 0.1902, "step": 954 }, { "epoch": 1.5568959895663514, "grad_norm": 0.12448864430189133, "learning_rate": 0.0001, "loss": 0.2005, "step": 955 }, { "epoch": 1.5585262471470491, "grad_norm": 0.11041487753391266, "learning_rate": 0.0001, "loss": 0.1889, "step": 956 }, { "epoch": 1.560156504727747, "grad_norm": 0.11932788044214249, "learning_rate": 0.0001, "loss": 0.177, "step": 957 }, { "epoch": 1.5617867623084447, "grad_norm": 0.11703822016716003, "learning_rate": 0.0001, "loss": 0.1866, "step": 958 }, { "epoch": 1.5634170198891426, "grad_norm": 0.11386875063180923, "learning_rate": 0.0001, "loss": 0.1945, "step": 959 }, { "epoch": 1.5650472774698403, "grad_norm": 0.1266459971666336, "learning_rate": 0.0001, "loss": 0.1847, "step": 960 }, { "epoch": 1.566677535050538, "grad_norm": 0.10677938908338547, "learning_rate": 0.0001, "loss": 0.1955, "step": 961 }, { "epoch": 1.5683077926312357, "grad_norm": 0.13318541646003723, "learning_rate": 0.0001, "loss": 0.197, "step": 962 }, { "epoch": 1.5699380502119333, "grad_norm": 0.11351260542869568, "learning_rate": 0.0001, "loss": 0.1935, "step": 963 }, { "epoch": 1.5715683077926312, "grad_norm": 0.10484421998262405, "learning_rate": 0.0001, "loss": 0.1985, "step": 964 }, { "epoch": 1.5731985653733291, "grad_norm": 0.12113457173109055, "learning_rate": 0.0001, "loss": 0.1852, "step": 965 }, { "epoch": 1.5748288229540268, "grad_norm": 0.11888866871595383, "learning_rate": 0.0001, "loss": 0.1857, "step": 966 }, { "epoch": 1.5764590805347245, "grad_norm": 0.1369660496711731, "learning_rate": 0.0001, "loss": 0.2087, "step": 967 }, { "epoch": 1.5780893381154222, "grad_norm": 0.11224696040153503, "learning_rate": 0.0001, "loss": 0.1967, "step": 968 }, { "epoch": 1.5797195956961199, "grad_norm": 0.12822385132312775, "learning_rate": 0.0001, "loss": 0.1937, "step": 969 }, { "epoch": 1.5813498532768178, "grad_norm": 0.13289442658424377, "learning_rate": 0.0001, "loss": 0.2012, "step": 970 }, { "epoch": 1.5829801108575154, "grad_norm": 0.10120034962892532, "learning_rate": 0.0001, "loss": 0.1809, "step": 971 }, { "epoch": 1.5846103684382133, "grad_norm": 0.1271483451128006, "learning_rate": 0.0001, "loss": 0.1933, "step": 972 }, { "epoch": 1.586240626018911, "grad_norm": 0.12362337112426758, "learning_rate": 0.0001, "loss": 0.1888, "step": 973 }, { "epoch": 1.5878708835996087, "grad_norm": 0.12856562435626984, "learning_rate": 0.0001, "loss": 0.2038, "step": 974 }, { "epoch": 1.5895011411803064, "grad_norm": 0.10737952589988708, "learning_rate": 0.0001, "loss": 0.1892, "step": 975 }, { "epoch": 1.5911313987610043, "grad_norm": 0.1198209896683693, "learning_rate": 0.0001, "loss": 0.1995, "step": 976 }, { "epoch": 1.592761656341702, "grad_norm": 0.1190398558974266, "learning_rate": 0.0001, "loss": 0.1832, "step": 977 }, { "epoch": 1.5943919139223999, "grad_norm": 0.1241382509469986, "learning_rate": 0.0001, "loss": 0.1949, "step": 978 }, { "epoch": 1.5960221715030976, "grad_norm": 0.10942984372377396, "learning_rate": 0.0001, "loss": 0.187, "step": 979 }, { "epoch": 1.5976524290837952, "grad_norm": 0.16293250024318695, "learning_rate": 0.0001, "loss": 0.1952, "step": 980 }, { "epoch": 1.599282686664493, "grad_norm": 0.1106327474117279, "learning_rate": 0.0001, "loss": 0.1839, "step": 981 }, { "epoch": 1.6009129442451906, "grad_norm": 0.15175805985927582, "learning_rate": 0.0001, "loss": 0.206, "step": 982 }, { "epoch": 1.6025432018258885, "grad_norm": 0.13483884930610657, "learning_rate": 0.0001, "loss": 0.195, "step": 983 }, { "epoch": 1.6041734594065864, "grad_norm": 0.1230054423213005, "learning_rate": 0.0001, "loss": 0.1756, "step": 984 }, { "epoch": 1.605803716987284, "grad_norm": 0.12960274517536163, "learning_rate": 0.0001, "loss": 0.1969, "step": 985 }, { "epoch": 1.6074339745679818, "grad_norm": 0.12781794369220734, "learning_rate": 0.0001, "loss": 0.1961, "step": 986 }, { "epoch": 1.6090642321486794, "grad_norm": 0.12113626301288605, "learning_rate": 0.0001, "loss": 0.1854, "step": 987 }, { "epoch": 1.6106944897293771, "grad_norm": 0.10890072584152222, "learning_rate": 0.0001, "loss": 0.1975, "step": 988 }, { "epoch": 1.612324747310075, "grad_norm": 0.12309688329696655, "learning_rate": 0.0001, "loss": 0.1866, "step": 989 }, { "epoch": 1.6139550048907727, "grad_norm": 0.12740392982959747, "learning_rate": 0.0001, "loss": 0.1993, "step": 990 }, { "epoch": 1.6155852624714706, "grad_norm": 0.12641726434230804, "learning_rate": 0.0001, "loss": 0.2091, "step": 991 }, { "epoch": 1.6172155200521683, "grad_norm": 0.13405801355838776, "learning_rate": 0.0001, "loss": 0.2103, "step": 992 }, { "epoch": 1.618845777632866, "grad_norm": 0.10853724181652069, "learning_rate": 0.0001, "loss": 0.2, "step": 993 }, { "epoch": 1.6204760352135636, "grad_norm": 0.13972024619579315, "learning_rate": 0.0001, "loss": 0.1889, "step": 994 }, { "epoch": 1.6221062927942613, "grad_norm": 0.11462700366973877, "learning_rate": 0.0001, "loss": 0.1884, "step": 995 }, { "epoch": 1.6237365503749592, "grad_norm": 0.11051095277070999, "learning_rate": 0.0001, "loss": 0.1897, "step": 996 }, { "epoch": 1.6253668079556571, "grad_norm": 0.11667358130216599, "learning_rate": 0.0001, "loss": 0.1997, "step": 997 }, { "epoch": 1.6269970655363548, "grad_norm": 0.11986871063709259, "learning_rate": 0.0001, "loss": 0.1857, "step": 998 }, { "epoch": 1.6286273231170525, "grad_norm": 0.10935333371162415, "learning_rate": 0.0001, "loss": 0.2016, "step": 999 }, { "epoch": 1.6302575806977502, "grad_norm": 0.12035983800888062, "learning_rate": 0.0001, "loss": 0.2016, "step": 1000 }, { "epoch": 1.6302575806977502, "eval_loss": 0.2019711285829544, "eval_runtime": 461.1407, "eval_samples_per_second": 4.092, "eval_steps_per_second": 1.024, "step": 1000 }, { "epoch": 1.6318878382784479, "grad_norm": 0.165378138422966, "learning_rate": 0.0001, "loss": 0.1832, "step": 1001 }, { "epoch": 1.6335180958591458, "grad_norm": 0.12353488057851791, "learning_rate": 0.0001, "loss": 0.1881, "step": 1002 }, { "epoch": 1.6351483534398437, "grad_norm": 0.13291586935520172, "learning_rate": 0.0001, "loss": 0.1882, "step": 1003 }, { "epoch": 1.6367786110205413, "grad_norm": 0.1183796152472496, "learning_rate": 0.0001, "loss": 0.1861, "step": 1004 }, { "epoch": 1.638408868601239, "grad_norm": 0.14914032816886902, "learning_rate": 0.0001, "loss": 0.1938, "step": 1005 }, { "epoch": 1.6400391261819367, "grad_norm": 0.13322560489177704, "learning_rate": 0.0001, "loss": 0.193, "step": 1006 }, { "epoch": 1.6416693837626344, "grad_norm": 0.14066238701343536, "learning_rate": 0.0001, "loss": 0.2088, "step": 1007 }, { "epoch": 1.6432996413433323, "grad_norm": 0.11830136179924011, "learning_rate": 0.0001, "loss": 0.1775, "step": 1008 }, { "epoch": 1.64492989892403, "grad_norm": 0.11095772683620453, "learning_rate": 0.0001, "loss": 0.1888, "step": 1009 }, { "epoch": 1.6465601565047279, "grad_norm": 0.12288574874401093, "learning_rate": 0.0001, "loss": 0.1918, "step": 1010 }, { "epoch": 1.6481904140854255, "grad_norm": 0.11359456181526184, "learning_rate": 0.0001, "loss": 0.1736, "step": 1011 }, { "epoch": 1.6498206716661232, "grad_norm": 0.11324212700128555, "learning_rate": 0.0001, "loss": 0.1974, "step": 1012 }, { "epoch": 1.651450929246821, "grad_norm": 0.11443588137626648, "learning_rate": 0.0001, "loss": 0.2122, "step": 1013 }, { "epoch": 1.6530811868275186, "grad_norm": 0.14412713050842285, "learning_rate": 0.0001, "loss": 0.1851, "step": 1014 }, { "epoch": 1.6547114444082165, "grad_norm": 0.10094897449016571, "learning_rate": 0.0001, "loss": 0.1884, "step": 1015 }, { "epoch": 1.6563417019889144, "grad_norm": 0.12090327590703964, "learning_rate": 0.0001, "loss": 0.1856, "step": 1016 }, { "epoch": 1.657971959569612, "grad_norm": 0.13659027218818665, "learning_rate": 0.0001, "loss": 0.1977, "step": 1017 }, { "epoch": 1.6596022171503098, "grad_norm": 0.12800638377666473, "learning_rate": 0.0001, "loss": 0.1859, "step": 1018 }, { "epoch": 1.6612324747310074, "grad_norm": 0.11905695497989655, "learning_rate": 0.0001, "loss": 0.2036, "step": 1019 }, { "epoch": 1.6628627323117051, "grad_norm": 0.1223512515425682, "learning_rate": 0.0001, "loss": 0.1902, "step": 1020 }, { "epoch": 1.664492989892403, "grad_norm": 0.12449689954519272, "learning_rate": 0.0001, "loss": 0.1916, "step": 1021 }, { "epoch": 1.6661232474731007, "grad_norm": 0.11523964256048203, "learning_rate": 0.0001, "loss": 0.1986, "step": 1022 }, { "epoch": 1.6677535050537986, "grad_norm": 0.1144275814294815, "learning_rate": 0.0001, "loss": 0.1925, "step": 1023 }, { "epoch": 1.6693837626344963, "grad_norm": 0.11656132340431213, "learning_rate": 0.0001, "loss": 0.1766, "step": 1024 }, { "epoch": 1.671014020215194, "grad_norm": 0.10800933092832565, "learning_rate": 0.0001, "loss": 0.1867, "step": 1025 }, { "epoch": 1.6726442777958916, "grad_norm": 0.12445061653852463, "learning_rate": 0.0001, "loss": 0.201, "step": 1026 }, { "epoch": 1.6742745353765895, "grad_norm": 0.11647329479455948, "learning_rate": 0.0001, "loss": 0.1942, "step": 1027 }, { "epoch": 1.6759047929572872, "grad_norm": 0.11205076426267624, "learning_rate": 0.0001, "loss": 0.1746, "step": 1028 }, { "epoch": 1.6775350505379851, "grad_norm": 0.12594562768936157, "learning_rate": 0.0001, "loss": 0.1758, "step": 1029 }, { "epoch": 1.6791653081186828, "grad_norm": 0.13459977507591248, "learning_rate": 0.0001, "loss": 0.2025, "step": 1030 }, { "epoch": 1.6807955656993805, "grad_norm": 0.12494435161352158, "learning_rate": 0.0001, "loss": 0.2046, "step": 1031 }, { "epoch": 1.6824258232800782, "grad_norm": 0.11778414994478226, "learning_rate": 0.0001, "loss": 0.1946, "step": 1032 }, { "epoch": 1.6840560808607759, "grad_norm": 0.10984843969345093, "learning_rate": 0.0001, "loss": 0.1893, "step": 1033 }, { "epoch": 1.6856863384414738, "grad_norm": 0.6984158754348755, "learning_rate": 0.0001, "loss": 0.2081, "step": 1034 }, { "epoch": 1.6873165960221717, "grad_norm": 0.10825484246015549, "learning_rate": 0.0001, "loss": 0.1846, "step": 1035 }, { "epoch": 1.6889468536028693, "grad_norm": 0.1061546802520752, "learning_rate": 0.0001, "loss": 0.1866, "step": 1036 }, { "epoch": 1.690577111183567, "grad_norm": 0.11936734616756439, "learning_rate": 0.0001, "loss": 0.1968, "step": 1037 }, { "epoch": 1.6922073687642647, "grad_norm": 0.13179874420166016, "learning_rate": 0.0001, "loss": 0.1872, "step": 1038 }, { "epoch": 1.6938376263449624, "grad_norm": 0.10697422921657562, "learning_rate": 0.0001, "loss": 0.1811, "step": 1039 }, { "epoch": 1.6954678839256603, "grad_norm": 0.1438966691493988, "learning_rate": 0.0001, "loss": 0.1971, "step": 1040 }, { "epoch": 1.697098141506358, "grad_norm": 0.10862574726343155, "learning_rate": 0.0001, "loss": 0.1928, "step": 1041 }, { "epoch": 1.6987283990870559, "grad_norm": 0.10851150006055832, "learning_rate": 0.0001, "loss": 0.1918, "step": 1042 }, { "epoch": 1.7003586566677535, "grad_norm": 0.11880330741405487, "learning_rate": 0.0001, "loss": 0.1802, "step": 1043 }, { "epoch": 1.7019889142484512, "grad_norm": 0.12955494225025177, "learning_rate": 0.0001, "loss": 0.1981, "step": 1044 }, { "epoch": 1.703619171829149, "grad_norm": 0.1307879388332367, "learning_rate": 0.0001, "loss": 0.1968, "step": 1045 }, { "epoch": 1.7052494294098468, "grad_norm": 0.12933744490146637, "learning_rate": 0.0001, "loss": 0.1888, "step": 1046 }, { "epoch": 1.7068796869905445, "grad_norm": 0.11336127668619156, "learning_rate": 0.0001, "loss": 0.1807, "step": 1047 }, { "epoch": 1.7085099445712424, "grad_norm": 0.1487550288438797, "learning_rate": 0.0001, "loss": 0.201, "step": 1048 }, { "epoch": 1.71014020215194, "grad_norm": 0.1337575614452362, "learning_rate": 0.0001, "loss": 0.1934, "step": 1049 }, { "epoch": 1.7117704597326378, "grad_norm": 0.13716711103916168, "learning_rate": 0.0001, "loss": 0.1967, "step": 1050 }, { "epoch": 1.7134007173133354, "grad_norm": 0.14090760052204132, "learning_rate": 0.0001, "loss": 0.1993, "step": 1051 }, { "epoch": 1.7150309748940331, "grad_norm": 0.11731863021850586, "learning_rate": 0.0001, "loss": 0.1855, "step": 1052 }, { "epoch": 1.716661232474731, "grad_norm": 0.13154546916484833, "learning_rate": 0.0001, "loss": 0.1865, "step": 1053 }, { "epoch": 1.718291490055429, "grad_norm": 0.11974471062421799, "learning_rate": 0.0001, "loss": 0.1874, "step": 1054 }, { "epoch": 1.7199217476361266, "grad_norm": 0.11386546492576599, "learning_rate": 0.0001, "loss": 0.188, "step": 1055 }, { "epoch": 1.7215520052168243, "grad_norm": 0.09487878531217575, "learning_rate": 0.0001, "loss": 0.1891, "step": 1056 }, { "epoch": 1.723182262797522, "grad_norm": 0.13436681032180786, "learning_rate": 0.0001, "loss": 0.2091, "step": 1057 }, { "epoch": 1.7248125203782196, "grad_norm": 0.11488918960094452, "learning_rate": 0.0001, "loss": 0.1799, "step": 1058 }, { "epoch": 1.7264427779589175, "grad_norm": 0.1182805746793747, "learning_rate": 0.0001, "loss": 0.1904, "step": 1059 }, { "epoch": 1.7280730355396152, "grad_norm": 0.11342515796422958, "learning_rate": 0.0001, "loss": 0.1964, "step": 1060 }, { "epoch": 1.7297032931203131, "grad_norm": 0.14482362568378448, "learning_rate": 0.0001, "loss": 0.2023, "step": 1061 }, { "epoch": 1.7313335507010108, "grad_norm": 0.12669838964939117, "learning_rate": 0.0001, "loss": 0.1899, "step": 1062 }, { "epoch": 1.7329638082817085, "grad_norm": 0.12242146581411362, "learning_rate": 0.0001, "loss": 0.1797, "step": 1063 }, { "epoch": 1.7345940658624062, "grad_norm": 0.11942725628614426, "learning_rate": 0.0001, "loss": 0.1975, "step": 1064 }, { "epoch": 1.7362243234431038, "grad_norm": 0.14859893918037415, "learning_rate": 0.0001, "loss": 0.1817, "step": 1065 }, { "epoch": 1.7378545810238017, "grad_norm": 0.13270403444766998, "learning_rate": 0.0001, "loss": 0.1967, "step": 1066 }, { "epoch": 1.7394848386044997, "grad_norm": 0.13862530887126923, "learning_rate": 0.0001, "loss": 0.1901, "step": 1067 }, { "epoch": 1.7411150961851973, "grad_norm": 0.13995380699634552, "learning_rate": 0.0001, "loss": 0.1928, "step": 1068 }, { "epoch": 1.742745353765895, "grad_norm": 0.11337511986494064, "learning_rate": 0.0001, "loss": 0.1764, "step": 1069 }, { "epoch": 1.7443756113465927, "grad_norm": 0.12498489022254944, "learning_rate": 0.0001, "loss": 0.1957, "step": 1070 }, { "epoch": 1.7460058689272904, "grad_norm": 0.127224400639534, "learning_rate": 0.0001, "loss": 0.1939, "step": 1071 }, { "epoch": 1.7476361265079883, "grad_norm": 0.11792348325252533, "learning_rate": 0.0001, "loss": 0.1785, "step": 1072 }, { "epoch": 1.7492663840886862, "grad_norm": 0.10551140457391739, "learning_rate": 0.0001, "loss": 0.1702, "step": 1073 }, { "epoch": 1.7508966416693839, "grad_norm": 0.11569567769765854, "learning_rate": 0.0001, "loss": 0.1842, "step": 1074 }, { "epoch": 1.7525268992500815, "grad_norm": 0.12134891003370285, "learning_rate": 0.0001, "loss": 0.1904, "step": 1075 }, { "epoch": 1.7541571568307792, "grad_norm": 0.10986746102571487, "learning_rate": 0.0001, "loss": 0.1781, "step": 1076 }, { "epoch": 1.755787414411477, "grad_norm": 0.12267926335334778, "learning_rate": 0.0001, "loss": 0.185, "step": 1077 }, { "epoch": 1.7574176719921748, "grad_norm": 0.11747080087661743, "learning_rate": 0.0001, "loss": 0.1897, "step": 1078 }, { "epoch": 1.7590479295728725, "grad_norm": 0.12816251814365387, "learning_rate": 0.0001, "loss": 0.1737, "step": 1079 }, { "epoch": 1.7606781871535704, "grad_norm": 0.11122124642133713, "learning_rate": 0.0001, "loss": 0.173, "step": 1080 }, { "epoch": 1.762308444734268, "grad_norm": 0.18602591753005981, "learning_rate": 0.0001, "loss": 0.1831, "step": 1081 }, { "epoch": 1.7639387023149657, "grad_norm": 0.11141426861286163, "learning_rate": 0.0001, "loss": 0.1743, "step": 1082 }, { "epoch": 1.7655689598956634, "grad_norm": 0.13159684836864471, "learning_rate": 0.0001, "loss": 0.2036, "step": 1083 }, { "epoch": 1.767199217476361, "grad_norm": 0.12319090962409973, "learning_rate": 0.0001, "loss": 0.1964, "step": 1084 }, { "epoch": 1.768829475057059, "grad_norm": 0.12106230854988098, "learning_rate": 0.0001, "loss": 0.1945, "step": 1085 }, { "epoch": 1.770459732637757, "grad_norm": 0.10471806675195694, "learning_rate": 0.0001, "loss": 0.19, "step": 1086 }, { "epoch": 1.7720899902184546, "grad_norm": 0.12956561148166656, "learning_rate": 0.0001, "loss": 0.1873, "step": 1087 }, { "epoch": 1.7737202477991523, "grad_norm": 0.12020842730998993, "learning_rate": 0.0001, "loss": 0.1887, "step": 1088 }, { "epoch": 1.77535050537985, "grad_norm": 0.12040827423334122, "learning_rate": 0.0001, "loss": 0.194, "step": 1089 }, { "epoch": 1.7769807629605476, "grad_norm": 0.11612360924482346, "learning_rate": 0.0001, "loss": 0.1844, "step": 1090 }, { "epoch": 1.7786110205412455, "grad_norm": 0.11711908876895905, "learning_rate": 0.0001, "loss": 0.1898, "step": 1091 }, { "epoch": 1.7802412781219432, "grad_norm": 0.13765960931777954, "learning_rate": 0.0001, "loss": 0.1974, "step": 1092 }, { "epoch": 1.7818715357026411, "grad_norm": 0.12794670462608337, "learning_rate": 0.0001, "loss": 0.1918, "step": 1093 }, { "epoch": 1.7835017932833388, "grad_norm": 0.12181553989648819, "learning_rate": 0.0001, "loss": 0.1964, "step": 1094 }, { "epoch": 1.7851320508640365, "grad_norm": 0.12774871289730072, "learning_rate": 0.0001, "loss": 0.2011, "step": 1095 }, { "epoch": 1.7867623084447342, "grad_norm": 0.11980225145816803, "learning_rate": 0.0001, "loss": 0.1861, "step": 1096 }, { "epoch": 1.788392566025432, "grad_norm": 0.1250666081905365, "learning_rate": 0.0001, "loss": 0.1937, "step": 1097 }, { "epoch": 1.7900228236061297, "grad_norm": 0.13532552123069763, "learning_rate": 0.0001, "loss": 0.1936, "step": 1098 }, { "epoch": 1.7916530811868276, "grad_norm": 0.1301998496055603, "learning_rate": 0.0001, "loss": 0.1826, "step": 1099 }, { "epoch": 1.7932833387675253, "grad_norm": 0.12082832306623459, "learning_rate": 0.0001, "loss": 0.1864, "step": 1100 }, { "epoch": 1.794913596348223, "grad_norm": 0.12959355115890503, "learning_rate": 0.0001, "loss": 0.1964, "step": 1101 }, { "epoch": 1.7965438539289207, "grad_norm": 0.1250586211681366, "learning_rate": 0.0001, "loss": 0.2007, "step": 1102 }, { "epoch": 1.7981741115096184, "grad_norm": 0.1253538429737091, "learning_rate": 0.0001, "loss": 0.1903, "step": 1103 }, { "epoch": 1.7998043690903163, "grad_norm": 0.09879365563392639, "learning_rate": 0.0001, "loss": 0.1746, "step": 1104 }, { "epoch": 1.8014346266710142, "grad_norm": 0.10257220268249512, "learning_rate": 0.0001, "loss": 0.1745, "step": 1105 }, { "epoch": 1.8030648842517119, "grad_norm": 0.10447902977466583, "learning_rate": 0.0001, "loss": 0.19, "step": 1106 }, { "epoch": 1.8046951418324095, "grad_norm": 0.10268338769674301, "learning_rate": 0.0001, "loss": 0.188, "step": 1107 }, { "epoch": 1.8063253994131072, "grad_norm": 0.12003128230571747, "learning_rate": 0.0001, "loss": 0.1872, "step": 1108 }, { "epoch": 1.807955656993805, "grad_norm": 0.10243313759565353, "learning_rate": 0.0001, "loss": 0.1698, "step": 1109 }, { "epoch": 1.8095859145745028, "grad_norm": 0.11695826798677444, "learning_rate": 0.0001, "loss": 0.1812, "step": 1110 }, { "epoch": 1.8112161721552005, "grad_norm": 0.10703260451555252, "learning_rate": 0.0001, "loss": 0.1811, "step": 1111 }, { "epoch": 1.8128464297358984, "grad_norm": 0.13467635214328766, "learning_rate": 0.0001, "loss": 0.1881, "step": 1112 }, { "epoch": 1.814476687316596, "grad_norm": 0.11541393399238586, "learning_rate": 0.0001, "loss": 0.192, "step": 1113 }, { "epoch": 1.8161069448972937, "grad_norm": 0.1386149823665619, "learning_rate": 0.0001, "loss": 0.1798, "step": 1114 }, { "epoch": 1.8177372024779914, "grad_norm": 0.12145870178937912, "learning_rate": 0.0001, "loss": 0.1962, "step": 1115 }, { "epoch": 1.8193674600586893, "grad_norm": 0.10783080011606216, "learning_rate": 0.0001, "loss": 0.171, "step": 1116 }, { "epoch": 1.820997717639387, "grad_norm": 0.10322029888629913, "learning_rate": 0.0001, "loss": 0.1836, "step": 1117 }, { "epoch": 1.822627975220085, "grad_norm": 0.11130011826753616, "learning_rate": 0.0001, "loss": 0.1765, "step": 1118 }, { "epoch": 1.8242582328007826, "grad_norm": 0.09618648141622543, "learning_rate": 0.0001, "loss": 0.1716, "step": 1119 }, { "epoch": 1.8258884903814803, "grad_norm": 0.11504556983709335, "learning_rate": 0.0001, "loss": 0.1936, "step": 1120 }, { "epoch": 1.827518747962178, "grad_norm": 0.11279092729091644, "learning_rate": 0.0001, "loss": 0.1879, "step": 1121 }, { "epoch": 1.8291490055428756, "grad_norm": 0.14116674661636353, "learning_rate": 0.0001, "loss": 0.1906, "step": 1122 }, { "epoch": 1.8307792631235735, "grad_norm": 0.11786951869726181, "learning_rate": 0.0001, "loss": 0.1874, "step": 1123 }, { "epoch": 1.8324095207042714, "grad_norm": 0.10708369314670563, "learning_rate": 0.0001, "loss": 0.195, "step": 1124 }, { "epoch": 1.8340397782849691, "grad_norm": 0.18581970036029816, "learning_rate": 0.0001, "loss": 0.182, "step": 1125 }, { "epoch": 1.8356700358656668, "grad_norm": 0.11519227176904678, "learning_rate": 0.0001, "loss": 0.1799, "step": 1126 }, { "epoch": 1.8373002934463645, "grad_norm": 0.11876104772090912, "learning_rate": 0.0001, "loss": 0.1799, "step": 1127 }, { "epoch": 1.8389305510270622, "grad_norm": 0.10485244542360306, "learning_rate": 0.0001, "loss": 0.1813, "step": 1128 }, { "epoch": 1.84056080860776, "grad_norm": 0.12142175436019897, "learning_rate": 0.0001, "loss": 0.1902, "step": 1129 }, { "epoch": 1.8421910661884577, "grad_norm": 0.12452993541955948, "learning_rate": 0.0001, "loss": 0.1889, "step": 1130 }, { "epoch": 1.8438213237691556, "grad_norm": 0.12156898528337479, "learning_rate": 0.0001, "loss": 0.1854, "step": 1131 }, { "epoch": 1.8454515813498533, "grad_norm": 0.12071850895881653, "learning_rate": 0.0001, "loss": 0.1949, "step": 1132 }, { "epoch": 1.847081838930551, "grad_norm": 0.12315471470355988, "learning_rate": 0.0001, "loss": 0.1965, "step": 1133 }, { "epoch": 1.8487120965112487, "grad_norm": 0.11038068681955338, "learning_rate": 0.0001, "loss": 0.1868, "step": 1134 }, { "epoch": 1.8503423540919464, "grad_norm": 0.10914380848407745, "learning_rate": 0.0001, "loss": 0.2023, "step": 1135 }, { "epoch": 1.8519726116726443, "grad_norm": 0.12893681228160858, "learning_rate": 0.0001, "loss": 0.1774, "step": 1136 }, { "epoch": 1.8536028692533422, "grad_norm": 0.10367900878190994, "learning_rate": 0.0001, "loss": 0.1905, "step": 1137 }, { "epoch": 1.8552331268340398, "grad_norm": 0.11435539275407791, "learning_rate": 0.0001, "loss": 0.1843, "step": 1138 }, { "epoch": 1.8568633844147375, "grad_norm": 0.11469294130802155, "learning_rate": 0.0001, "loss": 0.1936, "step": 1139 }, { "epoch": 1.8584936419954352, "grad_norm": 0.1339050680398941, "learning_rate": 0.0001, "loss": 0.193, "step": 1140 }, { "epoch": 1.8601238995761329, "grad_norm": 0.10412506759166718, "learning_rate": 0.0001, "loss": 0.1824, "step": 1141 }, { "epoch": 1.8617541571568308, "grad_norm": 0.10916057229042053, "learning_rate": 0.0001, "loss": 0.1879, "step": 1142 }, { "epoch": 1.8633844147375287, "grad_norm": 0.11035683751106262, "learning_rate": 0.0001, "loss": 0.1862, "step": 1143 }, { "epoch": 1.8650146723182264, "grad_norm": 0.13064561784267426, "learning_rate": 0.0001, "loss": 0.1969, "step": 1144 }, { "epoch": 1.866644929898924, "grad_norm": 0.11412317305803299, "learning_rate": 0.0001, "loss": 0.1915, "step": 1145 }, { "epoch": 1.8682751874796217, "grad_norm": 0.11612699925899506, "learning_rate": 0.0001, "loss": 0.2052, "step": 1146 }, { "epoch": 1.8699054450603194, "grad_norm": 0.1117459088563919, "learning_rate": 0.0001, "loss": 0.1743, "step": 1147 }, { "epoch": 1.8715357026410173, "grad_norm": 0.1192610114812851, "learning_rate": 0.0001, "loss": 0.1743, "step": 1148 }, { "epoch": 1.873165960221715, "grad_norm": 0.12196903675794601, "learning_rate": 0.0001, "loss": 0.2023, "step": 1149 }, { "epoch": 1.874796217802413, "grad_norm": 0.11657217144966125, "learning_rate": 0.0001, "loss": 0.1713, "step": 1150 }, { "epoch": 1.8764264753831106, "grad_norm": 0.14465437829494476, "learning_rate": 0.0001, "loss": 0.1914, "step": 1151 }, { "epoch": 1.8780567329638083, "grad_norm": 0.11741483211517334, "learning_rate": 0.0001, "loss": 0.1881, "step": 1152 }, { "epoch": 1.879686990544506, "grad_norm": 0.12043508142232895, "learning_rate": 0.0001, "loss": 0.1929, "step": 1153 }, { "epoch": 1.8813172481252036, "grad_norm": 0.14909182488918304, "learning_rate": 0.0001, "loss": 0.2239, "step": 1154 }, { "epoch": 1.8829475057059015, "grad_norm": 0.12423070520162582, "learning_rate": 0.0001, "loss": 0.1812, "step": 1155 }, { "epoch": 1.8845777632865994, "grad_norm": 0.11938756704330444, "learning_rate": 0.0001, "loss": 0.1976, "step": 1156 }, { "epoch": 1.886208020867297, "grad_norm": 0.10287628322839737, "learning_rate": 0.0001, "loss": 0.1921, "step": 1157 }, { "epoch": 1.8878382784479948, "grad_norm": 0.09794101864099503, "learning_rate": 0.0001, "loss": 0.1767, "step": 1158 }, { "epoch": 1.8894685360286925, "grad_norm": 0.12463726103305817, "learning_rate": 0.0001, "loss": 0.1909, "step": 1159 }, { "epoch": 1.8910987936093901, "grad_norm": 0.11366378515958786, "learning_rate": 0.0001, "loss": 0.1954, "step": 1160 }, { "epoch": 1.892729051190088, "grad_norm": 0.1246451884508133, "learning_rate": 0.0001, "loss": 0.1932, "step": 1161 }, { "epoch": 1.8943593087707857, "grad_norm": 0.13116668164730072, "learning_rate": 0.0001, "loss": 0.1821, "step": 1162 }, { "epoch": 1.8959895663514836, "grad_norm": 0.104774110019207, "learning_rate": 0.0001, "loss": 0.1801, "step": 1163 }, { "epoch": 1.8976198239321813, "grad_norm": 0.10643444210290909, "learning_rate": 0.0001, "loss": 0.1718, "step": 1164 }, { "epoch": 1.899250081512879, "grad_norm": 0.11924157291650772, "learning_rate": 0.0001, "loss": 0.1747, "step": 1165 }, { "epoch": 1.9008803390935767, "grad_norm": 0.1133570596575737, "learning_rate": 0.0001, "loss": 0.1831, "step": 1166 }, { "epoch": 1.9025105966742746, "grad_norm": 0.11677032709121704, "learning_rate": 0.0001, "loss": 0.1876, "step": 1167 }, { "epoch": 1.9041408542549723, "grad_norm": 0.11180409044027328, "learning_rate": 0.0001, "loss": 0.1787, "step": 1168 }, { "epoch": 1.9057711118356702, "grad_norm": 0.10403387248516083, "learning_rate": 0.0001, "loss": 0.1816, "step": 1169 }, { "epoch": 1.9074013694163678, "grad_norm": 0.1255159080028534, "learning_rate": 0.0001, "loss": 0.2094, "step": 1170 }, { "epoch": 1.9090316269970655, "grad_norm": 0.1264365166425705, "learning_rate": 0.0001, "loss": 0.1873, "step": 1171 }, { "epoch": 1.9106618845777632, "grad_norm": 0.12751294672489166, "learning_rate": 0.0001, "loss": 0.2049, "step": 1172 }, { "epoch": 1.9122921421584609, "grad_norm": 0.10625582933425903, "learning_rate": 0.0001, "loss": 0.1731, "step": 1173 }, { "epoch": 1.9139223997391588, "grad_norm": 0.11829791963100433, "learning_rate": 0.0001, "loss": 0.1863, "step": 1174 }, { "epoch": 1.9155526573198567, "grad_norm": 0.11165602505207062, "learning_rate": 0.0001, "loss": 0.1838, "step": 1175 }, { "epoch": 1.9171829149005544, "grad_norm": 0.14051920175552368, "learning_rate": 0.0001, "loss": 0.2053, "step": 1176 }, { "epoch": 1.918813172481252, "grad_norm": 0.11515091359615326, "learning_rate": 0.0001, "loss": 0.1793, "step": 1177 }, { "epoch": 1.9204434300619497, "grad_norm": 0.14358852803707123, "learning_rate": 0.0001, "loss": 0.1786, "step": 1178 }, { "epoch": 1.9220736876426474, "grad_norm": 0.1013508066534996, "learning_rate": 0.0001, "loss": 0.1724, "step": 1179 }, { "epoch": 1.9237039452233453, "grad_norm": 0.12144607305526733, "learning_rate": 0.0001, "loss": 0.1867, "step": 1180 }, { "epoch": 1.925334202804043, "grad_norm": 0.1413160115480423, "learning_rate": 0.0001, "loss": 0.1849, "step": 1181 }, { "epoch": 1.926964460384741, "grad_norm": 0.14951057732105255, "learning_rate": 0.0001, "loss": 0.204, "step": 1182 }, { "epoch": 1.9285947179654386, "grad_norm": 0.11757660657167435, "learning_rate": 0.0001, "loss": 0.1837, "step": 1183 }, { "epoch": 1.9302249755461363, "grad_norm": 0.1191357746720314, "learning_rate": 0.0001, "loss": 0.1951, "step": 1184 }, { "epoch": 1.931855233126834, "grad_norm": 0.12074853479862213, "learning_rate": 0.0001, "loss": 0.1815, "step": 1185 }, { "epoch": 1.9334854907075318, "grad_norm": 0.11659263074398041, "learning_rate": 0.0001, "loss": 0.1773, "step": 1186 }, { "epoch": 1.9351157482882295, "grad_norm": 0.10916323959827423, "learning_rate": 0.0001, "loss": 0.1749, "step": 1187 }, { "epoch": 1.9367460058689274, "grad_norm": 0.10679040104150772, "learning_rate": 0.0001, "loss": 0.1731, "step": 1188 }, { "epoch": 1.938376263449625, "grad_norm": 0.13911648094654083, "learning_rate": 0.0001, "loss": 0.1896, "step": 1189 }, { "epoch": 1.9400065210303228, "grad_norm": 0.09244771301746368, "learning_rate": 0.0001, "loss": 0.1823, "step": 1190 }, { "epoch": 1.9416367786110205, "grad_norm": 0.14823158085346222, "learning_rate": 0.0001, "loss": 0.1907, "step": 1191 }, { "epoch": 1.9432670361917181, "grad_norm": 0.1267862170934677, "learning_rate": 0.0001, "loss": 0.1908, "step": 1192 }, { "epoch": 1.944897293772416, "grad_norm": 0.12872754037380219, "learning_rate": 0.0001, "loss": 0.1937, "step": 1193 }, { "epoch": 1.946527551353114, "grad_norm": 0.15068793296813965, "learning_rate": 0.0001, "loss": 0.1919, "step": 1194 }, { "epoch": 1.9481578089338116, "grad_norm": 0.1112789437174797, "learning_rate": 0.0001, "loss": 0.1941, "step": 1195 }, { "epoch": 1.9497880665145093, "grad_norm": 0.12267830222845078, "learning_rate": 0.0001, "loss": 0.1812, "step": 1196 }, { "epoch": 1.951418324095207, "grad_norm": 0.13730347156524658, "learning_rate": 0.0001, "loss": 0.1858, "step": 1197 }, { "epoch": 1.9530485816759047, "grad_norm": 0.11384674906730652, "learning_rate": 0.0001, "loss": 0.1929, "step": 1198 }, { "epoch": 1.9546788392566026, "grad_norm": 0.15709474682807922, "learning_rate": 0.0001, "loss": 0.178, "step": 1199 }, { "epoch": 1.9563090968373003, "grad_norm": 0.15006817877292633, "learning_rate": 0.0001, "loss": 0.1886, "step": 1200 }, { "epoch": 1.9579393544179982, "grad_norm": 0.11046171188354492, "learning_rate": 0.0001, "loss": 0.1863, "step": 1201 }, { "epoch": 1.9595696119986958, "grad_norm": 0.10871347039937973, "learning_rate": 0.0001, "loss": 0.194, "step": 1202 }, { "epoch": 1.9611998695793935, "grad_norm": 0.12221578508615494, "learning_rate": 0.0001, "loss": 0.1885, "step": 1203 }, { "epoch": 1.9628301271600912, "grad_norm": 0.11434303224086761, "learning_rate": 0.0001, "loss": 0.1964, "step": 1204 }, { "epoch": 1.9644603847407889, "grad_norm": 0.12363463640213013, "learning_rate": 0.0001, "loss": 0.1999, "step": 1205 }, { "epoch": 1.9660906423214868, "grad_norm": 0.1334676444530487, "learning_rate": 0.0001, "loss": 0.185, "step": 1206 }, { "epoch": 1.9677208999021847, "grad_norm": 0.1140584945678711, "learning_rate": 0.0001, "loss": 0.1865, "step": 1207 }, { "epoch": 1.9693511574828824, "grad_norm": 0.11625649034976959, "learning_rate": 0.0001, "loss": 0.212, "step": 1208 }, { "epoch": 1.97098141506358, "grad_norm": 0.10492118448019028, "learning_rate": 0.0001, "loss": 0.1825, "step": 1209 }, { "epoch": 1.9726116726442777, "grad_norm": 0.11723576486110687, "learning_rate": 0.0001, "loss": 0.1995, "step": 1210 }, { "epoch": 1.9742419302249754, "grad_norm": 0.10941202193498611, "learning_rate": 0.0001, "loss": 0.1745, "step": 1211 }, { "epoch": 1.9758721878056733, "grad_norm": 0.11242581903934479, "learning_rate": 0.0001, "loss": 0.19, "step": 1212 }, { "epoch": 1.9775024453863712, "grad_norm": 0.1318778693675995, "learning_rate": 0.0001, "loss": 0.1958, "step": 1213 }, { "epoch": 1.979132702967069, "grad_norm": 0.12168639153242111, "learning_rate": 0.0001, "loss": 0.1882, "step": 1214 }, { "epoch": 1.9807629605477666, "grad_norm": 0.1154153048992157, "learning_rate": 0.0001, "loss": 0.2033, "step": 1215 }, { "epoch": 1.9823932181284643, "grad_norm": 0.12487414479255676, "learning_rate": 0.0001, "loss": 0.1739, "step": 1216 }, { "epoch": 1.984023475709162, "grad_norm": 0.14309844374656677, "learning_rate": 0.0001, "loss": 0.199, "step": 1217 }, { "epoch": 1.9856537332898598, "grad_norm": 0.127132847905159, "learning_rate": 0.0001, "loss": 0.1799, "step": 1218 }, { "epoch": 1.9872839908705575, "grad_norm": 0.1264181286096573, "learning_rate": 0.0001, "loss": 0.1764, "step": 1219 }, { "epoch": 1.9889142484512554, "grad_norm": 0.12706303596496582, "learning_rate": 0.0001, "loss": 0.1789, "step": 1220 }, { "epoch": 1.990544506031953, "grad_norm": 0.12314368784427643, "learning_rate": 0.0001, "loss": 0.1945, "step": 1221 }, { "epoch": 1.9921747636126508, "grad_norm": 0.11701434850692749, "learning_rate": 0.0001, "loss": 0.2002, "step": 1222 }, { "epoch": 1.9938050211933485, "grad_norm": 0.1571284830570221, "learning_rate": 0.0001, "loss": 0.1791, "step": 1223 }, { "epoch": 1.9954352787740461, "grad_norm": 0.12066090106964111, "learning_rate": 0.0001, "loss": 0.179, "step": 1224 }, { "epoch": 1.997065536354744, "grad_norm": 0.1215023398399353, "learning_rate": 0.0001, "loss": 0.2005, "step": 1225 }, { "epoch": 1.998695793935442, "grad_norm": 0.11120277643203735, "learning_rate": 0.0001, "loss": 0.1825, "step": 1226 }, { "epoch": 2.0003260515161396, "grad_norm": 0.14585649967193604, "learning_rate": 0.0001, "loss": 0.1874, "step": 1227 }, { "epoch": 2.0019563090968373, "grad_norm": 0.10910741984844208, "learning_rate": 0.0001, "loss": 0.1836, "step": 1228 }, { "epoch": 2.003586566677535, "grad_norm": 0.1540917009115219, "learning_rate": 0.0001, "loss": 0.1783, "step": 1229 }, { "epoch": 2.0052168242582327, "grad_norm": 0.12682808935642242, "learning_rate": 0.0001, "loss": 0.1778, "step": 1230 }, { "epoch": 2.0068470818389303, "grad_norm": 0.12131527066230774, "learning_rate": 0.0001, "loss": 0.1861, "step": 1231 }, { "epoch": 2.0084773394196285, "grad_norm": 0.13246504962444305, "learning_rate": 0.0001, "loss": 0.1748, "step": 1232 }, { "epoch": 2.010107597000326, "grad_norm": 0.13079266250133514, "learning_rate": 0.0001, "loss": 0.174, "step": 1233 }, { "epoch": 2.011737854581024, "grad_norm": 0.13317003846168518, "learning_rate": 0.0001, "loss": 0.1713, "step": 1234 }, { "epoch": 2.0133681121617215, "grad_norm": 0.15676195919513702, "learning_rate": 0.0001, "loss": 0.1738, "step": 1235 }, { "epoch": 2.014998369742419, "grad_norm": 0.12478923052549362, "learning_rate": 0.0001, "loss": 0.1747, "step": 1236 }, { "epoch": 2.016628627323117, "grad_norm": 0.15440379083156586, "learning_rate": 0.0001, "loss": 0.1951, "step": 1237 }, { "epoch": 2.018258884903815, "grad_norm": 0.13177958130836487, "learning_rate": 0.0001, "loss": 0.1666, "step": 1238 }, { "epoch": 2.0198891424845127, "grad_norm": 0.1117987260222435, "learning_rate": 0.0001, "loss": 0.1666, "step": 1239 }, { "epoch": 2.0215194000652104, "grad_norm": 0.15712931752204895, "learning_rate": 0.0001, "loss": 0.1824, "step": 1240 }, { "epoch": 2.023149657645908, "grad_norm": 0.11659948527812958, "learning_rate": 0.0001, "loss": 0.1689, "step": 1241 }, { "epoch": 2.0247799152266057, "grad_norm": 0.08816540241241455, "learning_rate": 0.0001, "loss": 0.1647, "step": 1242 }, { "epoch": 2.0264101728073034, "grad_norm": 0.13327382504940033, "learning_rate": 0.0001, "loss": 0.1814, "step": 1243 }, { "epoch": 2.0280404303880015, "grad_norm": 0.11885062605142593, "learning_rate": 0.0001, "loss": 0.1717, "step": 1244 }, { "epoch": 2.029670687968699, "grad_norm": 0.15437649190425873, "learning_rate": 0.0001, "loss": 0.1726, "step": 1245 }, { "epoch": 2.031300945549397, "grad_norm": 0.13453203439712524, "learning_rate": 0.0001, "loss": 0.1823, "step": 1246 }, { "epoch": 2.0329312031300946, "grad_norm": 0.12537498772144318, "learning_rate": 0.0001, "loss": 0.1842, "step": 1247 }, { "epoch": 2.0345614607107922, "grad_norm": 0.12894268333911896, "learning_rate": 0.0001, "loss": 0.1654, "step": 1248 }, { "epoch": 2.03619171829149, "grad_norm": 0.1385597288608551, "learning_rate": 0.0001, "loss": 0.1763, "step": 1249 }, { "epoch": 2.0378219758721876, "grad_norm": 0.15800078213214874, "learning_rate": 0.0001, "loss": 0.1856, "step": 1250 }, { "epoch": 2.0394522334528857, "grad_norm": 0.09816145896911621, "learning_rate": 0.0001, "loss": 0.1565, "step": 1251 }, { "epoch": 2.0410824910335834, "grad_norm": 0.1402331292629242, "learning_rate": 0.0001, "loss": 0.1753, "step": 1252 }, { "epoch": 2.042712748614281, "grad_norm": 0.13870537281036377, "learning_rate": 0.0001, "loss": 0.1861, "step": 1253 }, { "epoch": 2.0443430061949788, "grad_norm": 0.1497325599193573, "learning_rate": 0.0001, "loss": 0.1896, "step": 1254 }, { "epoch": 2.0459732637756765, "grad_norm": 0.1422789841890335, "learning_rate": 0.0001, "loss": 0.1776, "step": 1255 }, { "epoch": 2.047603521356374, "grad_norm": 0.12478403002023697, "learning_rate": 0.0001, "loss": 0.1802, "step": 1256 }, { "epoch": 2.0492337789370723, "grad_norm": 0.12358732521533966, "learning_rate": 0.0001, "loss": 0.1854, "step": 1257 }, { "epoch": 2.05086403651777, "grad_norm": 0.11842041462659836, "learning_rate": 0.0001, "loss": 0.1708, "step": 1258 }, { "epoch": 2.0524942940984676, "grad_norm": 0.11563142389059067, "learning_rate": 0.0001, "loss": 0.1622, "step": 1259 }, { "epoch": 2.0541245516791653, "grad_norm": 0.12804879248142242, "learning_rate": 0.0001, "loss": 0.1718, "step": 1260 }, { "epoch": 2.055754809259863, "grad_norm": 0.13581717014312744, "learning_rate": 0.0001, "loss": 0.1779, "step": 1261 }, { "epoch": 2.0573850668405607, "grad_norm": 0.1498471200466156, "learning_rate": 0.0001, "loss": 0.1843, "step": 1262 }, { "epoch": 2.0590153244212583, "grad_norm": 0.11999404430389404, "learning_rate": 0.0001, "loss": 0.1581, "step": 1263 }, { "epoch": 2.0606455820019565, "grad_norm": 0.15887455642223358, "learning_rate": 0.0001, "loss": 0.1803, "step": 1264 }, { "epoch": 2.062275839582654, "grad_norm": 0.139693021774292, "learning_rate": 0.0001, "loss": 0.1636, "step": 1265 }, { "epoch": 2.063906097163352, "grad_norm": 0.15679016709327698, "learning_rate": 0.0001, "loss": 0.1664, "step": 1266 }, { "epoch": 2.0655363547440495, "grad_norm": 0.1563042551279068, "learning_rate": 0.0001, "loss": 0.2025, "step": 1267 }, { "epoch": 2.067166612324747, "grad_norm": 0.13360831141471863, "learning_rate": 0.0001, "loss": 0.2039, "step": 1268 }, { "epoch": 2.068796869905445, "grad_norm": 0.123869888484478, "learning_rate": 0.0001, "loss": 0.169, "step": 1269 }, { "epoch": 2.070427127486143, "grad_norm": 0.1273512840270996, "learning_rate": 0.0001, "loss": 0.1652, "step": 1270 }, { "epoch": 2.0720573850668407, "grad_norm": 0.1433614194393158, "learning_rate": 0.0001, "loss": 0.167, "step": 1271 }, { "epoch": 2.0736876426475384, "grad_norm": 0.1275768131017685, "learning_rate": 0.0001, "loss": 0.1742, "step": 1272 }, { "epoch": 2.075317900228236, "grad_norm": 0.113472118973732, "learning_rate": 0.0001, "loss": 0.1689, "step": 1273 }, { "epoch": 2.0769481578089337, "grad_norm": 0.14065606892108917, "learning_rate": 0.0001, "loss": 0.1852, "step": 1274 }, { "epoch": 2.0785784153896314, "grad_norm": 0.10859358310699463, "learning_rate": 0.0001, "loss": 0.1692, "step": 1275 }, { "epoch": 2.0802086729703295, "grad_norm": 0.12903261184692383, "learning_rate": 0.0001, "loss": 0.1571, "step": 1276 }, { "epoch": 2.081838930551027, "grad_norm": 0.1369621455669403, "learning_rate": 0.0001, "loss": 0.1946, "step": 1277 }, { "epoch": 2.083469188131725, "grad_norm": 0.13974396884441376, "learning_rate": 0.0001, "loss": 0.1648, "step": 1278 }, { "epoch": 2.0850994457124226, "grad_norm": 0.1181628480553627, "learning_rate": 0.0001, "loss": 0.1778, "step": 1279 }, { "epoch": 2.0867297032931202, "grad_norm": 0.14868682622909546, "learning_rate": 0.0001, "loss": 0.1807, "step": 1280 }, { "epoch": 2.088359960873818, "grad_norm": 0.1333058476448059, "learning_rate": 0.0001, "loss": 0.1796, "step": 1281 }, { "epoch": 2.0899902184545156, "grad_norm": 0.14653992652893066, "learning_rate": 0.0001, "loss": 0.1805, "step": 1282 }, { "epoch": 2.0916204760352137, "grad_norm": 0.15411017835140228, "learning_rate": 0.0001, "loss": 0.1601, "step": 1283 }, { "epoch": 2.0932507336159114, "grad_norm": 0.1162109524011612, "learning_rate": 0.0001, "loss": 0.1757, "step": 1284 }, { "epoch": 2.094880991196609, "grad_norm": 0.14571483433246613, "learning_rate": 0.0001, "loss": 0.174, "step": 1285 }, { "epoch": 2.0965112487773068, "grad_norm": 0.15159551799297333, "learning_rate": 0.0001, "loss": 0.1709, "step": 1286 }, { "epoch": 2.0981415063580044, "grad_norm": 0.1452435553073883, "learning_rate": 0.0001, "loss": 0.1749, "step": 1287 }, { "epoch": 2.099771763938702, "grad_norm": 0.1417376548051834, "learning_rate": 0.0001, "loss": 0.1793, "step": 1288 }, { "epoch": 2.1014020215194003, "grad_norm": 0.15437503159046173, "learning_rate": 0.0001, "loss": 0.1753, "step": 1289 }, { "epoch": 2.103032279100098, "grad_norm": 0.13350600004196167, "learning_rate": 0.0001, "loss": 0.1689, "step": 1290 }, { "epoch": 2.1046625366807956, "grad_norm": 0.12360616028308868, "learning_rate": 0.0001, "loss": 0.1649, "step": 1291 }, { "epoch": 2.1062927942614933, "grad_norm": 0.12257912009954453, "learning_rate": 0.0001, "loss": 0.1683, "step": 1292 }, { "epoch": 2.107923051842191, "grad_norm": 0.1374562382698059, "learning_rate": 0.0001, "loss": 0.1786, "step": 1293 }, { "epoch": 2.1095533094228887, "grad_norm": 0.17024345695972443, "learning_rate": 0.0001, "loss": 0.173, "step": 1294 }, { "epoch": 2.111183567003587, "grad_norm": 0.13668283820152283, "learning_rate": 0.0001, "loss": 0.1769, "step": 1295 }, { "epoch": 2.1128138245842845, "grad_norm": 0.12531708180904388, "learning_rate": 0.0001, "loss": 0.1693, "step": 1296 }, { "epoch": 2.114444082164982, "grad_norm": 0.12956015765666962, "learning_rate": 0.0001, "loss": 0.1749, "step": 1297 }, { "epoch": 2.11607433974568, "grad_norm": 0.11305827647447586, "learning_rate": 0.0001, "loss": 0.1661, "step": 1298 }, { "epoch": 2.1177045973263775, "grad_norm": 0.1285475343465805, "learning_rate": 0.0001, "loss": 0.174, "step": 1299 }, { "epoch": 2.119334854907075, "grad_norm": 0.10780393332242966, "learning_rate": 0.0001, "loss": 0.1609, "step": 1300 }, { "epoch": 2.120965112487773, "grad_norm": 0.14362122118473053, "learning_rate": 0.0001, "loss": 0.1818, "step": 1301 }, { "epoch": 2.122595370068471, "grad_norm": 0.13147400319576263, "learning_rate": 0.0001, "loss": 0.1663, "step": 1302 }, { "epoch": 2.1242256276491687, "grad_norm": 0.11345292627811432, "learning_rate": 0.0001, "loss": 0.1595, "step": 1303 }, { "epoch": 2.1258558852298663, "grad_norm": 0.12018296867609024, "learning_rate": 0.0001, "loss": 0.1632, "step": 1304 }, { "epoch": 2.127486142810564, "grad_norm": 0.12988026440143585, "learning_rate": 0.0001, "loss": 0.1911, "step": 1305 }, { "epoch": 2.1291164003912617, "grad_norm": 0.12335322052240372, "learning_rate": 0.0001, "loss": 0.1599, "step": 1306 }, { "epoch": 2.1307466579719594, "grad_norm": 0.13894210755825043, "learning_rate": 0.0001, "loss": 0.189, "step": 1307 }, { "epoch": 2.1323769155526575, "grad_norm": 0.13918493688106537, "learning_rate": 0.0001, "loss": 0.1694, "step": 1308 }, { "epoch": 2.134007173133355, "grad_norm": 0.12675361335277557, "learning_rate": 0.0001, "loss": 0.1769, "step": 1309 }, { "epoch": 2.135637430714053, "grad_norm": 0.13385078310966492, "learning_rate": 0.0001, "loss": 0.1849, "step": 1310 }, { "epoch": 2.1372676882947506, "grad_norm": 0.17780406773090363, "learning_rate": 0.0001, "loss": 0.1674, "step": 1311 }, { "epoch": 2.1388979458754482, "grad_norm": 0.16155046224594116, "learning_rate": 0.0001, "loss": 0.1861, "step": 1312 }, { "epoch": 2.140528203456146, "grad_norm": 0.177040234208107, "learning_rate": 0.0001, "loss": 0.1884, "step": 1313 }, { "epoch": 2.1421584610368436, "grad_norm": 0.14751054346561432, "learning_rate": 0.0001, "loss": 0.1723, "step": 1314 }, { "epoch": 2.1437887186175417, "grad_norm": 0.11751122027635574, "learning_rate": 0.0001, "loss": 0.1629, "step": 1315 }, { "epoch": 2.1454189761982394, "grad_norm": 0.12306412309408188, "learning_rate": 0.0001, "loss": 0.1793, "step": 1316 }, { "epoch": 2.147049233778937, "grad_norm": 0.10852048546075821, "learning_rate": 0.0001, "loss": 0.1628, "step": 1317 }, { "epoch": 2.1486794913596348, "grad_norm": 0.11529412865638733, "learning_rate": 0.0001, "loss": 0.1675, "step": 1318 }, { "epoch": 2.1503097489403324, "grad_norm": 0.11132712662220001, "learning_rate": 0.0001, "loss": 0.162, "step": 1319 }, { "epoch": 2.15194000652103, "grad_norm": 0.1375824362039566, "learning_rate": 0.0001, "loss": 0.1807, "step": 1320 }, { "epoch": 2.1535702641017282, "grad_norm": 0.1289290189743042, "learning_rate": 0.0001, "loss": 0.1883, "step": 1321 }, { "epoch": 2.155200521682426, "grad_norm": 0.1294177919626236, "learning_rate": 0.0001, "loss": 0.1873, "step": 1322 }, { "epoch": 2.1568307792631236, "grad_norm": 0.13281673192977905, "learning_rate": 0.0001, "loss": 0.159, "step": 1323 }, { "epoch": 2.1584610368438213, "grad_norm": 0.1543353796005249, "learning_rate": 0.0001, "loss": 0.1744, "step": 1324 }, { "epoch": 2.160091294424519, "grad_norm": 0.13094060122966766, "learning_rate": 0.0001, "loss": 0.1737, "step": 1325 }, { "epoch": 2.1617215520052167, "grad_norm": 0.1372898817062378, "learning_rate": 0.0001, "loss": 0.1693, "step": 1326 }, { "epoch": 2.1633518095859148, "grad_norm": 0.1310727298259735, "learning_rate": 0.0001, "loss": 0.1934, "step": 1327 }, { "epoch": 2.1649820671666125, "grad_norm": 0.13207446038722992, "learning_rate": 0.0001, "loss": 0.1691, "step": 1328 }, { "epoch": 2.16661232474731, "grad_norm": 0.13898542523384094, "learning_rate": 0.0001, "loss": 0.1783, "step": 1329 }, { "epoch": 2.168242582328008, "grad_norm": 0.1400642991065979, "learning_rate": 0.0001, "loss": 0.1783, "step": 1330 }, { "epoch": 2.1698728399087055, "grad_norm": 0.12981773912906647, "learning_rate": 0.0001, "loss": 0.1772, "step": 1331 }, { "epoch": 2.171503097489403, "grad_norm": 0.15546952188014984, "learning_rate": 0.0001, "loss": 0.1708, "step": 1332 }, { "epoch": 2.1731333550701013, "grad_norm": 0.11434978246688843, "learning_rate": 0.0001, "loss": 0.1679, "step": 1333 }, { "epoch": 2.174763612650799, "grad_norm": 0.13666783273220062, "learning_rate": 0.0001, "loss": 0.1712, "step": 1334 }, { "epoch": 2.1763938702314967, "grad_norm": 0.12208392471075058, "learning_rate": 0.0001, "loss": 0.1744, "step": 1335 }, { "epoch": 2.1780241278121943, "grad_norm": 0.13914120197296143, "learning_rate": 0.0001, "loss": 0.1871, "step": 1336 }, { "epoch": 2.179654385392892, "grad_norm": 0.13285982608795166, "learning_rate": 0.0001, "loss": 0.171, "step": 1337 }, { "epoch": 2.1812846429735897, "grad_norm": 0.1425054520368576, "learning_rate": 0.0001, "loss": 0.1707, "step": 1338 }, { "epoch": 2.1829149005542874, "grad_norm": 0.12354496121406555, "learning_rate": 0.0001, "loss": 0.1912, "step": 1339 }, { "epoch": 2.1845451581349855, "grad_norm": 0.12002517282962799, "learning_rate": 0.0001, "loss": 0.1915, "step": 1340 }, { "epoch": 2.186175415715683, "grad_norm": 0.1574515998363495, "learning_rate": 0.0001, "loss": 0.175, "step": 1341 }, { "epoch": 2.187805673296381, "grad_norm": 0.12115081399679184, "learning_rate": 0.0001, "loss": 0.1662, "step": 1342 }, { "epoch": 2.1894359308770786, "grad_norm": 0.1330757737159729, "learning_rate": 0.0001, "loss": 0.162, "step": 1343 }, { "epoch": 2.1910661884577762, "grad_norm": 0.11849580705165863, "learning_rate": 0.0001, "loss": 0.1806, "step": 1344 }, { "epoch": 2.192696446038474, "grad_norm": 0.13124977052211761, "learning_rate": 0.0001, "loss": 0.1737, "step": 1345 }, { "epoch": 2.194326703619172, "grad_norm": 0.15525256097316742, "learning_rate": 0.0001, "loss": 0.1767, "step": 1346 }, { "epoch": 2.1959569611998697, "grad_norm": 0.13997358083724976, "learning_rate": 0.0001, "loss": 0.1708, "step": 1347 }, { "epoch": 2.1975872187805674, "grad_norm": 0.12414538115262985, "learning_rate": 0.0001, "loss": 0.1803, "step": 1348 }, { "epoch": 2.199217476361265, "grad_norm": 0.12983959913253784, "learning_rate": 0.0001, "loss": 0.175, "step": 1349 }, { "epoch": 2.2008477339419628, "grad_norm": 0.10924775898456573, "learning_rate": 0.0001, "loss": 0.1671, "step": 1350 }, { "epoch": 2.2024779915226604, "grad_norm": 0.12970325350761414, "learning_rate": 0.0001, "loss": 0.1997, "step": 1351 }, { "epoch": 2.204108249103358, "grad_norm": 0.13614393770694733, "learning_rate": 0.0001, "loss": 0.1772, "step": 1352 }, { "epoch": 2.2057385066840562, "grad_norm": 0.16756513714790344, "learning_rate": 0.0001, "loss": 0.1666, "step": 1353 }, { "epoch": 2.207368764264754, "grad_norm": 0.12025320529937744, "learning_rate": 0.0001, "loss": 0.1729, "step": 1354 }, { "epoch": 2.2089990218454516, "grad_norm": 0.13931293785572052, "learning_rate": 0.0001, "loss": 0.1709, "step": 1355 }, { "epoch": 2.2106292794261493, "grad_norm": 0.14095361530780792, "learning_rate": 0.0001, "loss": 0.1794, "step": 1356 }, { "epoch": 2.212259537006847, "grad_norm": 0.14122013747692108, "learning_rate": 0.0001, "loss": 0.1732, "step": 1357 }, { "epoch": 2.2138897945875446, "grad_norm": 0.14121028780937195, "learning_rate": 0.0001, "loss": 0.1711, "step": 1358 }, { "epoch": 2.2155200521682428, "grad_norm": 0.1547023057937622, "learning_rate": 0.0001, "loss": 0.1962, "step": 1359 }, { "epoch": 2.2171503097489405, "grad_norm": 0.12952187657356262, "learning_rate": 0.0001, "loss": 0.164, "step": 1360 }, { "epoch": 2.218780567329638, "grad_norm": 0.17359581589698792, "learning_rate": 0.0001, "loss": 0.1707, "step": 1361 }, { "epoch": 2.220410824910336, "grad_norm": 0.12868142127990723, "learning_rate": 0.0001, "loss": 0.1834, "step": 1362 }, { "epoch": 2.2220410824910335, "grad_norm": 0.15855832397937775, "learning_rate": 0.0001, "loss": 0.1985, "step": 1363 }, { "epoch": 2.223671340071731, "grad_norm": 0.14433158934116364, "learning_rate": 0.0001, "loss": 0.1943, "step": 1364 }, { "epoch": 2.225301597652429, "grad_norm": 0.13805247843265533, "learning_rate": 0.0001, "loss": 0.1771, "step": 1365 }, { "epoch": 2.226931855233127, "grad_norm": 0.12476252019405365, "learning_rate": 0.0001, "loss": 0.1835, "step": 1366 }, { "epoch": 2.2285621128138247, "grad_norm": 0.14098286628723145, "learning_rate": 0.0001, "loss": 0.1797, "step": 1367 }, { "epoch": 2.2301923703945223, "grad_norm": 0.10581690073013306, "learning_rate": 0.0001, "loss": 0.1603, "step": 1368 }, { "epoch": 2.23182262797522, "grad_norm": 0.13662414252758026, "learning_rate": 0.0001, "loss": 0.1598, "step": 1369 }, { "epoch": 2.2334528855559177, "grad_norm": 0.13440090417861938, "learning_rate": 0.0001, "loss": 0.1762, "step": 1370 }, { "epoch": 2.2350831431366154, "grad_norm": 0.17255081236362457, "learning_rate": 0.0001, "loss": 0.1885, "step": 1371 }, { "epoch": 2.2367134007173135, "grad_norm": 0.13008393347263336, "learning_rate": 0.0001, "loss": 0.1834, "step": 1372 }, { "epoch": 2.238343658298011, "grad_norm": 0.19530728459358215, "learning_rate": 0.0001, "loss": 0.1802, "step": 1373 }, { "epoch": 2.239973915878709, "grad_norm": 0.12233468890190125, "learning_rate": 0.0001, "loss": 0.1655, "step": 1374 }, { "epoch": 2.2416041734594065, "grad_norm": 0.12828123569488525, "learning_rate": 0.0001, "loss": 0.1693, "step": 1375 }, { "epoch": 2.2432344310401042, "grad_norm": 0.1313515156507492, "learning_rate": 0.0001, "loss": 0.1698, "step": 1376 }, { "epoch": 2.244864688620802, "grad_norm": 0.1494060903787613, "learning_rate": 0.0001, "loss": 0.1845, "step": 1377 }, { "epoch": 2.2464949462015, "grad_norm": 0.13343721628189087, "learning_rate": 0.0001, "loss": 0.1628, "step": 1378 }, { "epoch": 2.2481252037821977, "grad_norm": 0.14259859919548035, "learning_rate": 0.0001, "loss": 0.185, "step": 1379 }, { "epoch": 2.2497554613628954, "grad_norm": 0.14472530782222748, "learning_rate": 0.0001, "loss": 0.185, "step": 1380 }, { "epoch": 2.251385718943593, "grad_norm": 0.1287330836057663, "learning_rate": 0.0001, "loss": 0.1822, "step": 1381 }, { "epoch": 2.2530159765242908, "grad_norm": 0.11863413453102112, "learning_rate": 0.0001, "loss": 0.1704, "step": 1382 }, { "epoch": 2.2546462341049884, "grad_norm": 0.10795796662569046, "learning_rate": 0.0001, "loss": 0.1687, "step": 1383 }, { "epoch": 2.2562764916856866, "grad_norm": 0.10524063557386398, "learning_rate": 0.0001, "loss": 0.172, "step": 1384 }, { "epoch": 2.2579067492663842, "grad_norm": 0.1203174889087677, "learning_rate": 0.0001, "loss": 0.1722, "step": 1385 }, { "epoch": 2.259537006847082, "grad_norm": 0.1155901700258255, "learning_rate": 0.0001, "loss": 0.1642, "step": 1386 }, { "epoch": 2.2611672644277796, "grad_norm": 0.13262033462524414, "learning_rate": 0.0001, "loss": 0.1767, "step": 1387 }, { "epoch": 2.2627975220084773, "grad_norm": 0.13157077133655548, "learning_rate": 0.0001, "loss": 0.1634, "step": 1388 }, { "epoch": 2.264427779589175, "grad_norm": 0.12929612398147583, "learning_rate": 0.0001, "loss": 0.1694, "step": 1389 }, { "epoch": 2.2660580371698726, "grad_norm": 0.12153502553701401, "learning_rate": 0.0001, "loss": 0.1616, "step": 1390 }, { "epoch": 2.2676882947505708, "grad_norm": 0.15442398190498352, "learning_rate": 0.0001, "loss": 0.1773, "step": 1391 }, { "epoch": 2.2693185523312684, "grad_norm": 0.15606394410133362, "learning_rate": 0.0001, "loss": 0.177, "step": 1392 }, { "epoch": 2.270948809911966, "grad_norm": 0.13780561089515686, "learning_rate": 0.0001, "loss": 0.1709, "step": 1393 }, { "epoch": 2.272579067492664, "grad_norm": 0.4755675792694092, "learning_rate": 0.0001, "loss": 0.1643, "step": 1394 }, { "epoch": 2.2742093250733615, "grad_norm": 0.1509324312210083, "learning_rate": 0.0001, "loss": 0.1932, "step": 1395 }, { "epoch": 2.275839582654059, "grad_norm": 0.18392273783683777, "learning_rate": 0.0001, "loss": 0.1861, "step": 1396 }, { "epoch": 2.2774698402347573, "grad_norm": 0.12845247983932495, "learning_rate": 0.0001, "loss": 0.1762, "step": 1397 }, { "epoch": 2.279100097815455, "grad_norm": 0.13720661401748657, "learning_rate": 0.0001, "loss": 0.1717, "step": 1398 }, { "epoch": 2.2807303553961527, "grad_norm": 0.13770131766796112, "learning_rate": 0.0001, "loss": 0.1682, "step": 1399 }, { "epoch": 2.2823606129768503, "grad_norm": 0.11446554213762283, "learning_rate": 0.0001, "loss": 0.1661, "step": 1400 }, { "epoch": 2.283990870557548, "grad_norm": 0.12670235335826874, "learning_rate": 0.0001, "loss": 0.1639, "step": 1401 }, { "epoch": 2.2856211281382457, "grad_norm": 0.14371562004089355, "learning_rate": 0.0001, "loss": 0.1895, "step": 1402 }, { "epoch": 2.2872513857189434, "grad_norm": 0.12471149116754532, "learning_rate": 0.0001, "loss": 0.1804, "step": 1403 }, { "epoch": 2.2888816432996415, "grad_norm": 0.13023298978805542, "learning_rate": 0.0001, "loss": 0.1715, "step": 1404 }, { "epoch": 2.290511900880339, "grad_norm": 0.1387406587600708, "learning_rate": 0.0001, "loss": 0.1694, "step": 1405 }, { "epoch": 2.292142158461037, "grad_norm": 0.127483531832695, "learning_rate": 0.0001, "loss": 0.1921, "step": 1406 }, { "epoch": 2.2937724160417345, "grad_norm": 0.14067308604717255, "learning_rate": 0.0001, "loss": 0.18, "step": 1407 }, { "epoch": 2.295402673622432, "grad_norm": 0.12968185544013977, "learning_rate": 0.0001, "loss": 0.182, "step": 1408 }, { "epoch": 2.29703293120313, "grad_norm": 0.23598945140838623, "learning_rate": 0.0001, "loss": 0.1727, "step": 1409 }, { "epoch": 2.298663188783828, "grad_norm": 0.1529974788427353, "learning_rate": 0.0001, "loss": 0.1738, "step": 1410 }, { "epoch": 2.3002934463645257, "grad_norm": 0.1288285255432129, "learning_rate": 0.0001, "loss": 0.1828, "step": 1411 }, { "epoch": 2.3019237039452234, "grad_norm": 0.12833067774772644, "learning_rate": 0.0001, "loss": 0.165, "step": 1412 }, { "epoch": 2.303553961525921, "grad_norm": 0.11376657336950302, "learning_rate": 0.0001, "loss": 0.1576, "step": 1413 }, { "epoch": 2.3051842191066187, "grad_norm": 0.23640869557857513, "learning_rate": 0.0001, "loss": 0.1678, "step": 1414 }, { "epoch": 2.3068144766873164, "grad_norm": 0.11846059560775757, "learning_rate": 0.0001, "loss": 0.1643, "step": 1415 }, { "epoch": 2.308444734268014, "grad_norm": 0.1381588727235794, "learning_rate": 0.0001, "loss": 0.1764, "step": 1416 }, { "epoch": 2.3100749918487122, "grad_norm": 0.28972718119621277, "learning_rate": 0.0001, "loss": 0.1956, "step": 1417 }, { "epoch": 2.31170524942941, "grad_norm": 0.1268901377916336, "learning_rate": 0.0001, "loss": 0.1725, "step": 1418 }, { "epoch": 2.3133355070101076, "grad_norm": 0.13899771869182587, "learning_rate": 0.0001, "loss": 0.1827, "step": 1419 }, { "epoch": 2.3149657645908053, "grad_norm": 0.16093842685222626, "learning_rate": 0.0001, "loss": 0.1987, "step": 1420 }, { "epoch": 2.316596022171503, "grad_norm": 0.13014528155326843, "learning_rate": 0.0001, "loss": 0.1937, "step": 1421 }, { "epoch": 2.318226279752201, "grad_norm": 0.13447855412960052, "learning_rate": 0.0001, "loss": 0.1819, "step": 1422 }, { "epoch": 2.3198565373328988, "grad_norm": 0.1403612345457077, "learning_rate": 0.0001, "loss": 0.1784, "step": 1423 }, { "epoch": 2.3214867949135964, "grad_norm": 0.12157420068979263, "learning_rate": 0.0001, "loss": 0.1851, "step": 1424 }, { "epoch": 2.323117052494294, "grad_norm": 0.11106284707784653, "learning_rate": 0.0001, "loss": 0.1635, "step": 1425 }, { "epoch": 2.324747310074992, "grad_norm": 0.1440584808588028, "learning_rate": 0.0001, "loss": 0.1811, "step": 1426 }, { "epoch": 2.3263775676556895, "grad_norm": 0.143922358751297, "learning_rate": 0.0001, "loss": 0.1781, "step": 1427 }, { "epoch": 2.328007825236387, "grad_norm": 0.13588687777519226, "learning_rate": 0.0001, "loss": 0.1661, "step": 1428 }, { "epoch": 2.3296380828170853, "grad_norm": 0.12500828504562378, "learning_rate": 0.0001, "loss": 0.1798, "step": 1429 }, { "epoch": 2.331268340397783, "grad_norm": 0.13726584613323212, "learning_rate": 0.0001, "loss": 0.1887, "step": 1430 }, { "epoch": 2.3328985979784806, "grad_norm": 0.12741105258464813, "learning_rate": 0.0001, "loss": 0.1622, "step": 1431 }, { "epoch": 2.3345288555591783, "grad_norm": 0.15056177973747253, "learning_rate": 0.0001, "loss": 0.1873, "step": 1432 }, { "epoch": 2.336159113139876, "grad_norm": 0.12522774934768677, "learning_rate": 0.0001, "loss": 0.1681, "step": 1433 }, { "epoch": 2.3377893707205737, "grad_norm": 0.3474383056163788, "learning_rate": 0.0001, "loss": 0.1884, "step": 1434 }, { "epoch": 2.339419628301272, "grad_norm": 0.12175151705741882, "learning_rate": 0.0001, "loss": 0.1659, "step": 1435 }, { "epoch": 2.3410498858819695, "grad_norm": 0.12453535199165344, "learning_rate": 0.0001, "loss": 0.1592, "step": 1436 }, { "epoch": 2.342680143462667, "grad_norm": 0.12441691756248474, "learning_rate": 0.0001, "loss": 0.1827, "step": 1437 }, { "epoch": 2.344310401043365, "grad_norm": 0.14301401376724243, "learning_rate": 0.0001, "loss": 0.1865, "step": 1438 }, { "epoch": 2.3459406586240625, "grad_norm": 0.11441715806722641, "learning_rate": 0.0001, "loss": 0.167, "step": 1439 }, { "epoch": 2.34757091620476, "grad_norm": 0.12235062569379807, "learning_rate": 0.0001, "loss": 0.1738, "step": 1440 }, { "epoch": 2.349201173785458, "grad_norm": 0.1444825977087021, "learning_rate": 0.0001, "loss": 0.177, "step": 1441 }, { "epoch": 2.350831431366156, "grad_norm": 0.12760686874389648, "learning_rate": 0.0001, "loss": 0.1584, "step": 1442 }, { "epoch": 2.3524616889468537, "grad_norm": 0.12846995890140533, "learning_rate": 0.0001, "loss": 0.1659, "step": 1443 }, { "epoch": 2.3540919465275514, "grad_norm": 0.12887585163116455, "learning_rate": 0.0001, "loss": 0.1755, "step": 1444 }, { "epoch": 2.355722204108249, "grad_norm": 0.15019284188747406, "learning_rate": 0.0001, "loss": 0.1722, "step": 1445 }, { "epoch": 2.3573524616889467, "grad_norm": 0.13723956048488617, "learning_rate": 0.0001, "loss": 0.1918, "step": 1446 }, { "epoch": 2.3589827192696444, "grad_norm": 0.13037189841270447, "learning_rate": 0.0001, "loss": 0.1725, "step": 1447 }, { "epoch": 2.3606129768503425, "grad_norm": 0.12533016502857208, "learning_rate": 0.0001, "loss": 0.1726, "step": 1448 }, { "epoch": 2.3622432344310402, "grad_norm": 0.11401855945587158, "learning_rate": 0.0001, "loss": 0.1597, "step": 1449 }, { "epoch": 2.363873492011738, "grad_norm": 0.12723423540592194, "learning_rate": 0.0001, "loss": 0.1597, "step": 1450 }, { "epoch": 2.3655037495924356, "grad_norm": 0.12735813856124878, "learning_rate": 0.0001, "loss": 0.176, "step": 1451 }, { "epoch": 2.3671340071731333, "grad_norm": 0.12557192146778107, "learning_rate": 0.0001, "loss": 0.1639, "step": 1452 }, { "epoch": 2.368764264753831, "grad_norm": 0.1219494640827179, "learning_rate": 0.0001, "loss": 0.1845, "step": 1453 }, { "epoch": 2.3703945223345286, "grad_norm": 0.1263827085494995, "learning_rate": 0.0001, "loss": 0.1635, "step": 1454 }, { "epoch": 2.3720247799152268, "grad_norm": 0.13217493891716003, "learning_rate": 0.0001, "loss": 0.1833, "step": 1455 }, { "epoch": 2.3736550374959244, "grad_norm": 0.1541173756122589, "learning_rate": 0.0001, "loss": 0.1744, "step": 1456 }, { "epoch": 2.375285295076622, "grad_norm": 0.13489516079425812, "learning_rate": 0.0001, "loss": 0.1801, "step": 1457 }, { "epoch": 2.37691555265732, "grad_norm": 0.13315707445144653, "learning_rate": 0.0001, "loss": 0.1697, "step": 1458 }, { "epoch": 2.3785458102380175, "grad_norm": 0.14040572941303253, "learning_rate": 0.0001, "loss": 0.1663, "step": 1459 }, { "epoch": 2.3801760678187156, "grad_norm": 0.1226130798459053, "learning_rate": 0.0001, "loss": 0.1745, "step": 1460 }, { "epoch": 2.3818063253994133, "grad_norm": 0.14467410743236542, "learning_rate": 0.0001, "loss": 0.193, "step": 1461 }, { "epoch": 2.383436582980111, "grad_norm": 0.1495625525712967, "learning_rate": 0.0001, "loss": 0.1709, "step": 1462 }, { "epoch": 2.3850668405608086, "grad_norm": 0.17007926106452942, "learning_rate": 0.0001, "loss": 0.1708, "step": 1463 }, { "epoch": 2.3866970981415063, "grad_norm": 0.09908633679151535, "learning_rate": 0.0001, "loss": 0.166, "step": 1464 }, { "epoch": 2.388327355722204, "grad_norm": 0.1391552835702896, "learning_rate": 0.0001, "loss": 0.1842, "step": 1465 }, { "epoch": 2.3899576133029017, "grad_norm": 0.12647175788879395, "learning_rate": 0.0001, "loss": 0.1737, "step": 1466 }, { "epoch": 2.3915878708835994, "grad_norm": 0.13236671686172485, "learning_rate": 0.0001, "loss": 0.1911, "step": 1467 }, { "epoch": 2.3932181284642975, "grad_norm": 0.11911871284246445, "learning_rate": 0.0001, "loss": 0.1675, "step": 1468 }, { "epoch": 2.394848386044995, "grad_norm": 0.11741984635591507, "learning_rate": 0.0001, "loss": 0.1764, "step": 1469 }, { "epoch": 2.396478643625693, "grad_norm": 0.13254094123840332, "learning_rate": 0.0001, "loss": 0.1638, "step": 1470 }, { "epoch": 2.3981089012063905, "grad_norm": 0.12681660056114197, "learning_rate": 0.0001, "loss": 0.1728, "step": 1471 }, { "epoch": 2.399739158787088, "grad_norm": 0.14695464074611664, "learning_rate": 0.0001, "loss": 0.1721, "step": 1472 }, { "epoch": 2.4013694163677863, "grad_norm": 0.15564168989658356, "learning_rate": 0.0001, "loss": 0.1719, "step": 1473 }, { "epoch": 2.402999673948484, "grad_norm": 0.12581582367420197, "learning_rate": 0.0001, "loss": 0.1757, "step": 1474 }, { "epoch": 2.4046299315291817, "grad_norm": 0.13274727761745453, "learning_rate": 0.0001, "loss": 0.177, "step": 1475 }, { "epoch": 2.4062601891098794, "grad_norm": 0.12894096970558167, "learning_rate": 0.0001, "loss": 0.1752, "step": 1476 }, { "epoch": 2.407890446690577, "grad_norm": 0.11936718225479126, "learning_rate": 0.0001, "loss": 0.1838, "step": 1477 }, { "epoch": 2.4095207042712747, "grad_norm": 0.16726456582546234, "learning_rate": 0.0001, "loss": 0.1727, "step": 1478 }, { "epoch": 2.4111509618519724, "grad_norm": 0.12068488448858261, "learning_rate": 0.0001, "loss": 0.1823, "step": 1479 }, { "epoch": 2.4127812194326705, "grad_norm": 0.11540605872869492, "learning_rate": 0.0001, "loss": 0.1856, "step": 1480 }, { "epoch": 2.4144114770133682, "grad_norm": 0.1343366652727127, "learning_rate": 0.0001, "loss": 0.1693, "step": 1481 }, { "epoch": 2.416041734594066, "grad_norm": 0.11893413960933685, "learning_rate": 0.0001, "loss": 0.1744, "step": 1482 }, { "epoch": 2.4176719921747636, "grad_norm": 0.124918632209301, "learning_rate": 0.0001, "loss": 0.1742, "step": 1483 }, { "epoch": 2.4193022497554613, "grad_norm": 0.13789139688014984, "learning_rate": 0.0001, "loss": 0.178, "step": 1484 }, { "epoch": 2.420932507336159, "grad_norm": 0.13478578627109528, "learning_rate": 0.0001, "loss": 0.1757, "step": 1485 }, { "epoch": 2.422562764916857, "grad_norm": 0.13447391986846924, "learning_rate": 0.0001, "loss": 0.1851, "step": 1486 }, { "epoch": 2.4241930224975548, "grad_norm": 0.12459027767181396, "learning_rate": 0.0001, "loss": 0.1622, "step": 1487 }, { "epoch": 2.4258232800782524, "grad_norm": 0.12010418623685837, "learning_rate": 0.0001, "loss": 0.1724, "step": 1488 }, { "epoch": 2.42745353765895, "grad_norm": 0.14664210379123688, "learning_rate": 0.0001, "loss": 0.1797, "step": 1489 }, { "epoch": 2.429083795239648, "grad_norm": 0.12377094477415085, "learning_rate": 0.0001, "loss": 0.1693, "step": 1490 }, { "epoch": 2.4307140528203455, "grad_norm": 0.12321583181619644, "learning_rate": 0.0001, "loss": 0.1778, "step": 1491 }, { "epoch": 2.432344310401043, "grad_norm": 0.14363206923007965, "learning_rate": 0.0001, "loss": 0.1738, "step": 1492 }, { "epoch": 2.4339745679817413, "grad_norm": 0.1732596904039383, "learning_rate": 0.0001, "loss": 0.1909, "step": 1493 }, { "epoch": 2.435604825562439, "grad_norm": 0.12334398180246353, "learning_rate": 0.0001, "loss": 0.163, "step": 1494 }, { "epoch": 2.4372350831431366, "grad_norm": 0.13248227536678314, "learning_rate": 0.0001, "loss": 0.183, "step": 1495 }, { "epoch": 2.4388653407238343, "grad_norm": 0.13810336589813232, "learning_rate": 0.0001, "loss": 0.1833, "step": 1496 }, { "epoch": 2.440495598304532, "grad_norm": 0.1454666256904602, "learning_rate": 0.0001, "loss": 0.1764, "step": 1497 }, { "epoch": 2.4421258558852297, "grad_norm": 0.14574459195137024, "learning_rate": 0.0001, "loss": 0.1632, "step": 1498 }, { "epoch": 2.443756113465928, "grad_norm": 0.13363990187644958, "learning_rate": 0.0001, "loss": 0.1881, "step": 1499 }, { "epoch": 2.4453863710466255, "grad_norm": 0.14776456356048584, "learning_rate": 0.0001, "loss": 0.1843, "step": 1500 }, { "epoch": 2.4453863710466255, "eval_loss": 0.1997021734714508, "eval_runtime": 460.2618, "eval_samples_per_second": 4.1, "eval_steps_per_second": 1.026, "step": 1500 }, { "epoch": 2.447016628627323, "grad_norm": 0.10768138617277145, "learning_rate": 0.0001, "loss": 0.1714, "step": 1501 }, { "epoch": 2.448646886208021, "grad_norm": 0.13541437685489655, "learning_rate": 0.0001, "loss": 0.1735, "step": 1502 }, { "epoch": 2.4502771437887185, "grad_norm": 0.12808628380298615, "learning_rate": 0.0001, "loss": 0.1558, "step": 1503 }, { "epoch": 2.451907401369416, "grad_norm": 0.11601930111646652, "learning_rate": 0.0001, "loss": 0.1671, "step": 1504 }, { "epoch": 2.453537658950114, "grad_norm": 0.12584732472896576, "learning_rate": 0.0001, "loss": 0.1618, "step": 1505 }, { "epoch": 2.455167916530812, "grad_norm": 0.12000913172960281, "learning_rate": 0.0001, "loss": 0.1719, "step": 1506 }, { "epoch": 2.4567981741115097, "grad_norm": 0.1270546168088913, "learning_rate": 0.0001, "loss": 0.1796, "step": 1507 }, { "epoch": 2.4584284316922074, "grad_norm": 0.12946002185344696, "learning_rate": 0.0001, "loss": 0.1731, "step": 1508 }, { "epoch": 2.460058689272905, "grad_norm": 0.14367902278900146, "learning_rate": 0.0001, "loss": 0.1767, "step": 1509 }, { "epoch": 2.4616889468536027, "grad_norm": 0.14034403860569, "learning_rate": 0.0001, "loss": 0.1757, "step": 1510 }, { "epoch": 2.463319204434301, "grad_norm": 0.11473418027162552, "learning_rate": 0.0001, "loss": 0.1522, "step": 1511 }, { "epoch": 2.4649494620149985, "grad_norm": 0.12510861456394196, "learning_rate": 0.0001, "loss": 0.1695, "step": 1512 }, { "epoch": 2.466579719595696, "grad_norm": 0.12651176750659943, "learning_rate": 0.0001, "loss": 0.1877, "step": 1513 }, { "epoch": 2.468209977176394, "grad_norm": 0.13196395337581635, "learning_rate": 0.0001, "loss": 0.1667, "step": 1514 }, { "epoch": 2.4698402347570916, "grad_norm": 0.13804642856121063, "learning_rate": 0.0001, "loss": 0.1652, "step": 1515 }, { "epoch": 2.4714704923377893, "grad_norm": 0.14535880088806152, "learning_rate": 0.0001, "loss": 0.1864, "step": 1516 }, { "epoch": 2.473100749918487, "grad_norm": 0.12995412945747375, "learning_rate": 0.0001, "loss": 0.1796, "step": 1517 }, { "epoch": 2.474731007499185, "grad_norm": 0.1427994966506958, "learning_rate": 0.0001, "loss": 0.1838, "step": 1518 }, { "epoch": 2.4763612650798827, "grad_norm": 0.11478241533041, "learning_rate": 0.0001, "loss": 0.1627, "step": 1519 }, { "epoch": 2.4779915226605804, "grad_norm": 0.1354149729013443, "learning_rate": 0.0001, "loss": 0.1819, "step": 1520 }, { "epoch": 2.479621780241278, "grad_norm": 0.11658424139022827, "learning_rate": 0.0001, "loss": 0.1617, "step": 1521 }, { "epoch": 2.481252037821976, "grad_norm": 0.12623891234397888, "learning_rate": 0.0001, "loss": 0.1809, "step": 1522 }, { "epoch": 2.4828822954026735, "grad_norm": 0.12294737994670868, "learning_rate": 0.0001, "loss": 0.1713, "step": 1523 }, { "epoch": 2.4845125529833716, "grad_norm": 0.1138739064335823, "learning_rate": 0.0001, "loss": 0.1817, "step": 1524 }, { "epoch": 2.4861428105640693, "grad_norm": 0.13236074149608612, "learning_rate": 0.0001, "loss": 0.1753, "step": 1525 }, { "epoch": 2.487773068144767, "grad_norm": 0.11817993223667145, "learning_rate": 0.0001, "loss": 0.1849, "step": 1526 }, { "epoch": 2.4894033257254646, "grad_norm": 0.10676276683807373, "learning_rate": 0.0001, "loss": 0.1736, "step": 1527 }, { "epoch": 2.4910335833061623, "grad_norm": 0.13718129694461823, "learning_rate": 0.0001, "loss": 0.1734, "step": 1528 }, { "epoch": 2.49266384088686, "grad_norm": 0.13279683887958527, "learning_rate": 0.0001, "loss": 0.188, "step": 1529 }, { "epoch": 2.4942940984675577, "grad_norm": 0.13865776360034943, "learning_rate": 0.0001, "loss": 0.1692, "step": 1530 }, { "epoch": 2.495924356048256, "grad_norm": 0.13090378046035767, "learning_rate": 0.0001, "loss": 0.1777, "step": 1531 }, { "epoch": 2.4975546136289535, "grad_norm": 0.12446148693561554, "learning_rate": 0.0001, "loss": 0.1634, "step": 1532 }, { "epoch": 2.499184871209651, "grad_norm": 0.1395845115184784, "learning_rate": 0.0001, "loss": 0.1986, "step": 1533 }, { "epoch": 2.500815128790349, "grad_norm": 0.11929094046354294, "learning_rate": 0.0001, "loss": 0.1686, "step": 1534 }, { "epoch": 2.5024453863710465, "grad_norm": 0.14516448974609375, "learning_rate": 0.0001, "loss": 0.18, "step": 1535 }, { "epoch": 2.5040756439517446, "grad_norm": 0.11706582456827164, "learning_rate": 0.0001, "loss": 0.175, "step": 1536 }, { "epoch": 2.5057059015324423, "grad_norm": 0.13064873218536377, "learning_rate": 0.0001, "loss": 0.1623, "step": 1537 }, { "epoch": 2.50733615911314, "grad_norm": 0.12183361500501633, "learning_rate": 0.0001, "loss": 0.1739, "step": 1538 }, { "epoch": 2.5089664166938377, "grad_norm": 0.11860349029302597, "learning_rate": 0.0001, "loss": 0.1694, "step": 1539 }, { "epoch": 2.5105966742745354, "grad_norm": 0.11384200304746628, "learning_rate": 0.0001, "loss": 0.167, "step": 1540 }, { "epoch": 2.512226931855233, "grad_norm": 0.13300910592079163, "learning_rate": 0.0001, "loss": 0.1665, "step": 1541 }, { "epoch": 2.5138571894359307, "grad_norm": 0.11307025700807571, "learning_rate": 0.0001, "loss": 0.1778, "step": 1542 }, { "epoch": 2.5154874470166284, "grad_norm": 0.11402298510074615, "learning_rate": 0.0001, "loss": 0.1622, "step": 1543 }, { "epoch": 2.5171177045973265, "grad_norm": 0.13868348300457, "learning_rate": 0.0001, "loss": 0.1831, "step": 1544 }, { "epoch": 2.518747962178024, "grad_norm": 0.14691469073295593, "learning_rate": 0.0001, "loss": 0.1707, "step": 1545 }, { "epoch": 2.520378219758722, "grad_norm": 0.1223243921995163, "learning_rate": 0.0001, "loss": 0.1828, "step": 1546 }, { "epoch": 2.5220084773394196, "grad_norm": 0.1313992589712143, "learning_rate": 0.0001, "loss": 0.167, "step": 1547 }, { "epoch": 2.5236387349201173, "grad_norm": 0.12788835167884827, "learning_rate": 0.0001, "loss": 0.1786, "step": 1548 }, { "epoch": 2.5252689925008154, "grad_norm": 0.11976204812526703, "learning_rate": 0.0001, "loss": 0.1641, "step": 1549 }, { "epoch": 2.526899250081513, "grad_norm": 0.11993979662656784, "learning_rate": 0.0001, "loss": 0.1705, "step": 1550 }, { "epoch": 2.5285295076622107, "grad_norm": 0.1298224925994873, "learning_rate": 0.0001, "loss": 0.1801, "step": 1551 }, { "epoch": 2.5301597652429084, "grad_norm": 0.12879474461078644, "learning_rate": 0.0001, "loss": 0.1566, "step": 1552 }, { "epoch": 2.531790022823606, "grad_norm": 0.11655417829751968, "learning_rate": 0.0001, "loss": 0.1542, "step": 1553 }, { "epoch": 2.533420280404304, "grad_norm": 0.14049752056598663, "learning_rate": 0.0001, "loss": 0.1899, "step": 1554 }, { "epoch": 2.5350505379850015, "grad_norm": 0.13857971131801605, "learning_rate": 0.0001, "loss": 0.1755, "step": 1555 }, { "epoch": 2.536680795565699, "grad_norm": 0.12514682114124298, "learning_rate": 0.0001, "loss": 0.1743, "step": 1556 }, { "epoch": 2.5383110531463973, "grad_norm": 0.12018663436174393, "learning_rate": 0.0001, "loss": 0.1788, "step": 1557 }, { "epoch": 2.539941310727095, "grad_norm": 0.14534510672092438, "learning_rate": 0.0001, "loss": 0.1679, "step": 1558 }, { "epoch": 2.5415715683077926, "grad_norm": 0.1236572340130806, "learning_rate": 0.0001, "loss": 0.1699, "step": 1559 }, { "epoch": 2.5432018258884903, "grad_norm": 0.13306783139705658, "learning_rate": 0.0001, "loss": 0.1845, "step": 1560 }, { "epoch": 2.544832083469188, "grad_norm": 0.10747732222080231, "learning_rate": 0.0001, "loss": 0.1766, "step": 1561 }, { "epoch": 2.546462341049886, "grad_norm": 0.12410397827625275, "learning_rate": 0.0001, "loss": 0.1829, "step": 1562 }, { "epoch": 2.548092598630584, "grad_norm": 0.14367325603961945, "learning_rate": 0.0001, "loss": 0.1845, "step": 1563 }, { "epoch": 2.5497228562112815, "grad_norm": 0.12617376446723938, "learning_rate": 0.0001, "loss": 0.1613, "step": 1564 }, { "epoch": 2.551353113791979, "grad_norm": 0.13464583456516266, "learning_rate": 0.0001, "loss": 0.1825, "step": 1565 }, { "epoch": 2.552983371372677, "grad_norm": 0.15118366479873657, "learning_rate": 0.0001, "loss": 0.1831, "step": 1566 }, { "epoch": 2.5546136289533745, "grad_norm": 0.13260018825531006, "learning_rate": 0.0001, "loss": 0.1734, "step": 1567 }, { "epoch": 2.556243886534072, "grad_norm": 0.1262848824262619, "learning_rate": 0.0001, "loss": 0.168, "step": 1568 }, { "epoch": 2.55787414411477, "grad_norm": 0.1359010487794876, "learning_rate": 0.0001, "loss": 0.1752, "step": 1569 }, { "epoch": 2.559504401695468, "grad_norm": 0.12151306122541428, "learning_rate": 0.0001, "loss": 0.1809, "step": 1570 }, { "epoch": 2.5611346592761657, "grad_norm": 0.144693985581398, "learning_rate": 0.0001, "loss": 0.1637, "step": 1571 }, { "epoch": 2.5627649168568634, "grad_norm": 0.13245713710784912, "learning_rate": 0.0001, "loss": 0.1741, "step": 1572 }, { "epoch": 2.564395174437561, "grad_norm": 0.14154277741909027, "learning_rate": 0.0001, "loss": 0.1844, "step": 1573 }, { "epoch": 2.5660254320182587, "grad_norm": 0.14346501231193542, "learning_rate": 0.0001, "loss": 0.1849, "step": 1574 }, { "epoch": 2.567655689598957, "grad_norm": 0.12976010143756866, "learning_rate": 0.0001, "loss": 0.1784, "step": 1575 }, { "epoch": 2.5692859471796545, "grad_norm": 0.11773863434791565, "learning_rate": 0.0001, "loss": 0.168, "step": 1576 }, { "epoch": 2.570916204760352, "grad_norm": 0.12860426306724548, "learning_rate": 0.0001, "loss": 0.1708, "step": 1577 }, { "epoch": 2.57254646234105, "grad_norm": 0.12525953352451324, "learning_rate": 0.0001, "loss": 0.1731, "step": 1578 }, { "epoch": 2.5741767199217476, "grad_norm": 0.13797353208065033, "learning_rate": 0.0001, "loss": 0.1886, "step": 1579 }, { "epoch": 2.5758069775024452, "grad_norm": 0.13514308631420135, "learning_rate": 0.0001, "loss": 0.1769, "step": 1580 }, { "epoch": 2.577437235083143, "grad_norm": 0.1197233498096466, "learning_rate": 0.0001, "loss": 0.1793, "step": 1581 }, { "epoch": 2.5790674926638406, "grad_norm": 0.1371515542268753, "learning_rate": 0.0001, "loss": 0.1608, "step": 1582 }, { "epoch": 2.5806977502445387, "grad_norm": 0.12182509154081345, "learning_rate": 0.0001, "loss": 0.1808, "step": 1583 }, { "epoch": 2.5823280078252364, "grad_norm": 0.12575587630271912, "learning_rate": 0.0001, "loss": 0.1873, "step": 1584 }, { "epoch": 2.583958265405934, "grad_norm": 0.11890911310911179, "learning_rate": 0.0001, "loss": 0.1666, "step": 1585 }, { "epoch": 2.5855885229866318, "grad_norm": 0.12488003820180893, "learning_rate": 0.0001, "loss": 0.1937, "step": 1586 }, { "epoch": 2.58721878056733, "grad_norm": 0.13378724455833435, "learning_rate": 0.0001, "loss": 0.1894, "step": 1587 }, { "epoch": 2.5888490381480276, "grad_norm": 0.14022649824619293, "learning_rate": 0.0001, "loss": 0.1727, "step": 1588 }, { "epoch": 2.5904792957287253, "grad_norm": 0.12191809713840485, "learning_rate": 0.0001, "loss": 0.1657, "step": 1589 }, { "epoch": 2.592109553309423, "grad_norm": 0.11590269953012466, "learning_rate": 0.0001, "loss": 0.1559, "step": 1590 }, { "epoch": 2.5937398108901206, "grad_norm": 0.18004612624645233, "learning_rate": 0.0001, "loss": 0.1821, "step": 1591 }, { "epoch": 2.5953700684708183, "grad_norm": 0.11519701033830643, "learning_rate": 0.0001, "loss": 0.1734, "step": 1592 }, { "epoch": 2.597000326051516, "grad_norm": 0.110859714448452, "learning_rate": 0.0001, "loss": 0.1752, "step": 1593 }, { "epoch": 2.5986305836322137, "grad_norm": 0.11771699786186218, "learning_rate": 0.0001, "loss": 0.1929, "step": 1594 }, { "epoch": 2.600260841212912, "grad_norm": 0.1058066114783287, "learning_rate": 0.0001, "loss": 0.1635, "step": 1595 }, { "epoch": 2.6018910987936095, "grad_norm": 0.13187898695468903, "learning_rate": 0.0001, "loss": 0.1653, "step": 1596 }, { "epoch": 2.603521356374307, "grad_norm": 0.14017753303050995, "learning_rate": 0.0001, "loss": 0.1767, "step": 1597 }, { "epoch": 2.605151613955005, "grad_norm": 0.13309310376644135, "learning_rate": 0.0001, "loss": 0.1684, "step": 1598 }, { "epoch": 2.6067818715357025, "grad_norm": 0.1273549348115921, "learning_rate": 0.0001, "loss": 0.1756, "step": 1599 }, { "epoch": 2.6084121291164006, "grad_norm": 0.16523495316505432, "learning_rate": 0.0001, "loss": 0.1751, "step": 1600 }, { "epoch": 2.6100423866970983, "grad_norm": 0.1349019855260849, "learning_rate": 0.0001, "loss": 0.1807, "step": 1601 }, { "epoch": 2.611672644277796, "grad_norm": 0.1419772207736969, "learning_rate": 0.0001, "loss": 0.1627, "step": 1602 }, { "epoch": 2.6133029018584937, "grad_norm": 0.12186914682388306, "learning_rate": 0.0001, "loss": 0.1745, "step": 1603 }, { "epoch": 2.6149331594391914, "grad_norm": 0.13109511137008667, "learning_rate": 0.0001, "loss": 0.1785, "step": 1604 }, { "epoch": 2.616563417019889, "grad_norm": 0.14104005694389343, "learning_rate": 0.0001, "loss": 0.1839, "step": 1605 }, { "epoch": 2.6181936746005867, "grad_norm": 0.13870073854923248, "learning_rate": 0.0001, "loss": 0.1758, "step": 1606 }, { "epoch": 2.6198239321812844, "grad_norm": 0.1275268942117691, "learning_rate": 0.0001, "loss": 0.1806, "step": 1607 }, { "epoch": 2.6214541897619825, "grad_norm": 0.14321641623973846, "learning_rate": 0.0001, "loss": 0.1734, "step": 1608 }, { "epoch": 2.62308444734268, "grad_norm": 0.13970732688903809, "learning_rate": 0.0001, "loss": 0.1723, "step": 1609 }, { "epoch": 2.624714704923378, "grad_norm": 0.13474468886852264, "learning_rate": 0.0001, "loss": 0.1903, "step": 1610 }, { "epoch": 2.6263449625040756, "grad_norm": 0.12260457128286362, "learning_rate": 0.0001, "loss": 0.1824, "step": 1611 }, { "epoch": 2.6279752200847732, "grad_norm": 0.112511046230793, "learning_rate": 0.0001, "loss": 0.1612, "step": 1612 }, { "epoch": 2.6296054776654714, "grad_norm": 0.13217821717262268, "learning_rate": 0.0001, "loss": 0.1662, "step": 1613 }, { "epoch": 2.631235735246169, "grad_norm": 0.13116757571697235, "learning_rate": 0.0001, "loss": 0.1777, "step": 1614 }, { "epoch": 2.6328659928268667, "grad_norm": 0.1306123584508896, "learning_rate": 0.0001, "loss": 0.1654, "step": 1615 }, { "epoch": 2.6344962504075644, "grad_norm": 0.13786140084266663, "learning_rate": 0.0001, "loss": 0.187, "step": 1616 }, { "epoch": 2.636126507988262, "grad_norm": 0.14164811372756958, "learning_rate": 0.0001, "loss": 0.1711, "step": 1617 }, { "epoch": 2.6377567655689598, "grad_norm": 0.1309959590435028, "learning_rate": 0.0001, "loss": 0.1691, "step": 1618 }, { "epoch": 2.6393870231496575, "grad_norm": 0.12645256519317627, "learning_rate": 0.0001, "loss": 0.1685, "step": 1619 }, { "epoch": 2.641017280730355, "grad_norm": 0.13473668694496155, "learning_rate": 0.0001, "loss": 0.1672, "step": 1620 }, { "epoch": 2.6426475383110533, "grad_norm": 0.11596634984016418, "learning_rate": 0.0001, "loss": 0.1634, "step": 1621 }, { "epoch": 2.644277795891751, "grad_norm": 0.12235169112682343, "learning_rate": 0.0001, "loss": 0.1694, "step": 1622 }, { "epoch": 2.6459080534724486, "grad_norm": 0.13206972181797028, "learning_rate": 0.0001, "loss": 0.1884, "step": 1623 }, { "epoch": 2.6475383110531463, "grad_norm": 0.12648923695087433, "learning_rate": 0.0001, "loss": 0.1906, "step": 1624 }, { "epoch": 2.6491685686338444, "grad_norm": 0.11728816479444504, "learning_rate": 0.0001, "loss": 0.1914, "step": 1625 }, { "epoch": 2.650798826214542, "grad_norm": 0.13588158786296844, "learning_rate": 0.0001, "loss": 0.1724, "step": 1626 }, { "epoch": 2.65242908379524, "grad_norm": 0.25838354229927063, "learning_rate": 0.0001, "loss": 0.1806, "step": 1627 }, { "epoch": 2.6540593413759375, "grad_norm": 0.13654319941997528, "learning_rate": 0.0001, "loss": 0.1904, "step": 1628 }, { "epoch": 2.655689598956635, "grad_norm": 0.11923132836818695, "learning_rate": 0.0001, "loss": 0.1797, "step": 1629 }, { "epoch": 2.657319856537333, "grad_norm": 0.13744457066059113, "learning_rate": 0.0001, "loss": 0.1681, "step": 1630 }, { "epoch": 2.6589501141180305, "grad_norm": 0.14435473084449768, "learning_rate": 0.0001, "loss": 0.1783, "step": 1631 }, { "epoch": 2.660580371698728, "grad_norm": 0.1060493215918541, "learning_rate": 0.0001, "loss": 0.1448, "step": 1632 }, { "epoch": 2.6622106292794263, "grad_norm": 0.12072440981864929, "learning_rate": 0.0001, "loss": 0.1766, "step": 1633 }, { "epoch": 2.663840886860124, "grad_norm": 0.11674067378044128, "learning_rate": 0.0001, "loss": 0.1771, "step": 1634 }, { "epoch": 2.6654711444408217, "grad_norm": 0.12532764673233032, "learning_rate": 0.0001, "loss": 0.1867, "step": 1635 }, { "epoch": 2.6671014020215194, "grad_norm": 0.10460726171731949, "learning_rate": 0.0001, "loss": 0.156, "step": 1636 }, { "epoch": 2.668731659602217, "grad_norm": 0.1254202276468277, "learning_rate": 0.0001, "loss": 0.1837, "step": 1637 }, { "epoch": 2.670361917182915, "grad_norm": 0.1245632991194725, "learning_rate": 0.0001, "loss": 0.1721, "step": 1638 }, { "epoch": 2.671992174763613, "grad_norm": 0.1212785616517067, "learning_rate": 0.0001, "loss": 0.1781, "step": 1639 }, { "epoch": 2.6736224323443105, "grad_norm": 0.1357593685388565, "learning_rate": 0.0001, "loss": 0.1699, "step": 1640 }, { "epoch": 2.675252689925008, "grad_norm": 0.13232427835464478, "learning_rate": 0.0001, "loss": 0.182, "step": 1641 }, { "epoch": 2.676882947505706, "grad_norm": 0.12210696190595627, "learning_rate": 0.0001, "loss": 0.1701, "step": 1642 }, { "epoch": 2.6785132050864036, "grad_norm": 0.12548168003559113, "learning_rate": 0.0001, "loss": 0.1624, "step": 1643 }, { "epoch": 2.6801434626671012, "grad_norm": 0.13502956926822662, "learning_rate": 0.0001, "loss": 0.1718, "step": 1644 }, { "epoch": 2.681773720247799, "grad_norm": 0.12499292939901352, "learning_rate": 0.0001, "loss": 0.1717, "step": 1645 }, { "epoch": 2.683403977828497, "grad_norm": 0.12501081824302673, "learning_rate": 0.0001, "loss": 0.1741, "step": 1646 }, { "epoch": 2.6850342354091947, "grad_norm": 0.13593406975269318, "learning_rate": 0.0001, "loss": 0.1932, "step": 1647 }, { "epoch": 2.6866644929898924, "grad_norm": 0.17511261999607086, "learning_rate": 0.0001, "loss": 0.1932, "step": 1648 }, { "epoch": 2.68829475057059, "grad_norm": 0.14490318298339844, "learning_rate": 0.0001, "loss": 0.1684, "step": 1649 }, { "epoch": 2.6899250081512878, "grad_norm": 0.22610324621200562, "learning_rate": 0.0001, "loss": 0.1852, "step": 1650 }, { "epoch": 2.691555265731986, "grad_norm": 0.15227797627449036, "learning_rate": 0.0001, "loss": 0.1655, "step": 1651 }, { "epoch": 2.6931855233126836, "grad_norm": 0.13343138992786407, "learning_rate": 0.0001, "loss": 0.1783, "step": 1652 }, { "epoch": 2.6948157808933813, "grad_norm": 0.1372375637292862, "learning_rate": 0.0001, "loss": 0.1824, "step": 1653 }, { "epoch": 2.696446038474079, "grad_norm": 0.10308275371789932, "learning_rate": 0.0001, "loss": 0.1531, "step": 1654 }, { "epoch": 2.6980762960547766, "grad_norm": 0.11079935729503632, "learning_rate": 0.0001, "loss": 0.165, "step": 1655 }, { "epoch": 2.6997065536354743, "grad_norm": 0.12217281013727188, "learning_rate": 0.0001, "loss": 0.1782, "step": 1656 }, { "epoch": 2.701336811216172, "grad_norm": 0.10372958332300186, "learning_rate": 0.0001, "loss": 0.1587, "step": 1657 }, { "epoch": 2.7029670687968697, "grad_norm": 0.10747478157281876, "learning_rate": 0.0001, "loss": 0.1698, "step": 1658 }, { "epoch": 2.704597326377568, "grad_norm": 0.12394733726978302, "learning_rate": 0.0001, "loss": 0.165, "step": 1659 }, { "epoch": 2.7062275839582655, "grad_norm": 0.11882739514112473, "learning_rate": 0.0001, "loss": 0.1722, "step": 1660 }, { "epoch": 2.707857841538963, "grad_norm": 0.11708527058362961, "learning_rate": 0.0001, "loss": 0.1708, "step": 1661 }, { "epoch": 2.709488099119661, "grad_norm": 0.1281583458185196, "learning_rate": 0.0001, "loss": 0.1773, "step": 1662 }, { "epoch": 2.7111183567003585, "grad_norm": 0.11309745907783508, "learning_rate": 0.0001, "loss": 0.167, "step": 1663 }, { "epoch": 2.7127486142810566, "grad_norm": 0.1279565989971161, "learning_rate": 0.0001, "loss": 0.1788, "step": 1664 }, { "epoch": 2.7143788718617543, "grad_norm": 0.11305457353591919, "learning_rate": 0.0001, "loss": 0.1625, "step": 1665 }, { "epoch": 2.716009129442452, "grad_norm": 0.18214936554431915, "learning_rate": 0.0001, "loss": 0.1532, "step": 1666 }, { "epoch": 2.7176393870231497, "grad_norm": 0.15045790374279022, "learning_rate": 0.0001, "loss": 0.1677, "step": 1667 }, { "epoch": 2.7192696446038473, "grad_norm": 0.1176130399107933, "learning_rate": 0.0001, "loss": 0.1685, "step": 1668 }, { "epoch": 2.720899902184545, "grad_norm": 0.12828783690929413, "learning_rate": 0.0001, "loss": 0.176, "step": 1669 }, { "epoch": 2.7225301597652427, "grad_norm": 0.1470414698123932, "learning_rate": 0.0001, "loss": 0.1911, "step": 1670 }, { "epoch": 2.7241604173459404, "grad_norm": 0.13573643565177917, "learning_rate": 0.0001, "loss": 0.1714, "step": 1671 }, { "epoch": 2.7257906749266385, "grad_norm": 0.13036182522773743, "learning_rate": 0.0001, "loss": 0.1868, "step": 1672 }, { "epoch": 2.727420932507336, "grad_norm": 0.13708871603012085, "learning_rate": 0.0001, "loss": 0.1632, "step": 1673 }, { "epoch": 2.729051190088034, "grad_norm": 0.1127118393778801, "learning_rate": 0.0001, "loss": 0.1653, "step": 1674 }, { "epoch": 2.7306814476687316, "grad_norm": 0.19948649406433105, "learning_rate": 0.0001, "loss": 0.1708, "step": 1675 }, { "epoch": 2.7323117052494297, "grad_norm": 0.12707367539405823, "learning_rate": 0.0001, "loss": 0.1781, "step": 1676 }, { "epoch": 2.7339419628301274, "grad_norm": 0.11897812783718109, "learning_rate": 0.0001, "loss": 0.1584, "step": 1677 }, { "epoch": 2.735572220410825, "grad_norm": 0.11845947057008743, "learning_rate": 0.0001, "loss": 0.1838, "step": 1678 }, { "epoch": 2.7372024779915227, "grad_norm": 0.11247242242097855, "learning_rate": 0.0001, "loss": 0.1795, "step": 1679 }, { "epoch": 2.7388327355722204, "grad_norm": 0.1320265382528305, "learning_rate": 0.0001, "loss": 0.1791, "step": 1680 }, { "epoch": 2.740462993152918, "grad_norm": 0.13398633897304535, "learning_rate": 0.0001, "loss": 0.1799, "step": 1681 }, { "epoch": 2.7420932507336158, "grad_norm": 0.13365308940410614, "learning_rate": 0.0001, "loss": 0.1869, "step": 1682 }, { "epoch": 2.7437235083143134, "grad_norm": 0.15530569851398468, "learning_rate": 0.0001, "loss": 0.1769, "step": 1683 }, { "epoch": 2.7453537658950116, "grad_norm": 0.16007980704307556, "learning_rate": 0.0001, "loss": 0.1847, "step": 1684 }, { "epoch": 2.7469840234757092, "grad_norm": 0.13407476246356964, "learning_rate": 0.0001, "loss": 0.1818, "step": 1685 }, { "epoch": 2.748614281056407, "grad_norm": 0.1119256541132927, "learning_rate": 0.0001, "loss": 0.1679, "step": 1686 }, { "epoch": 2.7502445386371046, "grad_norm": 0.1142873466014862, "learning_rate": 0.0001, "loss": 0.1783, "step": 1687 }, { "epoch": 2.7518747962178023, "grad_norm": 0.3455221951007843, "learning_rate": 0.0001, "loss": 0.1765, "step": 1688 }, { "epoch": 2.7535050537985004, "grad_norm": 0.11033251881599426, "learning_rate": 0.0001, "loss": 0.1571, "step": 1689 }, { "epoch": 2.755135311379198, "grad_norm": 0.10998693853616714, "learning_rate": 0.0001, "loss": 0.1795, "step": 1690 }, { "epoch": 2.7567655689598958, "grad_norm": 0.12498677521944046, "learning_rate": 0.0001, "loss": 0.1682, "step": 1691 }, { "epoch": 2.7583958265405935, "grad_norm": 0.10666051506996155, "learning_rate": 0.0001, "loss": 0.159, "step": 1692 }, { "epoch": 2.760026084121291, "grad_norm": 0.11262010782957077, "learning_rate": 0.0001, "loss": 0.1649, "step": 1693 }, { "epoch": 2.761656341701989, "grad_norm": 0.12692642211914062, "learning_rate": 0.0001, "loss": 0.1619, "step": 1694 }, { "epoch": 2.7632865992826865, "grad_norm": 0.13128408789634705, "learning_rate": 0.0001, "loss": 0.1657, "step": 1695 }, { "epoch": 2.764916856863384, "grad_norm": 0.12888456881046295, "learning_rate": 0.0001, "loss": 0.1855, "step": 1696 }, { "epoch": 2.7665471144440823, "grad_norm": 0.15095765888690948, "learning_rate": 0.0001, "loss": 0.1847, "step": 1697 }, { "epoch": 2.76817737202478, "grad_norm": 0.14810307323932648, "learning_rate": 0.0001, "loss": 0.1749, "step": 1698 }, { "epoch": 2.7698076296054777, "grad_norm": 0.1464967578649521, "learning_rate": 0.0001, "loss": 0.1713, "step": 1699 }, { "epoch": 2.7714378871861753, "grad_norm": 0.11904679238796234, "learning_rate": 0.0001, "loss": 0.1762, "step": 1700 }, { "epoch": 2.773068144766873, "grad_norm": 0.1260305941104889, "learning_rate": 0.0001, "loss": 0.1557, "step": 1701 }, { "epoch": 2.774698402347571, "grad_norm": 0.12736454606056213, "learning_rate": 0.0001, "loss": 0.1773, "step": 1702 }, { "epoch": 2.776328659928269, "grad_norm": 0.14235186576843262, "learning_rate": 0.0001, "loss": 0.1567, "step": 1703 }, { "epoch": 2.7779589175089665, "grad_norm": 0.13851489126682281, "learning_rate": 0.0001, "loss": 0.1683, "step": 1704 }, { "epoch": 2.779589175089664, "grad_norm": 0.15208859741687775, "learning_rate": 0.0001, "loss": 0.1867, "step": 1705 }, { "epoch": 2.781219432670362, "grad_norm": 0.12781186401844025, "learning_rate": 0.0001, "loss": 0.1784, "step": 1706 }, { "epoch": 2.7828496902510595, "grad_norm": 0.16855067014694214, "learning_rate": 0.0001, "loss": 0.1587, "step": 1707 }, { "epoch": 2.7844799478317572, "grad_norm": 0.10379106551408768, "learning_rate": 0.0001, "loss": 0.1569, "step": 1708 }, { "epoch": 2.786110205412455, "grad_norm": 0.13109523057937622, "learning_rate": 0.0001, "loss": 0.1858, "step": 1709 }, { "epoch": 2.787740462993153, "grad_norm": 0.11424817889928818, "learning_rate": 0.0001, "loss": 0.1849, "step": 1710 }, { "epoch": 2.7893707205738507, "grad_norm": 0.1378762423992157, "learning_rate": 0.0001, "loss": 0.1817, "step": 1711 }, { "epoch": 2.7910009781545484, "grad_norm": 0.13009703159332275, "learning_rate": 0.0001, "loss": 0.1743, "step": 1712 }, { "epoch": 2.792631235735246, "grad_norm": 0.1202034205198288, "learning_rate": 0.0001, "loss": 0.1674, "step": 1713 }, { "epoch": 2.7942614933159438, "grad_norm": 0.11197958886623383, "learning_rate": 0.0001, "loss": 0.1755, "step": 1714 }, { "epoch": 2.795891750896642, "grad_norm": 0.1207757517695427, "learning_rate": 0.0001, "loss": 0.1756, "step": 1715 }, { "epoch": 2.7975220084773396, "grad_norm": 0.13465166091918945, "learning_rate": 0.0001, "loss": 0.1793, "step": 1716 }, { "epoch": 2.7991522660580372, "grad_norm": 0.12799595296382904, "learning_rate": 0.0001, "loss": 0.164, "step": 1717 }, { "epoch": 2.800782523638735, "grad_norm": 0.10873240232467651, "learning_rate": 0.0001, "loss": 0.1586, "step": 1718 }, { "epoch": 2.8024127812194326, "grad_norm": 0.204450324177742, "learning_rate": 0.0001, "loss": 0.177, "step": 1719 }, { "epoch": 2.8040430388001303, "grad_norm": 0.12960658967494965, "learning_rate": 0.0001, "loss": 0.1698, "step": 1720 }, { "epoch": 2.805673296380828, "grad_norm": 0.14518047869205475, "learning_rate": 0.0001, "loss": 0.1732, "step": 1721 }, { "epoch": 2.8073035539615256, "grad_norm": 0.13319043815135956, "learning_rate": 0.0001, "loss": 0.178, "step": 1722 }, { "epoch": 2.8089338115422238, "grad_norm": 0.13059182465076447, "learning_rate": 0.0001, "loss": 0.182, "step": 1723 }, { "epoch": 2.8105640691229214, "grad_norm": 0.13546039164066315, "learning_rate": 0.0001, "loss": 0.1754, "step": 1724 }, { "epoch": 2.812194326703619, "grad_norm": 0.10475914925336838, "learning_rate": 0.0001, "loss": 0.1648, "step": 1725 }, { "epoch": 2.813824584284317, "grad_norm": 0.11786188930273056, "learning_rate": 0.0001, "loss": 0.1655, "step": 1726 }, { "epoch": 2.815454841865015, "grad_norm": 0.12777145206928253, "learning_rate": 0.0001, "loss": 0.1907, "step": 1727 }, { "epoch": 2.8170850994457126, "grad_norm": 0.11510585993528366, "learning_rate": 0.0001, "loss": 0.1625, "step": 1728 }, { "epoch": 2.8187153570264103, "grad_norm": 0.13350418210029602, "learning_rate": 0.0001, "loss": 0.1758, "step": 1729 }, { "epoch": 2.820345614607108, "grad_norm": 0.12786592543125153, "learning_rate": 0.0001, "loss": 0.1767, "step": 1730 }, { "epoch": 2.8219758721878057, "grad_norm": 0.1287817358970642, "learning_rate": 0.0001, "loss": 0.1718, "step": 1731 }, { "epoch": 2.8236061297685033, "grad_norm": 0.12238069623708725, "learning_rate": 0.0001, "loss": 0.1669, "step": 1732 }, { "epoch": 2.825236387349201, "grad_norm": 0.12411892414093018, "learning_rate": 0.0001, "loss": 0.1785, "step": 1733 }, { "epoch": 2.8268666449298987, "grad_norm": 0.12480080127716064, "learning_rate": 0.0001, "loss": 0.1814, "step": 1734 }, { "epoch": 2.828496902510597, "grad_norm": 0.13746987283229828, "learning_rate": 0.0001, "loss": 0.1806, "step": 1735 }, { "epoch": 2.8301271600912945, "grad_norm": 0.14211837947368622, "learning_rate": 0.0001, "loss": 0.1728, "step": 1736 }, { "epoch": 2.831757417671992, "grad_norm": 0.13357684016227722, "learning_rate": 0.0001, "loss": 0.1751, "step": 1737 }, { "epoch": 2.83338767525269, "grad_norm": 0.13745687901973724, "learning_rate": 0.0001, "loss": 0.1646, "step": 1738 }, { "epoch": 2.8350179328333875, "grad_norm": 0.1132083386182785, "learning_rate": 0.0001, "loss": 0.1583, "step": 1739 }, { "epoch": 2.8366481904140857, "grad_norm": 0.13176746666431427, "learning_rate": 0.0001, "loss": 0.1852, "step": 1740 }, { "epoch": 2.8382784479947833, "grad_norm": 0.12045666575431824, "learning_rate": 0.0001, "loss": 0.1774, "step": 1741 }, { "epoch": 2.839908705575481, "grad_norm": 0.1171451285481453, "learning_rate": 0.0001, "loss": 0.1728, "step": 1742 }, { "epoch": 2.8415389631561787, "grad_norm": 0.13992704451084137, "learning_rate": 0.0001, "loss": 0.1853, "step": 1743 }, { "epoch": 2.8431692207368764, "grad_norm": 0.1277761161327362, "learning_rate": 0.0001, "loss": 0.1714, "step": 1744 }, { "epoch": 2.844799478317574, "grad_norm": 0.14471198618412018, "learning_rate": 0.0001, "loss": 0.1753, "step": 1745 }, { "epoch": 2.8464297358982718, "grad_norm": 0.1245482787489891, "learning_rate": 0.0001, "loss": 0.1783, "step": 1746 }, { "epoch": 2.8480599934789694, "grad_norm": 0.1273997575044632, "learning_rate": 0.0001, "loss": 0.1832, "step": 1747 }, { "epoch": 2.8496902510596676, "grad_norm": 0.1431506723165512, "learning_rate": 0.0001, "loss": 0.1731, "step": 1748 }, { "epoch": 2.8513205086403652, "grad_norm": 0.12924441695213318, "learning_rate": 0.0001, "loss": 0.1686, "step": 1749 }, { "epoch": 2.852950766221063, "grad_norm": 0.12894988059997559, "learning_rate": 0.0001, "loss": 0.1753, "step": 1750 }, { "epoch": 2.8545810238017606, "grad_norm": 0.1198873519897461, "learning_rate": 0.0001, "loss": 0.1771, "step": 1751 }, { "epoch": 2.8562112813824583, "grad_norm": 0.12302883714437485, "learning_rate": 0.0001, "loss": 0.1699, "step": 1752 }, { "epoch": 2.8578415389631564, "grad_norm": 0.14283844828605652, "learning_rate": 0.0001, "loss": 0.1811, "step": 1753 }, { "epoch": 2.859471796543854, "grad_norm": 0.1237570121884346, "learning_rate": 0.0001, "loss": 0.159, "step": 1754 }, { "epoch": 2.8611020541245518, "grad_norm": 0.12284982204437256, "learning_rate": 0.0001, "loss": 0.1813, "step": 1755 }, { "epoch": 2.8627323117052494, "grad_norm": 0.16714490950107574, "learning_rate": 0.0001, "loss": 0.1661, "step": 1756 }, { "epoch": 2.864362569285947, "grad_norm": 0.12366341799497604, "learning_rate": 0.0001, "loss": 0.1715, "step": 1757 }, { "epoch": 2.865992826866645, "grad_norm": 0.13020524382591248, "learning_rate": 0.0001, "loss": 0.1703, "step": 1758 }, { "epoch": 2.8676230844473425, "grad_norm": 0.11508645117282867, "learning_rate": 0.0001, "loss": 0.1704, "step": 1759 }, { "epoch": 2.86925334202804, "grad_norm": 0.13335858285427094, "learning_rate": 0.0001, "loss": 0.168, "step": 1760 }, { "epoch": 2.8708835996087383, "grad_norm": 0.12083142250776291, "learning_rate": 0.0001, "loss": 0.1626, "step": 1761 }, { "epoch": 2.872513857189436, "grad_norm": 0.12425024807453156, "learning_rate": 0.0001, "loss": 0.1731, "step": 1762 }, { "epoch": 2.8741441147701337, "grad_norm": 0.11999020725488663, "learning_rate": 0.0001, "loss": 0.1796, "step": 1763 }, { "epoch": 2.8757743723508313, "grad_norm": 0.12801139056682587, "learning_rate": 0.0001, "loss": 0.1811, "step": 1764 }, { "epoch": 2.8774046299315295, "grad_norm": 0.12506726384162903, "learning_rate": 0.0001, "loss": 0.1699, "step": 1765 }, { "epoch": 2.879034887512227, "grad_norm": 0.12049156427383423, "learning_rate": 0.0001, "loss": 0.1794, "step": 1766 }, { "epoch": 2.880665145092925, "grad_norm": 0.11800320446491241, "learning_rate": 0.0001, "loss": 0.1764, "step": 1767 }, { "epoch": 2.8822954026736225, "grad_norm": 0.13116998970508575, "learning_rate": 0.0001, "loss": 0.1742, "step": 1768 }, { "epoch": 2.88392566025432, "grad_norm": 0.11840593814849854, "learning_rate": 0.0001, "loss": 0.1721, "step": 1769 }, { "epoch": 2.885555917835018, "grad_norm": 0.11977692693471909, "learning_rate": 0.0001, "loss": 0.1699, "step": 1770 }, { "epoch": 2.8871861754157155, "grad_norm": 0.13329051434993744, "learning_rate": 0.0001, "loss": 0.1786, "step": 1771 }, { "epoch": 2.888816432996413, "grad_norm": 0.19213466346263885, "learning_rate": 0.0001, "loss": 0.173, "step": 1772 }, { "epoch": 2.8904466905771113, "grad_norm": 0.1305350363254547, "learning_rate": 0.0001, "loss": 0.1713, "step": 1773 }, { "epoch": 2.892076948157809, "grad_norm": 0.14388902485370636, "learning_rate": 0.0001, "loss": 0.1734, "step": 1774 }, { "epoch": 2.8937072057385067, "grad_norm": 0.1325220912694931, "learning_rate": 0.0001, "loss": 0.1674, "step": 1775 }, { "epoch": 2.8953374633192044, "grad_norm": 0.14095838367938995, "learning_rate": 0.0001, "loss": 0.1733, "step": 1776 }, { "epoch": 2.896967720899902, "grad_norm": 0.12072951346635818, "learning_rate": 0.0001, "loss": 0.1789, "step": 1777 }, { "epoch": 2.8985979784806, "grad_norm": 0.15178942680358887, "learning_rate": 0.0001, "loss": 0.1891, "step": 1778 }, { "epoch": 2.900228236061298, "grad_norm": 0.19420669972896576, "learning_rate": 0.0001, "loss": 0.1833, "step": 1779 }, { "epoch": 2.9018584936419956, "grad_norm": 0.125785693526268, "learning_rate": 0.0001, "loss": 0.1873, "step": 1780 }, { "epoch": 2.9034887512226932, "grad_norm": 0.13364505767822266, "learning_rate": 0.0001, "loss": 0.1561, "step": 1781 }, { "epoch": 2.905119008803391, "grad_norm": 0.12486680597066879, "learning_rate": 0.0001, "loss": 0.1758, "step": 1782 }, { "epoch": 2.9067492663840886, "grad_norm": 0.13013507425785065, "learning_rate": 0.0001, "loss": 0.1781, "step": 1783 }, { "epoch": 2.9083795239647863, "grad_norm": 0.1089881956577301, "learning_rate": 0.0001, "loss": 0.1653, "step": 1784 }, { "epoch": 2.910009781545484, "grad_norm": 0.145898699760437, "learning_rate": 0.0001, "loss": 0.1796, "step": 1785 }, { "epoch": 2.911640039126182, "grad_norm": 0.12093661725521088, "learning_rate": 0.0001, "loss": 0.1829, "step": 1786 }, { "epoch": 2.9132702967068798, "grad_norm": 0.12347777187824249, "learning_rate": 0.0001, "loss": 0.1582, "step": 1787 }, { "epoch": 2.9149005542875774, "grad_norm": 0.10860280692577362, "learning_rate": 0.0001, "loss": 0.1674, "step": 1788 }, { "epoch": 2.916530811868275, "grad_norm": 0.12860281765460968, "learning_rate": 0.0001, "loss": 0.1748, "step": 1789 }, { "epoch": 2.918161069448973, "grad_norm": 0.12744852900505066, "learning_rate": 0.0001, "loss": 0.1667, "step": 1790 }, { "epoch": 2.919791327029671, "grad_norm": 0.13266021013259888, "learning_rate": 0.0001, "loss": 0.1804, "step": 1791 }, { "epoch": 2.9214215846103686, "grad_norm": 0.1323481798171997, "learning_rate": 0.0001, "loss": 0.1786, "step": 1792 }, { "epoch": 2.9230518421910663, "grad_norm": 0.12896117568016052, "learning_rate": 0.0001, "loss": 0.1753, "step": 1793 }, { "epoch": 2.924682099771764, "grad_norm": 0.12157560884952545, "learning_rate": 0.0001, "loss": 0.1669, "step": 1794 }, { "epoch": 2.9263123573524616, "grad_norm": 0.12362449616193771, "learning_rate": 0.0001, "loss": 0.1651, "step": 1795 }, { "epoch": 2.9279426149331593, "grad_norm": 0.1297728419303894, "learning_rate": 0.0001, "loss": 0.1897, "step": 1796 }, { "epoch": 2.929572872513857, "grad_norm": 0.10805771499872208, "learning_rate": 0.0001, "loss": 0.162, "step": 1797 }, { "epoch": 2.9312031300945547, "grad_norm": 0.1192641630768776, "learning_rate": 0.0001, "loss": 0.176, "step": 1798 }, { "epoch": 2.932833387675253, "grad_norm": 0.14953552186489105, "learning_rate": 0.0001, "loss": 0.1826, "step": 1799 }, { "epoch": 2.9344636452559505, "grad_norm": 0.12457464635372162, "learning_rate": 0.0001, "loss": 0.168, "step": 1800 }, { "epoch": 2.936093902836648, "grad_norm": 0.12195872515439987, "learning_rate": 0.0001, "loss": 0.1842, "step": 1801 }, { "epoch": 2.937724160417346, "grad_norm": 0.1217537596821785, "learning_rate": 0.0001, "loss": 0.1662, "step": 1802 }, { "epoch": 2.9393544179980435, "grad_norm": 0.10911030322313309, "learning_rate": 0.0001, "loss": 0.1635, "step": 1803 }, { "epoch": 2.9409846755787417, "grad_norm": 0.15903405845165253, "learning_rate": 0.0001, "loss": 0.1767, "step": 1804 }, { "epoch": 2.9426149331594393, "grad_norm": 0.14546822011470795, "learning_rate": 0.0001, "loss": 0.1735, "step": 1805 }, { "epoch": 2.944245190740137, "grad_norm": 0.13845416903495789, "learning_rate": 0.0001, "loss": 0.1869, "step": 1806 }, { "epoch": 2.9458754483208347, "grad_norm": 0.12575270235538483, "learning_rate": 0.0001, "loss": 0.1719, "step": 1807 }, { "epoch": 2.9475057059015324, "grad_norm": 0.11531825363636017, "learning_rate": 0.0001, "loss": 0.1677, "step": 1808 }, { "epoch": 2.94913596348223, "grad_norm": 0.12678653001785278, "learning_rate": 0.0001, "loss": 0.1782, "step": 1809 }, { "epoch": 2.9507662210629277, "grad_norm": 0.11455704271793365, "learning_rate": 0.0001, "loss": 0.1664, "step": 1810 }, { "epoch": 2.9523964786436254, "grad_norm": 0.12815020978450775, "learning_rate": 0.0001, "loss": 0.1821, "step": 1811 }, { "epoch": 2.9540267362243235, "grad_norm": 0.14364176988601685, "learning_rate": 0.0001, "loss": 0.1793, "step": 1812 }, { "epoch": 2.9556569938050212, "grad_norm": 0.1432567834854126, "learning_rate": 0.0001, "loss": 0.1642, "step": 1813 }, { "epoch": 2.957287251385719, "grad_norm": 0.1255834549665451, "learning_rate": 0.0001, "loss": 0.17, "step": 1814 }, { "epoch": 2.9589175089664166, "grad_norm": 0.13830745220184326, "learning_rate": 0.0001, "loss": 0.1481, "step": 1815 }, { "epoch": 2.9605477665471147, "grad_norm": 0.1475990116596222, "learning_rate": 0.0001, "loss": 0.1938, "step": 1816 }, { "epoch": 2.9621780241278124, "grad_norm": 0.13100960850715637, "learning_rate": 0.0001, "loss": 0.1877, "step": 1817 }, { "epoch": 2.96380828170851, "grad_norm": 0.11911682784557343, "learning_rate": 0.0001, "loss": 0.1668, "step": 1818 }, { "epoch": 2.9654385392892078, "grad_norm": 0.13728202879428864, "learning_rate": 0.0001, "loss": 0.181, "step": 1819 }, { "epoch": 2.9670687968699054, "grad_norm": 0.1287861168384552, "learning_rate": 0.0001, "loss": 0.1685, "step": 1820 }, { "epoch": 2.968699054450603, "grad_norm": 0.1377815157175064, "learning_rate": 0.0001, "loss": 0.1708, "step": 1821 }, { "epoch": 2.970329312031301, "grad_norm": 0.13828015327453613, "learning_rate": 0.0001, "loss": 0.1751, "step": 1822 }, { "epoch": 2.9719595696119985, "grad_norm": 0.13699176907539368, "learning_rate": 0.0001, "loss": 0.1707, "step": 1823 }, { "epoch": 2.9735898271926966, "grad_norm": 0.13141410052776337, "learning_rate": 0.0001, "loss": 0.1686, "step": 1824 }, { "epoch": 2.9752200847733943, "grad_norm": 0.11644890159368515, "learning_rate": 0.0001, "loss": 0.1707, "step": 1825 }, { "epoch": 2.976850342354092, "grad_norm": 0.12482846528291702, "learning_rate": 0.0001, "loss": 0.1762, "step": 1826 }, { "epoch": 2.9784805999347896, "grad_norm": 0.1434062272310257, "learning_rate": 0.0001, "loss": 0.1727, "step": 1827 }, { "epoch": 2.9801108575154873, "grad_norm": 0.1083054170012474, "learning_rate": 0.0001, "loss": 0.1793, "step": 1828 }, { "epoch": 2.9817411150961854, "grad_norm": 0.10897631198167801, "learning_rate": 0.0001, "loss": 0.1676, "step": 1829 }, { "epoch": 2.983371372676883, "grad_norm": 0.12798282504081726, "learning_rate": 0.0001, "loss": 0.1733, "step": 1830 }, { "epoch": 2.985001630257581, "grad_norm": 0.13066311180591583, "learning_rate": 0.0001, "loss": 0.1663, "step": 1831 }, { "epoch": 2.9866318878382785, "grad_norm": 0.12644776701927185, "learning_rate": 0.0001, "loss": 0.1772, "step": 1832 }, { "epoch": 2.988262145418976, "grad_norm": 0.14311029016971588, "learning_rate": 0.0001, "loss": 0.1932, "step": 1833 }, { "epoch": 2.989892402999674, "grad_norm": 0.12313991785049438, "learning_rate": 0.0001, "loss": 0.1771, "step": 1834 }, { "epoch": 2.9915226605803715, "grad_norm": 0.13823242485523224, "learning_rate": 0.0001, "loss": 0.1849, "step": 1835 }, { "epoch": 2.993152918161069, "grad_norm": 0.12686224281787872, "learning_rate": 0.0001, "loss": 0.1841, "step": 1836 }, { "epoch": 2.9947831757417673, "grad_norm": 0.17587055265903473, "learning_rate": 0.0001, "loss": 0.1826, "step": 1837 }, { "epoch": 2.996413433322465, "grad_norm": 0.11246396601200104, "learning_rate": 0.0001, "loss": 0.1594, "step": 1838 }, { "epoch": 2.9980436909031627, "grad_norm": 0.13493315875530243, "learning_rate": 0.0001, "loss": 0.184, "step": 1839 }, { "epoch": 2.9996739484838604, "grad_norm": 0.12286293506622314, "learning_rate": 0.0001, "loss": 0.175, "step": 1840 }, { "epoch": 3.001304206064558, "grad_norm": 0.1269206702709198, "learning_rate": 0.0001, "loss": 0.1589, "step": 1841 }, { "epoch": 3.0029344636452557, "grad_norm": 0.1355823576450348, "learning_rate": 0.0001, "loss": 0.1583, "step": 1842 }, { "epoch": 3.004564721225954, "grad_norm": 0.11628458648920059, "learning_rate": 0.0001, "loss": 0.1678, "step": 1843 }, { "epoch": 3.0061949788066515, "grad_norm": 0.11154302209615707, "learning_rate": 0.0001, "loss": 0.1444, "step": 1844 }, { "epoch": 3.007825236387349, "grad_norm": 0.14044183492660522, "learning_rate": 0.0001, "loss": 0.1484, "step": 1845 }, { "epoch": 3.009455493968047, "grad_norm": 0.11825685203075409, "learning_rate": 0.0001, "loss": 0.1495, "step": 1846 }, { "epoch": 3.0110857515487446, "grad_norm": 0.17750664055347443, "learning_rate": 0.0001, "loss": 0.1606, "step": 1847 }, { "epoch": 3.0127160091294423, "grad_norm": 0.16660550236701965, "learning_rate": 0.0001, "loss": 0.1604, "step": 1848 }, { "epoch": 3.0143462667101404, "grad_norm": 0.16394905745983124, "learning_rate": 0.0001, "loss": 0.1738, "step": 1849 }, { "epoch": 3.015976524290838, "grad_norm": 0.1530742645263672, "learning_rate": 0.0001, "loss": 0.1594, "step": 1850 }, { "epoch": 3.0176067818715357, "grad_norm": 0.13188475370407104, "learning_rate": 0.0001, "loss": 0.1564, "step": 1851 }, { "epoch": 3.0192370394522334, "grad_norm": 0.13797006011009216, "learning_rate": 0.0001, "loss": 0.1565, "step": 1852 }, { "epoch": 3.020867297032931, "grad_norm": 0.16686192154884338, "learning_rate": 0.0001, "loss": 0.1639, "step": 1853 }, { "epoch": 3.022497554613629, "grad_norm": 0.13372887670993805, "learning_rate": 0.0001, "loss": 0.1522, "step": 1854 }, { "epoch": 3.024127812194327, "grad_norm": 0.1713315099477768, "learning_rate": 0.0001, "loss": 0.1647, "step": 1855 }, { "epoch": 3.0257580697750246, "grad_norm": 0.17664030194282532, "learning_rate": 0.0001, "loss": 0.1706, "step": 1856 }, { "epoch": 3.0273883273557223, "grad_norm": 0.15478695929050446, "learning_rate": 0.0001, "loss": 0.1584, "step": 1857 }, { "epoch": 3.02901858493642, "grad_norm": 0.12312593311071396, "learning_rate": 0.0001, "loss": 0.1444, "step": 1858 }, { "epoch": 3.0306488425171176, "grad_norm": 0.13076375424861908, "learning_rate": 0.0001, "loss": 0.1644, "step": 1859 }, { "epoch": 3.0322791000978153, "grad_norm": 0.17195327579975128, "learning_rate": 0.0001, "loss": 0.1568, "step": 1860 }, { "epoch": 3.033909357678513, "grad_norm": 0.15723173320293427, "learning_rate": 0.0001, "loss": 0.1581, "step": 1861 }, { "epoch": 3.035539615259211, "grad_norm": 0.13662908971309662, "learning_rate": 0.0001, "loss": 0.1603, "step": 1862 }, { "epoch": 3.037169872839909, "grad_norm": 0.14388419687747955, "learning_rate": 0.0001, "loss": 0.1547, "step": 1863 }, { "epoch": 3.0388001304206065, "grad_norm": 0.1430109143257141, "learning_rate": 0.0001, "loss": 0.1448, "step": 1864 }, { "epoch": 3.040430388001304, "grad_norm": 0.18873119354248047, "learning_rate": 0.0001, "loss": 0.1737, "step": 1865 }, { "epoch": 3.042060645582002, "grad_norm": 0.13666655123233795, "learning_rate": 0.0001, "loss": 0.1489, "step": 1866 }, { "epoch": 3.0436909031626995, "grad_norm": 0.12187852710485458, "learning_rate": 0.0001, "loss": 0.1486, "step": 1867 }, { "epoch": 3.0453211607433976, "grad_norm": 0.14884653687477112, "learning_rate": 0.0001, "loss": 0.1532, "step": 1868 }, { "epoch": 3.0469514183240953, "grad_norm": 0.1332685947418213, "learning_rate": 0.0001, "loss": 0.1504, "step": 1869 }, { "epoch": 3.048581675904793, "grad_norm": 0.15968628227710724, "learning_rate": 0.0001, "loss": 0.1626, "step": 1870 }, { "epoch": 3.0502119334854907, "grad_norm": 0.13830626010894775, "learning_rate": 0.0001, "loss": 0.1556, "step": 1871 }, { "epoch": 3.0518421910661884, "grad_norm": 0.14742417633533478, "learning_rate": 0.0001, "loss": 0.1661, "step": 1872 }, { "epoch": 3.053472448646886, "grad_norm": 0.1588188111782074, "learning_rate": 0.0001, "loss": 0.1626, "step": 1873 }, { "epoch": 3.055102706227584, "grad_norm": 0.14762498438358307, "learning_rate": 0.0001, "loss": 0.1531, "step": 1874 }, { "epoch": 3.056732963808282, "grad_norm": 0.13328132033348083, "learning_rate": 0.0001, "loss": 0.1606, "step": 1875 }, { "epoch": 3.0583632213889795, "grad_norm": 0.12717348337173462, "learning_rate": 0.0001, "loss": 0.1661, "step": 1876 }, { "epoch": 3.059993478969677, "grad_norm": 0.15513722598552704, "learning_rate": 0.0001, "loss": 0.1734, "step": 1877 }, { "epoch": 3.061623736550375, "grad_norm": 0.12475986033678055, "learning_rate": 0.0001, "loss": 0.1587, "step": 1878 }, { "epoch": 3.0632539941310726, "grad_norm": 0.1064193844795227, "learning_rate": 0.0001, "loss": 0.1357, "step": 1879 }, { "epoch": 3.0648842517117703, "grad_norm": 0.14102140069007874, "learning_rate": 0.0001, "loss": 0.1474, "step": 1880 }, { "epoch": 3.0665145092924684, "grad_norm": 0.1496179848909378, "learning_rate": 0.0001, "loss": 0.157, "step": 1881 }, { "epoch": 3.068144766873166, "grad_norm": 0.13995377719402313, "learning_rate": 0.0001, "loss": 0.1513, "step": 1882 }, { "epoch": 3.0697750244538637, "grad_norm": 0.1414022445678711, "learning_rate": 0.0001, "loss": 0.1515, "step": 1883 }, { "epoch": 3.0714052820345614, "grad_norm": 0.15052585303783417, "learning_rate": 0.0001, "loss": 0.1732, "step": 1884 }, { "epoch": 3.073035539615259, "grad_norm": 0.1524229496717453, "learning_rate": 0.0001, "loss": 0.1721, "step": 1885 }, { "epoch": 3.074665797195957, "grad_norm": 0.13956795632839203, "learning_rate": 0.0001, "loss": 0.1634, "step": 1886 }, { "epoch": 3.076296054776655, "grad_norm": 0.13106916844844818, "learning_rate": 0.0001, "loss": 0.1541, "step": 1887 }, { "epoch": 3.0779263123573526, "grad_norm": 0.1438571810722351, "learning_rate": 0.0001, "loss": 0.1634, "step": 1888 }, { "epoch": 3.0795565699380503, "grad_norm": 0.12486065924167633, "learning_rate": 0.0001, "loss": 0.1558, "step": 1889 }, { "epoch": 3.081186827518748, "grad_norm": 0.15477533638477325, "learning_rate": 0.0001, "loss": 0.1634, "step": 1890 }, { "epoch": 3.0828170850994456, "grad_norm": 0.14669184386730194, "learning_rate": 0.0001, "loss": 0.1688, "step": 1891 }, { "epoch": 3.0844473426801433, "grad_norm": 0.12497131526470184, "learning_rate": 0.0001, "loss": 0.1452, "step": 1892 }, { "epoch": 3.0860776002608414, "grad_norm": 0.1318637579679489, "learning_rate": 0.0001, "loss": 0.1501, "step": 1893 }, { "epoch": 3.087707857841539, "grad_norm": 0.1398518979549408, "learning_rate": 0.0001, "loss": 0.1518, "step": 1894 }, { "epoch": 3.089338115422237, "grad_norm": 0.13344673812389374, "learning_rate": 0.0001, "loss": 0.1516, "step": 1895 }, { "epoch": 3.0909683730029345, "grad_norm": 0.13853240013122559, "learning_rate": 0.0001, "loss": 0.1438, "step": 1896 }, { "epoch": 3.092598630583632, "grad_norm": 0.15498271584510803, "learning_rate": 0.0001, "loss": 0.1536, "step": 1897 }, { "epoch": 3.09422888816433, "grad_norm": 0.1456194967031479, "learning_rate": 0.0001, "loss": 0.1535, "step": 1898 }, { "epoch": 3.0958591457450275, "grad_norm": 0.15903611481189728, "learning_rate": 0.0001, "loss": 0.169, "step": 1899 }, { "epoch": 3.0974894033257256, "grad_norm": 0.1665191799402237, "learning_rate": 0.0001, "loss": 0.1694, "step": 1900 }, { "epoch": 3.0991196609064233, "grad_norm": 0.1596616506576538, "learning_rate": 0.0001, "loss": 0.1645, "step": 1901 }, { "epoch": 3.100749918487121, "grad_norm": 0.14981094002723694, "learning_rate": 0.0001, "loss": 0.1555, "step": 1902 }, { "epoch": 3.1023801760678187, "grad_norm": 0.1317712813615799, "learning_rate": 0.0001, "loss": 0.1527, "step": 1903 }, { "epoch": 3.1040104336485164, "grad_norm": 0.12632454931735992, "learning_rate": 0.0001, "loss": 0.1364, "step": 1904 }, { "epoch": 3.105640691229214, "grad_norm": 0.16619497537612915, "learning_rate": 0.0001, "loss": 0.1648, "step": 1905 }, { "epoch": 3.107270948809912, "grad_norm": 0.12969553470611572, "learning_rate": 0.0001, "loss": 0.1545, "step": 1906 }, { "epoch": 3.10890120639061, "grad_norm": 0.1580219268798828, "learning_rate": 0.0001, "loss": 0.1613, "step": 1907 }, { "epoch": 3.1105314639713075, "grad_norm": 0.14800585806369781, "learning_rate": 0.0001, "loss": 0.1775, "step": 1908 }, { "epoch": 3.112161721552005, "grad_norm": 0.13492430746555328, "learning_rate": 0.0001, "loss": 0.1354, "step": 1909 }, { "epoch": 3.113791979132703, "grad_norm": 0.13859906792640686, "learning_rate": 0.0001, "loss": 0.1737, "step": 1910 }, { "epoch": 3.1154222367134006, "grad_norm": 0.1361161321401596, "learning_rate": 0.0001, "loss": 0.1537, "step": 1911 }, { "epoch": 3.1170524942940983, "grad_norm": 0.17924432456493378, "learning_rate": 0.0001, "loss": 0.1744, "step": 1912 }, { "epoch": 3.1186827518747964, "grad_norm": 0.14079366624355316, "learning_rate": 0.0001, "loss": 0.1501, "step": 1913 }, { "epoch": 3.120313009455494, "grad_norm": 0.13240069150924683, "learning_rate": 0.0001, "loss": 0.1614, "step": 1914 }, { "epoch": 3.1219432670361917, "grad_norm": 0.1292378306388855, "learning_rate": 0.0001, "loss": 0.1533, "step": 1915 }, { "epoch": 3.1235735246168894, "grad_norm": 0.1428757756948471, "learning_rate": 0.0001, "loss": 0.1518, "step": 1916 }, { "epoch": 3.125203782197587, "grad_norm": 0.1274784803390503, "learning_rate": 0.0001, "loss": 0.1676, "step": 1917 }, { "epoch": 3.126834039778285, "grad_norm": 0.13806386291980743, "learning_rate": 0.0001, "loss": 0.1426, "step": 1918 }, { "epoch": 3.128464297358983, "grad_norm": 0.12677744030952454, "learning_rate": 0.0001, "loss": 0.1546, "step": 1919 }, { "epoch": 3.1300945549396806, "grad_norm": 0.13189762830734253, "learning_rate": 0.0001, "loss": 0.1519, "step": 1920 }, { "epoch": 3.1317248125203783, "grad_norm": 0.18951299786567688, "learning_rate": 0.0001, "loss": 0.1674, "step": 1921 }, { "epoch": 3.133355070101076, "grad_norm": 0.1415640413761139, "learning_rate": 0.0001, "loss": 0.1534, "step": 1922 }, { "epoch": 3.1349853276817736, "grad_norm": 0.1354334056377411, "learning_rate": 0.0001, "loss": 0.1584, "step": 1923 }, { "epoch": 3.1366155852624713, "grad_norm": 0.15447008609771729, "learning_rate": 0.0001, "loss": 0.1632, "step": 1924 }, { "epoch": 3.138245842843169, "grad_norm": 0.14421270787715912, "learning_rate": 0.0001, "loss": 0.1648, "step": 1925 }, { "epoch": 3.139876100423867, "grad_norm": 0.13568207621574402, "learning_rate": 0.0001, "loss": 0.1692, "step": 1926 }, { "epoch": 3.141506358004565, "grad_norm": 0.140906423330307, "learning_rate": 0.0001, "loss": 0.1622, "step": 1927 }, { "epoch": 3.1431366155852625, "grad_norm": 0.11631222814321518, "learning_rate": 0.0001, "loss": 0.1616, "step": 1928 }, { "epoch": 3.14476687316596, "grad_norm": 0.15498870611190796, "learning_rate": 0.0001, "loss": 0.1718, "step": 1929 }, { "epoch": 3.146397130746658, "grad_norm": 0.16634507477283478, "learning_rate": 0.0001, "loss": 0.1642, "step": 1930 }, { "epoch": 3.148027388327356, "grad_norm": 0.1286221295595169, "learning_rate": 0.0001, "loss": 0.1515, "step": 1931 }, { "epoch": 3.1496576459080536, "grad_norm": 0.14258523285388947, "learning_rate": 0.0001, "loss": 0.1556, "step": 1932 }, { "epoch": 3.1512879034887513, "grad_norm": 0.1418234407901764, "learning_rate": 0.0001, "loss": 0.1585, "step": 1933 }, { "epoch": 3.152918161069449, "grad_norm": 0.13352856040000916, "learning_rate": 0.0001, "loss": 0.1534, "step": 1934 }, { "epoch": 3.1545484186501467, "grad_norm": 0.12372912466526031, "learning_rate": 0.0001, "loss": 0.1515, "step": 1935 }, { "epoch": 3.1561786762308444, "grad_norm": 0.15148986876010895, "learning_rate": 0.0001, "loss": 0.1733, "step": 1936 }, { "epoch": 3.157808933811542, "grad_norm": 0.13402307033538818, "learning_rate": 0.0001, "loss": 0.1523, "step": 1937 }, { "epoch": 3.15943919139224, "grad_norm": 0.15002478659152985, "learning_rate": 0.0001, "loss": 0.1552, "step": 1938 }, { "epoch": 3.161069448972938, "grad_norm": 0.14147819578647614, "learning_rate": 0.0001, "loss": 0.1654, "step": 1939 }, { "epoch": 3.1626997065536355, "grad_norm": 0.16846194863319397, "learning_rate": 0.0001, "loss": 0.1745, "step": 1940 }, { "epoch": 3.164329964134333, "grad_norm": 0.17385046184062958, "learning_rate": 0.0001, "loss": 0.1618, "step": 1941 }, { "epoch": 3.165960221715031, "grad_norm": 0.14873671531677246, "learning_rate": 0.0001, "loss": 0.1617, "step": 1942 }, { "epoch": 3.1675904792957286, "grad_norm": 0.13968585431575775, "learning_rate": 0.0001, "loss": 0.155, "step": 1943 }, { "epoch": 3.1692207368764267, "grad_norm": 0.15636034309864044, "learning_rate": 0.0001, "loss": 0.1612, "step": 1944 }, { "epoch": 3.1708509944571244, "grad_norm": 0.17083799839019775, "learning_rate": 0.0001, "loss": 0.1654, "step": 1945 }, { "epoch": 3.172481252037822, "grad_norm": 0.13680125772953033, "learning_rate": 0.0001, "loss": 0.1647, "step": 1946 }, { "epoch": 3.1741115096185197, "grad_norm": 0.14011834561824799, "learning_rate": 0.0001, "loss": 0.1407, "step": 1947 }, { "epoch": 3.1757417671992174, "grad_norm": 0.14125655591487885, "learning_rate": 0.0001, "loss": 0.1547, "step": 1948 }, { "epoch": 3.177372024779915, "grad_norm": 0.13686692714691162, "learning_rate": 0.0001, "loss": 0.1638, "step": 1949 }, { "epoch": 3.1790022823606128, "grad_norm": 0.14849920570850372, "learning_rate": 0.0001, "loss": 0.159, "step": 1950 }, { "epoch": 3.180632539941311, "grad_norm": 0.2647811770439148, "learning_rate": 0.0001, "loss": 0.1541, "step": 1951 }, { "epoch": 3.1822627975220086, "grad_norm": 0.1454608142375946, "learning_rate": 0.0001, "loss": 0.1531, "step": 1952 }, { "epoch": 3.1838930551027063, "grad_norm": 0.15348565578460693, "learning_rate": 0.0001, "loss": 0.1642, "step": 1953 }, { "epoch": 3.185523312683404, "grad_norm": 0.14148078858852386, "learning_rate": 0.0001, "loss": 0.1719, "step": 1954 }, { "epoch": 3.1871535702641016, "grad_norm": 0.16094723343849182, "learning_rate": 0.0001, "loss": 0.1669, "step": 1955 }, { "epoch": 3.1887838278447993, "grad_norm": 0.6947292685508728, "learning_rate": 0.0001, "loss": 0.1537, "step": 1956 }, { "epoch": 3.1904140854254974, "grad_norm": 0.138764888048172, "learning_rate": 0.0001, "loss": 0.1532, "step": 1957 }, { "epoch": 3.192044343006195, "grad_norm": 0.19771906733512878, "learning_rate": 0.0001, "loss": 0.1529, "step": 1958 }, { "epoch": 3.193674600586893, "grad_norm": 0.15238133072853088, "learning_rate": 0.0001, "loss": 0.1603, "step": 1959 }, { "epoch": 3.1953048581675905, "grad_norm": 0.14184920489788055, "learning_rate": 0.0001, "loss": 0.1532, "step": 1960 }, { "epoch": 3.196935115748288, "grad_norm": 0.13385626673698425, "learning_rate": 0.0001, "loss": 0.1545, "step": 1961 }, { "epoch": 3.198565373328986, "grad_norm": 0.1413835883140564, "learning_rate": 0.0001, "loss": 0.1588, "step": 1962 }, { "epoch": 3.2001956309096835, "grad_norm": 0.14667753875255585, "learning_rate": 0.0001, "loss": 0.1573, "step": 1963 }, { "epoch": 3.2018258884903816, "grad_norm": 0.16244405508041382, "learning_rate": 0.0001, "loss": 0.1658, "step": 1964 }, { "epoch": 3.2034561460710793, "grad_norm": 0.157258540391922, "learning_rate": 0.0001, "loss": 0.1567, "step": 1965 }, { "epoch": 3.205086403651777, "grad_norm": 0.16143973171710968, "learning_rate": 0.0001, "loss": 0.1584, "step": 1966 }, { "epoch": 3.2067166612324747, "grad_norm": 0.15137411653995514, "learning_rate": 0.0001, "loss": 0.1636, "step": 1967 }, { "epoch": 3.2083469188131724, "grad_norm": 0.13595400750637054, "learning_rate": 0.0001, "loss": 0.1566, "step": 1968 }, { "epoch": 3.20997717639387, "grad_norm": 0.1366918534040451, "learning_rate": 0.0001, "loss": 0.1479, "step": 1969 }, { "epoch": 3.211607433974568, "grad_norm": 0.12027698010206223, "learning_rate": 0.0001, "loss": 0.1475, "step": 1970 }, { "epoch": 3.213237691555266, "grad_norm": 0.11735312640666962, "learning_rate": 0.0001, "loss": 0.1436, "step": 1971 }, { "epoch": 3.2148679491359635, "grad_norm": 0.13770481944084167, "learning_rate": 0.0001, "loss": 0.1463, "step": 1972 }, { "epoch": 3.216498206716661, "grad_norm": 0.14083652198314667, "learning_rate": 0.0001, "loss": 0.1596, "step": 1973 }, { "epoch": 3.218128464297359, "grad_norm": 0.13524475693702698, "learning_rate": 0.0001, "loss": 0.1626, "step": 1974 }, { "epoch": 3.2197587218780566, "grad_norm": 0.13120479881763458, "learning_rate": 0.0001, "loss": 0.1596, "step": 1975 }, { "epoch": 3.2213889794587547, "grad_norm": 0.17046691477298737, "learning_rate": 0.0001, "loss": 0.1579, "step": 1976 }, { "epoch": 3.2230192370394524, "grad_norm": 0.13418637216091156, "learning_rate": 0.0001, "loss": 0.1628, "step": 1977 }, { "epoch": 3.22464949462015, "grad_norm": 0.11807394027709961, "learning_rate": 0.0001, "loss": 0.1495, "step": 1978 }, { "epoch": 3.2262797522008477, "grad_norm": 0.15155768394470215, "learning_rate": 0.0001, "loss": 0.1644, "step": 1979 }, { "epoch": 3.2279100097815454, "grad_norm": 0.1263338178396225, "learning_rate": 0.0001, "loss": 0.1558, "step": 1980 }, { "epoch": 3.229540267362243, "grad_norm": 0.1663949191570282, "learning_rate": 0.0001, "loss": 0.159, "step": 1981 }, { "epoch": 3.231170524942941, "grad_norm": 0.1378049999475479, "learning_rate": 0.0001, "loss": 0.1514, "step": 1982 }, { "epoch": 3.232800782523639, "grad_norm": 0.1389368176460266, "learning_rate": 0.0001, "loss": 0.1487, "step": 1983 }, { "epoch": 3.2344310401043366, "grad_norm": 0.15180104970932007, "learning_rate": 0.0001, "loss": 0.1643, "step": 1984 }, { "epoch": 3.2360612976850343, "grad_norm": 0.14475220441818237, "learning_rate": 0.0001, "loss": 0.156, "step": 1985 }, { "epoch": 3.237691555265732, "grad_norm": 0.14919638633728027, "learning_rate": 0.0001, "loss": 0.1723, "step": 1986 }, { "epoch": 3.2393218128464296, "grad_norm": 0.14045490324497223, "learning_rate": 0.0001, "loss": 0.156, "step": 1987 }, { "epoch": 3.2409520704271273, "grad_norm": 0.1450393944978714, "learning_rate": 0.0001, "loss": 0.1495, "step": 1988 }, { "epoch": 3.2425823280078254, "grad_norm": 0.12991821765899658, "learning_rate": 0.0001, "loss": 0.1417, "step": 1989 }, { "epoch": 3.244212585588523, "grad_norm": 0.1459466964006424, "learning_rate": 0.0001, "loss": 0.167, "step": 1990 }, { "epoch": 3.245842843169221, "grad_norm": 0.1518615484237671, "learning_rate": 0.0001, "loss": 0.1639, "step": 1991 }, { "epoch": 3.2474731007499185, "grad_norm": 0.14151298999786377, "learning_rate": 0.0001, "loss": 0.1683, "step": 1992 }, { "epoch": 3.249103358330616, "grad_norm": 0.13873840868473053, "learning_rate": 0.0001, "loss": 0.1434, "step": 1993 }, { "epoch": 3.250733615911314, "grad_norm": 0.15724745392799377, "learning_rate": 0.0001, "loss": 0.1719, "step": 1994 }, { "epoch": 3.252363873492012, "grad_norm": 0.12847794592380524, "learning_rate": 0.0001, "loss": 0.152, "step": 1995 }, { "epoch": 3.2539941310727096, "grad_norm": 0.1553923785686493, "learning_rate": 0.0001, "loss": 0.1673, "step": 1996 }, { "epoch": 3.2556243886534073, "grad_norm": 0.17050054669380188, "learning_rate": 0.0001, "loss": 0.18, "step": 1997 }, { "epoch": 3.257254646234105, "grad_norm": 0.13154077529907227, "learning_rate": 0.0001, "loss": 0.1625, "step": 1998 }, { "epoch": 3.2588849038148027, "grad_norm": 0.1389462798833847, "learning_rate": 0.0001, "loss": 0.1585, "step": 1999 }, { "epoch": 3.2605151613955003, "grad_norm": 0.12226948142051697, "learning_rate": 0.0001, "loss": 0.1641, "step": 2000 }, { "epoch": 3.2605151613955003, "eval_loss": 0.2004072368144989, "eval_runtime": 463.7076, "eval_samples_per_second": 4.069, "eval_steps_per_second": 1.018, "step": 2000 }, { "epoch": 3.262145418976198, "grad_norm": 0.13234886527061462, "learning_rate": 0.0001, "loss": 0.1664, "step": 2001 }, { "epoch": 3.263775676556896, "grad_norm": 0.12151322513818741, "learning_rate": 0.0001, "loss": 0.1433, "step": 2002 }, { "epoch": 3.265405934137594, "grad_norm": 0.16095128655433655, "learning_rate": 0.0001, "loss": 0.1662, "step": 2003 }, { "epoch": 3.2670361917182915, "grad_norm": 0.1361403614282608, "learning_rate": 0.0001, "loss": 0.16, "step": 2004 }, { "epoch": 3.268666449298989, "grad_norm": 0.13729725778102875, "learning_rate": 0.0001, "loss": 0.1626, "step": 2005 }, { "epoch": 3.270296706879687, "grad_norm": 0.16737613081932068, "learning_rate": 0.0001, "loss": 0.1532, "step": 2006 }, { "epoch": 3.2719269644603846, "grad_norm": 0.13619449734687805, "learning_rate": 0.0001, "loss": 0.1667, "step": 2007 }, { "epoch": 3.2735572220410827, "grad_norm": 0.1604672521352768, "learning_rate": 0.0001, "loss": 0.1576, "step": 2008 }, { "epoch": 3.2751874796217804, "grad_norm": 0.14666776359081268, "learning_rate": 0.0001, "loss": 0.1723, "step": 2009 }, { "epoch": 3.276817737202478, "grad_norm": 0.1353665441274643, "learning_rate": 0.0001, "loss": 0.1579, "step": 2010 }, { "epoch": 3.2784479947831757, "grad_norm": 0.1277618557214737, "learning_rate": 0.0001, "loss": 0.1518, "step": 2011 }, { "epoch": 3.2800782523638734, "grad_norm": 0.16270460188388824, "learning_rate": 0.0001, "loss": 0.1645, "step": 2012 }, { "epoch": 3.281708509944571, "grad_norm": 0.15774473547935486, "learning_rate": 0.0001, "loss": 0.1678, "step": 2013 }, { "epoch": 3.2833387675252688, "grad_norm": 0.13945020735263824, "learning_rate": 0.0001, "loss": 0.1484, "step": 2014 }, { "epoch": 3.284969025105967, "grad_norm": 0.14141727983951569, "learning_rate": 0.0001, "loss": 0.164, "step": 2015 }, { "epoch": 3.2865992826866646, "grad_norm": 0.12424743920564651, "learning_rate": 0.0001, "loss": 0.161, "step": 2016 }, { "epoch": 3.2882295402673622, "grad_norm": 0.12121134996414185, "learning_rate": 0.0001, "loss": 0.1513, "step": 2017 }, { "epoch": 3.28985979784806, "grad_norm": 0.1380610466003418, "learning_rate": 0.0001, "loss": 0.1615, "step": 2018 }, { "epoch": 3.2914900554287576, "grad_norm": 0.10836822539567947, "learning_rate": 0.0001, "loss": 0.1328, "step": 2019 }, { "epoch": 3.2931203130094557, "grad_norm": 0.12385184317827225, "learning_rate": 0.0001, "loss": 0.1446, "step": 2020 }, { "epoch": 3.2947505705901534, "grad_norm": 0.13645856082439423, "learning_rate": 0.0001, "loss": 0.1594, "step": 2021 }, { "epoch": 3.296380828170851, "grad_norm": 0.14024437963962555, "learning_rate": 0.0001, "loss": 0.1728, "step": 2022 }, { "epoch": 3.2980110857515488, "grad_norm": 0.13787047564983368, "learning_rate": 0.0001, "loss": 0.1601, "step": 2023 }, { "epoch": 3.2996413433322465, "grad_norm": 0.14804314076900482, "learning_rate": 0.0001, "loss": 0.1548, "step": 2024 }, { "epoch": 3.301271600912944, "grad_norm": 0.1506766825914383, "learning_rate": 0.0001, "loss": 0.1609, "step": 2025 }, { "epoch": 3.302901858493642, "grad_norm": 0.1486261785030365, "learning_rate": 0.0001, "loss": 0.165, "step": 2026 }, { "epoch": 3.3045321160743395, "grad_norm": 0.16172239184379578, "learning_rate": 0.0001, "loss": 0.1694, "step": 2027 }, { "epoch": 3.3061623736550376, "grad_norm": 0.1388084590435028, "learning_rate": 0.0001, "loss": 0.1529, "step": 2028 }, { "epoch": 3.3077926312357353, "grad_norm": 0.15074113011360168, "learning_rate": 0.0001, "loss": 0.1638, "step": 2029 }, { "epoch": 3.309422888816433, "grad_norm": 0.14456892013549805, "learning_rate": 0.0001, "loss": 0.1683, "step": 2030 }, { "epoch": 3.3110531463971307, "grad_norm": 0.1312306523323059, "learning_rate": 0.0001, "loss": 0.149, "step": 2031 }, { "epoch": 3.3126834039778283, "grad_norm": 0.17245103418827057, "learning_rate": 0.0001, "loss": 0.1638, "step": 2032 }, { "epoch": 3.3143136615585265, "grad_norm": 0.16342630982398987, "learning_rate": 0.0001, "loss": 0.166, "step": 2033 }, { "epoch": 3.315943919139224, "grad_norm": 0.13496838510036469, "learning_rate": 0.0001, "loss": 0.1599, "step": 2034 }, { "epoch": 3.317574176719922, "grad_norm": 0.1370801329612732, "learning_rate": 0.0001, "loss": 0.1556, "step": 2035 }, { "epoch": 3.3192044343006195, "grad_norm": 0.12318190932273865, "learning_rate": 0.0001, "loss": 0.1604, "step": 2036 }, { "epoch": 3.320834691881317, "grad_norm": 0.13986453413963318, "learning_rate": 0.0001, "loss": 0.1564, "step": 2037 }, { "epoch": 3.322464949462015, "grad_norm": 0.14522337913513184, "learning_rate": 0.0001, "loss": 0.1597, "step": 2038 }, { "epoch": 3.3240952070427126, "grad_norm": 0.1366148144006729, "learning_rate": 0.0001, "loss": 0.1391, "step": 2039 }, { "epoch": 3.3257254646234107, "grad_norm": 0.1322958618402481, "learning_rate": 0.0001, "loss": 0.1471, "step": 2040 }, { "epoch": 3.3273557222041084, "grad_norm": 0.1638648360967636, "learning_rate": 0.0001, "loss": 0.1507, "step": 2041 }, { "epoch": 3.328985979784806, "grad_norm": 0.15232039988040924, "learning_rate": 0.0001, "loss": 0.1757, "step": 2042 }, { "epoch": 3.3306162373655037, "grad_norm": 0.13481836020946503, "learning_rate": 0.0001, "loss": 0.1458, "step": 2043 }, { "epoch": 3.3322464949462014, "grad_norm": 0.1377725750207901, "learning_rate": 0.0001, "loss": 0.1552, "step": 2044 }, { "epoch": 3.333876752526899, "grad_norm": 0.1482841521501541, "learning_rate": 0.0001, "loss": 0.17, "step": 2045 }, { "epoch": 3.335507010107597, "grad_norm": 0.15259544551372528, "learning_rate": 0.0001, "loss": 0.1479, "step": 2046 }, { "epoch": 3.337137267688295, "grad_norm": 0.14434483647346497, "learning_rate": 0.0001, "loss": 0.1579, "step": 2047 }, { "epoch": 3.3387675252689926, "grad_norm": 0.14351776242256165, "learning_rate": 0.0001, "loss": 0.1605, "step": 2048 }, { "epoch": 3.3403977828496902, "grad_norm": 0.1376352459192276, "learning_rate": 0.0001, "loss": 0.1571, "step": 2049 }, { "epoch": 3.342028040430388, "grad_norm": 0.15512721240520477, "learning_rate": 0.0001, "loss": 0.1656, "step": 2050 }, { "epoch": 3.3436582980110856, "grad_norm": 0.1421147584915161, "learning_rate": 0.0001, "loss": 0.1563, "step": 2051 }, { "epoch": 3.3452885555917833, "grad_norm": 0.12368530035018921, "learning_rate": 0.0001, "loss": 0.1382, "step": 2052 }, { "epoch": 3.3469188131724814, "grad_norm": 0.14277908205986023, "learning_rate": 0.0001, "loss": 0.1407, "step": 2053 }, { "epoch": 3.348549070753179, "grad_norm": 0.15306757390499115, "learning_rate": 0.0001, "loss": 0.1574, "step": 2054 }, { "epoch": 3.3501793283338768, "grad_norm": 0.28750717639923096, "learning_rate": 0.0001, "loss": 0.1658, "step": 2055 }, { "epoch": 3.3518095859145745, "grad_norm": 0.33853110671043396, "learning_rate": 0.0001, "loss": 0.1491, "step": 2056 }, { "epoch": 3.353439843495272, "grad_norm": 0.1322954297065735, "learning_rate": 0.0001, "loss": 0.1553, "step": 2057 }, { "epoch": 3.35507010107597, "grad_norm": 0.15321014821529388, "learning_rate": 0.0001, "loss": 0.1651, "step": 2058 }, { "epoch": 3.356700358656668, "grad_norm": 0.15461274981498718, "learning_rate": 0.0001, "loss": 0.1648, "step": 2059 }, { "epoch": 3.3583306162373656, "grad_norm": 0.14355158805847168, "learning_rate": 0.0001, "loss": 0.167, "step": 2060 }, { "epoch": 3.3599608738180633, "grad_norm": 0.14021973311901093, "learning_rate": 0.0001, "loss": 0.1393, "step": 2061 }, { "epoch": 3.361591131398761, "grad_norm": 0.13298651576042175, "learning_rate": 0.0001, "loss": 0.1612, "step": 2062 }, { "epoch": 3.3632213889794587, "grad_norm": 0.12698578834533691, "learning_rate": 0.0001, "loss": 0.1518, "step": 2063 }, { "epoch": 3.3648516465601563, "grad_norm": 0.13323427736759186, "learning_rate": 0.0001, "loss": 0.147, "step": 2064 }, { "epoch": 3.366481904140854, "grad_norm": 0.14920176565647125, "learning_rate": 0.0001, "loss": 0.1669, "step": 2065 }, { "epoch": 3.368112161721552, "grad_norm": 0.13470523059368134, "learning_rate": 0.0001, "loss": 0.1588, "step": 2066 }, { "epoch": 3.36974241930225, "grad_norm": 0.13527804613113403, "learning_rate": 0.0001, "loss": 0.159, "step": 2067 }, { "epoch": 3.3713726768829475, "grad_norm": 0.14219944179058075, "learning_rate": 0.0001, "loss": 0.1575, "step": 2068 }, { "epoch": 3.373002934463645, "grad_norm": 0.1605711281299591, "learning_rate": 0.0001, "loss": 0.1781, "step": 2069 }, { "epoch": 3.374633192044343, "grad_norm": 0.12055234611034393, "learning_rate": 0.0001, "loss": 0.1543, "step": 2070 }, { "epoch": 3.376263449625041, "grad_norm": 0.13583442568778992, "learning_rate": 0.0001, "loss": 0.1629, "step": 2071 }, { "epoch": 3.3778937072057387, "grad_norm": 0.16419711709022522, "learning_rate": 0.0001, "loss": 0.1671, "step": 2072 }, { "epoch": 3.3795239647864364, "grad_norm": 0.16373597085475922, "learning_rate": 0.0001, "loss": 0.1711, "step": 2073 }, { "epoch": 3.381154222367134, "grad_norm": 0.15173253417015076, "learning_rate": 0.0001, "loss": 0.1458, "step": 2074 }, { "epoch": 3.3827844799478317, "grad_norm": 0.13307532668113708, "learning_rate": 0.0001, "loss": 0.1647, "step": 2075 }, { "epoch": 3.3844147375285294, "grad_norm": 0.13389620184898376, "learning_rate": 0.0001, "loss": 0.1622, "step": 2076 }, { "epoch": 3.386044995109227, "grad_norm": 0.1596551537513733, "learning_rate": 0.0001, "loss": 0.1756, "step": 2077 }, { "epoch": 3.387675252689925, "grad_norm": 0.13653887808322906, "learning_rate": 0.0001, "loss": 0.1571, "step": 2078 }, { "epoch": 3.389305510270623, "grad_norm": 0.1583099216222763, "learning_rate": 0.0001, "loss": 0.1611, "step": 2079 }, { "epoch": 3.3909357678513206, "grad_norm": 0.1411380171775818, "learning_rate": 0.0001, "loss": 0.1598, "step": 2080 }, { "epoch": 3.3925660254320182, "grad_norm": 0.1537819504737854, "learning_rate": 0.0001, "loss": 0.1717, "step": 2081 }, { "epoch": 3.394196283012716, "grad_norm": 0.13339395821094513, "learning_rate": 0.0001, "loss": 0.1507, "step": 2082 }, { "epoch": 3.3958265405934136, "grad_norm": 0.1422421634197235, "learning_rate": 0.0001, "loss": 0.1644, "step": 2083 }, { "epoch": 3.3974567981741117, "grad_norm": 0.21194976568222046, "learning_rate": 0.0001, "loss": 0.1677, "step": 2084 }, { "epoch": 3.3990870557548094, "grad_norm": 0.21863634884357452, "learning_rate": 0.0001, "loss": 0.1538, "step": 2085 }, { "epoch": 3.400717313335507, "grad_norm": 0.13825783133506775, "learning_rate": 0.0001, "loss": 0.1602, "step": 2086 }, { "epoch": 3.4023475709162048, "grad_norm": 0.154763326048851, "learning_rate": 0.0001, "loss": 0.1867, "step": 2087 }, { "epoch": 3.4039778284969024, "grad_norm": 0.14275521039962769, "learning_rate": 0.0001, "loss": 0.1655, "step": 2088 }, { "epoch": 3.4056080860776, "grad_norm": 0.14550544321537018, "learning_rate": 0.0001, "loss": 0.1642, "step": 2089 }, { "epoch": 3.407238343658298, "grad_norm": 0.1512393355369568, "learning_rate": 0.0001, "loss": 0.1612, "step": 2090 }, { "epoch": 3.408868601238996, "grad_norm": 0.14268140494823456, "learning_rate": 0.0001, "loss": 0.1621, "step": 2091 }, { "epoch": 3.4104988588196936, "grad_norm": 0.14155443012714386, "learning_rate": 0.0001, "loss": 0.1669, "step": 2092 }, { "epoch": 3.4121291164003913, "grad_norm": 0.1586080640554428, "learning_rate": 0.0001, "loss": 0.169, "step": 2093 }, { "epoch": 3.413759373981089, "grad_norm": 0.14126574993133545, "learning_rate": 0.0001, "loss": 0.1577, "step": 2094 }, { "epoch": 3.4153896315617867, "grad_norm": 0.12679395079612732, "learning_rate": 0.0001, "loss": 0.148, "step": 2095 }, { "epoch": 3.4170198891424843, "grad_norm": 0.17171818017959595, "learning_rate": 0.0001, "loss": 0.1628, "step": 2096 }, { "epoch": 3.4186501467231825, "grad_norm": 0.1353333592414856, "learning_rate": 0.0001, "loss": 0.155, "step": 2097 }, { "epoch": 3.42028040430388, "grad_norm": 0.20119301974773407, "learning_rate": 0.0001, "loss": 0.1555, "step": 2098 }, { "epoch": 3.421910661884578, "grad_norm": 0.1484251767396927, "learning_rate": 0.0001, "loss": 0.1631, "step": 2099 }, { "epoch": 3.4235409194652755, "grad_norm": 0.1589980572462082, "learning_rate": 0.0001, "loss": 0.1587, "step": 2100 }, { "epoch": 3.425171177045973, "grad_norm": 0.13816691935062408, "learning_rate": 0.0001, "loss": 0.1556, "step": 2101 }, { "epoch": 3.426801434626671, "grad_norm": 0.12436775863170624, "learning_rate": 0.0001, "loss": 0.1573, "step": 2102 }, { "epoch": 3.4284316922073685, "grad_norm": 0.12955281138420105, "learning_rate": 0.0001, "loss": 0.1662, "step": 2103 }, { "epoch": 3.4300619497880667, "grad_norm": 0.17406727373600006, "learning_rate": 0.0001, "loss": 0.1765, "step": 2104 }, { "epoch": 3.4316922073687643, "grad_norm": 0.17298480868339539, "learning_rate": 0.0001, "loss": 0.1766, "step": 2105 }, { "epoch": 3.433322464949462, "grad_norm": 0.15015456080436707, "learning_rate": 0.0001, "loss": 0.1541, "step": 2106 }, { "epoch": 3.4349527225301597, "grad_norm": 0.13401339948177338, "learning_rate": 0.0001, "loss": 0.1531, "step": 2107 }, { "epoch": 3.4365829801108574, "grad_norm": 0.15644237399101257, "learning_rate": 0.0001, "loss": 0.1665, "step": 2108 }, { "epoch": 3.4382132376915555, "grad_norm": 0.1544109582901001, "learning_rate": 0.0001, "loss": 0.1577, "step": 2109 }, { "epoch": 3.439843495272253, "grad_norm": 0.1289900243282318, "learning_rate": 0.0001, "loss": 0.1647, "step": 2110 }, { "epoch": 3.441473752852951, "grad_norm": 0.12115547060966492, "learning_rate": 0.0001, "loss": 0.1497, "step": 2111 }, { "epoch": 3.4431040104336486, "grad_norm": 0.16109564900398254, "learning_rate": 0.0001, "loss": 0.1519, "step": 2112 }, { "epoch": 3.4447342680143462, "grad_norm": 0.12790006399154663, "learning_rate": 0.0001, "loss": 0.1607, "step": 2113 }, { "epoch": 3.446364525595044, "grad_norm": 0.12892436981201172, "learning_rate": 0.0001, "loss": 0.1471, "step": 2114 }, { "epoch": 3.4479947831757416, "grad_norm": 0.1363893300294876, "learning_rate": 0.0001, "loss": 0.1535, "step": 2115 }, { "epoch": 3.4496250407564393, "grad_norm": 0.1328737437725067, "learning_rate": 0.0001, "loss": 0.1682, "step": 2116 }, { "epoch": 3.4512552983371374, "grad_norm": 0.14212438464164734, "learning_rate": 0.0001, "loss": 0.1639, "step": 2117 }, { "epoch": 3.452885555917835, "grad_norm": 0.16319912672042847, "learning_rate": 0.0001, "loss": 0.1659, "step": 2118 }, { "epoch": 3.4545158134985328, "grad_norm": 0.14669449627399445, "learning_rate": 0.0001, "loss": 0.1726, "step": 2119 }, { "epoch": 3.4561460710792304, "grad_norm": 0.12865161895751953, "learning_rate": 0.0001, "loss": 0.1431, "step": 2120 }, { "epoch": 3.457776328659928, "grad_norm": 0.14727066457271576, "learning_rate": 0.0001, "loss": 0.1591, "step": 2121 }, { "epoch": 3.4594065862406262, "grad_norm": 0.12576113641262054, "learning_rate": 0.0001, "loss": 0.1624, "step": 2122 }, { "epoch": 3.461036843821324, "grad_norm": 0.14870908856391907, "learning_rate": 0.0001, "loss": 0.1649, "step": 2123 }, { "epoch": 3.4626671014020216, "grad_norm": 0.14071956276893616, "learning_rate": 0.0001, "loss": 0.1621, "step": 2124 }, { "epoch": 3.4642973589827193, "grad_norm": 0.13211530447006226, "learning_rate": 0.0001, "loss": 0.1549, "step": 2125 }, { "epoch": 3.465927616563417, "grad_norm": 0.14687129855155945, "learning_rate": 0.0001, "loss": 0.1518, "step": 2126 }, { "epoch": 3.4675578741441146, "grad_norm": 0.1233501061797142, "learning_rate": 0.0001, "loss": 0.1553, "step": 2127 }, { "epoch": 3.4691881317248123, "grad_norm": 0.1451001763343811, "learning_rate": 0.0001, "loss": 0.1689, "step": 2128 }, { "epoch": 3.4708183893055105, "grad_norm": 0.14870783686637878, "learning_rate": 0.0001, "loss": 0.18, "step": 2129 }, { "epoch": 3.472448646886208, "grad_norm": 0.1319887787103653, "learning_rate": 0.0001, "loss": 0.1603, "step": 2130 }, { "epoch": 3.474078904466906, "grad_norm": 0.12232866883277893, "learning_rate": 0.0001, "loss": 0.1533, "step": 2131 }, { "epoch": 3.4757091620476035, "grad_norm": 0.12974146008491516, "learning_rate": 0.0001, "loss": 0.1554, "step": 2132 }, { "epoch": 3.477339419628301, "grad_norm": 0.14855380356311798, "learning_rate": 0.0001, "loss": 0.1671, "step": 2133 }, { "epoch": 3.478969677208999, "grad_norm": 0.15491904318332672, "learning_rate": 0.0001, "loss": 0.1796, "step": 2134 }, { "epoch": 3.480599934789697, "grad_norm": 0.12838788330554962, "learning_rate": 0.0001, "loss": 0.1495, "step": 2135 }, { "epoch": 3.4822301923703947, "grad_norm": 0.13425928354263306, "learning_rate": 0.0001, "loss": 0.1462, "step": 2136 }, { "epoch": 3.4838604499510923, "grad_norm": 0.1410258710384369, "learning_rate": 0.0001, "loss": 0.1704, "step": 2137 }, { "epoch": 3.48549070753179, "grad_norm": 0.1354324221611023, "learning_rate": 0.0001, "loss": 0.1474, "step": 2138 }, { "epoch": 3.4871209651124877, "grad_norm": 0.13679568469524384, "learning_rate": 0.0001, "loss": 0.1583, "step": 2139 }, { "epoch": 3.4887512226931854, "grad_norm": 0.1500730812549591, "learning_rate": 0.0001, "loss": 0.1763, "step": 2140 }, { "epoch": 3.490381480273883, "grad_norm": 0.13870522379875183, "learning_rate": 0.0001, "loss": 0.1645, "step": 2141 }, { "epoch": 3.492011737854581, "grad_norm": 0.13639062643051147, "learning_rate": 0.0001, "loss": 0.1679, "step": 2142 }, { "epoch": 3.493641995435279, "grad_norm": 0.16066312789916992, "learning_rate": 0.0001, "loss": 0.1582, "step": 2143 }, { "epoch": 3.4952722530159765, "grad_norm": 0.12547770142555237, "learning_rate": 0.0001, "loss": 0.152, "step": 2144 }, { "epoch": 3.4969025105966742, "grad_norm": 0.14830149710178375, "learning_rate": 0.0001, "loss": 0.1651, "step": 2145 }, { "epoch": 3.498532768177372, "grad_norm": 0.13417202234268188, "learning_rate": 0.0001, "loss": 0.1606, "step": 2146 }, { "epoch": 3.50016302575807, "grad_norm": 0.12787555158138275, "learning_rate": 0.0001, "loss": 0.1526, "step": 2147 }, { "epoch": 3.5017932833387677, "grad_norm": 0.1516534686088562, "learning_rate": 0.0001, "loss": 0.184, "step": 2148 }, { "epoch": 3.5034235409194654, "grad_norm": 0.11404507607221603, "learning_rate": 0.0001, "loss": 0.1433, "step": 2149 }, { "epoch": 3.505053798500163, "grad_norm": 0.1648181825876236, "learning_rate": 0.0001, "loss": 0.1655, "step": 2150 }, { "epoch": 3.5066840560808608, "grad_norm": 0.15040086209774017, "learning_rate": 0.0001, "loss": 0.1555, "step": 2151 }, { "epoch": 3.5083143136615584, "grad_norm": 0.17668725550174713, "learning_rate": 0.0001, "loss": 0.1672, "step": 2152 }, { "epoch": 3.509944571242256, "grad_norm": 0.13613073527812958, "learning_rate": 0.0001, "loss": 0.1639, "step": 2153 }, { "epoch": 3.511574828822954, "grad_norm": 0.18130412697792053, "learning_rate": 0.0001, "loss": 0.1689, "step": 2154 }, { "epoch": 3.513205086403652, "grad_norm": 0.1680803745985031, "learning_rate": 0.0001, "loss": 0.1496, "step": 2155 }, { "epoch": 3.5148353439843496, "grad_norm": 0.13636229932308197, "learning_rate": 0.0001, "loss": 0.1653, "step": 2156 }, { "epoch": 3.5164656015650473, "grad_norm": 0.16443794965744019, "learning_rate": 0.0001, "loss": 0.1668, "step": 2157 }, { "epoch": 3.518095859145745, "grad_norm": 0.15313531458377838, "learning_rate": 0.0001, "loss": 0.1604, "step": 2158 }, { "epoch": 3.5197261167264426, "grad_norm": 0.12920008599758148, "learning_rate": 0.0001, "loss": 0.1576, "step": 2159 }, { "epoch": 3.5213563743071408, "grad_norm": 0.14635585248470306, "learning_rate": 0.0001, "loss": 0.1644, "step": 2160 }, { "epoch": 3.5229866318878384, "grad_norm": 0.15921540558338165, "learning_rate": 0.0001, "loss": 0.1612, "step": 2161 }, { "epoch": 3.524616889468536, "grad_norm": 0.14027996361255646, "learning_rate": 0.0001, "loss": 0.1637, "step": 2162 }, { "epoch": 3.526247147049234, "grad_norm": 0.17607349157333374, "learning_rate": 0.0001, "loss": 0.1652, "step": 2163 }, { "epoch": 3.5278774046299315, "grad_norm": 0.14340530335903168, "learning_rate": 0.0001, "loss": 0.167, "step": 2164 }, { "epoch": 3.529507662210629, "grad_norm": 0.13496717810630798, "learning_rate": 0.0001, "loss": 0.1474, "step": 2165 }, { "epoch": 3.531137919791327, "grad_norm": 0.13388283550739288, "learning_rate": 0.0001, "loss": 0.1628, "step": 2166 }, { "epoch": 3.5327681773720245, "grad_norm": 0.12602393329143524, "learning_rate": 0.0001, "loss": 0.1426, "step": 2167 }, { "epoch": 3.5343984349527227, "grad_norm": 0.13006533682346344, "learning_rate": 0.0001, "loss": 0.1483, "step": 2168 }, { "epoch": 3.5360286925334203, "grad_norm": 0.14154121279716492, "learning_rate": 0.0001, "loss": 0.1618, "step": 2169 }, { "epoch": 3.537658950114118, "grad_norm": 0.16337460279464722, "learning_rate": 0.0001, "loss": 0.1591, "step": 2170 }, { "epoch": 3.5392892076948157, "grad_norm": 0.13401515781879425, "learning_rate": 0.0001, "loss": 0.1534, "step": 2171 }, { "epoch": 3.5409194652755134, "grad_norm": 0.16181372106075287, "learning_rate": 0.0001, "loss": 0.1657, "step": 2172 }, { "epoch": 3.5425497228562115, "grad_norm": 0.14788036048412323, "learning_rate": 0.0001, "loss": 0.1658, "step": 2173 }, { "epoch": 3.544179980436909, "grad_norm": 0.16251921653747559, "learning_rate": 0.0001, "loss": 0.1526, "step": 2174 }, { "epoch": 3.545810238017607, "grad_norm": 0.15175074338912964, "learning_rate": 0.0001, "loss": 0.1576, "step": 2175 }, { "epoch": 3.5474404955983045, "grad_norm": 0.1437595635652542, "learning_rate": 0.0001, "loss": 0.1731, "step": 2176 }, { "epoch": 3.5490707531790022, "grad_norm": 0.13593018054962158, "learning_rate": 0.0001, "loss": 0.1553, "step": 2177 }, { "epoch": 3.5507010107597, "grad_norm": 0.1520191729068756, "learning_rate": 0.0001, "loss": 0.169, "step": 2178 }, { "epoch": 3.5523312683403976, "grad_norm": 0.13462144136428833, "learning_rate": 0.0001, "loss": 0.1537, "step": 2179 }, { "epoch": 3.5539615259210953, "grad_norm": 0.12073640525341034, "learning_rate": 0.0001, "loss": 0.1544, "step": 2180 }, { "epoch": 3.5555917835017934, "grad_norm": 0.15898512303829193, "learning_rate": 0.0001, "loss": 0.1587, "step": 2181 }, { "epoch": 3.557222041082491, "grad_norm": 0.15629935264587402, "learning_rate": 0.0001, "loss": 0.1665, "step": 2182 }, { "epoch": 3.5588522986631888, "grad_norm": 0.12848952412605286, "learning_rate": 0.0001, "loss": 0.1492, "step": 2183 }, { "epoch": 3.5604825562438864, "grad_norm": 0.13705319166183472, "learning_rate": 0.0001, "loss": 0.1515, "step": 2184 }, { "epoch": 3.5621128138245846, "grad_norm": 0.1567762941122055, "learning_rate": 0.0001, "loss": 0.1671, "step": 2185 }, { "epoch": 3.5637430714052822, "grad_norm": 0.13675236701965332, "learning_rate": 0.0001, "loss": 0.1642, "step": 2186 }, { "epoch": 3.56537332898598, "grad_norm": 0.14318861067295074, "learning_rate": 0.0001, "loss": 0.1602, "step": 2187 }, { "epoch": 3.5670035865666776, "grad_norm": 0.16439467668533325, "learning_rate": 0.0001, "loss": 0.1798, "step": 2188 }, { "epoch": 3.5686338441473753, "grad_norm": 0.15110914409160614, "learning_rate": 0.0001, "loss": 0.1665, "step": 2189 }, { "epoch": 3.570264101728073, "grad_norm": 0.1775410771369934, "learning_rate": 0.0001, "loss": 0.1812, "step": 2190 }, { "epoch": 3.5718943593087706, "grad_norm": 0.14808377623558044, "learning_rate": 0.0001, "loss": 0.1588, "step": 2191 }, { "epoch": 3.5735246168894683, "grad_norm": 0.14409534633159637, "learning_rate": 0.0001, "loss": 0.1662, "step": 2192 }, { "epoch": 3.5751548744701664, "grad_norm": 0.14742523431777954, "learning_rate": 0.0001, "loss": 0.1763, "step": 2193 }, { "epoch": 3.576785132050864, "grad_norm": 0.15724779665470123, "learning_rate": 0.0001, "loss": 0.1603, "step": 2194 }, { "epoch": 3.578415389631562, "grad_norm": 0.1302788108587265, "learning_rate": 0.0001, "loss": 0.1561, "step": 2195 }, { "epoch": 3.5800456472122595, "grad_norm": 0.1408357322216034, "learning_rate": 0.0001, "loss": 0.1638, "step": 2196 }, { "epoch": 3.581675904792957, "grad_norm": 0.13979199528694153, "learning_rate": 0.0001, "loss": 0.1603, "step": 2197 }, { "epoch": 3.5833061623736553, "grad_norm": 0.12203294783830643, "learning_rate": 0.0001, "loss": 0.1613, "step": 2198 }, { "epoch": 3.584936419954353, "grad_norm": 0.12141190469264984, "learning_rate": 0.0001, "loss": 0.1705, "step": 2199 }, { "epoch": 3.5865666775350507, "grad_norm": 0.13076384365558624, "learning_rate": 0.0001, "loss": 0.17, "step": 2200 }, { "epoch": 3.5881969351157483, "grad_norm": 0.14992623031139374, "learning_rate": 0.0001, "loss": 0.1617, "step": 2201 }, { "epoch": 3.589827192696446, "grad_norm": 0.1519753336906433, "learning_rate": 0.0001, "loss": 0.1767, "step": 2202 }, { "epoch": 3.5914574502771437, "grad_norm": 0.1315367966890335, "learning_rate": 0.0001, "loss": 0.1606, "step": 2203 }, { "epoch": 3.5930877078578414, "grad_norm": 0.15619060397148132, "learning_rate": 0.0001, "loss": 0.1514, "step": 2204 }, { "epoch": 3.594717965438539, "grad_norm": 0.13551609218120575, "learning_rate": 0.0001, "loss": 0.1569, "step": 2205 }, { "epoch": 3.596348223019237, "grad_norm": 0.12931670248508453, "learning_rate": 0.0001, "loss": 0.156, "step": 2206 }, { "epoch": 3.597978480599935, "grad_norm": 0.1995784491300583, "learning_rate": 0.0001, "loss": 0.1624, "step": 2207 }, { "epoch": 3.5996087381806325, "grad_norm": 0.1508382260799408, "learning_rate": 0.0001, "loss": 0.1639, "step": 2208 }, { "epoch": 3.60123899576133, "grad_norm": 0.15891651809215546, "learning_rate": 0.0001, "loss": 0.172, "step": 2209 }, { "epoch": 3.602869253342028, "grad_norm": 0.14910927414894104, "learning_rate": 0.0001, "loss": 0.1675, "step": 2210 }, { "epoch": 3.604499510922726, "grad_norm": 0.1462751179933548, "learning_rate": 0.0001, "loss": 0.159, "step": 2211 }, { "epoch": 3.6061297685034237, "grad_norm": 0.1275903880596161, "learning_rate": 0.0001, "loss": 0.1517, "step": 2212 }, { "epoch": 3.6077600260841214, "grad_norm": 0.13851912319660187, "learning_rate": 0.0001, "loss": 0.1464, "step": 2213 }, { "epoch": 3.609390283664819, "grad_norm": 0.121219202876091, "learning_rate": 0.0001, "loss": 0.161, "step": 2214 }, { "epoch": 3.6110205412455167, "grad_norm": 0.14287428557872772, "learning_rate": 0.0001, "loss": 0.1731, "step": 2215 }, { "epoch": 3.6126507988262144, "grad_norm": 0.14093217253684998, "learning_rate": 0.0001, "loss": 0.1705, "step": 2216 }, { "epoch": 3.614281056406912, "grad_norm": 0.12914489209651947, "learning_rate": 0.0001, "loss": 0.1497, "step": 2217 }, { "epoch": 3.61591131398761, "grad_norm": 0.14418743550777435, "learning_rate": 0.0001, "loss": 0.1566, "step": 2218 }, { "epoch": 3.617541571568308, "grad_norm": 0.13824236392974854, "learning_rate": 0.0001, "loss": 0.1595, "step": 2219 }, { "epoch": 3.6191718291490056, "grad_norm": 0.14314696192741394, "learning_rate": 0.0001, "loss": 0.1695, "step": 2220 }, { "epoch": 3.6208020867297033, "grad_norm": 0.14805221557617188, "learning_rate": 0.0001, "loss": 0.1763, "step": 2221 }, { "epoch": 3.622432344310401, "grad_norm": 0.1371402144432068, "learning_rate": 0.0001, "loss": 0.1585, "step": 2222 }, { "epoch": 3.6240626018910986, "grad_norm": 0.1396588832139969, "learning_rate": 0.0001, "loss": 0.1531, "step": 2223 }, { "epoch": 3.6256928594717968, "grad_norm": 0.15313009917736053, "learning_rate": 0.0001, "loss": 0.1703, "step": 2224 }, { "epoch": 3.6273231170524944, "grad_norm": 0.12829270958900452, "learning_rate": 0.0001, "loss": 0.1638, "step": 2225 }, { "epoch": 3.628953374633192, "grad_norm": 0.14311690628528595, "learning_rate": 0.0001, "loss": 0.1614, "step": 2226 }, { "epoch": 3.63058363221389, "grad_norm": 0.1597692370414734, "learning_rate": 0.0001, "loss": 0.1579, "step": 2227 }, { "epoch": 3.6322138897945875, "grad_norm": 0.16670234501361847, "learning_rate": 0.0001, "loss": 0.1611, "step": 2228 }, { "epoch": 3.633844147375285, "grad_norm": 0.13492721319198608, "learning_rate": 0.0001, "loss": 0.1606, "step": 2229 }, { "epoch": 3.635474404955983, "grad_norm": 0.17079739272594452, "learning_rate": 0.0001, "loss": 0.1565, "step": 2230 }, { "epoch": 3.6371046625366805, "grad_norm": 0.1510683000087738, "learning_rate": 0.0001, "loss": 0.165, "step": 2231 }, { "epoch": 3.6387349201173786, "grad_norm": 0.16345694661140442, "learning_rate": 0.0001, "loss": 0.1604, "step": 2232 }, { "epoch": 3.6403651776980763, "grad_norm": 0.16322366893291473, "learning_rate": 0.0001, "loss": 0.1586, "step": 2233 }, { "epoch": 3.641995435278774, "grad_norm": 0.14221617579460144, "learning_rate": 0.0001, "loss": 0.1639, "step": 2234 }, { "epoch": 3.6436256928594717, "grad_norm": 0.1472931206226349, "learning_rate": 0.0001, "loss": 0.1652, "step": 2235 }, { "epoch": 3.64525595044017, "grad_norm": 0.15251535177230835, "learning_rate": 0.0001, "loss": 0.1814, "step": 2236 }, { "epoch": 3.6468862080208675, "grad_norm": 0.12713898718357086, "learning_rate": 0.0001, "loss": 0.1523, "step": 2237 }, { "epoch": 3.648516465601565, "grad_norm": 0.13150040805339813, "learning_rate": 0.0001, "loss": 0.1665, "step": 2238 }, { "epoch": 3.650146723182263, "grad_norm": 0.1565570831298828, "learning_rate": 0.0001, "loss": 0.1705, "step": 2239 }, { "epoch": 3.6517769807629605, "grad_norm": 0.13739293813705444, "learning_rate": 0.0001, "loss": 0.1638, "step": 2240 }, { "epoch": 3.653407238343658, "grad_norm": 0.1197710707783699, "learning_rate": 0.0001, "loss": 0.1458, "step": 2241 }, { "epoch": 3.655037495924356, "grad_norm": 0.1372688263654709, "learning_rate": 0.0001, "loss": 0.1569, "step": 2242 }, { "epoch": 3.6566677535050536, "grad_norm": 0.14185823500156403, "learning_rate": 0.0001, "loss": 0.1581, "step": 2243 }, { "epoch": 3.6582980110857517, "grad_norm": 0.18424132466316223, "learning_rate": 0.0001, "loss": 0.1542, "step": 2244 }, { "epoch": 3.6599282686664494, "grad_norm": 0.15011794865131378, "learning_rate": 0.0001, "loss": 0.1665, "step": 2245 }, { "epoch": 3.661558526247147, "grad_norm": 0.15816602110862732, "learning_rate": 0.0001, "loss": 0.1644, "step": 2246 }, { "epoch": 3.6631887838278447, "grad_norm": 0.14764584600925446, "learning_rate": 0.0001, "loss": 0.1632, "step": 2247 }, { "epoch": 3.6648190414085424, "grad_norm": 0.1543309986591339, "learning_rate": 0.0001, "loss": 0.1547, "step": 2248 }, { "epoch": 3.6664492989892405, "grad_norm": 0.16506819427013397, "learning_rate": 0.0001, "loss": 0.1711, "step": 2249 }, { "epoch": 3.6680795565699382, "grad_norm": 0.18144920468330383, "learning_rate": 0.0001, "loss": 0.1795, "step": 2250 }, { "epoch": 3.669709814150636, "grad_norm": 0.1368333399295807, "learning_rate": 0.0001, "loss": 0.1537, "step": 2251 }, { "epoch": 3.6713400717313336, "grad_norm": 0.14611852169036865, "learning_rate": 0.0001, "loss": 0.1659, "step": 2252 }, { "epoch": 3.6729703293120313, "grad_norm": 0.14789550006389618, "learning_rate": 0.0001, "loss": 0.1671, "step": 2253 }, { "epoch": 3.674600586892729, "grad_norm": 0.14431506395339966, "learning_rate": 0.0001, "loss": 0.1535, "step": 2254 }, { "epoch": 3.6762308444734266, "grad_norm": 0.14077717065811157, "learning_rate": 0.0001, "loss": 0.1596, "step": 2255 }, { "epoch": 3.6778611020541243, "grad_norm": 0.1420920193195343, "learning_rate": 0.0001, "loss": 0.1572, "step": 2256 }, { "epoch": 3.6794913596348224, "grad_norm": 0.13903354108333588, "learning_rate": 0.0001, "loss": 0.157, "step": 2257 }, { "epoch": 3.68112161721552, "grad_norm": 0.12706349790096283, "learning_rate": 0.0001, "loss": 0.1514, "step": 2258 }, { "epoch": 3.682751874796218, "grad_norm": 0.134929820895195, "learning_rate": 0.0001, "loss": 0.1614, "step": 2259 }, { "epoch": 3.6843821323769155, "grad_norm": 0.13152171671390533, "learning_rate": 0.0001, "loss": 0.1642, "step": 2260 }, { "epoch": 3.686012389957613, "grad_norm": 0.12819035351276398, "learning_rate": 0.0001, "loss": 0.1664, "step": 2261 }, { "epoch": 3.6876426475383113, "grad_norm": 0.14969497919082642, "learning_rate": 0.0001, "loss": 0.171, "step": 2262 }, { "epoch": 3.689272905119009, "grad_norm": 0.1479291170835495, "learning_rate": 0.0001, "loss": 0.1621, "step": 2263 }, { "epoch": 3.6909031626997066, "grad_norm": 0.13185615837574005, "learning_rate": 0.0001, "loss": 0.145, "step": 2264 }, { "epoch": 3.6925334202804043, "grad_norm": 0.1268220990896225, "learning_rate": 0.0001, "loss": 0.1616, "step": 2265 }, { "epoch": 3.694163677861102, "grad_norm": 0.15159299969673157, "learning_rate": 0.0001, "loss": 0.1571, "step": 2266 }, { "epoch": 3.6957939354417997, "grad_norm": 0.16078874468803406, "learning_rate": 0.0001, "loss": 0.1683, "step": 2267 }, { "epoch": 3.6974241930224974, "grad_norm": 0.13626697659492493, "learning_rate": 0.0001, "loss": 0.1447, "step": 2268 }, { "epoch": 3.699054450603195, "grad_norm": 0.14417946338653564, "learning_rate": 0.0001, "loss": 0.1632, "step": 2269 }, { "epoch": 3.700684708183893, "grad_norm": 0.13131751120090485, "learning_rate": 0.0001, "loss": 0.1698, "step": 2270 }, { "epoch": 3.702314965764591, "grad_norm": 0.142496719956398, "learning_rate": 0.0001, "loss": 0.1569, "step": 2271 }, { "epoch": 3.7039452233452885, "grad_norm": 0.14883071184158325, "learning_rate": 0.0001, "loss": 0.1595, "step": 2272 }, { "epoch": 3.705575480925986, "grad_norm": 0.1475469022989273, "learning_rate": 0.0001, "loss": 0.1678, "step": 2273 }, { "epoch": 3.707205738506684, "grad_norm": 0.14427977800369263, "learning_rate": 0.0001, "loss": 0.1631, "step": 2274 }, { "epoch": 3.708835996087382, "grad_norm": 0.13665615022182465, "learning_rate": 0.0001, "loss": 0.1513, "step": 2275 }, { "epoch": 3.7104662536680797, "grad_norm": 0.1295589655637741, "learning_rate": 0.0001, "loss": 0.1635, "step": 2276 }, { "epoch": 3.7120965112487774, "grad_norm": 0.139891117811203, "learning_rate": 0.0001, "loss": 0.1631, "step": 2277 }, { "epoch": 3.713726768829475, "grad_norm": 0.14485996961593628, "learning_rate": 0.0001, "loss": 0.1643, "step": 2278 }, { "epoch": 3.7153570264101727, "grad_norm": 0.13603661954402924, "learning_rate": 0.0001, "loss": 0.1559, "step": 2279 }, { "epoch": 3.7169872839908704, "grad_norm": 0.13382630050182343, "learning_rate": 0.0001, "loss": 0.1613, "step": 2280 }, { "epoch": 3.718617541571568, "grad_norm": 0.12754695117473602, "learning_rate": 0.0001, "loss": 0.1472, "step": 2281 }, { "epoch": 3.720247799152266, "grad_norm": 0.13873539865016937, "learning_rate": 0.0001, "loss": 0.1645, "step": 2282 }, { "epoch": 3.721878056732964, "grad_norm": 0.1300293654203415, "learning_rate": 0.0001, "loss": 0.1596, "step": 2283 }, { "epoch": 3.7235083143136616, "grad_norm": 0.1459476351737976, "learning_rate": 0.0001, "loss": 0.1664, "step": 2284 }, { "epoch": 3.7251385718943593, "grad_norm": 0.12731602787971497, "learning_rate": 0.0001, "loss": 0.1488, "step": 2285 }, { "epoch": 3.726768829475057, "grad_norm": 0.15073290467262268, "learning_rate": 0.0001, "loss": 0.159, "step": 2286 }, { "epoch": 3.728399087055755, "grad_norm": 0.13490375876426697, "learning_rate": 0.0001, "loss": 0.1683, "step": 2287 }, { "epoch": 3.7300293446364527, "grad_norm": 0.15296167135238647, "learning_rate": 0.0001, "loss": 0.1528, "step": 2288 }, { "epoch": 3.7316596022171504, "grad_norm": 0.18614895641803741, "learning_rate": 0.0001, "loss": 0.1731, "step": 2289 }, { "epoch": 3.733289859797848, "grad_norm": 0.14630430936813354, "learning_rate": 0.0001, "loss": 0.1499, "step": 2290 }, { "epoch": 3.734920117378546, "grad_norm": 0.13443906605243683, "learning_rate": 0.0001, "loss": 0.1526, "step": 2291 }, { "epoch": 3.7365503749592435, "grad_norm": 0.14118340611457825, "learning_rate": 0.0001, "loss": 0.1747, "step": 2292 }, { "epoch": 3.738180632539941, "grad_norm": 0.13624626398086548, "learning_rate": 0.0001, "loss": 0.1498, "step": 2293 }, { "epoch": 3.739810890120639, "grad_norm": 0.13312043249607086, "learning_rate": 0.0001, "loss": 0.1542, "step": 2294 }, { "epoch": 3.741441147701337, "grad_norm": 0.13018035888671875, "learning_rate": 0.0001, "loss": 0.1643, "step": 2295 }, { "epoch": 3.7430714052820346, "grad_norm": 0.178181990981102, "learning_rate": 0.0001, "loss": 0.1538, "step": 2296 }, { "epoch": 3.7447016628627323, "grad_norm": 0.13893936574459076, "learning_rate": 0.0001, "loss": 0.1557, "step": 2297 }, { "epoch": 3.74633192044343, "grad_norm": 0.13853946328163147, "learning_rate": 0.0001, "loss": 0.1595, "step": 2298 }, { "epoch": 3.7479621780241277, "grad_norm": 0.13181748986244202, "learning_rate": 0.0001, "loss": 0.1636, "step": 2299 }, { "epoch": 3.749592435604826, "grad_norm": 0.16607768833637238, "learning_rate": 0.0001, "loss": 0.164, "step": 2300 }, { "epoch": 3.7512226931855235, "grad_norm": 0.1501704752445221, "learning_rate": 0.0001, "loss": 0.1763, "step": 2301 }, { "epoch": 3.752852950766221, "grad_norm": 0.15002481639385223, "learning_rate": 0.0001, "loss": 0.149, "step": 2302 }, { "epoch": 3.754483208346919, "grad_norm": 0.14106477797031403, "learning_rate": 0.0001, "loss": 0.1553, "step": 2303 }, { "epoch": 3.7561134659276165, "grad_norm": 0.16274330019950867, "learning_rate": 0.0001, "loss": 0.1647, "step": 2304 }, { "epoch": 3.757743723508314, "grad_norm": 0.15609531104564667, "learning_rate": 0.0001, "loss": 0.151, "step": 2305 }, { "epoch": 3.759373981089012, "grad_norm": 0.1472497135400772, "learning_rate": 0.0001, "loss": 0.1649, "step": 2306 }, { "epoch": 3.7610042386697096, "grad_norm": 0.1381707787513733, "learning_rate": 0.0001, "loss": 0.1536, "step": 2307 }, { "epoch": 3.7626344962504077, "grad_norm": 0.14111392199993134, "learning_rate": 0.0001, "loss": 0.1484, "step": 2308 }, { "epoch": 3.7642647538311054, "grad_norm": 0.14470559358596802, "learning_rate": 0.0001, "loss": 0.161, "step": 2309 }, { "epoch": 3.765895011411803, "grad_norm": 0.13996532559394836, "learning_rate": 0.0001, "loss": 0.1552, "step": 2310 }, { "epoch": 3.7675252689925007, "grad_norm": 0.14922195672988892, "learning_rate": 0.0001, "loss": 0.1548, "step": 2311 }, { "epoch": 3.7691555265731984, "grad_norm": 0.15004834532737732, "learning_rate": 0.0001, "loss": 0.1613, "step": 2312 }, { "epoch": 3.7707857841538965, "grad_norm": 0.13584741950035095, "learning_rate": 0.0001, "loss": 0.1586, "step": 2313 }, { "epoch": 3.772416041734594, "grad_norm": 0.14363713562488556, "learning_rate": 0.0001, "loss": 0.1595, "step": 2314 }, { "epoch": 3.774046299315292, "grad_norm": 0.24087132513523102, "learning_rate": 0.0001, "loss": 0.1697, "step": 2315 }, { "epoch": 3.7756765568959896, "grad_norm": 0.15411627292633057, "learning_rate": 0.0001, "loss": 0.1666, "step": 2316 }, { "epoch": 3.7773068144766873, "grad_norm": 0.12080325186252594, "learning_rate": 0.0001, "loss": 0.1469, "step": 2317 }, { "epoch": 3.778937072057385, "grad_norm": 0.14731939136981964, "learning_rate": 0.0001, "loss": 0.1592, "step": 2318 }, { "epoch": 3.7805673296380826, "grad_norm": 0.13772052526474, "learning_rate": 0.0001, "loss": 0.1609, "step": 2319 }, { "epoch": 3.7821975872187803, "grad_norm": 0.14156171679496765, "learning_rate": 0.0001, "loss": 0.171, "step": 2320 }, { "epoch": 3.7838278447994784, "grad_norm": 0.14452658593654633, "learning_rate": 0.0001, "loss": 0.1507, "step": 2321 }, { "epoch": 3.785458102380176, "grad_norm": 0.12419889867305756, "learning_rate": 0.0001, "loss": 0.1623, "step": 2322 }, { "epoch": 3.787088359960874, "grad_norm": 0.11582159996032715, "learning_rate": 0.0001, "loss": 0.1492, "step": 2323 }, { "epoch": 3.7887186175415715, "grad_norm": 0.13737718760967255, "learning_rate": 0.0001, "loss": 0.1542, "step": 2324 }, { "epoch": 3.7903488751222696, "grad_norm": 0.1628282368183136, "learning_rate": 0.0001, "loss": 0.1666, "step": 2325 }, { "epoch": 3.7919791327029673, "grad_norm": 0.13625358045101166, "learning_rate": 0.0001, "loss": 0.1675, "step": 2326 }, { "epoch": 3.793609390283665, "grad_norm": 0.1334877908229828, "learning_rate": 0.0001, "loss": 0.1532, "step": 2327 }, { "epoch": 3.7952396478643626, "grad_norm": 0.13337381184101105, "learning_rate": 0.0001, "loss": 0.153, "step": 2328 }, { "epoch": 3.7968699054450603, "grad_norm": 0.1452982872724533, "learning_rate": 0.0001, "loss": 0.1541, "step": 2329 }, { "epoch": 3.798500163025758, "grad_norm": 0.14199711382389069, "learning_rate": 0.0001, "loss": 0.1553, "step": 2330 }, { "epoch": 3.8001304206064557, "grad_norm": 0.14179185032844543, "learning_rate": 0.0001, "loss": 0.1679, "step": 2331 }, { "epoch": 3.8017606781871534, "grad_norm": 0.14089013636112213, "learning_rate": 0.0001, "loss": 0.1571, "step": 2332 }, { "epoch": 3.8033909357678515, "grad_norm": 0.1408660113811493, "learning_rate": 0.0001, "loss": 0.1719, "step": 2333 }, { "epoch": 3.805021193348549, "grad_norm": 0.21150575578212738, "learning_rate": 0.0001, "loss": 0.1625, "step": 2334 }, { "epoch": 3.806651450929247, "grad_norm": 0.1231960654258728, "learning_rate": 0.0001, "loss": 0.1541, "step": 2335 }, { "epoch": 3.8082817085099445, "grad_norm": 0.16461803019046783, "learning_rate": 0.0001, "loss": 0.1662, "step": 2336 }, { "epoch": 3.809911966090642, "grad_norm": 0.14631450176239014, "learning_rate": 0.0001, "loss": 0.1788, "step": 2337 }, { "epoch": 3.8115422236713403, "grad_norm": 0.1487826257944107, "learning_rate": 0.0001, "loss": 0.1645, "step": 2338 }, { "epoch": 3.813172481252038, "grad_norm": 0.13970328867435455, "learning_rate": 0.0001, "loss": 0.1614, "step": 2339 }, { "epoch": 3.8148027388327357, "grad_norm": 0.13937072455883026, "learning_rate": 0.0001, "loss": 0.175, "step": 2340 }, { "epoch": 3.8164329964134334, "grad_norm": 0.13930930197238922, "learning_rate": 0.0001, "loss": 0.1491, "step": 2341 }, { "epoch": 3.818063253994131, "grad_norm": 0.13589458167552948, "learning_rate": 0.0001, "loss": 0.155, "step": 2342 }, { "epoch": 3.8196935115748287, "grad_norm": 0.1309502124786377, "learning_rate": 0.0001, "loss": 0.1772, "step": 2343 }, { "epoch": 3.8213237691555264, "grad_norm": 0.13713008165359497, "learning_rate": 0.0001, "loss": 0.1623, "step": 2344 }, { "epoch": 3.822954026736224, "grad_norm": 0.1149279922246933, "learning_rate": 0.0001, "loss": 0.149, "step": 2345 }, { "epoch": 3.824584284316922, "grad_norm": 0.14622916281223297, "learning_rate": 0.0001, "loss": 0.1583, "step": 2346 }, { "epoch": 3.82621454189762, "grad_norm": 0.1591426283121109, "learning_rate": 0.0001, "loss": 0.1575, "step": 2347 }, { "epoch": 3.8278447994783176, "grad_norm": 0.15121379494667053, "learning_rate": 0.0001, "loss": 0.1487, "step": 2348 }, { "epoch": 3.8294750570590153, "grad_norm": 0.14661197364330292, "learning_rate": 0.0001, "loss": 0.1587, "step": 2349 }, { "epoch": 3.831105314639713, "grad_norm": 0.21391494572162628, "learning_rate": 0.0001, "loss": 0.1811, "step": 2350 }, { "epoch": 3.832735572220411, "grad_norm": 0.14501704275608063, "learning_rate": 0.0001, "loss": 0.1503, "step": 2351 }, { "epoch": 3.8343658298011087, "grad_norm": 0.17319004237651825, "learning_rate": 0.0001, "loss": 0.1681, "step": 2352 }, { "epoch": 3.8359960873818064, "grad_norm": 0.15633517503738403, "learning_rate": 0.0001, "loss": 0.1607, "step": 2353 }, { "epoch": 3.837626344962504, "grad_norm": 0.15051516890525818, "learning_rate": 0.0001, "loss": 0.1651, "step": 2354 }, { "epoch": 3.839256602543202, "grad_norm": 0.1435638666152954, "learning_rate": 0.0001, "loss": 0.1574, "step": 2355 }, { "epoch": 3.8408868601238995, "grad_norm": 0.14581768214702606, "learning_rate": 0.0001, "loss": 0.1533, "step": 2356 }, { "epoch": 3.842517117704597, "grad_norm": 0.1498011201620102, "learning_rate": 0.0001, "loss": 0.1697, "step": 2357 }, { "epoch": 3.844147375285295, "grad_norm": 0.14038564264774323, "learning_rate": 0.0001, "loss": 0.1378, "step": 2358 }, { "epoch": 3.845777632865993, "grad_norm": 0.14458616077899933, "learning_rate": 0.0001, "loss": 0.1535, "step": 2359 }, { "epoch": 3.8474078904466906, "grad_norm": 0.17851635813713074, "learning_rate": 0.0001, "loss": 0.1602, "step": 2360 }, { "epoch": 3.8490381480273883, "grad_norm": 0.12941935658454895, "learning_rate": 0.0001, "loss": 0.1553, "step": 2361 }, { "epoch": 3.850668405608086, "grad_norm": 0.1391555517911911, "learning_rate": 0.0001, "loss": 0.1799, "step": 2362 }, { "epoch": 3.8522986631887837, "grad_norm": 0.13640303909778595, "learning_rate": 0.0001, "loss": 0.1613, "step": 2363 }, { "epoch": 3.853928920769482, "grad_norm": 0.1430894136428833, "learning_rate": 0.0001, "loss": 0.1631, "step": 2364 }, { "epoch": 3.8555591783501795, "grad_norm": 0.14540916681289673, "learning_rate": 0.0001, "loss": 0.1611, "step": 2365 }, { "epoch": 3.857189435930877, "grad_norm": 0.1320265531539917, "learning_rate": 0.0001, "loss": 0.1661, "step": 2366 }, { "epoch": 3.858819693511575, "grad_norm": 0.12641210854053497, "learning_rate": 0.0001, "loss": 0.1609, "step": 2367 }, { "epoch": 3.8604499510922725, "grad_norm": 0.1293664276599884, "learning_rate": 0.0001, "loss": 0.1429, "step": 2368 }, { "epoch": 3.86208020867297, "grad_norm": 0.13504886627197266, "learning_rate": 0.0001, "loss": 0.1616, "step": 2369 }, { "epoch": 3.863710466253668, "grad_norm": 0.13120906054973602, "learning_rate": 0.0001, "loss": 0.1678, "step": 2370 }, { "epoch": 3.8653407238343656, "grad_norm": 0.1444963663816452, "learning_rate": 0.0001, "loss": 0.1744, "step": 2371 }, { "epoch": 3.8669709814150637, "grad_norm": 0.13688752055168152, "learning_rate": 0.0001, "loss": 0.1707, "step": 2372 }, { "epoch": 3.8686012389957614, "grad_norm": 0.13113614916801453, "learning_rate": 0.0001, "loss": 0.163, "step": 2373 }, { "epoch": 3.870231496576459, "grad_norm": 0.1431058645248413, "learning_rate": 0.0001, "loss": 0.1677, "step": 2374 }, { "epoch": 3.8718617541571567, "grad_norm": 0.16322541236877441, "learning_rate": 0.0001, "loss": 0.1492, "step": 2375 }, { "epoch": 3.873492011737855, "grad_norm": 0.15727584064006805, "learning_rate": 0.0001, "loss": 0.1773, "step": 2376 }, { "epoch": 3.8751222693185525, "grad_norm": 0.13210931420326233, "learning_rate": 0.0001, "loss": 0.1583, "step": 2377 }, { "epoch": 3.87675252689925, "grad_norm": 0.12558962404727936, "learning_rate": 0.0001, "loss": 0.1461, "step": 2378 }, { "epoch": 3.878382784479948, "grad_norm": 0.14045056700706482, "learning_rate": 0.0001, "loss": 0.1541, "step": 2379 }, { "epoch": 3.8800130420606456, "grad_norm": 0.1418190896511078, "learning_rate": 0.0001, "loss": 0.1581, "step": 2380 }, { "epoch": 3.8816432996413432, "grad_norm": 0.12244697660207748, "learning_rate": 0.0001, "loss": 0.1405, "step": 2381 }, { "epoch": 3.883273557222041, "grad_norm": 0.164667546749115, "learning_rate": 0.0001, "loss": 0.1672, "step": 2382 }, { "epoch": 3.8849038148027386, "grad_norm": 0.15115118026733398, "learning_rate": 0.0001, "loss": 0.1755, "step": 2383 }, { "epoch": 3.8865340723834367, "grad_norm": 0.13539472222328186, "learning_rate": 0.0001, "loss": 0.1639, "step": 2384 }, { "epoch": 3.8881643299641344, "grad_norm": 0.13099908828735352, "learning_rate": 0.0001, "loss": 0.1495, "step": 2385 }, { "epoch": 3.889794587544832, "grad_norm": 0.12721744179725647, "learning_rate": 0.0001, "loss": 0.1589, "step": 2386 }, { "epoch": 3.8914248451255298, "grad_norm": 0.1410721391439438, "learning_rate": 0.0001, "loss": 0.1614, "step": 2387 }, { "epoch": 3.8930551027062275, "grad_norm": 0.1388472616672516, "learning_rate": 0.0001, "loss": 0.1563, "step": 2388 }, { "epoch": 3.8946853602869256, "grad_norm": 0.157010018825531, "learning_rate": 0.0001, "loss": 0.1618, "step": 2389 }, { "epoch": 3.8963156178676233, "grad_norm": 0.14445838332176208, "learning_rate": 0.0001, "loss": 0.153, "step": 2390 }, { "epoch": 3.897945875448321, "grad_norm": 0.14753592014312744, "learning_rate": 0.0001, "loss": 0.1582, "step": 2391 }, { "epoch": 3.8995761330290186, "grad_norm": 0.13478566706180573, "learning_rate": 0.0001, "loss": 0.1586, "step": 2392 }, { "epoch": 3.9012063906097163, "grad_norm": 0.13900446891784668, "learning_rate": 0.0001, "loss": 0.1685, "step": 2393 }, { "epoch": 3.902836648190414, "grad_norm": 0.13876253366470337, "learning_rate": 0.0001, "loss": 0.1592, "step": 2394 }, { "epoch": 3.9044669057711117, "grad_norm": 0.14419294893741608, "learning_rate": 0.0001, "loss": 0.1593, "step": 2395 }, { "epoch": 3.9060971633518093, "grad_norm": 0.14167584478855133, "learning_rate": 0.0001, "loss": 0.161, "step": 2396 }, { "epoch": 3.9077274209325075, "grad_norm": 0.12826170027256012, "learning_rate": 0.0001, "loss": 0.147, "step": 2397 }, { "epoch": 3.909357678513205, "grad_norm": 0.14393731951713562, "learning_rate": 0.0001, "loss": 0.1557, "step": 2398 }, { "epoch": 3.910987936093903, "grad_norm": 0.15396815538406372, "learning_rate": 0.0001, "loss": 0.18, "step": 2399 }, { "epoch": 3.9126181936746005, "grad_norm": 0.11657989025115967, "learning_rate": 0.0001, "loss": 0.1543, "step": 2400 }, { "epoch": 3.914248451255298, "grad_norm": 0.1413910686969757, "learning_rate": 0.0001, "loss": 0.1626, "step": 2401 }, { "epoch": 3.9158787088359963, "grad_norm": 0.12488115578889847, "learning_rate": 0.0001, "loss": 0.1519, "step": 2402 }, { "epoch": 3.917508966416694, "grad_norm": 0.1491551250219345, "learning_rate": 0.0001, "loss": 0.1692, "step": 2403 }, { "epoch": 3.9191392239973917, "grad_norm": 0.13683412969112396, "learning_rate": 0.0001, "loss": 0.1503, "step": 2404 }, { "epoch": 3.9207694815780894, "grad_norm": 0.15317296981811523, "learning_rate": 0.0001, "loss": 0.1623, "step": 2405 }, { "epoch": 3.922399739158787, "grad_norm": 0.12439368665218353, "learning_rate": 0.0001, "loss": 0.1536, "step": 2406 }, { "epoch": 3.9240299967394847, "grad_norm": 0.12986606359481812, "learning_rate": 0.0001, "loss": 0.1566, "step": 2407 }, { "epoch": 3.9256602543201824, "grad_norm": 0.1569644659757614, "learning_rate": 0.0001, "loss": 0.1662, "step": 2408 }, { "epoch": 3.92729051190088, "grad_norm": 0.14531882107257843, "learning_rate": 0.0001, "loss": 0.1604, "step": 2409 }, { "epoch": 3.928920769481578, "grad_norm": 0.1490756869316101, "learning_rate": 0.0001, "loss": 0.1631, "step": 2410 }, { "epoch": 3.930551027062276, "grad_norm": 0.11785148829221725, "learning_rate": 0.0001, "loss": 0.1498, "step": 2411 }, { "epoch": 3.9321812846429736, "grad_norm": 0.13614097237586975, "learning_rate": 0.0001, "loss": 0.169, "step": 2412 }, { "epoch": 3.9338115422236712, "grad_norm": 0.12401864677667618, "learning_rate": 0.0001, "loss": 0.1607, "step": 2413 }, { "epoch": 3.9354417998043694, "grad_norm": 0.12099269777536392, "learning_rate": 0.0001, "loss": 0.1305, "step": 2414 }, { "epoch": 3.937072057385067, "grad_norm": 0.14183704555034637, "learning_rate": 0.0001, "loss": 0.1711, "step": 2415 }, { "epoch": 3.9387023149657647, "grad_norm": 0.12612907588481903, "learning_rate": 0.0001, "loss": 0.1503, "step": 2416 }, { "epoch": 3.9403325725464624, "grad_norm": 0.15672796964645386, "learning_rate": 0.0001, "loss": 0.1586, "step": 2417 }, { "epoch": 3.94196283012716, "grad_norm": 0.1381552368402481, "learning_rate": 0.0001, "loss": 0.1541, "step": 2418 }, { "epoch": 3.9435930877078578, "grad_norm": 0.2042001485824585, "learning_rate": 0.0001, "loss": 0.1497, "step": 2419 }, { "epoch": 3.9452233452885554, "grad_norm": 0.13539600372314453, "learning_rate": 0.0001, "loss": 0.1485, "step": 2420 }, { "epoch": 3.946853602869253, "grad_norm": 0.15428173542022705, "learning_rate": 0.0001, "loss": 0.1541, "step": 2421 }, { "epoch": 3.9484838604499513, "grad_norm": 0.1534593552350998, "learning_rate": 0.0001, "loss": 0.1723, "step": 2422 }, { "epoch": 3.950114118030649, "grad_norm": 0.15443755686283112, "learning_rate": 0.0001, "loss": 0.155, "step": 2423 }, { "epoch": 3.9517443756113466, "grad_norm": 0.1520153284072876, "learning_rate": 0.0001, "loss": 0.1518, "step": 2424 }, { "epoch": 3.9533746331920443, "grad_norm": 0.14066120982170105, "learning_rate": 0.0001, "loss": 0.165, "step": 2425 }, { "epoch": 3.955004890772742, "grad_norm": 0.14006389677524567, "learning_rate": 0.0001, "loss": 0.1505, "step": 2426 }, { "epoch": 3.95663514835344, "grad_norm": 0.1307193785905838, "learning_rate": 0.0001, "loss": 0.1526, "step": 2427 }, { "epoch": 3.958265405934138, "grad_norm": 0.1285211592912674, "learning_rate": 0.0001, "loss": 0.1653, "step": 2428 }, { "epoch": 3.9598956635148355, "grad_norm": 0.14015939831733704, "learning_rate": 0.0001, "loss": 0.1544, "step": 2429 }, { "epoch": 3.961525921095533, "grad_norm": 0.13942451775074005, "learning_rate": 0.0001, "loss": 0.1606, "step": 2430 }, { "epoch": 3.963156178676231, "grad_norm": 0.12587985396385193, "learning_rate": 0.0001, "loss": 0.1578, "step": 2431 }, { "epoch": 3.9647864362569285, "grad_norm": 0.14944875240325928, "learning_rate": 0.0001, "loss": 0.1635, "step": 2432 }, { "epoch": 3.966416693837626, "grad_norm": 0.13582269847393036, "learning_rate": 0.0001, "loss": 0.1502, "step": 2433 }, { "epoch": 3.968046951418324, "grad_norm": 0.15742120146751404, "learning_rate": 0.0001, "loss": 0.1562, "step": 2434 }, { "epoch": 3.969677208999022, "grad_norm": 0.14265507459640503, "learning_rate": 0.0001, "loss": 0.1395, "step": 2435 }, { "epoch": 3.9713074665797197, "grad_norm": 0.14941944181919098, "learning_rate": 0.0001, "loss": 0.1584, "step": 2436 }, { "epoch": 3.9729377241604173, "grad_norm": 0.13401104509830475, "learning_rate": 0.0001, "loss": 0.1575, "step": 2437 }, { "epoch": 3.974567981741115, "grad_norm": 0.13355430960655212, "learning_rate": 0.0001, "loss": 0.1732, "step": 2438 }, { "epoch": 3.9761982393218127, "grad_norm": 0.14246457815170288, "learning_rate": 0.0001, "loss": 0.154, "step": 2439 }, { "epoch": 3.977828496902511, "grad_norm": 0.1443282514810562, "learning_rate": 0.0001, "loss": 0.1737, "step": 2440 }, { "epoch": 3.9794587544832085, "grad_norm": 0.13674692809581757, "learning_rate": 0.0001, "loss": 0.1526, "step": 2441 }, { "epoch": 3.981089012063906, "grad_norm": 0.14663022756576538, "learning_rate": 0.0001, "loss": 0.1817, "step": 2442 }, { "epoch": 3.982719269644604, "grad_norm": 0.23058007657527924, "learning_rate": 0.0001, "loss": 0.1485, "step": 2443 }, { "epoch": 3.9843495272253016, "grad_norm": 0.12496799230575562, "learning_rate": 0.0001, "loss": 0.1625, "step": 2444 }, { "epoch": 3.9859797848059992, "grad_norm": 0.14396335184574127, "learning_rate": 0.0001, "loss": 0.1577, "step": 2445 }, { "epoch": 3.987610042386697, "grad_norm": 0.11941339075565338, "learning_rate": 0.0001, "loss": 0.1524, "step": 2446 }, { "epoch": 3.9892402999673946, "grad_norm": 0.15236441791057587, "learning_rate": 0.0001, "loss": 0.1736, "step": 2447 }, { "epoch": 3.9908705575480927, "grad_norm": 0.134572833776474, "learning_rate": 0.0001, "loss": 0.1731, "step": 2448 }, { "epoch": 3.9925008151287904, "grad_norm": 0.1253572255373001, "learning_rate": 0.0001, "loss": 0.1513, "step": 2449 }, { "epoch": 3.994131072709488, "grad_norm": 0.1377570629119873, "learning_rate": 0.0001, "loss": 0.179, "step": 2450 }, { "epoch": 3.9957613302901858, "grad_norm": 0.1263006627559662, "learning_rate": 0.0001, "loss": 0.167, "step": 2451 }, { "epoch": 3.9973915878708834, "grad_norm": 0.1317623406648636, "learning_rate": 0.0001, "loss": 0.1691, "step": 2452 }, { "epoch": 3.9973915878708834, "step": 2452, "total_flos": 5.923561418853974e+18, "train_loss": 0.19042991894201586, "train_runtime": 49765.6162, "train_samples_per_second": 1.479, "train_steps_per_second": 0.049 } ], "logging_steps": 1.0, "max_steps": 2452, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 5.923561418853974e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }