diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.50012156576708, + "epoch": 1.00024313153416, "eval_steps": 500, - "global_step": 2057, + "global_step": 4114, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -14406,6 +14406,14413 @@ "learning_rate": 2.158971359258265e-06, "loss": 0.4969, "step": 2057 + }, + { + "epoch": 0.50036469730124, + "grad_norm": 21.25, + "learning_rate": 2.1586395558563363e-06, + "loss": 1.3187, + "step": 2058 + }, + { + "epoch": 0.5006078288354, + "grad_norm": 18.5, + "learning_rate": 2.1583076166425328e-06, + "loss": 1.2774, + "step": 2059 + }, + { + "epoch": 0.5008509603695599, + "grad_norm": 22.125, + "learning_rate": 2.157975541666469e-06, + "loss": 1.0634, + "step": 2060 + }, + { + "epoch": 0.5010940919037199, + "grad_norm": 18.0, + "learning_rate": 2.1576433309777794e-06, + "loss": 0.8712, + "step": 2061 + }, + { + "epoch": 0.5013372234378799, + "grad_norm": 15.3125, + "learning_rate": 2.157310984626118e-06, + "loss": 0.6967, + "step": 2062 + }, + { + "epoch": 0.5015803549720399, + "grad_norm": 23.375, + "learning_rate": 2.1569785026611605e-06, + "loss": 1.2955, + "step": 2063 + }, + { + "epoch": 0.5018234865061999, + "grad_norm": 21.25, + "learning_rate": 2.1566458851326015e-06, + "loss": 0.8218, + "step": 2064 + }, + { + "epoch": 0.5020666180403598, + "grad_norm": 15.125, + "learning_rate": 2.156313132090157e-06, + "loss": 0.6786, + "step": 2065 + }, + { + "epoch": 0.5023097495745198, + "grad_norm": 18.875, + "learning_rate": 2.1559802435835623e-06, + "loss": 0.669, + "step": 2066 + }, + { + "epoch": 0.5025528811086798, + "grad_norm": 16.375, + "learning_rate": 2.1556472196625733e-06, + "loss": 0.71, + "step": 2067 + }, + { + "epoch": 0.5027960126428398, + "grad_norm": 16.25, + "learning_rate": 2.155314060376966e-06, + "loss": 0.6031, + "step": 2068 + }, + { + "epoch": 0.5030391441769998, + "grad_norm": 19.375, + "learning_rate": 2.1549807657765375e-06, + "loss": 1.1718, + "step": 2069 + }, + { + "epoch": 0.5032822757111597, + "grad_norm": 26.125, + "learning_rate": 2.1546473359111037e-06, + "loss": 0.8826, + "step": 2070 + }, + { + "epoch": 0.5035254072453197, + "grad_norm": 19.625, + "learning_rate": 2.154313770830502e-06, + "loss": 0.8892, + "step": 2071 + }, + { + "epoch": 0.5037685387794797, + "grad_norm": 21.0, + "learning_rate": 2.1539800705845886e-06, + "loss": 1.0622, + "step": 2072 + }, + { + "epoch": 0.5040116703136397, + "grad_norm": 17.375, + "learning_rate": 2.1536462352232416e-06, + "loss": 0.7742, + "step": 2073 + }, + { + "epoch": 0.5042548018477997, + "grad_norm": 16.25, + "learning_rate": 2.153312264796359e-06, + "loss": 1.0432, + "step": 2074 + }, + { + "epoch": 0.5044979333819597, + "grad_norm": 20.75, + "learning_rate": 2.152978159353857e-06, + "loss": 0.6966, + "step": 2075 + }, + { + "epoch": 0.5047410649161196, + "grad_norm": 20.75, + "learning_rate": 2.152643918945674e-06, + "loss": 0.9008, + "step": 2076 + }, + { + "epoch": 0.5049841964502796, + "grad_norm": 15.375, + "learning_rate": 2.1523095436217685e-06, + "loss": 0.6613, + "step": 2077 + }, + { + "epoch": 0.5052273279844396, + "grad_norm": 20.75, + "learning_rate": 2.151975033432118e-06, + "loss": 1.3671, + "step": 2078 + }, + { + "epoch": 0.5054704595185996, + "grad_norm": 19.25, + "learning_rate": 2.151640388426721e-06, + "loss": 1.0413, + "step": 2079 + }, + { + "epoch": 0.5057135910527596, + "grad_norm": 19.125, + "learning_rate": 2.151305608655597e-06, + "loss": 0.8604, + "step": 2080 + }, + { + "epoch": 0.5059567225869195, + "grad_norm": 19.0, + "learning_rate": 2.1509706941687824e-06, + "loss": 0.794, + "step": 2081 + }, + { + "epoch": 0.5061998541210795, + "grad_norm": 17.5, + "learning_rate": 2.150635645016338e-06, + "loss": 1.0629, + "step": 2082 + }, + { + "epoch": 0.5064429856552395, + "grad_norm": 16.625, + "learning_rate": 2.150300461248342e-06, + "loss": 1.1524, + "step": 2083 + }, + { + "epoch": 0.5066861171893995, + "grad_norm": 22.5, + "learning_rate": 2.149965142914893e-06, + "loss": 1.0185, + "step": 2084 + }, + { + "epoch": 0.5069292487235595, + "grad_norm": 18.875, + "learning_rate": 2.1496296900661106e-06, + "loss": 1.0102, + "step": 2085 + }, + { + "epoch": 0.5071723802577194, + "grad_norm": 16.75, + "learning_rate": 2.149294102752134e-06, + "loss": 0.8522, + "step": 2086 + }, + { + "epoch": 0.5074155117918794, + "grad_norm": 17.75, + "learning_rate": 2.1489583810231217e-06, + "loss": 0.6616, + "step": 2087 + }, + { + "epoch": 0.5076586433260394, + "grad_norm": 22.25, + "learning_rate": 2.148622524929255e-06, + "loss": 1.2406, + "step": 2088 + }, + { + "epoch": 0.5079017748601994, + "grad_norm": 25.75, + "learning_rate": 2.148286534520731e-06, + "loss": 0.9268, + "step": 2089 + }, + { + "epoch": 0.5081449063943594, + "grad_norm": 21.0, + "learning_rate": 2.147950409847771e-06, + "loss": 1.0492, + "step": 2090 + }, + { + "epoch": 0.5083880379285193, + "grad_norm": 20.125, + "learning_rate": 2.1476141509606144e-06, + "loss": 1.084, + "step": 2091 + }, + { + "epoch": 0.5086311694626793, + "grad_norm": 18.875, + "learning_rate": 2.1472777579095205e-06, + "loss": 0.9973, + "step": 2092 + }, + { + "epoch": 0.5088743009968393, + "grad_norm": 16.125, + "learning_rate": 2.146941230744769e-06, + "loss": 0.3655, + "step": 2093 + }, + { + "epoch": 0.5091174325309993, + "grad_norm": 12.8125, + "learning_rate": 2.14660456951666e-06, + "loss": 0.7654, + "step": 2094 + }, + { + "epoch": 0.5093605640651593, + "grad_norm": 18.625, + "learning_rate": 2.146267774275513e-06, + "loss": 0.9237, + "step": 2095 + }, + { + "epoch": 0.5096036955993193, + "grad_norm": 17.25, + "learning_rate": 2.145930845071668e-06, + "loss": 0.7004, + "step": 2096 + }, + { + "epoch": 0.5098468271334792, + "grad_norm": 26.5, + "learning_rate": 2.145593781955485e-06, + "loss": 0.9624, + "step": 2097 + }, + { + "epoch": 0.5100899586676392, + "grad_norm": 22.75, + "learning_rate": 2.145256584977344e-06, + "loss": 0.8197, + "step": 2098 + }, + { + "epoch": 0.5103330902017992, + "grad_norm": 22.625, + "learning_rate": 2.1449192541876447e-06, + "loss": 1.3957, + "step": 2099 + }, + { + "epoch": 0.5105762217359592, + "grad_norm": 25.5, + "learning_rate": 2.144581789636807e-06, + "loss": 0.9053, + "step": 2100 + }, + { + "epoch": 0.5108193532701192, + "grad_norm": 17.125, + "learning_rate": 2.144244191375271e-06, + "loss": 0.8517, + "step": 2101 + }, + { + "epoch": 0.5110624848042791, + "grad_norm": 17.125, + "learning_rate": 2.143906459453496e-06, + "loss": 0.8091, + "step": 2102 + }, + { + "epoch": 0.5113056163384391, + "grad_norm": 23.125, + "learning_rate": 2.143568593921963e-06, + "loss": 0.8385, + "step": 2103 + }, + { + "epoch": 0.5115487478725991, + "grad_norm": 20.75, + "learning_rate": 2.143230594831171e-06, + "loss": 0.9638, + "step": 2104 + }, + { + "epoch": 0.5117918794067591, + "grad_norm": 19.25, + "learning_rate": 2.1428924622316396e-06, + "loss": 0.9676, + "step": 2105 + }, + { + "epoch": 0.5120350109409191, + "grad_norm": 19.125, + "learning_rate": 2.1425541961739093e-06, + "loss": 0.8893, + "step": 2106 + }, + { + "epoch": 0.512278142475079, + "grad_norm": 17.375, + "learning_rate": 2.1422157967085394e-06, + "loss": 0.8004, + "step": 2107 + }, + { + "epoch": 0.512521274009239, + "grad_norm": 25.625, + "learning_rate": 2.1418772638861095e-06, + "loss": 0.8912, + "step": 2108 + }, + { + "epoch": 0.512764405543399, + "grad_norm": 18.5, + "learning_rate": 2.141538597757219e-06, + "loss": 0.6878, + "step": 2109 + }, + { + "epoch": 0.513007537077559, + "grad_norm": 20.0, + "learning_rate": 2.141199798372488e-06, + "loss": 0.8896, + "step": 2110 + }, + { + "epoch": 0.513250668611719, + "grad_norm": 25.25, + "learning_rate": 2.140860865782556e-06, + "loss": 1.0707, + "step": 2111 + }, + { + "epoch": 0.513493800145879, + "grad_norm": 31.25, + "learning_rate": 2.1405218000380813e-06, + "loss": 0.9574, + "step": 2112 + }, + { + "epoch": 0.5137369316800389, + "grad_norm": 21.0, + "learning_rate": 2.1401826011897436e-06, + "loss": 0.8259, + "step": 2113 + }, + { + "epoch": 0.5139800632141989, + "grad_norm": 19.5, + "learning_rate": 2.1398432692882423e-06, + "loss": 1.1377, + "step": 2114 + }, + { + "epoch": 0.5142231947483589, + "grad_norm": 17.125, + "learning_rate": 2.1395038043842966e-06, + "loss": 0.5954, + "step": 2115 + }, + { + "epoch": 0.5144663262825189, + "grad_norm": 21.75, + "learning_rate": 2.139164206528645e-06, + "loss": 0.9733, + "step": 2116 + }, + { + "epoch": 0.5147094578166789, + "grad_norm": 16.25, + "learning_rate": 2.138824475772046e-06, + "loss": 0.6823, + "step": 2117 + }, + { + "epoch": 0.5149525893508388, + "grad_norm": 15.375, + "learning_rate": 2.138484612165279e-06, + "loss": 1.3318, + "step": 2118 + }, + { + "epoch": 0.5151957208849988, + "grad_norm": 24.625, + "learning_rate": 2.138144615759142e-06, + "loss": 1.221, + "step": 2119 + }, + { + "epoch": 0.5154388524191588, + "grad_norm": 17.125, + "learning_rate": 2.137804486604453e-06, + "loss": 0.7901, + "step": 2120 + }, + { + "epoch": 0.5156819839533188, + "grad_norm": 16.125, + "learning_rate": 2.1374642247520506e-06, + "loss": 0.819, + "step": 2121 + }, + { + "epoch": 0.5159251154874788, + "grad_norm": 17.375, + "learning_rate": 2.137123830252793e-06, + "loss": 0.6808, + "step": 2122 + }, + { + "epoch": 0.5161682470216387, + "grad_norm": 20.0, + "learning_rate": 2.1367833031575576e-06, + "loss": 0.6986, + "step": 2123 + }, + { + "epoch": 0.5164113785557987, + "grad_norm": 28.625, + "learning_rate": 2.1364426435172426e-06, + "loss": 0.9064, + "step": 2124 + }, + { + "epoch": 0.5166545100899587, + "grad_norm": 26.875, + "learning_rate": 2.136101851382765e-06, + "loss": 0.8917, + "step": 2125 + }, + { + "epoch": 0.5168976416241187, + "grad_norm": 17.375, + "learning_rate": 2.1357609268050623e-06, + "loss": 0.7116, + "step": 2126 + }, + { + "epoch": 0.5171407731582787, + "grad_norm": 23.125, + "learning_rate": 2.135419869835091e-06, + "loss": 1.0916, + "step": 2127 + }, + { + "epoch": 0.5173839046924386, + "grad_norm": 26.25, + "learning_rate": 2.1350786805238287e-06, + "loss": 1.1954, + "step": 2128 + }, + { + "epoch": 0.5176270362265986, + "grad_norm": 28.0, + "learning_rate": 2.1347373589222718e-06, + "loss": 1.0237, + "step": 2129 + }, + { + "epoch": 0.5178701677607586, + "grad_norm": 18.75, + "learning_rate": 2.134395905081437e-06, + "loss": 0.6715, + "step": 2130 + }, + { + "epoch": 0.5181132992949186, + "grad_norm": 18.25, + "learning_rate": 2.13405431905236e-06, + "loss": 1.0831, + "step": 2131 + }, + { + "epoch": 0.5183564308290786, + "grad_norm": 17.25, + "learning_rate": 2.1337126008860964e-06, + "loss": 0.7293, + "step": 2132 + }, + { + "epoch": 0.5185995623632386, + "grad_norm": 20.125, + "learning_rate": 2.1333707506337227e-06, + "loss": 0.7882, + "step": 2133 + }, + { + "epoch": 0.5188426938973985, + "grad_norm": 19.375, + "learning_rate": 2.133028768346334e-06, + "loss": 0.761, + "step": 2134 + }, + { + "epoch": 0.5190858254315585, + "grad_norm": 16.5, + "learning_rate": 2.132686654075045e-06, + "loss": 0.5887, + "step": 2135 + }, + { + "epoch": 0.5193289569657185, + "grad_norm": 22.0, + "learning_rate": 2.132344407870992e-06, + "loss": 0.9873, + "step": 2136 + }, + { + "epoch": 0.5195720884998785, + "grad_norm": 28.25, + "learning_rate": 2.1320020297853276e-06, + "loss": 1.2165, + "step": 2137 + }, + { + "epoch": 0.5198152200340385, + "grad_norm": 18.75, + "learning_rate": 2.1316595198692274e-06, + "loss": 1.0632, + "step": 2138 + }, + { + "epoch": 0.5200583515681984, + "grad_norm": 25.625, + "learning_rate": 2.1313168781738855e-06, + "loss": 1.2686, + "step": 2139 + }, + { + "epoch": 0.5203014831023584, + "grad_norm": 17.125, + "learning_rate": 2.130974104750515e-06, + "loss": 0.75, + "step": 2140 + }, + { + "epoch": 0.5205446146365184, + "grad_norm": 22.0, + "learning_rate": 2.13063119965035e-06, + "loss": 1.0972, + "step": 2141 + }, + { + "epoch": 0.5207877461706784, + "grad_norm": 16.125, + "learning_rate": 2.1302881629246426e-06, + "loss": 0.5599, + "step": 2142 + }, + { + "epoch": 0.5210308777048384, + "grad_norm": 19.875, + "learning_rate": 2.1299449946246666e-06, + "loss": 1.2064, + "step": 2143 + }, + { + "epoch": 0.5212740092389982, + "grad_norm": 21.75, + "learning_rate": 2.129601694801714e-06, + "loss": 1.2053, + "step": 2144 + }, + { + "epoch": 0.5215171407731582, + "grad_norm": 20.375, + "learning_rate": 2.1292582635070966e-06, + "loss": 0.8525, + "step": 2145 + }, + { + "epoch": 0.5217602723073183, + "grad_norm": 14.3125, + "learning_rate": 2.128914700792146e-06, + "loss": 0.859, + "step": 2146 + }, + { + "epoch": 0.5220034038414783, + "grad_norm": 14.8125, + "learning_rate": 2.1285710067082147e-06, + "loss": 0.4744, + "step": 2147 + }, + { + "epoch": 0.5222465353756383, + "grad_norm": 16.125, + "learning_rate": 2.128227181306673e-06, + "loss": 0.8446, + "step": 2148 + }, + { + "epoch": 0.5224896669097983, + "grad_norm": 34.5, + "learning_rate": 2.1278832246389116e-06, + "loss": 0.8184, + "step": 2149 + }, + { + "epoch": 0.5227327984439581, + "grad_norm": 24.375, + "learning_rate": 2.1275391367563403e-06, + "loss": 1.5662, + "step": 2150 + }, + { + "epoch": 0.5229759299781181, + "grad_norm": 17.75, + "learning_rate": 2.1271949177103894e-06, + "loss": 0.6558, + "step": 2151 + }, + { + "epoch": 0.5232190615122781, + "grad_norm": 23.375, + "learning_rate": 2.1268505675525084e-06, + "loss": 0.8729, + "step": 2152 + }, + { + "epoch": 0.5234621930464382, + "grad_norm": 21.0, + "learning_rate": 2.1265060863341665e-06, + "loss": 1.039, + "step": 2153 + }, + { + "epoch": 0.5237053245805982, + "grad_norm": 17.5, + "learning_rate": 2.1261614741068522e-06, + "loss": 0.7471, + "step": 2154 + }, + { + "epoch": 0.523948456114758, + "grad_norm": 15.9375, + "learning_rate": 2.1258167309220737e-06, + "loss": 0.5395, + "step": 2155 + }, + { + "epoch": 0.524191587648918, + "grad_norm": 17.125, + "learning_rate": 2.125471856831359e-06, + "loss": 0.7584, + "step": 2156 + }, + { + "epoch": 0.524434719183078, + "grad_norm": 41.25, + "learning_rate": 2.125126851886255e-06, + "loss": 0.7686, + "step": 2157 + }, + { + "epoch": 0.524677850717238, + "grad_norm": 15.5, + "learning_rate": 2.1247817161383295e-06, + "loss": 0.4855, + "step": 2158 + }, + { + "epoch": 0.524920982251398, + "grad_norm": 16.875, + "learning_rate": 2.1244364496391684e-06, + "loss": 0.7316, + "step": 2159 + }, + { + "epoch": 0.5251641137855579, + "grad_norm": 20.375, + "learning_rate": 2.124091052440378e-06, + "loss": 1.142, + "step": 2160 + }, + { + "epoch": 0.5254072453197179, + "grad_norm": 22.75, + "learning_rate": 2.123745524593583e-06, + "loss": 0.729, + "step": 2161 + }, + { + "epoch": 0.5256503768538779, + "grad_norm": 21.0, + "learning_rate": 2.1233998661504297e-06, + "loss": 0.9577, + "step": 2162 + }, + { + "epoch": 0.525893508388038, + "grad_norm": 15.5625, + "learning_rate": 2.1230540771625823e-06, + "loss": 0.657, + "step": 2163 + }, + { + "epoch": 0.526136639922198, + "grad_norm": 16.75, + "learning_rate": 2.122708157681725e-06, + "loss": 1.1673, + "step": 2164 + }, + { + "epoch": 0.5263797714563578, + "grad_norm": 14.9375, + "learning_rate": 2.122362107759561e-06, + "loss": 0.7474, + "step": 2165 + }, + { + "epoch": 0.5266229029905178, + "grad_norm": 19.625, + "learning_rate": 2.1220159274478145e-06, + "loss": 0.9294, + "step": 2166 + }, + { + "epoch": 0.5268660345246778, + "grad_norm": 17.25, + "learning_rate": 2.121669616798227e-06, + "loss": 0.7146, + "step": 2167 + }, + { + "epoch": 0.5271091660588378, + "grad_norm": 23.25, + "learning_rate": 2.1213231758625606e-06, + "loss": 0.9958, + "step": 2168 + }, + { + "epoch": 0.5273522975929978, + "grad_norm": 18.5, + "learning_rate": 2.1209766046925976e-06, + "loss": 0.7999, + "step": 2169 + }, + { + "epoch": 0.5275954291271578, + "grad_norm": 15.0625, + "learning_rate": 2.120629903340139e-06, + "loss": 0.6183, + "step": 2170 + }, + { + "epoch": 0.5278385606613177, + "grad_norm": 20.75, + "learning_rate": 2.120283071857005e-06, + "loss": 0.8308, + "step": 2171 + }, + { + "epoch": 0.5280816921954777, + "grad_norm": 18.625, + "learning_rate": 2.1199361102950357e-06, + "loss": 0.6661, + "step": 2172 + }, + { + "epoch": 0.5283248237296377, + "grad_norm": 22.375, + "learning_rate": 2.11958901870609e-06, + "loss": 0.8004, + "step": 2173 + }, + { + "epoch": 0.5285679552637977, + "grad_norm": 23.5, + "learning_rate": 2.119241797142047e-06, + "loss": 1.2852, + "step": 2174 + }, + { + "epoch": 0.5288110867979577, + "grad_norm": 27.0, + "learning_rate": 2.1188944456548054e-06, + "loss": 0.9778, + "step": 2175 + }, + { + "epoch": 0.5290542183321176, + "grad_norm": 23.0, + "learning_rate": 2.1185469642962826e-06, + "loss": 0.8277, + "step": 2176 + }, + { + "epoch": 0.5292973498662776, + "grad_norm": 17.75, + "learning_rate": 2.1181993531184156e-06, + "loss": 0.6125, + "step": 2177 + }, + { + "epoch": 0.5295404814004376, + "grad_norm": 20.375, + "learning_rate": 2.117851612173161e-06, + "loss": 1.027, + "step": 2178 + }, + { + "epoch": 0.5297836129345976, + "grad_norm": 17.875, + "learning_rate": 2.1175037415124947e-06, + "loss": 0.8801, + "step": 2179 + }, + { + "epoch": 0.5300267444687576, + "grad_norm": 21.0, + "learning_rate": 2.1171557411884116e-06, + "loss": 1.0453, + "step": 2180 + }, + { + "epoch": 0.5302698760029175, + "grad_norm": 25.875, + "learning_rate": 2.116807611252927e-06, + "loss": 1.467, + "step": 2181 + }, + { + "epoch": 0.5305130075370775, + "grad_norm": 16.875, + "learning_rate": 2.1164593517580746e-06, + "loss": 0.9979, + "step": 2182 + }, + { + "epoch": 0.5307561390712375, + "grad_norm": 15.375, + "learning_rate": 2.116110962755908e-06, + "loss": 0.6979, + "step": 2183 + }, + { + "epoch": 0.5309992706053975, + "grad_norm": 31.0, + "learning_rate": 2.1157624442984993e-06, + "loss": 1.4906, + "step": 2184 + }, + { + "epoch": 0.5312424021395575, + "grad_norm": 19.625, + "learning_rate": 2.115413796437941e-06, + "loss": 0.9826, + "step": 2185 + }, + { + "epoch": 0.5314855336737175, + "grad_norm": 20.625, + "learning_rate": 2.115065019226345e-06, + "loss": 0.7923, + "step": 2186 + }, + { + "epoch": 0.5317286652078774, + "grad_norm": 20.625, + "learning_rate": 2.114716112715842e-06, + "loss": 1.0459, + "step": 2187 + }, + { + "epoch": 0.5319717967420374, + "grad_norm": 17.25, + "learning_rate": 2.114367076958581e-06, + "loss": 1.155, + "step": 2188 + }, + { + "epoch": 0.5322149282761974, + "grad_norm": 18.625, + "learning_rate": 2.1140179120067324e-06, + "loss": 1.0049, + "step": 2189 + }, + { + "epoch": 0.5324580598103574, + "grad_norm": 19.75, + "learning_rate": 2.113668617912485e-06, + "loss": 0.8863, + "step": 2190 + }, + { + "epoch": 0.5327011913445174, + "grad_norm": 15.375, + "learning_rate": 2.1133191947280465e-06, + "loss": 0.7787, + "step": 2191 + }, + { + "epoch": 0.5329443228786773, + "grad_norm": 21.125, + "learning_rate": 2.112969642505644e-06, + "loss": 0.8467, + "step": 2192 + }, + { + "epoch": 0.5331874544128373, + "grad_norm": 22.25, + "learning_rate": 2.112619961297525e-06, + "loss": 0.7615, + "step": 2193 + }, + { + "epoch": 0.5334305859469973, + "grad_norm": 23.625, + "learning_rate": 2.1122701511559548e-06, + "loss": 0.7331, + "step": 2194 + }, + { + "epoch": 0.5336737174811573, + "grad_norm": 17.875, + "learning_rate": 2.1119202121332185e-06, + "loss": 0.751, + "step": 2195 + }, + { + "epoch": 0.5339168490153173, + "grad_norm": 24.625, + "learning_rate": 2.11157014428162e-06, + "loss": 1.0139, + "step": 2196 + }, + { + "epoch": 0.5341599805494772, + "grad_norm": 17.125, + "learning_rate": 2.111219947653484e-06, + "loss": 0.7356, + "step": 2197 + }, + { + "epoch": 0.5344031120836372, + "grad_norm": 23.625, + "learning_rate": 2.1108696223011534e-06, + "loss": 1.0466, + "step": 2198 + }, + { + "epoch": 0.5346462436177972, + "grad_norm": 19.0, + "learning_rate": 2.1105191682769895e-06, + "loss": 0.7842, + "step": 2199 + }, + { + "epoch": 0.5348893751519572, + "grad_norm": 25.125, + "learning_rate": 2.1101685856333744e-06, + "loss": 1.2724, + "step": 2200 + }, + { + "epoch": 0.5351325066861172, + "grad_norm": 20.5, + "learning_rate": 2.1098178744227088e-06, + "loss": 0.8252, + "step": 2201 + }, + { + "epoch": 0.5353756382202771, + "grad_norm": 24.5, + "learning_rate": 2.109467034697412e-06, + "loss": 0.5705, + "step": 2202 + }, + { + "epoch": 0.5356187697544371, + "grad_norm": 22.25, + "learning_rate": 2.1091160665099235e-06, + "loss": 0.9459, + "step": 2203 + }, + { + "epoch": 0.5358619012885971, + "grad_norm": 19.875, + "learning_rate": 2.108764969912701e-06, + "loss": 1.189, + "step": 2204 + }, + { + "epoch": 0.5361050328227571, + "grad_norm": 23.625, + "learning_rate": 2.108413744958223e-06, + "loss": 1.2873, + "step": 2205 + }, + { + "epoch": 0.5363481643569171, + "grad_norm": 17.125, + "learning_rate": 2.108062391698985e-06, + "loss": 0.7731, + "step": 2206 + }, + { + "epoch": 0.5365912958910771, + "grad_norm": 18.625, + "learning_rate": 2.1077109101875036e-06, + "loss": 0.8861, + "step": 2207 + }, + { + "epoch": 0.536834427425237, + "grad_norm": 18.5, + "learning_rate": 2.1073593004763134e-06, + "loss": 1.0385, + "step": 2208 + }, + { + "epoch": 0.537077558959397, + "grad_norm": 24.375, + "learning_rate": 2.1070075626179686e-06, + "loss": 0.8896, + "step": 2209 + }, + { + "epoch": 0.537320690493557, + "grad_norm": 15.5625, + "learning_rate": 2.1066556966650427e-06, + "loss": 0.6847, + "step": 2210 + }, + { + "epoch": 0.537563822027717, + "grad_norm": 18.25, + "learning_rate": 2.1063037026701277e-06, + "loss": 0.8065, + "step": 2211 + }, + { + "epoch": 0.537806953561877, + "grad_norm": 24.75, + "learning_rate": 2.1059515806858357e-06, + "loss": 0.9747, + "step": 2212 + }, + { + "epoch": 0.5380500850960369, + "grad_norm": 22.75, + "learning_rate": 2.105599330764797e-06, + "loss": 1.2146, + "step": 2213 + }, + { + "epoch": 0.5382932166301969, + "grad_norm": 21.0, + "learning_rate": 2.105246952959662e-06, + "loss": 0.8581, + "step": 2214 + }, + { + "epoch": 0.5385363481643569, + "grad_norm": 24.5, + "learning_rate": 2.104894447323099e-06, + "loss": 1.2318, + "step": 2215 + }, + { + "epoch": 0.5387794796985169, + "grad_norm": 22.875, + "learning_rate": 2.104541813907796e-06, + "loss": 1.3717, + "step": 2216 + }, + { + "epoch": 0.5390226112326769, + "grad_norm": 16.375, + "learning_rate": 2.104189052766461e-06, + "loss": 0.9492, + "step": 2217 + }, + { + "epoch": 0.5392657427668368, + "grad_norm": 26.75, + "learning_rate": 2.1038361639518195e-06, + "loss": 0.9797, + "step": 2218 + }, + { + "epoch": 0.5395088743009968, + "grad_norm": 35.5, + "learning_rate": 2.1034831475166166e-06, + "loss": 0.8712, + "step": 2219 + }, + { + "epoch": 0.5397520058351568, + "grad_norm": 25.125, + "learning_rate": 2.103130003513618e-06, + "loss": 1.3308, + "step": 2220 + }, + { + "epoch": 0.5399951373693168, + "grad_norm": 13.1875, + "learning_rate": 2.1027767319956055e-06, + "loss": 0.3165, + "step": 2221 + }, + { + "epoch": 0.5402382689034768, + "grad_norm": 20.375, + "learning_rate": 2.1024233330153828e-06, + "loss": 0.7538, + "step": 2222 + }, + { + "epoch": 0.5404814004376368, + "grad_norm": 20.25, + "learning_rate": 2.1020698066257707e-06, + "loss": 0.9751, + "step": 2223 + }, + { + "epoch": 0.5407245319717967, + "grad_norm": 17.875, + "learning_rate": 2.101716152879611e-06, + "loss": 0.8774, + "step": 2224 + }, + { + "epoch": 0.5409676635059567, + "grad_norm": 17.75, + "learning_rate": 2.1013623718297623e-06, + "loss": 0.8974, + "step": 2225 + }, + { + "epoch": 0.5412107950401167, + "grad_norm": 18.875, + "learning_rate": 2.1010084635291036e-06, + "loss": 0.8043, + "step": 2226 + }, + { + "epoch": 0.5414539265742767, + "grad_norm": 21.75, + "learning_rate": 2.1006544280305325e-06, + "loss": 0.8488, + "step": 2227 + }, + { + "epoch": 0.5416970581084367, + "grad_norm": 22.0, + "learning_rate": 2.1003002653869658e-06, + "loss": 0.9424, + "step": 2228 + }, + { + "epoch": 0.5419401896425966, + "grad_norm": 20.75, + "learning_rate": 2.099945975651339e-06, + "loss": 0.9605, + "step": 2229 + }, + { + "epoch": 0.5421833211767566, + "grad_norm": 24.375, + "learning_rate": 2.0995915588766074e-06, + "loss": 1.0632, + "step": 2230 + }, + { + "epoch": 0.5424264527109166, + "grad_norm": 25.0, + "learning_rate": 2.0992370151157444e-06, + "loss": 1.0454, + "step": 2231 + }, + { + "epoch": 0.5426695842450766, + "grad_norm": 17.25, + "learning_rate": 2.0988823444217426e-06, + "loss": 0.8084, + "step": 2232 + }, + { + "epoch": 0.5429127157792366, + "grad_norm": 17.75, + "learning_rate": 2.0985275468476137e-06, + "loss": 0.9918, + "step": 2233 + }, + { + "epoch": 0.5431558473133965, + "grad_norm": 19.5, + "learning_rate": 2.098172622446388e-06, + "loss": 0.9706, + "step": 2234 + }, + { + "epoch": 0.5433989788475565, + "grad_norm": 23.625, + "learning_rate": 2.097817571271116e-06, + "loss": 0.8768, + "step": 2235 + }, + { + "epoch": 0.5436421103817165, + "grad_norm": 18.75, + "learning_rate": 2.0974623933748655e-06, + "loss": 0.9007, + "step": 2236 + }, + { + "epoch": 0.5438852419158765, + "grad_norm": 14.8125, + "learning_rate": 2.097107088810724e-06, + "loss": 0.5056, + "step": 2237 + }, + { + "epoch": 0.5441283734500365, + "grad_norm": 17.25, + "learning_rate": 2.096751657631798e-06, + "loss": 1.0783, + "step": 2238 + }, + { + "epoch": 0.5443715049841964, + "grad_norm": 22.875, + "learning_rate": 2.0963960998912132e-06, + "loss": 0.9539, + "step": 2239 + }, + { + "epoch": 0.5446146365183564, + "grad_norm": 20.375, + "learning_rate": 2.0960404156421133e-06, + "loss": 0.9082, + "step": 2240 + }, + { + "epoch": 0.5448577680525164, + "grad_norm": 23.375, + "learning_rate": 2.095684604937662e-06, + "loss": 1.0703, + "step": 2241 + }, + { + "epoch": 0.5451008995866764, + "grad_norm": 15.3125, + "learning_rate": 2.09532866783104e-06, + "loss": 0.6463, + "step": 2242 + }, + { + "epoch": 0.5453440311208364, + "grad_norm": 18.5, + "learning_rate": 2.09497260437545e-06, + "loss": 0.8202, + "step": 2243 + }, + { + "epoch": 0.5455871626549964, + "grad_norm": 21.0, + "learning_rate": 2.094616414624111e-06, + "loss": 0.8521, + "step": 2244 + }, + { + "epoch": 0.5458302941891563, + "grad_norm": 33.0, + "learning_rate": 2.094260098630262e-06, + "loss": 1.0215, + "step": 2245 + }, + { + "epoch": 0.5460734257233163, + "grad_norm": 22.375, + "learning_rate": 2.09390365644716e-06, + "loss": 1.2234, + "step": 2246 + }, + { + "epoch": 0.5463165572574763, + "grad_norm": 20.5, + "learning_rate": 2.093547088128082e-06, + "loss": 0.7184, + "step": 2247 + }, + { + "epoch": 0.5465596887916363, + "grad_norm": 34.25, + "learning_rate": 2.093190393726323e-06, + "loss": 1.2451, + "step": 2248 + }, + { + "epoch": 0.5468028203257963, + "grad_norm": 14.375, + "learning_rate": 2.0928335732951976e-06, + "loss": 0.5257, + "step": 2249 + }, + { + "epoch": 0.5470459518599562, + "grad_norm": 21.125, + "learning_rate": 2.0924766268880382e-06, + "loss": 0.7474, + "step": 2250 + }, + { + "epoch": 0.5472890833941162, + "grad_norm": 34.0, + "learning_rate": 2.0921195545581967e-06, + "loss": 0.8276, + "step": 2251 + }, + { + "epoch": 0.5475322149282762, + "grad_norm": 31.625, + "learning_rate": 2.091762356359044e-06, + "loss": 0.9416, + "step": 2252 + }, + { + "epoch": 0.5477753464624362, + "grad_norm": 21.75, + "learning_rate": 2.0914050323439695e-06, + "loss": 1.4075, + "step": 2253 + }, + { + "epoch": 0.5480184779965962, + "grad_norm": 24.5, + "learning_rate": 2.0910475825663813e-06, + "loss": 1.0821, + "step": 2254 + }, + { + "epoch": 0.5482616095307561, + "grad_norm": 15.0, + "learning_rate": 2.0906900070797067e-06, + "loss": 0.6176, + "step": 2255 + }, + { + "epoch": 0.5485047410649161, + "grad_norm": 12.8125, + "learning_rate": 2.090332305937391e-06, + "loss": 0.4484, + "step": 2256 + }, + { + "epoch": 0.5487478725990761, + "grad_norm": 24.375, + "learning_rate": 2.089974479192899e-06, + "loss": 1.1816, + "step": 2257 + }, + { + "epoch": 0.5489910041332361, + "grad_norm": 15.4375, + "learning_rate": 2.0896165268997145e-06, + "loss": 1.0037, + "step": 2258 + }, + { + "epoch": 0.5492341356673961, + "grad_norm": 16.25, + "learning_rate": 2.089258449111339e-06, + "loss": 0.5961, + "step": 2259 + }, + { + "epoch": 0.5494772672015561, + "grad_norm": 27.75, + "learning_rate": 2.088900245881294e-06, + "loss": 0.9547, + "step": 2260 + }, + { + "epoch": 0.549720398735716, + "grad_norm": 18.375, + "learning_rate": 2.0885419172631192e-06, + "loss": 1.0561, + "step": 2261 + }, + { + "epoch": 0.549963530269876, + "grad_norm": 20.0, + "learning_rate": 2.088183463310373e-06, + "loss": 0.9799, + "step": 2262 + }, + { + "epoch": 0.550206661804036, + "grad_norm": 15.625, + "learning_rate": 2.0878248840766317e-06, + "loss": 0.5904, + "step": 2263 + }, + { + "epoch": 0.550449793338196, + "grad_norm": 24.75, + "learning_rate": 2.0874661796154916e-06, + "loss": 0.7311, + "step": 2264 + }, + { + "epoch": 0.550692924872356, + "grad_norm": 18.5, + "learning_rate": 2.087107349980568e-06, + "loss": 0.7602, + "step": 2265 + }, + { + "epoch": 0.5509360564065159, + "grad_norm": 15.9375, + "learning_rate": 2.086748395225493e-06, + "loss": 0.6092, + "step": 2266 + }, + { + "epoch": 0.5511791879406759, + "grad_norm": 18.625, + "learning_rate": 2.08638931540392e-06, + "loss": 0.7429, + "step": 2267 + }, + { + "epoch": 0.5514223194748359, + "grad_norm": 32.5, + "learning_rate": 2.086030110569518e-06, + "loss": 0.9813, + "step": 2268 + }, + { + "epoch": 0.5516654510089959, + "grad_norm": 15.6875, + "learning_rate": 2.0856707807759776e-06, + "loss": 0.6213, + "step": 2269 + }, + { + "epoch": 0.5519085825431559, + "grad_norm": 21.5, + "learning_rate": 2.0853113260770063e-06, + "loss": 1.0692, + "step": 2270 + }, + { + "epoch": 0.5521517140773158, + "grad_norm": 23.0, + "learning_rate": 2.0849517465263315e-06, + "loss": 1.0135, + "step": 2271 + }, + { + "epoch": 0.5523948456114758, + "grad_norm": 18.0, + "learning_rate": 2.084592042177698e-06, + "loss": 1.0545, + "step": 2272 + }, + { + "epoch": 0.5526379771456358, + "grad_norm": 12.3125, + "learning_rate": 2.084232213084869e-06, + "loss": 0.5815, + "step": 2273 + }, + { + "epoch": 0.5528811086797958, + "grad_norm": 21.375, + "learning_rate": 2.0838722593016288e-06, + "loss": 0.8707, + "step": 2274 + }, + { + "epoch": 0.5531242402139558, + "grad_norm": 25.625, + "learning_rate": 2.083512180881778e-06, + "loss": 0.76, + "step": 2275 + }, + { + "epoch": 0.5533673717481157, + "grad_norm": 14.25, + "learning_rate": 2.0831519778791363e-06, + "loss": 0.4806, + "step": 2276 + }, + { + "epoch": 0.5536105032822757, + "grad_norm": 13.75, + "learning_rate": 2.0827916503475425e-06, + "loss": 0.5682, + "step": 2277 + }, + { + "epoch": 0.5538536348164357, + "grad_norm": 26.875, + "learning_rate": 2.082431198340854e-06, + "loss": 0.9897, + "step": 2278 + }, + { + "epoch": 0.5540967663505957, + "grad_norm": 18.75, + "learning_rate": 2.082070621912946e-06, + "loss": 1.3372, + "step": 2279 + }, + { + "epoch": 0.5543398978847557, + "grad_norm": 24.0, + "learning_rate": 2.0817099211177137e-06, + "loss": 1.2054, + "step": 2280 + }, + { + "epoch": 0.5545830294189157, + "grad_norm": 26.5, + "learning_rate": 2.081349096009069e-06, + "loss": 0.9381, + "step": 2281 + }, + { + "epoch": 0.5548261609530756, + "grad_norm": 22.375, + "learning_rate": 2.0809881466409444e-06, + "loss": 0.9875, + "step": 2282 + }, + { + "epoch": 0.5550692924872356, + "grad_norm": 23.375, + "learning_rate": 2.080627073067289e-06, + "loss": 0.9427, + "step": 2283 + }, + { + "epoch": 0.5553124240213956, + "grad_norm": 18.75, + "learning_rate": 2.0802658753420728e-06, + "loss": 0.8702, + "step": 2284 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 17.5, + "learning_rate": 2.0799045535192817e-06, + "loss": 0.5937, + "step": 2285 + }, + { + "epoch": 0.5557986870897156, + "grad_norm": 21.75, + "learning_rate": 2.0795431076529226e-06, + "loss": 1.0784, + "step": 2286 + }, + { + "epoch": 0.5560418186238755, + "grad_norm": 21.25, + "learning_rate": 2.0791815377970197e-06, + "loss": 1.1514, + "step": 2287 + }, + { + "epoch": 0.5562849501580355, + "grad_norm": 16.25, + "learning_rate": 2.078819844005615e-06, + "loss": 0.6522, + "step": 2288 + }, + { + "epoch": 0.5565280816921955, + "grad_norm": 17.5, + "learning_rate": 2.07845802633277e-06, + "loss": 0.9068, + "step": 2289 + }, + { + "epoch": 0.5567712132263555, + "grad_norm": 21.0, + "learning_rate": 2.078096084832566e-06, + "loss": 0.8342, + "step": 2290 + }, + { + "epoch": 0.5570143447605155, + "grad_norm": 21.5, + "learning_rate": 2.0777340195590996e-06, + "loss": 0.6393, + "step": 2291 + }, + { + "epoch": 0.5572574762946754, + "grad_norm": 21.375, + "learning_rate": 2.0773718305664887e-06, + "loss": 0.6652, + "step": 2292 + }, + { + "epoch": 0.5575006078288354, + "grad_norm": 22.75, + "learning_rate": 2.0770095179088688e-06, + "loss": 0.7814, + "step": 2293 + }, + { + "epoch": 0.5577437393629954, + "grad_norm": 18.375, + "learning_rate": 2.0766470816403935e-06, + "loss": 0.9092, + "step": 2294 + }, + { + "epoch": 0.5579868708971554, + "grad_norm": 21.5, + "learning_rate": 2.076284521815235e-06, + "loss": 0.9198, + "step": 2295 + }, + { + "epoch": 0.5582300024313154, + "grad_norm": 21.125, + "learning_rate": 2.075921838487584e-06, + "loss": 0.7984, + "step": 2296 + }, + { + "epoch": 0.5584731339654754, + "grad_norm": 17.75, + "learning_rate": 2.07555903171165e-06, + "loss": 0.8968, + "step": 2297 + }, + { + "epoch": 0.5587162654996353, + "grad_norm": 22.0, + "learning_rate": 2.0751961015416617e-06, + "loss": 1.0454, + "step": 2298 + }, + { + "epoch": 0.5589593970337953, + "grad_norm": 14.9375, + "learning_rate": 2.0748330480318637e-06, + "loss": 0.7281, + "step": 2299 + }, + { + "epoch": 0.5592025285679553, + "grad_norm": 15.1875, + "learning_rate": 2.0744698712365215e-06, + "loss": 0.7593, + "step": 2300 + }, + { + "epoch": 0.5594456601021153, + "grad_norm": 22.25, + "learning_rate": 2.074106571209918e-06, + "loss": 0.8927, + "step": 2301 + }, + { + "epoch": 0.5596887916362753, + "grad_norm": 16.75, + "learning_rate": 2.073743148006354e-06, + "loss": 0.9008, + "step": 2302 + }, + { + "epoch": 0.5599319231704352, + "grad_norm": 30.25, + "learning_rate": 2.073379601680151e-06, + "loss": 0.936, + "step": 2303 + }, + { + "epoch": 0.5601750547045952, + "grad_norm": 19.25, + "learning_rate": 2.0730159322856454e-06, + "loss": 0.7952, + "step": 2304 + }, + { + "epoch": 0.5604181862387552, + "grad_norm": 17.5, + "learning_rate": 2.0726521398771956e-06, + "loss": 0.6307, + "step": 2305 + }, + { + "epoch": 0.5606613177729152, + "grad_norm": 19.125, + "learning_rate": 2.0722882245091753e-06, + "loss": 0.8667, + "step": 2306 + }, + { + "epoch": 0.5609044493070752, + "grad_norm": 14.0625, + "learning_rate": 2.0719241862359786e-06, + "loss": 0.623, + "step": 2307 + }, + { + "epoch": 0.5611475808412351, + "grad_norm": 21.125, + "learning_rate": 2.0715600251120167e-06, + "loss": 1.173, + "step": 2308 + }, + { + "epoch": 0.5613907123753951, + "grad_norm": 16.25, + "learning_rate": 2.0711957411917207e-06, + "loss": 0.8189, + "step": 2309 + }, + { + "epoch": 0.5616338439095551, + "grad_norm": 16.25, + "learning_rate": 2.0708313345295384e-06, + "loss": 0.6672, + "step": 2310 + }, + { + "epoch": 0.5618769754437151, + "grad_norm": 19.125, + "learning_rate": 2.070466805179937e-06, + "loss": 0.7677, + "step": 2311 + }, + { + "epoch": 0.5621201069778751, + "grad_norm": 18.625, + "learning_rate": 2.0701021531974014e-06, + "loss": 0.9986, + "step": 2312 + }, + { + "epoch": 0.562363238512035, + "grad_norm": 26.0, + "learning_rate": 2.0697373786364357e-06, + "loss": 0.9489, + "step": 2313 + }, + { + "epoch": 0.562606370046195, + "grad_norm": 42.75, + "learning_rate": 2.0693724815515612e-06, + "loss": 1.7043, + "step": 2314 + }, + { + "epoch": 0.562849501580355, + "grad_norm": 18.375, + "learning_rate": 2.0690074619973185e-06, + "loss": 0.7949, + "step": 2315 + }, + { + "epoch": 0.563092633114515, + "grad_norm": 19.875, + "learning_rate": 2.0686423200282652e-06, + "loss": 1.1615, + "step": 2316 + }, + { + "epoch": 0.563335764648675, + "grad_norm": 16.875, + "learning_rate": 2.0682770556989797e-06, + "loss": 0.8947, + "step": 2317 + }, + { + "epoch": 0.563578896182835, + "grad_norm": 15.375, + "learning_rate": 2.0679116690640556e-06, + "loss": 0.5107, + "step": 2318 + }, + { + "epoch": 0.5638220277169949, + "grad_norm": 17.75, + "learning_rate": 2.0675461601781067e-06, + "loss": 0.8202, + "step": 2319 + }, + { + "epoch": 0.5640651592511549, + "grad_norm": 20.75, + "learning_rate": 2.0671805290957646e-06, + "loss": 1.0881, + "step": 2320 + }, + { + "epoch": 0.5643082907853149, + "grad_norm": 19.125, + "learning_rate": 2.0668147758716792e-06, + "loss": 0.9051, + "step": 2321 + }, + { + "epoch": 0.5645514223194749, + "grad_norm": 28.375, + "learning_rate": 2.0664489005605187e-06, + "loss": 0.9758, + "step": 2322 + }, + { + "epoch": 0.5647945538536349, + "grad_norm": 18.375, + "learning_rate": 2.0660829032169695e-06, + "loss": 0.8816, + "step": 2323 + }, + { + "epoch": 0.5650376853877948, + "grad_norm": 17.875, + "learning_rate": 2.0657167838957365e-06, + "loss": 0.7318, + "step": 2324 + }, + { + "epoch": 0.5652808169219548, + "grad_norm": 17.5, + "learning_rate": 2.065350542651542e-06, + "loss": 0.6686, + "step": 2325 + }, + { + "epoch": 0.5655239484561148, + "grad_norm": 12.125, + "learning_rate": 2.064984179539127e-06, + "loss": 0.3821, + "step": 2326 + }, + { + "epoch": 0.5657670799902748, + "grad_norm": 22.375, + "learning_rate": 2.064617694613251e-06, + "loss": 1.1379, + "step": 2327 + }, + { + "epoch": 0.5660102115244348, + "grad_norm": 17.75, + "learning_rate": 2.0642510879286924e-06, + "loss": 0.8428, + "step": 2328 + }, + { + "epoch": 0.5662533430585946, + "grad_norm": 20.0, + "learning_rate": 2.0638843595402456e-06, + "loss": 1.07, + "step": 2329 + }, + { + "epoch": 0.5664964745927547, + "grad_norm": 27.625, + "learning_rate": 2.063517509502725e-06, + "loss": 1.1483, + "step": 2330 + }, + { + "epoch": 0.5667396061269147, + "grad_norm": 20.25, + "learning_rate": 2.063150537870963e-06, + "loss": 0.9096, + "step": 2331 + }, + { + "epoch": 0.5669827376610747, + "grad_norm": 19.25, + "learning_rate": 2.062783444699809e-06, + "loss": 0.633, + "step": 2332 + }, + { + "epoch": 0.5672258691952347, + "grad_norm": 30.875, + "learning_rate": 2.0624162300441327e-06, + "loss": 1.0003, + "step": 2333 + }, + { + "epoch": 0.5674690007293947, + "grad_norm": 30.375, + "learning_rate": 2.062048893958819e-06, + "loss": 1.4803, + "step": 2334 + }, + { + "epoch": 0.5677121322635545, + "grad_norm": 30.75, + "learning_rate": 2.0616814364987738e-06, + "loss": 1.3023, + "step": 2335 + }, + { + "epoch": 0.5679552637977145, + "grad_norm": 23.375, + "learning_rate": 2.0613138577189203e-06, + "loss": 0.8766, + "step": 2336 + }, + { + "epoch": 0.5681983953318746, + "grad_norm": 21.0, + "learning_rate": 2.060946157674198e-06, + "loss": 0.7785, + "step": 2337 + }, + { + "epoch": 0.5684415268660346, + "grad_norm": 18.375, + "learning_rate": 2.0605783364195676e-06, + "loss": 0.9711, + "step": 2338 + }, + { + "epoch": 0.5686846584001946, + "grad_norm": 17.625, + "learning_rate": 2.060210394010005e-06, + "loss": 0.9325, + "step": 2339 + }, + { + "epoch": 0.5689277899343544, + "grad_norm": 16.25, + "learning_rate": 2.0598423305005065e-06, + "loss": 0.8241, + "step": 2340 + }, + { + "epoch": 0.5691709214685144, + "grad_norm": 17.875, + "learning_rate": 2.059474145946086e-06, + "loss": 0.7612, + "step": 2341 + }, + { + "epoch": 0.5694140530026744, + "grad_norm": 16.0, + "learning_rate": 2.0591058404017735e-06, + "loss": 0.7962, + "step": 2342 + }, + { + "epoch": 0.5696571845368344, + "grad_norm": 18.75, + "learning_rate": 2.058737413922619e-06, + "loss": 1.0329, + "step": 2343 + }, + { + "epoch": 0.5699003160709945, + "grad_norm": 30.375, + "learning_rate": 2.0583688665636915e-06, + "loss": 1.0044, + "step": 2344 + }, + { + "epoch": 0.5701434476051543, + "grad_norm": 21.75, + "learning_rate": 2.0580001983800752e-06, + "loss": 1.1053, + "step": 2345 + }, + { + "epoch": 0.5703865791393143, + "grad_norm": 18.5, + "learning_rate": 2.0576314094268753e-06, + "loss": 1.1189, + "step": 2346 + }, + { + "epoch": 0.5706297106734743, + "grad_norm": 16.375, + "learning_rate": 2.0572624997592124e-06, + "loss": 0.6924, + "step": 2347 + }, + { + "epoch": 0.5708728422076343, + "grad_norm": 26.125, + "learning_rate": 2.0568934694322274e-06, + "loss": 1.156, + "step": 2348 + }, + { + "epoch": 0.5711159737417943, + "grad_norm": 16.625, + "learning_rate": 2.0565243185010776e-06, + "loss": 0.633, + "step": 2349 + }, + { + "epoch": 0.5713591052759542, + "grad_norm": 26.875, + "learning_rate": 2.056155047020939e-06, + "loss": 0.922, + "step": 2350 + }, + { + "epoch": 0.5716022368101142, + "grad_norm": 19.875, + "learning_rate": 2.055785655047006e-06, + "loss": 0.7393, + "step": 2351 + }, + { + "epoch": 0.5718453683442742, + "grad_norm": 16.5, + "learning_rate": 2.055416142634491e-06, + "loss": 0.7964, + "step": 2352 + }, + { + "epoch": 0.5720884998784342, + "grad_norm": 20.5, + "learning_rate": 2.055046509838623e-06, + "loss": 0.9522, + "step": 2353 + }, + { + "epoch": 0.5723316314125942, + "grad_norm": 14.625, + "learning_rate": 2.05467675671465e-06, + "loss": 0.7326, + "step": 2354 + }, + { + "epoch": 0.5725747629467542, + "grad_norm": 20.0, + "learning_rate": 2.0543068833178394e-06, + "loss": 0.659, + "step": 2355 + }, + { + "epoch": 0.5728178944809141, + "grad_norm": 20.125, + "learning_rate": 2.053936889703474e-06, + "loss": 0.7541, + "step": 2356 + }, + { + "epoch": 0.5730610260150741, + "grad_norm": 16.625, + "learning_rate": 2.0535667759268556e-06, + "loss": 0.9131, + "step": 2357 + }, + { + "epoch": 0.5733041575492341, + "grad_norm": 29.125, + "learning_rate": 2.0531965420433046e-06, + "loss": 1.2867, + "step": 2358 + }, + { + "epoch": 0.5735472890833941, + "grad_norm": 21.25, + "learning_rate": 2.0528261881081587e-06, + "loss": 1.1126, + "step": 2359 + }, + { + "epoch": 0.5737904206175541, + "grad_norm": 17.375, + "learning_rate": 2.052455714176774e-06, + "loss": 0.9647, + "step": 2360 + }, + { + "epoch": 0.574033552151714, + "grad_norm": 25.375, + "learning_rate": 2.0520851203045236e-06, + "loss": 0.9846, + "step": 2361 + }, + { + "epoch": 0.574276683685874, + "grad_norm": 20.5, + "learning_rate": 2.0517144065467993e-06, + "loss": 0.9121, + "step": 2362 + }, + { + "epoch": 0.574519815220034, + "grad_norm": 19.25, + "learning_rate": 2.0513435729590114e-06, + "loss": 0.7062, + "step": 2363 + }, + { + "epoch": 0.574762946754194, + "grad_norm": 20.5, + "learning_rate": 2.0509726195965863e-06, + "loss": 0.8853, + "step": 2364 + }, + { + "epoch": 0.575006078288354, + "grad_norm": 18.375, + "learning_rate": 2.0506015465149704e-06, + "loss": 0.7734, + "step": 2365 + }, + { + "epoch": 0.5752492098225139, + "grad_norm": 17.5, + "learning_rate": 2.050230353769626e-06, + "loss": 0.7197, + "step": 2366 + }, + { + "epoch": 0.5754923413566739, + "grad_norm": 20.125, + "learning_rate": 2.049859041416035e-06, + "loss": 0.8364, + "step": 2367 + }, + { + "epoch": 0.5757354728908339, + "grad_norm": 14.5, + "learning_rate": 2.0494876095096964e-06, + "loss": 0.7339, + "step": 2368 + }, + { + "epoch": 0.5759786044249939, + "grad_norm": 14.9375, + "learning_rate": 2.049116058106127e-06, + "loss": 0.757, + "step": 2369 + }, + { + "epoch": 0.5762217359591539, + "grad_norm": 21.75, + "learning_rate": 2.0487443872608613e-06, + "loss": 1.1455, + "step": 2370 + }, + { + "epoch": 0.5764648674933139, + "grad_norm": 21.5, + "learning_rate": 2.048372597029452e-06, + "loss": 0.7505, + "step": 2371 + }, + { + "epoch": 0.5767079990274738, + "grad_norm": 15.8125, + "learning_rate": 2.04800068746747e-06, + "loss": 0.6198, + "step": 2372 + }, + { + "epoch": 0.5769511305616338, + "grad_norm": 18.0, + "learning_rate": 2.047628658630503e-06, + "loss": 0.9446, + "step": 2373 + }, + { + "epoch": 0.5771942620957938, + "grad_norm": 15.5, + "learning_rate": 2.0472565105741578e-06, + "loss": 1.1734, + "step": 2374 + }, + { + "epoch": 0.5774373936299538, + "grad_norm": 19.625, + "learning_rate": 2.0468842433540576e-06, + "loss": 1.1002, + "step": 2375 + }, + { + "epoch": 0.5776805251641138, + "grad_norm": 16.625, + "learning_rate": 2.046511857025845e-06, + "loss": 1.0567, + "step": 2376 + }, + { + "epoch": 0.5779236566982737, + "grad_norm": 17.875, + "learning_rate": 2.0461393516451785e-06, + "loss": 0.7922, + "step": 2377 + }, + { + "epoch": 0.5781667882324337, + "grad_norm": 24.375, + "learning_rate": 2.0457667272677366e-06, + "loss": 1.0617, + "step": 2378 + }, + { + "epoch": 0.5784099197665937, + "grad_norm": 24.375, + "learning_rate": 2.0453939839492137e-06, + "loss": 0.9944, + "step": 2379 + }, + { + "epoch": 0.5786530513007537, + "grad_norm": 17.25, + "learning_rate": 2.0450211217453235e-06, + "loss": 0.962, + "step": 2380 + }, + { + "epoch": 0.5788961828349137, + "grad_norm": 15.5, + "learning_rate": 2.0446481407117953e-06, + "loss": 0.5756, + "step": 2381 + }, + { + "epoch": 0.5791393143690736, + "grad_norm": 17.75, + "learning_rate": 2.044275040904379e-06, + "loss": 1.2556, + "step": 2382 + }, + { + "epoch": 0.5793824459032336, + "grad_norm": 22.875, + "learning_rate": 2.0439018223788402e-06, + "loss": 0.9041, + "step": 2383 + }, + { + "epoch": 0.5796255774373936, + "grad_norm": 14.5625, + "learning_rate": 2.043528485190963e-06, + "loss": 0.414, + "step": 2384 + }, + { + "epoch": 0.5798687089715536, + "grad_norm": 18.375, + "learning_rate": 2.0431550293965486e-06, + "loss": 0.8813, + "step": 2385 + }, + { + "epoch": 0.5801118405057136, + "grad_norm": 21.75, + "learning_rate": 2.042781455051417e-06, + "loss": 0.7461, + "step": 2386 + }, + { + "epoch": 0.5803549720398735, + "grad_norm": 19.75, + "learning_rate": 2.042407762211405e-06, + "loss": 1.0001, + "step": 2387 + }, + { + "epoch": 0.5805981035740335, + "grad_norm": 19.125, + "learning_rate": 2.042033950932368e-06, + "loss": 1.0402, + "step": 2388 + }, + { + "epoch": 0.5808412351081935, + "grad_norm": 20.125, + "learning_rate": 2.0416600212701777e-06, + "loss": 1.1356, + "step": 2389 + }, + { + "epoch": 0.5810843666423535, + "grad_norm": 21.125, + "learning_rate": 2.041285973280725e-06, + "loss": 0.9779, + "step": 2390 + }, + { + "epoch": 0.5813274981765135, + "grad_norm": 15.5625, + "learning_rate": 2.0409118070199175e-06, + "loss": 0.5775, + "step": 2391 + }, + { + "epoch": 0.5815706297106735, + "grad_norm": 19.375, + "learning_rate": 2.040537522543681e-06, + "loss": 0.9431, + "step": 2392 + }, + { + "epoch": 0.5818137612448334, + "grad_norm": 18.625, + "learning_rate": 2.0401631199079584e-06, + "loss": 0.9297, + "step": 2393 + }, + { + "epoch": 0.5820568927789934, + "grad_norm": 25.0, + "learning_rate": 2.039788599168711e-06, + "loss": 1.3637, + "step": 2394 + }, + { + "epoch": 0.5823000243131534, + "grad_norm": 23.75, + "learning_rate": 2.0394139603819176e-06, + "loss": 1.4877, + "step": 2395 + }, + { + "epoch": 0.5825431558473134, + "grad_norm": 24.75, + "learning_rate": 2.039039203603574e-06, + "loss": 1.1025, + "step": 2396 + }, + { + "epoch": 0.5827862873814734, + "grad_norm": 23.625, + "learning_rate": 2.0386643288896944e-06, + "loss": 1.1648, + "step": 2397 + }, + { + "epoch": 0.5830294189156333, + "grad_norm": 20.375, + "learning_rate": 2.0382893362963102e-06, + "loss": 0.6469, + "step": 2398 + }, + { + "epoch": 0.5832725504497933, + "grad_norm": 20.375, + "learning_rate": 2.0379142258794703e-06, + "loss": 0.8905, + "step": 2399 + }, + { + "epoch": 0.5835156819839533, + "grad_norm": 14.4375, + "learning_rate": 2.0375389976952416e-06, + "loss": 0.4437, + "step": 2400 + }, + { + "epoch": 0.5837588135181133, + "grad_norm": 19.625, + "learning_rate": 2.0371636517997085e-06, + "loss": 1.5071, + "step": 2401 + }, + { + "epoch": 0.5840019450522733, + "grad_norm": 18.5, + "learning_rate": 2.0367881882489727e-06, + "loss": 0.8795, + "step": 2402 + }, + { + "epoch": 0.5842450765864332, + "grad_norm": 21.5, + "learning_rate": 2.0364126070991543e-06, + "loss": 0.9467, + "step": 2403 + }, + { + "epoch": 0.5844882081205932, + "grad_norm": 19.75, + "learning_rate": 2.036036908406389e-06, + "loss": 1.3867, + "step": 2404 + }, + { + "epoch": 0.5847313396547532, + "grad_norm": 20.125, + "learning_rate": 2.0356610922268335e-06, + "loss": 0.7784, + "step": 2405 + }, + { + "epoch": 0.5849744711889132, + "grad_norm": 23.125, + "learning_rate": 2.035285158616658e-06, + "loss": 0.912, + "step": 2406 + }, + { + "epoch": 0.5852176027230732, + "grad_norm": 26.125, + "learning_rate": 2.034909107632054e-06, + "loss": 1.0614, + "step": 2407 + }, + { + "epoch": 0.5854607342572332, + "grad_norm": 17.25, + "learning_rate": 2.0345329393292272e-06, + "loss": 0.643, + "step": 2408 + }, + { + "epoch": 0.5857038657913931, + "grad_norm": 19.625, + "learning_rate": 2.034156653764404e-06, + "loss": 0.7394, + "step": 2409 + }, + { + "epoch": 0.5859469973255531, + "grad_norm": 18.625, + "learning_rate": 2.033780250993826e-06, + "loss": 0.7713, + "step": 2410 + }, + { + "epoch": 0.5861901288597131, + "grad_norm": 21.5, + "learning_rate": 2.033403731073753e-06, + "loss": 1.119, + "step": 2411 + }, + { + "epoch": 0.5864332603938731, + "grad_norm": 19.75, + "learning_rate": 2.033027094060462e-06, + "loss": 1.2968, + "step": 2412 + }, + { + "epoch": 0.5866763919280331, + "grad_norm": 15.5, + "learning_rate": 2.0326503400102494e-06, + "loss": 0.6065, + "step": 2413 + }, + { + "epoch": 0.586919523462193, + "grad_norm": 18.0, + "learning_rate": 2.0322734689794262e-06, + "loss": 0.7435, + "step": 2414 + }, + { + "epoch": 0.587162654996353, + "grad_norm": 14.75, + "learning_rate": 2.0318964810243224e-06, + "loss": 0.4709, + "step": 2415 + }, + { + "epoch": 0.587405786530513, + "grad_norm": 27.125, + "learning_rate": 2.031519376201286e-06, + "loss": 0.9531, + "step": 2416 + }, + { + "epoch": 0.587648918064673, + "grad_norm": 16.25, + "learning_rate": 2.0311421545666817e-06, + "loss": 0.7211, + "step": 2417 + }, + { + "epoch": 0.587892049598833, + "grad_norm": 17.625, + "learning_rate": 2.0307648161768914e-06, + "loss": 0.83, + "step": 2418 + }, + { + "epoch": 0.5881351811329929, + "grad_norm": 17.25, + "learning_rate": 2.030387361088315e-06, + "loss": 0.8202, + "step": 2419 + }, + { + "epoch": 0.5883783126671529, + "grad_norm": 13.625, + "learning_rate": 2.0300097893573694e-06, + "loss": 0.3608, + "step": 2420 + }, + { + "epoch": 0.5886214442013129, + "grad_norm": 20.375, + "learning_rate": 2.02963210104049e-06, + "loss": 0.7299, + "step": 2421 + }, + { + "epoch": 0.5888645757354729, + "grad_norm": 17.125, + "learning_rate": 2.0292542961941285e-06, + "loss": 0.8545, + "step": 2422 + }, + { + "epoch": 0.5891077072696329, + "grad_norm": 22.0, + "learning_rate": 2.028876374874754e-06, + "loss": 1.1379, + "step": 2423 + }, + { + "epoch": 0.5893508388037928, + "grad_norm": 18.75, + "learning_rate": 2.028498337138853e-06, + "loss": 1.0685, + "step": 2424 + }, + { + "epoch": 0.5895939703379528, + "grad_norm": 15.3125, + "learning_rate": 2.0281201830429316e-06, + "loss": 0.9339, + "step": 2425 + }, + { + "epoch": 0.5898371018721128, + "grad_norm": 17.75, + "learning_rate": 2.02774191264351e-06, + "loss": 1.0232, + "step": 2426 + }, + { + "epoch": 0.5900802334062728, + "grad_norm": 18.375, + "learning_rate": 2.0273635259971268e-06, + "loss": 1.0813, + "step": 2427 + }, + { + "epoch": 0.5903233649404328, + "grad_norm": 23.625, + "learning_rate": 2.0269850231603393e-06, + "loss": 1.1019, + "step": 2428 + }, + { + "epoch": 0.5905664964745928, + "grad_norm": 17.25, + "learning_rate": 2.0266064041897216e-06, + "loss": 0.6596, + "step": 2429 + }, + { + "epoch": 0.5908096280087527, + "grad_norm": 22.25, + "learning_rate": 2.026227669141864e-06, + "loss": 1.3176, + "step": 2430 + }, + { + "epoch": 0.5910527595429127, + "grad_norm": 19.875, + "learning_rate": 2.0258488180733755e-06, + "loss": 0.7592, + "step": 2431 + }, + { + "epoch": 0.5912958910770727, + "grad_norm": 17.875, + "learning_rate": 2.0254698510408815e-06, + "loss": 1.0825, + "step": 2432 + }, + { + "epoch": 0.5915390226112327, + "grad_norm": 17.0, + "learning_rate": 2.0250907681010255e-06, + "loss": 1.1475, + "step": 2433 + }, + { + "epoch": 0.5917821541453927, + "grad_norm": 17.125, + "learning_rate": 2.024711569310468e-06, + "loss": 0.8014, + "step": 2434 + }, + { + "epoch": 0.5920252856795526, + "grad_norm": 20.0, + "learning_rate": 2.0243322547258866e-06, + "loss": 1.1512, + "step": 2435 + }, + { + "epoch": 0.5922684172137126, + "grad_norm": 23.0, + "learning_rate": 2.0239528244039767e-06, + "loss": 0.7642, + "step": 2436 + }, + { + "epoch": 0.5925115487478726, + "grad_norm": 23.75, + "learning_rate": 2.0235732784014507e-06, + "loss": 1.2959, + "step": 2437 + }, + { + "epoch": 0.5927546802820326, + "grad_norm": 16.875, + "learning_rate": 2.0231936167750378e-06, + "loss": 0.6246, + "step": 2438 + }, + { + "epoch": 0.5929978118161926, + "grad_norm": 24.125, + "learning_rate": 2.0228138395814854e-06, + "loss": 1.1494, + "step": 2439 + }, + { + "epoch": 0.5932409433503525, + "grad_norm": 13.25, + "learning_rate": 2.022433946877558e-06, + "loss": 0.3006, + "step": 2440 + }, + { + "epoch": 0.5934840748845125, + "grad_norm": 37.5, + "learning_rate": 2.0220539387200365e-06, + "loss": 1.3829, + "step": 2441 + }, + { + "epoch": 0.5937272064186725, + "grad_norm": 15.75, + "learning_rate": 2.0216738151657208e-06, + "loss": 0.7871, + "step": 2442 + }, + { + "epoch": 0.5939703379528325, + "grad_norm": 15.25, + "learning_rate": 2.0212935762714254e-06, + "loss": 0.817, + "step": 2443 + }, + { + "epoch": 0.5942134694869925, + "grad_norm": 15.125, + "learning_rate": 2.0209132220939845e-06, + "loss": 0.5611, + "step": 2444 + }, + { + "epoch": 0.5944566010211525, + "grad_norm": 17.375, + "learning_rate": 2.0205327526902486e-06, + "loss": 0.873, + "step": 2445 + }, + { + "epoch": 0.5946997325553124, + "grad_norm": 17.125, + "learning_rate": 2.020152168117085e-06, + "loss": 1.2222, + "step": 2446 + }, + { + "epoch": 0.5949428640894724, + "grad_norm": 21.875, + "learning_rate": 2.0197714684313786e-06, + "loss": 0.689, + "step": 2447 + }, + { + "epoch": 0.5951859956236324, + "grad_norm": 22.5, + "learning_rate": 2.019390653690033e-06, + "loss": 1.0182, + "step": 2448 + }, + { + "epoch": 0.5954291271577924, + "grad_norm": 20.0, + "learning_rate": 2.019009723949965e-06, + "loss": 1.0079, + "step": 2449 + }, + { + "epoch": 0.5956722586919524, + "grad_norm": 23.5, + "learning_rate": 2.018628679268113e-06, + "loss": 0.9264, + "step": 2450 + }, + { + "epoch": 0.5959153902261123, + "grad_norm": 21.625, + "learning_rate": 2.0182475197014306e-06, + "loss": 0.9782, + "step": 2451 + }, + { + "epoch": 0.5961585217602723, + "grad_norm": 16.25, + "learning_rate": 2.0178662453068877e-06, + "loss": 0.6295, + "step": 2452 + }, + { + "epoch": 0.5964016532944323, + "grad_norm": 19.375, + "learning_rate": 2.0174848561414734e-06, + "loss": 0.6381, + "step": 2453 + }, + { + "epoch": 0.5966447848285923, + "grad_norm": 17.625, + "learning_rate": 2.017103352262192e-06, + "loss": 0.589, + "step": 2454 + }, + { + "epoch": 0.5968879163627523, + "grad_norm": 16.875, + "learning_rate": 2.0167217337260665e-06, + "loss": 0.4894, + "step": 2455 + }, + { + "epoch": 0.5971310478969122, + "grad_norm": 17.875, + "learning_rate": 2.0163400005901362e-06, + "loss": 0.8663, + "step": 2456 + }, + { + "epoch": 0.5973741794310722, + "grad_norm": 13.875, + "learning_rate": 2.015958152911458e-06, + "loss": 0.6678, + "step": 2457 + }, + { + "epoch": 0.5976173109652322, + "grad_norm": 16.625, + "learning_rate": 2.0155761907471043e-06, + "loss": 0.7652, + "step": 2458 + }, + { + "epoch": 0.5978604424993922, + "grad_norm": 23.375, + "learning_rate": 2.015194114154168e-06, + "loss": 0.9417, + "step": 2459 + }, + { + "epoch": 0.5981035740335522, + "grad_norm": 18.125, + "learning_rate": 2.0148119231897556e-06, + "loss": 0.912, + "step": 2460 + }, + { + "epoch": 0.5983467055677121, + "grad_norm": 24.0, + "learning_rate": 2.0144296179109923e-06, + "loss": 0.8892, + "step": 2461 + }, + { + "epoch": 0.5985898371018721, + "grad_norm": 26.25, + "learning_rate": 2.0140471983750205e-06, + "loss": 0.9228, + "step": 2462 + }, + { + "epoch": 0.5988329686360321, + "grad_norm": 18.75, + "learning_rate": 2.0136646646389996e-06, + "loss": 1.0494, + "step": 2463 + }, + { + "epoch": 0.5990761001701921, + "grad_norm": 15.5625, + "learning_rate": 2.013282016760105e-06, + "loss": 0.5649, + "step": 2464 + }, + { + "epoch": 0.5993192317043521, + "grad_norm": 39.75, + "learning_rate": 2.0128992547955315e-06, + "loss": 1.1498, + "step": 2465 + }, + { + "epoch": 0.5995623632385121, + "grad_norm": 22.125, + "learning_rate": 2.012516378802488e-06, + "loss": 1.0061, + "step": 2466 + }, + { + "epoch": 0.599805494772672, + "grad_norm": 23.75, + "learning_rate": 2.0121333888382032e-06, + "loss": 1.245, + "step": 2467 + }, + { + "epoch": 0.600048626306832, + "grad_norm": 16.5, + "learning_rate": 2.0117502849599204e-06, + "loss": 0.6547, + "step": 2468 + }, + { + "epoch": 0.600291757840992, + "grad_norm": 19.0, + "learning_rate": 2.011367067224902e-06, + "loss": 0.6211, + "step": 2469 + }, + { + "epoch": 0.600534889375152, + "grad_norm": 19.5, + "learning_rate": 2.0109837356904257e-06, + "loss": 0.8737, + "step": 2470 + }, + { + "epoch": 0.600778020909312, + "grad_norm": 23.375, + "learning_rate": 2.0106002904137877e-06, + "loss": 0.9821, + "step": 2471 + }, + { + "epoch": 0.6010211524434719, + "grad_norm": 18.125, + "learning_rate": 2.0102167314523004e-06, + "loss": 0.9827, + "step": 2472 + }, + { + "epoch": 0.6012642839776319, + "grad_norm": 16.75, + "learning_rate": 2.009833058863293e-06, + "loss": 0.8096, + "step": 2473 + }, + { + "epoch": 0.6015074155117919, + "grad_norm": 24.25, + "learning_rate": 2.0094492727041124e-06, + "loss": 0.8818, + "step": 2474 + }, + { + "epoch": 0.6017505470459519, + "grad_norm": 14.375, + "learning_rate": 2.009065373032122e-06, + "loss": 0.4699, + "step": 2475 + }, + { + "epoch": 0.6019936785801119, + "grad_norm": 14.125, + "learning_rate": 2.0086813599047012e-06, + "loss": 0.6093, + "step": 2476 + }, + { + "epoch": 0.6022368101142718, + "grad_norm": 17.25, + "learning_rate": 2.0082972333792496e-06, + "loss": 0.6792, + "step": 2477 + }, + { + "epoch": 0.6024799416484318, + "grad_norm": 13.5, + "learning_rate": 2.007912993513179e-06, + "loss": 0.4687, + "step": 2478 + }, + { + "epoch": 0.6027230731825918, + "grad_norm": 23.875, + "learning_rate": 2.0075286403639226e-06, + "loss": 0.792, + "step": 2479 + }, + { + "epoch": 0.6029662047167518, + "grad_norm": 15.9375, + "learning_rate": 2.0071441739889278e-06, + "loss": 0.7047, + "step": 2480 + }, + { + "epoch": 0.6032093362509118, + "grad_norm": 25.0, + "learning_rate": 2.0067595944456598e-06, + "loss": 1.1387, + "step": 2481 + }, + { + "epoch": 0.6034524677850718, + "grad_norm": 37.25, + "learning_rate": 2.006374901791601e-06, + "loss": 1.6589, + "step": 2482 + }, + { + "epoch": 0.6036955993192317, + "grad_norm": 16.75, + "learning_rate": 2.0059900960842493e-06, + "loss": 0.6608, + "step": 2483 + }, + { + "epoch": 0.6039387308533917, + "grad_norm": 20.625, + "learning_rate": 2.005605177381122e-06, + "loss": 0.7116, + "step": 2484 + }, + { + "epoch": 0.6041818623875517, + "grad_norm": 32.5, + "learning_rate": 2.0052201457397507e-06, + "loss": 1.7751, + "step": 2485 + }, + { + "epoch": 0.6044249939217117, + "grad_norm": 25.125, + "learning_rate": 2.004835001217686e-06, + "loss": 1.0185, + "step": 2486 + }, + { + "epoch": 0.6046681254558717, + "grad_norm": 15.75, + "learning_rate": 2.004449743872494e-06, + "loss": 0.5704, + "step": 2487 + }, + { + "epoch": 0.6049112569900316, + "grad_norm": 19.375, + "learning_rate": 2.0040643737617577e-06, + "loss": 0.7855, + "step": 2488 + }, + { + "epoch": 0.6051543885241916, + "grad_norm": 14.5625, + "learning_rate": 2.0036788909430774e-06, + "loss": 0.4763, + "step": 2489 + }, + { + "epoch": 0.6053975200583516, + "grad_norm": 20.75, + "learning_rate": 2.0032932954740707e-06, + "loss": 0.9844, + "step": 2490 + }, + { + "epoch": 0.6056406515925116, + "grad_norm": 17.375, + "learning_rate": 2.002907587412371e-06, + "loss": 0.7559, + "step": 2491 + }, + { + "epoch": 0.6058837831266716, + "grad_norm": 20.25, + "learning_rate": 2.0025217668156295e-06, + "loss": 0.9571, + "step": 2492 + }, + { + "epoch": 0.6061269146608315, + "grad_norm": 16.5, + "learning_rate": 2.002135833741513e-06, + "loss": 0.667, + "step": 2493 + }, + { + "epoch": 0.6063700461949915, + "grad_norm": 23.125, + "learning_rate": 2.0017497882477068e-06, + "loss": 1.0873, + "step": 2494 + }, + { + "epoch": 0.6066131777291515, + "grad_norm": 21.375, + "learning_rate": 2.001363630391911e-06, + "loss": 0.8989, + "step": 2495 + }, + { + "epoch": 0.6068563092633115, + "grad_norm": 28.5, + "learning_rate": 2.0009773602318444e-06, + "loss": 0.8248, + "step": 2496 + }, + { + "epoch": 0.6070994407974715, + "grad_norm": 24.125, + "learning_rate": 2.0005909778252415e-06, + "loss": 0.9701, + "step": 2497 + }, + { + "epoch": 0.6073425723316314, + "grad_norm": 15.5, + "learning_rate": 2.000204483229854e-06, + "loss": 0.7341, + "step": 2498 + }, + { + "epoch": 0.6075857038657914, + "grad_norm": 16.125, + "learning_rate": 1.9998178765034496e-06, + "loss": 0.5449, + "step": 2499 + }, + { + "epoch": 0.6078288353999514, + "grad_norm": 19.625, + "learning_rate": 1.9994311577038146e-06, + "loss": 0.8267, + "step": 2500 + }, + { + "epoch": 0.6080719669341114, + "grad_norm": 22.25, + "learning_rate": 1.999044326888749e-06, + "loss": 0.9458, + "step": 2501 + }, + { + "epoch": 0.6083150984682714, + "grad_norm": 14.8125, + "learning_rate": 1.9986573841160728e-06, + "loss": 0.5654, + "step": 2502 + }, + { + "epoch": 0.6085582300024314, + "grad_norm": 15.9375, + "learning_rate": 1.9982703294436206e-06, + "loss": 0.6877, + "step": 2503 + }, + { + "epoch": 0.6088013615365913, + "grad_norm": 18.375, + "learning_rate": 1.9978831629292444e-06, + "loss": 0.7262, + "step": 2504 + }, + { + "epoch": 0.6090444930707513, + "grad_norm": 22.375, + "learning_rate": 1.9974958846308136e-06, + "loss": 0.7039, + "step": 2505 + }, + { + "epoch": 0.6092876246049113, + "grad_norm": 20.125, + "learning_rate": 1.9971084946062126e-06, + "loss": 0.7864, + "step": 2506 + }, + { + "epoch": 0.6095307561390713, + "grad_norm": 20.375, + "learning_rate": 1.996720992913345e-06, + "loss": 0.7615, + "step": 2507 + }, + { + "epoch": 0.6097738876732313, + "grad_norm": 19.625, + "learning_rate": 1.9963333796101275e-06, + "loss": 0.6926, + "step": 2508 + }, + { + "epoch": 0.6100170192073912, + "grad_norm": 18.375, + "learning_rate": 1.995945654754497e-06, + "loss": 0.777, + "step": 2509 + }, + { + "epoch": 0.6102601507415512, + "grad_norm": 27.875, + "learning_rate": 1.9955578184044062e-06, + "loss": 1.2121, + "step": 2510 + }, + { + "epoch": 0.6105032822757112, + "grad_norm": 18.375, + "learning_rate": 1.995169870617823e-06, + "loss": 0.7759, + "step": 2511 + }, + { + "epoch": 0.6107464138098712, + "grad_norm": 19.75, + "learning_rate": 1.994781811452733e-06, + "loss": 0.672, + "step": 2512 + }, + { + "epoch": 0.6109895453440312, + "grad_norm": 23.25, + "learning_rate": 1.994393640967138e-06, + "loss": 1.1134, + "step": 2513 + }, + { + "epoch": 0.611232676878191, + "grad_norm": 26.5, + "learning_rate": 1.994005359219058e-06, + "loss": 0.9399, + "step": 2514 + }, + { + "epoch": 0.611475808412351, + "grad_norm": 23.125, + "learning_rate": 1.993616966266527e-06, + "loss": 1.2102, + "step": 2515 + }, + { + "epoch": 0.611718939946511, + "grad_norm": 16.75, + "learning_rate": 1.993228462167598e-06, + "loss": 0.6831, + "step": 2516 + }, + { + "epoch": 0.611962071480671, + "grad_norm": 16.875, + "learning_rate": 1.992839846980339e-06, + "loss": 0.5729, + "step": 2517 + }, + { + "epoch": 0.6122052030148311, + "grad_norm": 16.625, + "learning_rate": 1.992451120762836e-06, + "loss": 0.8248, + "step": 2518 + }, + { + "epoch": 0.6124483345489911, + "grad_norm": 22.0, + "learning_rate": 1.99206228357319e-06, + "loss": 1.1109, + "step": 2519 + }, + { + "epoch": 0.612691466083151, + "grad_norm": 18.25, + "learning_rate": 1.9916733354695204e-06, + "loss": 0.9915, + "step": 2520 + }, + { + "epoch": 0.612934597617311, + "grad_norm": 17.125, + "learning_rate": 1.9912842765099617e-06, + "loss": 0.7568, + "step": 2521 + }, + { + "epoch": 0.613177729151471, + "grad_norm": 25.875, + "learning_rate": 1.990895106752665e-06, + "loss": 1.5433, + "step": 2522 + }, + { + "epoch": 0.613420860685631, + "grad_norm": 25.125, + "learning_rate": 1.9905058262557993e-06, + "loss": 0.8675, + "step": 2523 + }, + { + "epoch": 0.613663992219791, + "grad_norm": 18.75, + "learning_rate": 1.9901164350775482e-06, + "loss": 0.8637, + "step": 2524 + }, + { + "epoch": 0.6139071237539508, + "grad_norm": 22.625, + "learning_rate": 1.9897269332761145e-06, + "loss": 0.7131, + "step": 2525 + }, + { + "epoch": 0.6141502552881108, + "grad_norm": 17.375, + "learning_rate": 1.9893373209097142e-06, + "loss": 0.7901, + "step": 2526 + }, + { + "epoch": 0.6143933868222708, + "grad_norm": 17.5, + "learning_rate": 1.988947598036583e-06, + "loss": 0.7823, + "step": 2527 + }, + { + "epoch": 0.6146365183564308, + "grad_norm": 17.875, + "learning_rate": 1.988557764714971e-06, + "loss": 0.8806, + "step": 2528 + }, + { + "epoch": 0.6148796498905909, + "grad_norm": 20.5, + "learning_rate": 1.9881678210031462e-06, + "loss": 0.7563, + "step": 2529 + }, + { + "epoch": 0.6151227814247507, + "grad_norm": 21.0, + "learning_rate": 1.9877777669593917e-06, + "loss": 0.9966, + "step": 2530 + }, + { + "epoch": 0.6153659129589107, + "grad_norm": 19.625, + "learning_rate": 1.987387602642008e-06, + "loss": 0.6233, + "step": 2531 + }, + { + "epoch": 0.6156090444930707, + "grad_norm": 18.25, + "learning_rate": 1.986997328109312e-06, + "loss": 1.0035, + "step": 2532 + }, + { + "epoch": 0.6158521760272307, + "grad_norm": 14.0, + "learning_rate": 1.9866069434196367e-06, + "loss": 0.8368, + "step": 2533 + }, + { + "epoch": 0.6160953075613907, + "grad_norm": 17.625, + "learning_rate": 1.9862164486313323e-06, + "loss": 0.6484, + "step": 2534 + }, + { + "epoch": 0.6163384390955506, + "grad_norm": 17.0, + "learning_rate": 1.985825843802765e-06, + "loss": 0.6463, + "step": 2535 + }, + { + "epoch": 0.6165815706297106, + "grad_norm": 19.875, + "learning_rate": 1.985435128992317e-06, + "loss": 0.7574, + "step": 2536 + }, + { + "epoch": 0.6168247021638706, + "grad_norm": 18.5, + "learning_rate": 1.9850443042583872e-06, + "loss": 0.7423, + "step": 2537 + }, + { + "epoch": 0.6170678336980306, + "grad_norm": 15.125, + "learning_rate": 1.984653369659392e-06, + "loss": 0.6347, + "step": 2538 + }, + { + "epoch": 0.6173109652321906, + "grad_norm": 27.125, + "learning_rate": 1.9842623252537624e-06, + "loss": 1.0904, + "step": 2539 + }, + { + "epoch": 0.6175540967663506, + "grad_norm": 21.375, + "learning_rate": 1.983871171099947e-06, + "loss": 0.8306, + "step": 2540 + }, + { + "epoch": 0.6177972283005105, + "grad_norm": 18.75, + "learning_rate": 1.983479907256411e-06, + "loss": 1.1409, + "step": 2541 + }, + { + "epoch": 0.6180403598346705, + "grad_norm": 14.3125, + "learning_rate": 1.983088533781635e-06, + "loss": 0.5206, + "step": 2542 + }, + { + "epoch": 0.6182834913688305, + "grad_norm": 21.25, + "learning_rate": 1.9826970507341173e-06, + "loss": 0.9233, + "step": 2543 + }, + { + "epoch": 0.6185266229029905, + "grad_norm": 27.125, + "learning_rate": 1.982305458172371e-06, + "loss": 0.7664, + "step": 2544 + }, + { + "epoch": 0.6187697544371505, + "grad_norm": 21.25, + "learning_rate": 1.9819137561549265e-06, + "loss": 1.3842, + "step": 2545 + }, + { + "epoch": 0.6190128859713104, + "grad_norm": 18.375, + "learning_rate": 1.9815219447403305e-06, + "loss": 0.8527, + "step": 2546 + }, + { + "epoch": 0.6192560175054704, + "grad_norm": 19.0, + "learning_rate": 1.9811300239871463e-06, + "loss": 1.0483, + "step": 2547 + }, + { + "epoch": 0.6194991490396304, + "grad_norm": 16.625, + "learning_rate": 1.9807379939539527e-06, + "loss": 0.7207, + "step": 2548 + }, + { + "epoch": 0.6197422805737904, + "grad_norm": 16.0, + "learning_rate": 1.9803458546993456e-06, + "loss": 0.5849, + "step": 2549 + }, + { + "epoch": 0.6199854121079504, + "grad_norm": 19.625, + "learning_rate": 1.9799536062819376e-06, + "loss": 0.6721, + "step": 2550 + }, + { + "epoch": 0.6202285436421103, + "grad_norm": 21.25, + "learning_rate": 1.9795612487603553e-06, + "loss": 0.7343, + "step": 2551 + }, + { + "epoch": 0.6204716751762703, + "grad_norm": 18.875, + "learning_rate": 1.9791687821932456e-06, + "loss": 0.6469, + "step": 2552 + }, + { + "epoch": 0.6207148067104303, + "grad_norm": 13.375, + "learning_rate": 1.9787762066392675e-06, + "loss": 0.4419, + "step": 2553 + }, + { + "epoch": 0.6209579382445903, + "grad_norm": 17.25, + "learning_rate": 1.978383522157099e-06, + "loss": 0.6656, + "step": 2554 + }, + { + "epoch": 0.6212010697787503, + "grad_norm": 17.875, + "learning_rate": 1.9779907288054332e-06, + "loss": 0.5274, + "step": 2555 + }, + { + "epoch": 0.6214442013129103, + "grad_norm": 24.625, + "learning_rate": 1.977597826642981e-06, + "loss": 1.051, + "step": 2556 + }, + { + "epoch": 0.6216873328470702, + "grad_norm": 20.0, + "learning_rate": 1.9772048157284666e-06, + "loss": 0.9632, + "step": 2557 + }, + { + "epoch": 0.6219304643812302, + "grad_norm": 19.0, + "learning_rate": 1.976811696120634e-06, + "loss": 1.0646, + "step": 2558 + }, + { + "epoch": 0.6221735959153902, + "grad_norm": 17.0, + "learning_rate": 1.9764184678782406e-06, + "loss": 0.711, + "step": 2559 + }, + { + "epoch": 0.6224167274495502, + "grad_norm": 21.75, + "learning_rate": 1.9760251310600614e-06, + "loss": 1.0041, + "step": 2560 + }, + { + "epoch": 0.6226598589837102, + "grad_norm": 17.75, + "learning_rate": 1.9756316857248877e-06, + "loss": 0.7743, + "step": 2561 + }, + { + "epoch": 0.6229029905178701, + "grad_norm": 18.625, + "learning_rate": 1.9752381319315267e-06, + "loss": 0.9294, + "step": 2562 + }, + { + "epoch": 0.6231461220520301, + "grad_norm": 16.125, + "learning_rate": 1.9748444697388008e-06, + "loss": 0.631, + "step": 2563 + }, + { + "epoch": 0.6233892535861901, + "grad_norm": 17.875, + "learning_rate": 1.974450699205551e-06, + "loss": 0.8134, + "step": 2564 + }, + { + "epoch": 0.6236323851203501, + "grad_norm": 16.375, + "learning_rate": 1.9740568203906325e-06, + "loss": 0.4797, + "step": 2565 + }, + { + "epoch": 0.6238755166545101, + "grad_norm": 26.0, + "learning_rate": 1.973662833352917e-06, + "loss": 1.2025, + "step": 2566 + }, + { + "epoch": 0.62411864818867, + "grad_norm": 17.25, + "learning_rate": 1.9732687381512933e-06, + "loss": 0.8941, + "step": 2567 + }, + { + "epoch": 0.62436177972283, + "grad_norm": 18.5, + "learning_rate": 1.9728745348446654e-06, + "loss": 1.0143, + "step": 2568 + }, + { + "epoch": 0.62460491125699, + "grad_norm": 20.0, + "learning_rate": 1.9724802234919535e-06, + "loss": 0.9828, + "step": 2569 + }, + { + "epoch": 0.62484804279115, + "grad_norm": 17.125, + "learning_rate": 1.9720858041520944e-06, + "loss": 0.7606, + "step": 2570 + }, + { + "epoch": 0.62509117432531, + "grad_norm": 18.375, + "learning_rate": 1.9716912768840417e-06, + "loss": 0.7916, + "step": 2571 + }, + { + "epoch": 0.6253343058594699, + "grad_norm": 19.25, + "learning_rate": 1.9712966417467634e-06, + "loss": 0.9992, + "step": 2572 + }, + { + "epoch": 0.6255774373936299, + "grad_norm": 13.75, + "learning_rate": 1.970901898799244e-06, + "loss": 0.3656, + "step": 2573 + }, + { + "epoch": 0.6258205689277899, + "grad_norm": 19.0, + "learning_rate": 1.9705070481004862e-06, + "loss": 0.8741, + "step": 2574 + }, + { + "epoch": 0.6260637004619499, + "grad_norm": 20.0, + "learning_rate": 1.9701120897095063e-06, + "loss": 0.6176, + "step": 2575 + }, + { + "epoch": 0.6263068319961099, + "grad_norm": 16.25, + "learning_rate": 1.969717023685338e-06, + "loss": 0.7102, + "step": 2576 + }, + { + "epoch": 0.6265499635302699, + "grad_norm": 25.625, + "learning_rate": 1.9693218500870303e-06, + "loss": 1.0332, + "step": 2577 + }, + { + "epoch": 0.6267930950644298, + "grad_norm": 17.375, + "learning_rate": 1.968926568973649e-06, + "loss": 0.8218, + "step": 2578 + }, + { + "epoch": 0.6270362265985898, + "grad_norm": 18.25, + "learning_rate": 1.9685311804042756e-06, + "loss": 0.8964, + "step": 2579 + }, + { + "epoch": 0.6272793581327498, + "grad_norm": 17.75, + "learning_rate": 1.968135684438008e-06, + "loss": 1.0285, + "step": 2580 + }, + { + "epoch": 0.6275224896669098, + "grad_norm": 25.375, + "learning_rate": 1.96774008113396e-06, + "loss": 1.0416, + "step": 2581 + }, + { + "epoch": 0.6277656212010698, + "grad_norm": 19.75, + "learning_rate": 1.9673443705512605e-06, + "loss": 0.8864, + "step": 2582 + }, + { + "epoch": 0.6280087527352297, + "grad_norm": 22.125, + "learning_rate": 1.9669485527490563e-06, + "loss": 1.0568, + "step": 2583 + }, + { + "epoch": 0.6282518842693897, + "grad_norm": 17.375, + "learning_rate": 1.9665526277865084e-06, + "loss": 0.6917, + "step": 2584 + }, + { + "epoch": 0.6284950158035497, + "grad_norm": 21.5, + "learning_rate": 1.9661565957227954e-06, + "loss": 1.0235, + "step": 2585 + }, + { + "epoch": 0.6287381473377097, + "grad_norm": 20.375, + "learning_rate": 1.96576045661711e-06, + "loss": 0.9047, + "step": 2586 + }, + { + "epoch": 0.6289812788718697, + "grad_norm": 21.375, + "learning_rate": 1.9653642105286636e-06, + "loss": 1.0859, + "step": 2587 + }, + { + "epoch": 0.6292244104060296, + "grad_norm": 17.375, + "learning_rate": 1.9649678575166808e-06, + "loss": 0.7796, + "step": 2588 + }, + { + "epoch": 0.6294675419401896, + "grad_norm": 17.875, + "learning_rate": 1.9645713976404036e-06, + "loss": 0.9683, + "step": 2589 + }, + { + "epoch": 0.6297106734743496, + "grad_norm": 17.0, + "learning_rate": 1.96417483095909e-06, + "loss": 0.3621, + "step": 2590 + }, + { + "epoch": 0.6299538050085096, + "grad_norm": 15.5625, + "learning_rate": 1.9637781575320138e-06, + "loss": 0.7793, + "step": 2591 + }, + { + "epoch": 0.6301969365426696, + "grad_norm": 20.25, + "learning_rate": 1.9633813774184646e-06, + "loss": 0.94, + "step": 2592 + }, + { + "epoch": 0.6304400680768296, + "grad_norm": 19.875, + "learning_rate": 1.9629844906777483e-06, + "loss": 0.7717, + "step": 2593 + }, + { + "epoch": 0.6306831996109895, + "grad_norm": 17.125, + "learning_rate": 1.9625874973691856e-06, + "loss": 0.6891, + "step": 2594 + }, + { + "epoch": 0.6309263311451495, + "grad_norm": 19.375, + "learning_rate": 1.962190397552115e-06, + "loss": 0.9662, + "step": 2595 + }, + { + "epoch": 0.6311694626793095, + "grad_norm": 21.75, + "learning_rate": 1.9617931912858897e-06, + "loss": 0.7517, + "step": 2596 + }, + { + "epoch": 0.6314125942134695, + "grad_norm": 15.8125, + "learning_rate": 1.9613958786298783e-06, + "loss": 0.551, + "step": 2597 + }, + { + "epoch": 0.6316557257476295, + "grad_norm": 25.25, + "learning_rate": 1.960998459643467e-06, + "loss": 0.9711, + "step": 2598 + }, + { + "epoch": 0.6318988572817894, + "grad_norm": 18.5, + "learning_rate": 1.9606009343860566e-06, + "loss": 1.0415, + "step": 2599 + }, + { + "epoch": 0.6321419888159494, + "grad_norm": 18.125, + "learning_rate": 1.9602033029170637e-06, + "loss": 1.009, + "step": 2600 + }, + { + "epoch": 0.6323851203501094, + "grad_norm": 19.75, + "learning_rate": 1.959805565295922e-06, + "loss": 0.7581, + "step": 2601 + }, + { + "epoch": 0.6326282518842694, + "grad_norm": 16.625, + "learning_rate": 1.9594077215820795e-06, + "loss": 0.6777, + "step": 2602 + }, + { + "epoch": 0.6328713834184294, + "grad_norm": 20.75, + "learning_rate": 1.959009771835001e-06, + "loss": 0.7276, + "step": 2603 + }, + { + "epoch": 0.6331145149525893, + "grad_norm": 17.875, + "learning_rate": 1.9586117161141672e-06, + "loss": 1.1352, + "step": 2604 + }, + { + "epoch": 0.6333576464867493, + "grad_norm": 45.0, + "learning_rate": 1.958213554479074e-06, + "loss": 0.9656, + "step": 2605 + }, + { + "epoch": 0.6336007780209093, + "grad_norm": 16.75, + "learning_rate": 1.957815286989235e-06, + "loss": 0.7441, + "step": 2606 + }, + { + "epoch": 0.6338439095550693, + "grad_norm": 15.5, + "learning_rate": 1.957416913704176e-06, + "loss": 0.5765, + "step": 2607 + }, + { + "epoch": 0.6340870410892293, + "grad_norm": 24.75, + "learning_rate": 1.9570184346834415e-06, + "loss": 1.3351, + "step": 2608 + }, + { + "epoch": 0.6343301726233892, + "grad_norm": 24.0, + "learning_rate": 1.9566198499865917e-06, + "loss": 0.9138, + "step": 2609 + }, + { + "epoch": 0.6345733041575492, + "grad_norm": 16.375, + "learning_rate": 1.9562211596732012e-06, + "loss": 0.5859, + "step": 2610 + }, + { + "epoch": 0.6348164356917092, + "grad_norm": 17.5, + "learning_rate": 1.955822363802862e-06, + "loss": 0.6955, + "step": 2611 + }, + { + "epoch": 0.6350595672258692, + "grad_norm": 22.375, + "learning_rate": 1.9554234624351807e-06, + "loss": 0.9278, + "step": 2612 + }, + { + "epoch": 0.6353026987600292, + "grad_norm": 18.375, + "learning_rate": 1.9550244556297794e-06, + "loss": 0.5837, + "step": 2613 + }, + { + "epoch": 0.6355458302941892, + "grad_norm": 17.875, + "learning_rate": 1.954625343446297e-06, + "loss": 0.9824, + "step": 2614 + }, + { + "epoch": 0.6357889618283491, + "grad_norm": 23.625, + "learning_rate": 1.954226125944388e-06, + "loss": 1.0017, + "step": 2615 + }, + { + "epoch": 0.6360320933625091, + "grad_norm": 21.125, + "learning_rate": 1.953826803183722e-06, + "loss": 1.2419, + "step": 2616 + }, + { + "epoch": 0.6362752248966691, + "grad_norm": 17.5, + "learning_rate": 1.9534273752239844e-06, + "loss": 0.44, + "step": 2617 + }, + { + "epoch": 0.6365183564308291, + "grad_norm": 18.75, + "learning_rate": 1.953027842124878e-06, + "loss": 0.7213, + "step": 2618 + }, + { + "epoch": 0.6367614879649891, + "grad_norm": 17.25, + "learning_rate": 1.9526282039461177e-06, + "loss": 0.7985, + "step": 2619 + }, + { + "epoch": 0.637004619499149, + "grad_norm": 20.0, + "learning_rate": 1.952228460747438e-06, + "loss": 0.8595, + "step": 2620 + }, + { + "epoch": 0.637247751033309, + "grad_norm": 16.625, + "learning_rate": 1.9518286125885872e-06, + "loss": 0.7828, + "step": 2621 + }, + { + "epoch": 0.637490882567469, + "grad_norm": 20.125, + "learning_rate": 1.9514286595293286e-06, + "loss": 0.9267, + "step": 2622 + }, + { + "epoch": 0.637734014101629, + "grad_norm": 12.125, + "learning_rate": 1.9510286016294432e-06, + "loss": 0.544, + "step": 2623 + }, + { + "epoch": 0.637977145635789, + "grad_norm": 22.0, + "learning_rate": 1.9506284389487256e-06, + "loss": 0.8828, + "step": 2624 + }, + { + "epoch": 0.6382202771699489, + "grad_norm": 18.5, + "learning_rate": 1.9502281715469883e-06, + "loss": 0.9487, + "step": 2625 + }, + { + "epoch": 0.6384634087041089, + "grad_norm": 21.75, + "learning_rate": 1.949827799484057e-06, + "loss": 0.8824, + "step": 2626 + }, + { + "epoch": 0.6387065402382689, + "grad_norm": 18.5, + "learning_rate": 1.9494273228197747e-06, + "loss": 1.0268, + "step": 2627 + }, + { + "epoch": 0.6389496717724289, + "grad_norm": 20.125, + "learning_rate": 1.949026741613999e-06, + "loss": 0.6952, + "step": 2628 + }, + { + "epoch": 0.6391928033065889, + "grad_norm": 14.875, + "learning_rate": 1.948626055926605e-06, + "loss": 0.6886, + "step": 2629 + }, + { + "epoch": 0.6394359348407489, + "grad_norm": 19.875, + "learning_rate": 1.948225265817481e-06, + "loss": 0.7787, + "step": 2630 + }, + { + "epoch": 0.6396790663749088, + "grad_norm": 24.625, + "learning_rate": 1.947824371346532e-06, + "loss": 1.0134, + "step": 2631 + }, + { + "epoch": 0.6399221979090688, + "grad_norm": 16.0, + "learning_rate": 1.9474233725736787e-06, + "loss": 0.8336, + "step": 2632 + }, + { + "epoch": 0.6401653294432288, + "grad_norm": 18.125, + "learning_rate": 1.947022269558858e-06, + "loss": 0.9251, + "step": 2633 + }, + { + "epoch": 0.6404084609773888, + "grad_norm": 21.0, + "learning_rate": 1.9466210623620207e-06, + "loss": 1.2374, + "step": 2634 + }, + { + "epoch": 0.6406515925115488, + "grad_norm": 16.75, + "learning_rate": 1.9462197510431346e-06, + "loss": 0.5718, + "step": 2635 + }, + { + "epoch": 0.6408947240457087, + "grad_norm": 18.5, + "learning_rate": 1.9458183356621826e-06, + "loss": 0.7142, + "step": 2636 + }, + { + "epoch": 0.6411378555798687, + "grad_norm": 21.625, + "learning_rate": 1.9454168162791635e-06, + "loss": 0.8723, + "step": 2637 + }, + { + "epoch": 0.6413809871140287, + "grad_norm": 17.0, + "learning_rate": 1.9450151929540908e-06, + "loss": 0.6637, + "step": 2638 + }, + { + "epoch": 0.6416241186481887, + "grad_norm": 16.375, + "learning_rate": 1.944613465746994e-06, + "loss": 0.9261, + "step": 2639 + }, + { + "epoch": 0.6418672501823487, + "grad_norm": 26.25, + "learning_rate": 1.944211634717918e-06, + "loss": 0.985, + "step": 2640 + }, + { + "epoch": 0.6421103817165086, + "grad_norm": 17.625, + "learning_rate": 1.9438096999269243e-06, + "loss": 0.9207, + "step": 2641 + }, + { + "epoch": 0.6423535132506686, + "grad_norm": 22.875, + "learning_rate": 1.9434076614340883e-06, + "loss": 0.7383, + "step": 2642 + }, + { + "epoch": 0.6425966447848286, + "grad_norm": 15.875, + "learning_rate": 1.9430055192995016e-06, + "loss": 0.8852, + "step": 2643 + }, + { + "epoch": 0.6428397763189886, + "grad_norm": 16.5, + "learning_rate": 1.9426032735832717e-06, + "loss": 0.6596, + "step": 2644 + }, + { + "epoch": 0.6430829078531486, + "grad_norm": 23.25, + "learning_rate": 1.94220092434552e-06, + "loss": 1.1477, + "step": 2645 + }, + { + "epoch": 0.6433260393873085, + "grad_norm": 17.625, + "learning_rate": 1.9417984716463868e-06, + "loss": 0.555, + "step": 2646 + }, + { + "epoch": 0.6435691709214685, + "grad_norm": 19.0, + "learning_rate": 1.941395915546024e-06, + "loss": 0.7148, + "step": 2647 + }, + { + "epoch": 0.6438123024556285, + "grad_norm": 26.125, + "learning_rate": 1.9409932561045995e-06, + "loss": 1.0506, + "step": 2648 + }, + { + "epoch": 0.6440554339897885, + "grad_norm": 16.125, + "learning_rate": 1.9405904933823e-06, + "loss": 0.7608, + "step": 2649 + }, + { + "epoch": 0.6442985655239485, + "grad_norm": 21.25, + "learning_rate": 1.940187627439325e-06, + "loss": 1.1644, + "step": 2650 + }, + { + "epoch": 0.6445416970581085, + "grad_norm": 21.5, + "learning_rate": 1.939784658335888e-06, + "loss": 0.9409, + "step": 2651 + }, + { + "epoch": 0.6447848285922684, + "grad_norm": 17.375, + "learning_rate": 1.939381586132221e-06, + "loss": 0.9638, + "step": 2652 + }, + { + "epoch": 0.6450279601264284, + "grad_norm": 19.375, + "learning_rate": 1.93897841088857e-06, + "loss": 1.0011, + "step": 2653 + }, + { + "epoch": 0.6452710916605884, + "grad_norm": 19.375, + "learning_rate": 1.938575132665197e-06, + "loss": 0.9863, + "step": 2654 + }, + { + "epoch": 0.6455142231947484, + "grad_norm": 17.625, + "learning_rate": 1.9381717515223775e-06, + "loss": 0.7573, + "step": 2655 + }, + { + "epoch": 0.6457573547289084, + "grad_norm": 17.375, + "learning_rate": 1.9377682675204053e-06, + "loss": 0.6723, + "step": 2656 + }, + { + "epoch": 0.6460004862630683, + "grad_norm": 20.25, + "learning_rate": 1.9373646807195867e-06, + "loss": 1.2054, + "step": 2657 + }, + { + "epoch": 0.6462436177972283, + "grad_norm": 22.875, + "learning_rate": 1.9369609911802455e-06, + "loss": 1.0758, + "step": 2658 + }, + { + "epoch": 0.6464867493313883, + "grad_norm": 17.375, + "learning_rate": 1.93655719896272e-06, + "loss": 0.9359, + "step": 2659 + }, + { + "epoch": 0.6467298808655483, + "grad_norm": 17.125, + "learning_rate": 1.9361533041273643e-06, + "loss": 0.8533, + "step": 2660 + }, + { + "epoch": 0.6469730123997083, + "grad_norm": 17.875, + "learning_rate": 1.935749306734547e-06, + "loss": 0.9048, + "step": 2661 + }, + { + "epoch": 0.6472161439338682, + "grad_norm": 16.875, + "learning_rate": 1.935345206844652e-06, + "loss": 0.7516, + "step": 2662 + }, + { + "epoch": 0.6474592754680282, + "grad_norm": 15.125, + "learning_rate": 1.9349410045180796e-06, + "loss": 0.6485, + "step": 2663 + }, + { + "epoch": 0.6477024070021882, + "grad_norm": 15.6875, + "learning_rate": 1.9345366998152448e-06, + "loss": 0.7834, + "step": 2664 + }, + { + "epoch": 0.6479455385363482, + "grad_norm": 16.625, + "learning_rate": 1.9341322927965782e-06, + "loss": 1.2497, + "step": 2665 + }, + { + "epoch": 0.6481886700705082, + "grad_norm": 17.25, + "learning_rate": 1.9337277835225248e-06, + "loss": 0.7493, + "step": 2666 + }, + { + "epoch": 0.6484318016046682, + "grad_norm": 19.125, + "learning_rate": 1.9333231720535456e-06, + "loss": 1.232, + "step": 2667 + }, + { + "epoch": 0.6486749331388281, + "grad_norm": 21.625, + "learning_rate": 1.932918458450117e-06, + "loss": 1.0351, + "step": 2668 + }, + { + "epoch": 0.6489180646729881, + "grad_norm": 22.5, + "learning_rate": 1.9325136427727302e-06, + "loss": 0.9951, + "step": 2669 + }, + { + "epoch": 0.6491611962071481, + "grad_norm": 17.5, + "learning_rate": 1.9321087250818927e-06, + "loss": 0.8068, + "step": 2670 + }, + { + "epoch": 0.6494043277413081, + "grad_norm": 18.75, + "learning_rate": 1.9317037054381255e-06, + "loss": 0.5792, + "step": 2671 + }, + { + "epoch": 0.6496474592754681, + "grad_norm": 13.75, + "learning_rate": 1.931298583901966e-06, + "loss": 0.6272, + "step": 2672 + }, + { + "epoch": 0.649890590809628, + "grad_norm": 18.375, + "learning_rate": 1.9308933605339667e-06, + "loss": 0.647, + "step": 2673 + }, + { + "epoch": 0.650133722343788, + "grad_norm": 16.5, + "learning_rate": 1.9304880353946952e-06, + "loss": 0.6139, + "step": 2674 + }, + { + "epoch": 0.650376853877948, + "grad_norm": 14.5625, + "learning_rate": 1.9300826085447345e-06, + "loss": 0.671, + "step": 2675 + }, + { + "epoch": 0.650619985412108, + "grad_norm": 21.625, + "learning_rate": 1.9296770800446825e-06, + "loss": 1.1881, + "step": 2676 + }, + { + "epoch": 0.650863116946268, + "grad_norm": 29.75, + "learning_rate": 1.9292714499551524e-06, + "loss": 0.8084, + "step": 2677 + }, + { + "epoch": 0.6511062484804279, + "grad_norm": 22.0, + "learning_rate": 1.9288657183367725e-06, + "loss": 0.8615, + "step": 2678 + }, + { + "epoch": 0.6513493800145879, + "grad_norm": 21.625, + "learning_rate": 1.9284598852501867e-06, + "loss": 1.3256, + "step": 2679 + }, + { + "epoch": 0.6515925115487479, + "grad_norm": 16.375, + "learning_rate": 1.928053950756054e-06, + "loss": 0.7895, + "step": 2680 + }, + { + "epoch": 0.6518356430829079, + "grad_norm": 19.375, + "learning_rate": 1.9276479149150475e-06, + "loss": 0.5394, + "step": 2681 + }, + { + "epoch": 0.6520787746170679, + "grad_norm": 24.25, + "learning_rate": 1.9272417777878573e-06, + "loss": 0.9726, + "step": 2682 + }, + { + "epoch": 0.6523219061512278, + "grad_norm": 25.25, + "learning_rate": 1.9268355394351862e-06, + "loss": 1.1387, + "step": 2683 + }, + { + "epoch": 0.6525650376853878, + "grad_norm": 22.875, + "learning_rate": 1.9264291999177547e-06, + "loss": 1.2903, + "step": 2684 + }, + { + "epoch": 0.6528081692195478, + "grad_norm": 18.25, + "learning_rate": 1.9260227592962976e-06, + "loss": 0.8315, + "step": 2685 + }, + { + "epoch": 0.6530513007537078, + "grad_norm": 20.75, + "learning_rate": 1.925616217631563e-06, + "loss": 1.2539, + "step": 2686 + }, + { + "epoch": 0.6532944322878678, + "grad_norm": 16.625, + "learning_rate": 1.9252095749843162e-06, + "loss": 0.6728, + "step": 2687 + }, + { + "epoch": 0.6535375638220278, + "grad_norm": 21.5, + "learning_rate": 1.9248028314153383e-06, + "loss": 0.796, + "step": 2688 + }, + { + "epoch": 0.6537806953561877, + "grad_norm": 15.625, + "learning_rate": 1.9243959869854222e-06, + "loss": 0.6722, + "step": 2689 + }, + { + "epoch": 0.6540238268903477, + "grad_norm": 15.75, + "learning_rate": 1.9239890417553786e-06, + "loss": 0.8586, + "step": 2690 + }, + { + "epoch": 0.6542669584245077, + "grad_norm": 19.25, + "learning_rate": 1.9235819957860323e-06, + "loss": 0.9895, + "step": 2691 + }, + { + "epoch": 0.6545100899586677, + "grad_norm": 20.875, + "learning_rate": 1.923174849138224e-06, + "loss": 0.9195, + "step": 2692 + }, + { + "epoch": 0.6547532214928277, + "grad_norm": 18.0, + "learning_rate": 1.9227676018728087e-06, + "loss": 1.1034, + "step": 2693 + }, + { + "epoch": 0.6549963530269876, + "grad_norm": 20.75, + "learning_rate": 1.922360254050655e-06, + "loss": 0.9611, + "step": 2694 + }, + { + "epoch": 0.6552394845611476, + "grad_norm": 18.25, + "learning_rate": 1.9219528057326507e-06, + "loss": 0.5477, + "step": 2695 + }, + { + "epoch": 0.6554826160953076, + "grad_norm": 17.625, + "learning_rate": 1.921545256979694e-06, + "loss": 0.7053, + "step": 2696 + }, + { + "epoch": 0.6557257476294676, + "grad_norm": 26.75, + "learning_rate": 1.9211376078527003e-06, + "loss": 0.9475, + "step": 2697 + }, + { + "epoch": 0.6559688791636276, + "grad_norm": 23.75, + "learning_rate": 1.9207298584126005e-06, + "loss": 0.6847, + "step": 2698 + }, + { + "epoch": 0.6562120106977875, + "grad_norm": 16.875, + "learning_rate": 1.920322008720339e-06, + "loss": 0.5618, + "step": 2699 + }, + { + "epoch": 0.6564551422319475, + "grad_norm": 16.625, + "learning_rate": 1.919914058836877e-06, + "loss": 0.8941, + "step": 2700 + }, + { + "epoch": 0.6566982737661075, + "grad_norm": 22.25, + "learning_rate": 1.919506008823189e-06, + "loss": 0.6823, + "step": 2701 + }, + { + "epoch": 0.6569414053002675, + "grad_norm": 19.375, + "learning_rate": 1.919097858740265e-06, + "loss": 0.9736, + "step": 2702 + }, + { + "epoch": 0.6571845368344275, + "grad_norm": 16.75, + "learning_rate": 1.91868960864911e-06, + "loss": 0.8121, + "step": 2703 + }, + { + "epoch": 0.6574276683685875, + "grad_norm": 16.5, + "learning_rate": 1.9182812586107454e-06, + "loss": 0.6385, + "step": 2704 + }, + { + "epoch": 0.6576707999027473, + "grad_norm": 18.625, + "learning_rate": 1.917872808686204e-06, + "loss": 1.0717, + "step": 2705 + }, + { + "epoch": 0.6579139314369074, + "grad_norm": 17.0, + "learning_rate": 1.9174642589365372e-06, + "loss": 0.6511, + "step": 2706 + }, + { + "epoch": 0.6581570629710674, + "grad_norm": 25.0, + "learning_rate": 1.9170556094228092e-06, + "loss": 0.8808, + "step": 2707 + }, + { + "epoch": 0.6584001945052274, + "grad_norm": 14.9375, + "learning_rate": 1.9166468602061e-06, + "loss": 0.5074, + "step": 2708 + }, + { + "epoch": 0.6586433260393874, + "grad_norm": 18.375, + "learning_rate": 1.9162380113475045e-06, + "loss": 0.6399, + "step": 2709 + }, + { + "epoch": 0.6588864575735472, + "grad_norm": 21.875, + "learning_rate": 1.9158290629081317e-06, + "loss": 1.036, + "step": 2710 + }, + { + "epoch": 0.6591295891077072, + "grad_norm": 46.75, + "learning_rate": 1.915420014949106e-06, + "loss": 1.1031, + "step": 2711 + }, + { + "epoch": 0.6593727206418672, + "grad_norm": 18.5, + "learning_rate": 1.915010867531567e-06, + "loss": 0.5675, + "step": 2712 + }, + { + "epoch": 0.6596158521760273, + "grad_norm": 26.25, + "learning_rate": 1.9146016207166684e-06, + "loss": 0.987, + "step": 2713 + }, + { + "epoch": 0.6598589837101873, + "grad_norm": 17.5, + "learning_rate": 1.91419227456558e-06, + "loss": 0.6809, + "step": 2714 + }, + { + "epoch": 0.6601021152443471, + "grad_norm": 19.125, + "learning_rate": 1.913782829139485e-06, + "loss": 0.8617, + "step": 2715 + }, + { + "epoch": 0.6603452467785071, + "grad_norm": 15.3125, + "learning_rate": 1.9133732844995824e-06, + "loss": 0.5928, + "step": 2716 + }, + { + "epoch": 0.6605883783126671, + "grad_norm": 20.0, + "learning_rate": 1.912963640707085e-06, + "loss": 1.1929, + "step": 2717 + }, + { + "epoch": 0.6608315098468271, + "grad_norm": 12.5625, + "learning_rate": 1.912553897823222e-06, + "loss": 0.5935, + "step": 2718 + }, + { + "epoch": 0.6610746413809871, + "grad_norm": 23.25, + "learning_rate": 1.912144055909237e-06, + "loss": 1.0319, + "step": 2719 + }, + { + "epoch": 0.661317772915147, + "grad_norm": 25.875, + "learning_rate": 1.9117341150263864e-06, + "loss": 1.0592, + "step": 2720 + }, + { + "epoch": 0.661560904449307, + "grad_norm": 19.875, + "learning_rate": 1.911324075235944e-06, + "loss": 0.9669, + "step": 2721 + }, + { + "epoch": 0.661804035983467, + "grad_norm": 23.375, + "learning_rate": 1.910913936599197e-06, + "loss": 0.8921, + "step": 2722 + }, + { + "epoch": 0.662047167517627, + "grad_norm": 18.875, + "learning_rate": 1.9105036991774476e-06, + "loss": 0.8377, + "step": 2723 + }, + { + "epoch": 0.662290299051787, + "grad_norm": 25.5, + "learning_rate": 1.9100933630320135e-06, + "loss": 0.8749, + "step": 2724 + }, + { + "epoch": 0.662533430585947, + "grad_norm": 19.125, + "learning_rate": 1.9096829282242257e-06, + "loss": 0.7983, + "step": 2725 + }, + { + "epoch": 0.6627765621201069, + "grad_norm": 18.75, + "learning_rate": 1.909272394815432e-06, + "loss": 0.935, + "step": 2726 + }, + { + "epoch": 0.6630196936542669, + "grad_norm": 17.5, + "learning_rate": 1.908861762866992e-06, + "loss": 0.8205, + "step": 2727 + }, + { + "epoch": 0.6632628251884269, + "grad_norm": 15.5625, + "learning_rate": 1.908451032440283e-06, + "loss": 0.5174, + "step": 2728 + }, + { + "epoch": 0.6635059567225869, + "grad_norm": 17.125, + "learning_rate": 1.908040203596695e-06, + "loss": 0.6204, + "step": 2729 + }, + { + "epoch": 0.6637490882567469, + "grad_norm": 22.75, + "learning_rate": 1.9076292763976338e-06, + "loss": 0.6751, + "step": 2730 + }, + { + "epoch": 0.6639922197909068, + "grad_norm": 23.875, + "learning_rate": 1.90721825090452e-06, + "loss": 0.939, + "step": 2731 + }, + { + "epoch": 0.6642353513250668, + "grad_norm": 42.0, + "learning_rate": 1.906807127178788e-06, + "loss": 1.2166, + "step": 2732 + }, + { + "epoch": 0.6644784828592268, + "grad_norm": 17.875, + "learning_rate": 1.906395905281887e-06, + "loss": 0.433, + "step": 2733 + }, + { + "epoch": 0.6647216143933868, + "grad_norm": 30.5, + "learning_rate": 1.905984585275282e-06, + "loss": 1.1259, + "step": 2734 + }, + { + "epoch": 0.6649647459275468, + "grad_norm": 18.875, + "learning_rate": 1.9055731672204513e-06, + "loss": 1.1893, + "step": 2735 + }, + { + "epoch": 0.6652078774617067, + "grad_norm": 15.5625, + "learning_rate": 1.9051616511788886e-06, + "loss": 0.7084, + "step": 2736 + }, + { + "epoch": 0.6654510089958667, + "grad_norm": 16.875, + "learning_rate": 1.9047500372121022e-06, + "loss": 0.8062, + "step": 2737 + }, + { + "epoch": 0.6656941405300267, + "grad_norm": 19.625, + "learning_rate": 1.904338325381615e-06, + "loss": 0.9368, + "step": 2738 + }, + { + "epoch": 0.6659372720641867, + "grad_norm": 21.375, + "learning_rate": 1.903926515748964e-06, + "loss": 0.8167, + "step": 2739 + }, + { + "epoch": 0.6661804035983467, + "grad_norm": 23.75, + "learning_rate": 1.9035146083757012e-06, + "loss": 1.1495, + "step": 2740 + }, + { + "epoch": 0.6664235351325067, + "grad_norm": 20.875, + "learning_rate": 1.903102603323394e-06, + "loss": 0.652, + "step": 2741 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 16.875, + "learning_rate": 1.9026905006536234e-06, + "loss": 0.693, + "step": 2742 + }, + { + "epoch": 0.6669097982008266, + "grad_norm": 17.0, + "learning_rate": 1.9022783004279852e-06, + "loss": 0.8309, + "step": 2743 + }, + { + "epoch": 0.6671529297349866, + "grad_norm": 16.75, + "learning_rate": 1.9018660027080893e-06, + "loss": 0.5793, + "step": 2744 + }, + { + "epoch": 0.6673960612691466, + "grad_norm": 16.875, + "learning_rate": 1.9014536075555612e-06, + "loss": 0.7686, + "step": 2745 + }, + { + "epoch": 0.6676391928033066, + "grad_norm": 14.75, + "learning_rate": 1.9010411150320408e-06, + "loss": 0.7207, + "step": 2746 + }, + { + "epoch": 0.6678823243374665, + "grad_norm": 17.5, + "learning_rate": 1.9006285251991818e-06, + "loss": 0.6781, + "step": 2747 + }, + { + "epoch": 0.6681254558716265, + "grad_norm": 17.5, + "learning_rate": 1.9002158381186527e-06, + "loss": 0.5717, + "step": 2748 + }, + { + "epoch": 0.6683685874057865, + "grad_norm": 18.875, + "learning_rate": 1.8998030538521373e-06, + "loss": 0.7242, + "step": 2749 + }, + { + "epoch": 0.6686117189399465, + "grad_norm": 22.125, + "learning_rate": 1.8993901724613328e-06, + "loss": 0.6692, + "step": 2750 + }, + { + "epoch": 0.6688548504741065, + "grad_norm": 20.625, + "learning_rate": 1.8989771940079517e-06, + "loss": 0.7458, + "step": 2751 + }, + { + "epoch": 0.6690979820082664, + "grad_norm": 18.75, + "learning_rate": 1.8985641185537207e-06, + "loss": 1.3859, + "step": 2752 + }, + { + "epoch": 0.6693411135424264, + "grad_norm": 16.75, + "learning_rate": 1.8981509461603815e-06, + "loss": 0.6103, + "step": 2753 + }, + { + "epoch": 0.6695842450765864, + "grad_norm": 17.5, + "learning_rate": 1.8977376768896888e-06, + "loss": 0.8004, + "step": 2754 + }, + { + "epoch": 0.6698273766107464, + "grad_norm": 17.375, + "learning_rate": 1.897324310803414e-06, + "loss": 0.7593, + "step": 2755 + }, + { + "epoch": 0.6700705081449064, + "grad_norm": 20.5, + "learning_rate": 1.8969108479633408e-06, + "loss": 0.9617, + "step": 2756 + }, + { + "epoch": 0.6703136396790663, + "grad_norm": 17.75, + "learning_rate": 1.8964972884312694e-06, + "loss": 0.6147, + "step": 2757 + }, + { + "epoch": 0.6705567712132263, + "grad_norm": 19.375, + "learning_rate": 1.8960836322690124e-06, + "loss": 1.0357, + "step": 2758 + }, + { + "epoch": 0.6707999027473863, + "grad_norm": 24.875, + "learning_rate": 1.8956698795383985e-06, + "loss": 1.1743, + "step": 2759 + }, + { + "epoch": 0.6710430342815463, + "grad_norm": 16.875, + "learning_rate": 1.8952560303012702e-06, + "loss": 0.5745, + "step": 2760 + }, + { + "epoch": 0.6712861658157063, + "grad_norm": 26.375, + "learning_rate": 1.8948420846194837e-06, + "loss": 1.2181, + "step": 2761 + }, + { + "epoch": 0.6715292973498663, + "grad_norm": 17.875, + "learning_rate": 1.894428042554911e-06, + "loss": 1.0507, + "step": 2762 + }, + { + "epoch": 0.6717724288840262, + "grad_norm": 17.0, + "learning_rate": 1.8940139041694377e-06, + "loss": 0.9185, + "step": 2763 + }, + { + "epoch": 0.6720155604181862, + "grad_norm": 25.25, + "learning_rate": 1.8935996695249643e-06, + "loss": 0.8981, + "step": 2764 + }, + { + "epoch": 0.6722586919523462, + "grad_norm": 21.0, + "learning_rate": 1.8931853386834047e-06, + "loss": 0.8092, + "step": 2765 + }, + { + "epoch": 0.6725018234865062, + "grad_norm": 21.375, + "learning_rate": 1.8927709117066878e-06, + "loss": 0.9859, + "step": 2766 + }, + { + "epoch": 0.6727449550206662, + "grad_norm": 19.25, + "learning_rate": 1.8923563886567574e-06, + "loss": 1.413, + "step": 2767 + }, + { + "epoch": 0.6729880865548261, + "grad_norm": 15.625, + "learning_rate": 1.8919417695955705e-06, + "loss": 0.6552, + "step": 2768 + }, + { + "epoch": 0.6732312180889861, + "grad_norm": 15.8125, + "learning_rate": 1.8915270545850998e-06, + "loss": 0.9794, + "step": 2769 + }, + { + "epoch": 0.6734743496231461, + "grad_norm": 17.375, + "learning_rate": 1.8911122436873313e-06, + "loss": 0.726, + "step": 2770 + }, + { + "epoch": 0.6737174811573061, + "grad_norm": 13.5, + "learning_rate": 1.890697336964265e-06, + "loss": 0.4577, + "step": 2771 + }, + { + "epoch": 0.6739606126914661, + "grad_norm": 13.9375, + "learning_rate": 1.890282334477917e-06, + "loss": 0.637, + "step": 2772 + }, + { + "epoch": 0.674203744225626, + "grad_norm": 17.5, + "learning_rate": 1.889867236290316e-06, + "loss": 0.8888, + "step": 2773 + }, + { + "epoch": 0.674446875759786, + "grad_norm": 22.875, + "learning_rate": 1.8894520424635055e-06, + "loss": 0.9943, + "step": 2774 + }, + { + "epoch": 0.674690007293946, + "grad_norm": 19.75, + "learning_rate": 1.8890367530595435e-06, + "loss": 1.2011, + "step": 2775 + }, + { + "epoch": 0.674933138828106, + "grad_norm": 16.625, + "learning_rate": 1.8886213681405022e-06, + "loss": 0.8127, + "step": 2776 + }, + { + "epoch": 0.675176270362266, + "grad_norm": 17.0, + "learning_rate": 1.8882058877684684e-06, + "loss": 0.8076, + "step": 2777 + }, + { + "epoch": 0.675419401896426, + "grad_norm": 18.5, + "learning_rate": 1.887790312005542e-06, + "loss": 1.0948, + "step": 2778 + }, + { + "epoch": 0.6756625334305859, + "grad_norm": 24.75, + "learning_rate": 1.887374640913839e-06, + "loss": 0.8811, + "step": 2779 + }, + { + "epoch": 0.6759056649647459, + "grad_norm": 16.75, + "learning_rate": 1.8869588745554874e-06, + "loss": 0.9598, + "step": 2780 + }, + { + "epoch": 0.6761487964989059, + "grad_norm": 18.5, + "learning_rate": 1.8865430129926316e-06, + "loss": 0.8612, + "step": 2781 + }, + { + "epoch": 0.6763919280330659, + "grad_norm": 15.6875, + "learning_rate": 1.8861270562874295e-06, + "loss": 0.6872, + "step": 2782 + }, + { + "epoch": 0.6766350595672259, + "grad_norm": 17.875, + "learning_rate": 1.8857110045020518e-06, + "loss": 0.836, + "step": 2783 + }, + { + "epoch": 0.6768781911013858, + "grad_norm": 22.125, + "learning_rate": 1.885294857698686e-06, + "loss": 0.9759, + "step": 2784 + }, + { + "epoch": 0.6771213226355458, + "grad_norm": 20.375, + "learning_rate": 1.8848786159395317e-06, + "loss": 1.1223, + "step": 2785 + }, + { + "epoch": 0.6773644541697058, + "grad_norm": 20.375, + "learning_rate": 1.884462279286803e-06, + "loss": 0.7113, + "step": 2786 + }, + { + "epoch": 0.6776075857038658, + "grad_norm": 18.625, + "learning_rate": 1.8840458478027296e-06, + "loss": 0.818, + "step": 2787 + }, + { + "epoch": 0.6778507172380258, + "grad_norm": 19.375, + "learning_rate": 1.8836293215495535e-06, + "loss": 1.1104, + "step": 2788 + }, + { + "epoch": 0.6780938487721857, + "grad_norm": 16.625, + "learning_rate": 1.8832127005895325e-06, + "loss": 0.6935, + "step": 2789 + }, + { + "epoch": 0.6783369803063457, + "grad_norm": 20.0, + "learning_rate": 1.882795984984937e-06, + "loss": 0.8294, + "step": 2790 + }, + { + "epoch": 0.6785801118405057, + "grad_norm": 16.875, + "learning_rate": 1.8823791747980535e-06, + "loss": 0.7348, + "step": 2791 + }, + { + "epoch": 0.6788232433746657, + "grad_norm": 17.875, + "learning_rate": 1.8819622700911804e-06, + "loss": 0.95, + "step": 2792 + }, + { + "epoch": 0.6790663749088257, + "grad_norm": 20.625, + "learning_rate": 1.8815452709266314e-06, + "loss": 0.8172, + "step": 2793 + }, + { + "epoch": 0.6793095064429856, + "grad_norm": 18.75, + "learning_rate": 1.8811281773667347e-06, + "loss": 0.8765, + "step": 2794 + }, + { + "epoch": 0.6795526379771456, + "grad_norm": 20.125, + "learning_rate": 1.8807109894738317e-06, + "loss": 0.7575, + "step": 2795 + }, + { + "epoch": 0.6797957695113056, + "grad_norm": 24.125, + "learning_rate": 1.8802937073102796e-06, + "loss": 0.8822, + "step": 2796 + }, + { + "epoch": 0.6800389010454656, + "grad_norm": 14.875, + "learning_rate": 1.8798763309384463e-06, + "loss": 0.6037, + "step": 2797 + }, + { + "epoch": 0.6802820325796256, + "grad_norm": 22.875, + "learning_rate": 1.8794588604207173e-06, + "loss": 0.8779, + "step": 2798 + }, + { + "epoch": 0.6805251641137856, + "grad_norm": 24.125, + "learning_rate": 1.8790412958194903e-06, + "loss": 0.6963, + "step": 2799 + }, + { + "epoch": 0.6807682956479455, + "grad_norm": 17.375, + "learning_rate": 1.878623637197178e-06, + "loss": 0.6414, + "step": 2800 + }, + { + "epoch": 0.6810114271821055, + "grad_norm": 15.1875, + "learning_rate": 1.8782058846162065e-06, + "loss": 0.7041, + "step": 2801 + }, + { + "epoch": 0.6812545587162655, + "grad_norm": 16.375, + "learning_rate": 1.8777880381390157e-06, + "loss": 0.4329, + "step": 2802 + }, + { + "epoch": 0.6814976902504255, + "grad_norm": 22.75, + "learning_rate": 1.8773700978280607e-06, + "loss": 1.1243, + "step": 2803 + }, + { + "epoch": 0.6817408217845855, + "grad_norm": 16.75, + "learning_rate": 1.8769520637458094e-06, + "loss": 0.8327, + "step": 2804 + }, + { + "epoch": 0.6819839533187454, + "grad_norm": 17.125, + "learning_rate": 1.8765339359547441e-06, + "loss": 0.6192, + "step": 2805 + }, + { + "epoch": 0.6822270848529054, + "grad_norm": 20.875, + "learning_rate": 1.8761157145173613e-06, + "loss": 1.2173, + "step": 2806 + }, + { + "epoch": 0.6824702163870654, + "grad_norm": 18.125, + "learning_rate": 1.875697399496172e-06, + "loss": 0.9912, + "step": 2807 + }, + { + "epoch": 0.6827133479212254, + "grad_norm": 19.375, + "learning_rate": 1.8752789909537005e-06, + "loss": 0.7923, + "step": 2808 + }, + { + "epoch": 0.6829564794553854, + "grad_norm": 31.75, + "learning_rate": 1.8748604889524844e-06, + "loss": 1.0251, + "step": 2809 + }, + { + "epoch": 0.6831996109895453, + "grad_norm": 16.625, + "learning_rate": 1.8744418935550764e-06, + "loss": 0.625, + "step": 2810 + }, + { + "epoch": 0.6834427425237053, + "grad_norm": 17.125, + "learning_rate": 1.874023204824043e-06, + "loss": 1.0231, + "step": 2811 + }, + { + "epoch": 0.6836858740578653, + "grad_norm": 18.0, + "learning_rate": 1.8736044228219647e-06, + "loss": 0.8958, + "step": 2812 + }, + { + "epoch": 0.6839290055920253, + "grad_norm": 22.5, + "learning_rate": 1.8731855476114353e-06, + "loss": 1.2198, + "step": 2813 + }, + { + "epoch": 0.6841721371261853, + "grad_norm": 14.9375, + "learning_rate": 1.8727665792550625e-06, + "loss": 0.4517, + "step": 2814 + }, + { + "epoch": 0.6844152686603453, + "grad_norm": 20.875, + "learning_rate": 1.8723475178154693e-06, + "loss": 0.9555, + "step": 2815 + }, + { + "epoch": 0.6846584001945052, + "grad_norm": 17.875, + "learning_rate": 1.8719283633552913e-06, + "loss": 0.8075, + "step": 2816 + }, + { + "epoch": 0.6849015317286652, + "grad_norm": 16.625, + "learning_rate": 1.8715091159371781e-06, + "loss": 0.7464, + "step": 2817 + }, + { + "epoch": 0.6851446632628252, + "grad_norm": 19.5, + "learning_rate": 1.8710897756237939e-06, + "loss": 0.9057, + "step": 2818 + }, + { + "epoch": 0.6853877947969852, + "grad_norm": 19.0, + "learning_rate": 1.8706703424778159e-06, + "loss": 0.8518, + "step": 2819 + }, + { + "epoch": 0.6856309263311452, + "grad_norm": 18.375, + "learning_rate": 1.8702508165619363e-06, + "loss": 0.9205, + "step": 2820 + }, + { + "epoch": 0.6858740578653051, + "grad_norm": 17.375, + "learning_rate": 1.8698311979388594e-06, + "loss": 0.6844, + "step": 2821 + }, + { + "epoch": 0.6861171893994651, + "grad_norm": 23.5, + "learning_rate": 1.8694114866713056e-06, + "loss": 1.3278, + "step": 2822 + }, + { + "epoch": 0.6863603209336251, + "grad_norm": 30.875, + "learning_rate": 1.8689916828220075e-06, + "loss": 1.3844, + "step": 2823 + }, + { + "epoch": 0.6866034524677851, + "grad_norm": 17.875, + "learning_rate": 1.8685717864537116e-06, + "loss": 0.4919, + "step": 2824 + }, + { + "epoch": 0.6868465840019451, + "grad_norm": 19.75, + "learning_rate": 1.8681517976291796e-06, + "loss": 0.8494, + "step": 2825 + }, + { + "epoch": 0.687089715536105, + "grad_norm": 20.5, + "learning_rate": 1.8677317164111856e-06, + "loss": 1.1265, + "step": 2826 + }, + { + "epoch": 0.687332847070265, + "grad_norm": 18.75, + "learning_rate": 1.867311542862518e-06, + "loss": 0.5187, + "step": 2827 + }, + { + "epoch": 0.687575978604425, + "grad_norm": 17.375, + "learning_rate": 1.8668912770459787e-06, + "loss": 0.8619, + "step": 2828 + }, + { + "epoch": 0.687819110138585, + "grad_norm": 18.625, + "learning_rate": 1.866470919024384e-06, + "loss": 0.9234, + "step": 2829 + }, + { + "epoch": 0.688062241672745, + "grad_norm": 20.0, + "learning_rate": 1.8660504688605638e-06, + "loss": 0.9266, + "step": 2830 + }, + { + "epoch": 0.6883053732069049, + "grad_norm": 18.625, + "learning_rate": 1.8656299266173613e-06, + "loss": 0.9105, + "step": 2831 + }, + { + "epoch": 0.6885485047410649, + "grad_norm": 20.625, + "learning_rate": 1.8652092923576342e-06, + "loss": 0.8332, + "step": 2832 + }, + { + "epoch": 0.6887916362752249, + "grad_norm": 19.875, + "learning_rate": 1.864788566144253e-06, + "loss": 1.1016, + "step": 2833 + }, + { + "epoch": 0.6890347678093849, + "grad_norm": 18.75, + "learning_rate": 1.8643677480401032e-06, + "loss": 0.8181, + "step": 2834 + }, + { + "epoch": 0.6892778993435449, + "grad_norm": 17.875, + "learning_rate": 1.8639468381080828e-06, + "loss": 0.6619, + "step": 2835 + }, + { + "epoch": 0.6895210308777049, + "grad_norm": 15.75, + "learning_rate": 1.8635258364111042e-06, + "loss": 0.7536, + "step": 2836 + }, + { + "epoch": 0.6897641624118648, + "grad_norm": 19.5, + "learning_rate": 1.863104743012093e-06, + "loss": 0.9637, + "step": 2837 + }, + { + "epoch": 0.6900072939460248, + "grad_norm": 20.75, + "learning_rate": 1.86268355797399e-06, + "loss": 1.1947, + "step": 2838 + }, + { + "epoch": 0.6902504254801848, + "grad_norm": 16.125, + "learning_rate": 1.8622622813597474e-06, + "loss": 0.8352, + "step": 2839 + }, + { + "epoch": 0.6904935570143448, + "grad_norm": 22.125, + "learning_rate": 1.8618409132323329e-06, + "loss": 1.2988, + "step": 2840 + }, + { + "epoch": 0.6907366885485048, + "grad_norm": 21.375, + "learning_rate": 1.861419453654727e-06, + "loss": 0.8299, + "step": 2841 + }, + { + "epoch": 0.6909798200826647, + "grad_norm": 20.5, + "learning_rate": 1.8609979026899239e-06, + "loss": 0.5336, + "step": 2842 + }, + { + "epoch": 0.6912229516168247, + "grad_norm": 16.75, + "learning_rate": 1.8605762604009323e-06, + "loss": 0.6185, + "step": 2843 + }, + { + "epoch": 0.6914660831509847, + "grad_norm": 18.5, + "learning_rate": 1.8601545268507734e-06, + "loss": 0.6208, + "step": 2844 + }, + { + "epoch": 0.6917092146851447, + "grad_norm": 23.625, + "learning_rate": 1.8597327021024825e-06, + "loss": 0.8914, + "step": 2845 + }, + { + "epoch": 0.6919523462193047, + "grad_norm": 15.1875, + "learning_rate": 1.8593107862191095e-06, + "loss": 0.6565, + "step": 2846 + }, + { + "epoch": 0.6921954777534646, + "grad_norm": 20.0, + "learning_rate": 1.8588887792637158e-06, + "loss": 0.9686, + "step": 2847 + }, + { + "epoch": 0.6924386092876246, + "grad_norm": 18.75, + "learning_rate": 1.858466681299378e-06, + "loss": 1.0165, + "step": 2848 + }, + { + "epoch": 0.6926817408217846, + "grad_norm": 15.25, + "learning_rate": 1.8580444923891865e-06, + "loss": 0.6777, + "step": 2849 + }, + { + "epoch": 0.6929248723559446, + "grad_norm": 20.75, + "learning_rate": 1.8576222125962442e-06, + "loss": 1.0999, + "step": 2850 + }, + { + "epoch": 0.6931680038901046, + "grad_norm": 28.75, + "learning_rate": 1.8571998419836684e-06, + "loss": 1.1889, + "step": 2851 + }, + { + "epoch": 0.6934111354242646, + "grad_norm": 15.875, + "learning_rate": 1.8567773806145892e-06, + "loss": 0.5628, + "step": 2852 + }, + { + "epoch": 0.6936542669584245, + "grad_norm": 21.375, + "learning_rate": 1.8563548285521515e-06, + "loss": 1.0427, + "step": 2853 + }, + { + "epoch": 0.6938973984925845, + "grad_norm": 22.875, + "learning_rate": 1.8559321858595121e-06, + "loss": 1.1127, + "step": 2854 + }, + { + "epoch": 0.6941405300267445, + "grad_norm": 18.625, + "learning_rate": 1.855509452599843e-06, + "loss": 1.1072, + "step": 2855 + }, + { + "epoch": 0.6943836615609045, + "grad_norm": 14.5, + "learning_rate": 1.8550866288363284e-06, + "loss": 0.471, + "step": 2856 + }, + { + "epoch": 0.6946267930950645, + "grad_norm": 21.875, + "learning_rate": 1.8546637146321672e-06, + "loss": 0.9184, + "step": 2857 + }, + { + "epoch": 0.6948699246292244, + "grad_norm": 122.5, + "learning_rate": 1.854240710050571e-06, + "loss": 0.9554, + "step": 2858 + }, + { + "epoch": 0.6951130561633844, + "grad_norm": 21.125, + "learning_rate": 1.853817615154765e-06, + "loss": 0.8874, + "step": 2859 + }, + { + "epoch": 0.6953561876975444, + "grad_norm": 22.375, + "learning_rate": 1.8533944300079876e-06, + "loss": 0.8626, + "step": 2860 + }, + { + "epoch": 0.6955993192317044, + "grad_norm": 15.25, + "learning_rate": 1.8529711546734925e-06, + "loss": 0.4943, + "step": 2861 + }, + { + "epoch": 0.6958424507658644, + "grad_norm": 18.0, + "learning_rate": 1.852547789214544e-06, + "loss": 1.1291, + "step": 2862 + }, + { + "epoch": 0.6960855823000243, + "grad_norm": 16.625, + "learning_rate": 1.8521243336944227e-06, + "loss": 0.6409, + "step": 2863 + }, + { + "epoch": 0.6963287138341843, + "grad_norm": 20.625, + "learning_rate": 1.85170078817642e-06, + "loss": 0.7132, + "step": 2864 + }, + { + "epoch": 0.6965718453683443, + "grad_norm": 26.75, + "learning_rate": 1.8512771527238433e-06, + "loss": 0.9868, + "step": 2865 + }, + { + "epoch": 0.6968149769025043, + "grad_norm": 12.625, + "learning_rate": 1.8508534274000114e-06, + "loss": 0.4037, + "step": 2866 + }, + { + "epoch": 0.6970581084366643, + "grad_norm": 18.5, + "learning_rate": 1.8504296122682578e-06, + "loss": 0.9511, + "step": 2867 + }, + { + "epoch": 0.6973012399708242, + "grad_norm": 18.75, + "learning_rate": 1.8500057073919286e-06, + "loss": 1.1812, + "step": 2868 + }, + { + "epoch": 0.6975443715049842, + "grad_norm": 20.125, + "learning_rate": 1.8495817128343844e-06, + "loss": 0.7531, + "step": 2869 + }, + { + "epoch": 0.6977875030391442, + "grad_norm": 13.625, + "learning_rate": 1.849157628658998e-06, + "loss": 0.4609, + "step": 2870 + }, + { + "epoch": 0.6980306345733042, + "grad_norm": 16.375, + "learning_rate": 1.8487334549291562e-06, + "loss": 0.7991, + "step": 2871 + }, + { + "epoch": 0.6982737661074642, + "grad_norm": 14.125, + "learning_rate": 1.8483091917082586e-06, + "loss": 0.2674, + "step": 2872 + }, + { + "epoch": 0.6985168976416242, + "grad_norm": 25.875, + "learning_rate": 1.8478848390597195e-06, + "loss": 1.0291, + "step": 2873 + }, + { + "epoch": 0.6987600291757841, + "grad_norm": 19.5, + "learning_rate": 1.8474603970469653e-06, + "loss": 0.8569, + "step": 2874 + }, + { + "epoch": 0.6990031607099441, + "grad_norm": 17.75, + "learning_rate": 1.8470358657334363e-06, + "loss": 1.0268, + "step": 2875 + }, + { + "epoch": 0.6992462922441041, + "grad_norm": 12.9375, + "learning_rate": 1.846611245182586e-06, + "loss": 0.2849, + "step": 2876 + }, + { + "epoch": 0.6994894237782641, + "grad_norm": 19.75, + "learning_rate": 1.8461865354578814e-06, + "loss": 0.7604, + "step": 2877 + }, + { + "epoch": 0.6997325553124241, + "grad_norm": 16.625, + "learning_rate": 1.8457617366228027e-06, + "loss": 0.8744, + "step": 2878 + }, + { + "epoch": 0.699975686846584, + "grad_norm": 24.125, + "learning_rate": 1.8453368487408427e-06, + "loss": 1.1708, + "step": 2879 + }, + { + "epoch": 0.700218818380744, + "grad_norm": 20.625, + "learning_rate": 1.8449118718755094e-06, + "loss": 0.6322, + "step": 2880 + }, + { + "epoch": 0.700461949914904, + "grad_norm": 20.125, + "learning_rate": 1.844486806090322e-06, + "loss": 0.7541, + "step": 2881 + }, + { + "epoch": 0.700705081449064, + "grad_norm": 19.0, + "learning_rate": 1.8440616514488146e-06, + "loss": 0.7688, + "step": 2882 + }, + { + "epoch": 0.700948212983224, + "grad_norm": 28.875, + "learning_rate": 1.8436364080145333e-06, + "loss": 1.0153, + "step": 2883 + }, + { + "epoch": 0.7011913445173839, + "grad_norm": 32.5, + "learning_rate": 1.8432110758510386e-06, + "loss": 1.027, + "step": 2884 + }, + { + "epoch": 0.7014344760515439, + "grad_norm": 24.25, + "learning_rate": 1.8427856550219038e-06, + "loss": 0.7096, + "step": 2885 + }, + { + "epoch": 0.7016776075857039, + "grad_norm": 21.375, + "learning_rate": 1.8423601455907145e-06, + "loss": 0.7179, + "step": 2886 + }, + { + "epoch": 0.7019207391198639, + "grad_norm": 17.625, + "learning_rate": 1.8419345476210712e-06, + "loss": 0.9237, + "step": 2887 + }, + { + "epoch": 0.7021638706540239, + "grad_norm": 17.125, + "learning_rate": 1.8415088611765866e-06, + "loss": 0.7091, + "step": 2888 + }, + { + "epoch": 0.7024070021881839, + "grad_norm": 14.0625, + "learning_rate": 1.8410830863208873e-06, + "loss": 0.4741, + "step": 2889 + }, + { + "epoch": 0.7026501337223438, + "grad_norm": 19.5, + "learning_rate": 1.8406572231176124e-06, + "loss": 0.789, + "step": 2890 + }, + { + "epoch": 0.7028932652565038, + "grad_norm": 16.625, + "learning_rate": 1.8402312716304138e-06, + "loss": 0.7747, + "step": 2891 + }, + { + "epoch": 0.7031363967906638, + "grad_norm": 29.125, + "learning_rate": 1.8398052319229586e-06, + "loss": 0.9976, + "step": 2892 + }, + { + "epoch": 0.7033795283248238, + "grad_norm": 23.375, + "learning_rate": 1.8393791040589255e-06, + "loss": 1.1398, + "step": 2893 + }, + { + "epoch": 0.7036226598589838, + "grad_norm": 19.125, + "learning_rate": 1.8389528881020061e-06, + "loss": 0.8569, + "step": 2894 + }, + { + "epoch": 0.7038657913931436, + "grad_norm": 18.0, + "learning_rate": 1.8385265841159056e-06, + "loss": 0.8613, + "step": 2895 + }, + { + "epoch": 0.7041089229273036, + "grad_norm": 20.375, + "learning_rate": 1.8381001921643431e-06, + "loss": 0.7865, + "step": 2896 + }, + { + "epoch": 0.7043520544614637, + "grad_norm": 16.125, + "learning_rate": 1.8376737123110503e-06, + "loss": 0.6729, + "step": 2897 + }, + { + "epoch": 0.7045951859956237, + "grad_norm": 19.25, + "learning_rate": 1.8372471446197716e-06, + "loss": 0.7436, + "step": 2898 + }, + { + "epoch": 0.7048383175297837, + "grad_norm": 17.75, + "learning_rate": 1.8368204891542648e-06, + "loss": 0.6284, + "step": 2899 + }, + { + "epoch": 0.7050814490639435, + "grad_norm": 23.625, + "learning_rate": 1.8363937459783016e-06, + "loss": 1.0442, + "step": 2900 + }, + { + "epoch": 0.7053245805981035, + "grad_norm": 19.25, + "learning_rate": 1.8359669151556652e-06, + "loss": 0.8138, + "step": 2901 + }, + { + "epoch": 0.7055677121322635, + "grad_norm": 19.0, + "learning_rate": 1.8355399967501538e-06, + "loss": 1.0141, + "step": 2902 + }, + { + "epoch": 0.7058108436664235, + "grad_norm": 20.75, + "learning_rate": 1.8351129908255767e-06, + "loss": 1.1416, + "step": 2903 + }, + { + "epoch": 0.7060539752005836, + "grad_norm": 18.0, + "learning_rate": 1.8346858974457585e-06, + "loss": 0.8712, + "step": 2904 + }, + { + "epoch": 0.7062971067347436, + "grad_norm": 20.0, + "learning_rate": 1.8342587166745346e-06, + "loss": 0.9932, + "step": 2905 + }, + { + "epoch": 0.7065402382689034, + "grad_norm": 22.375, + "learning_rate": 1.8338314485757553e-06, + "loss": 1.1834, + "step": 2906 + }, + { + "epoch": 0.7067833698030634, + "grad_norm": 22.5, + "learning_rate": 1.8334040932132825e-06, + "loss": 1.0299, + "step": 2907 + }, + { + "epoch": 0.7070265013372234, + "grad_norm": 34.0, + "learning_rate": 1.8329766506509925e-06, + "loss": 1.161, + "step": 2908 + }, + { + "epoch": 0.7072696328713834, + "grad_norm": 21.25, + "learning_rate": 1.8325491209527737e-06, + "loss": 1.3779, + "step": 2909 + }, + { + "epoch": 0.7075127644055434, + "grad_norm": 15.875, + "learning_rate": 1.8321215041825276e-06, + "loss": 0.6067, + "step": 2910 + }, + { + "epoch": 0.7077558959397033, + "grad_norm": 26.0, + "learning_rate": 1.8316938004041695e-06, + "loss": 1.2639, + "step": 2911 + }, + { + "epoch": 0.7079990274738633, + "grad_norm": 15.8125, + "learning_rate": 1.8312660096816265e-06, + "loss": 0.8692, + "step": 2912 + }, + { + "epoch": 0.7082421590080233, + "grad_norm": 24.5, + "learning_rate": 1.8308381320788397e-06, + "loss": 0.9224, + "step": 2913 + }, + { + "epoch": 0.7084852905421833, + "grad_norm": 29.5, + "learning_rate": 1.8304101676597624e-06, + "loss": 1.2138, + "step": 2914 + }, + { + "epoch": 0.7087284220763433, + "grad_norm": 28.5, + "learning_rate": 1.8299821164883613e-06, + "loss": 0.9979, + "step": 2915 + }, + { + "epoch": 0.7089715536105032, + "grad_norm": 21.25, + "learning_rate": 1.829553978628617e-06, + "loss": 1.2982, + "step": 2916 + }, + { + "epoch": 0.7092146851446632, + "grad_norm": 17.125, + "learning_rate": 1.8291257541445206e-06, + "loss": 0.807, + "step": 2917 + }, + { + "epoch": 0.7094578166788232, + "grad_norm": 23.75, + "learning_rate": 1.828697443100079e-06, + "loss": 1.2644, + "step": 2918 + }, + { + "epoch": 0.7097009482129832, + "grad_norm": 19.875, + "learning_rate": 1.8282690455593096e-06, + "loss": 0.8658, + "step": 2919 + }, + { + "epoch": 0.7099440797471432, + "grad_norm": 18.125, + "learning_rate": 1.8278405615862444e-06, + "loss": 0.876, + "step": 2920 + }, + { + "epoch": 0.7101872112813031, + "grad_norm": 20.125, + "learning_rate": 1.8274119912449279e-06, + "loss": 1.1041, + "step": 2921 + }, + { + "epoch": 0.7104303428154631, + "grad_norm": 16.875, + "learning_rate": 1.8269833345994168e-06, + "loss": 0.5185, + "step": 2922 + }, + { + "epoch": 0.7106734743496231, + "grad_norm": 16.875, + "learning_rate": 1.8265545917137817e-06, + "loss": 0.878, + "step": 2923 + }, + { + "epoch": 0.7109166058837831, + "grad_norm": 17.375, + "learning_rate": 1.826125762652105e-06, + "loss": 0.8941, + "step": 2924 + }, + { + "epoch": 0.7111597374179431, + "grad_norm": 19.5, + "learning_rate": 1.8256968474784835e-06, + "loss": 0.6803, + "step": 2925 + }, + { + "epoch": 0.7114028689521031, + "grad_norm": 18.875, + "learning_rate": 1.8252678462570253e-06, + "loss": 1.1147, + "step": 2926 + }, + { + "epoch": 0.711646000486263, + "grad_norm": 18.375, + "learning_rate": 1.8248387590518522e-06, + "loss": 1.1708, + "step": 2927 + }, + { + "epoch": 0.711889132020423, + "grad_norm": 19.875, + "learning_rate": 1.8244095859270992e-06, + "loss": 0.8755, + "step": 2928 + }, + { + "epoch": 0.712132263554583, + "grad_norm": 22.0, + "learning_rate": 1.8239803269469126e-06, + "loss": 0.9856, + "step": 2929 + }, + { + "epoch": 0.712375395088743, + "grad_norm": 19.375, + "learning_rate": 1.8235509821754532e-06, + "loss": 0.7377, + "step": 2930 + }, + { + "epoch": 0.712618526622903, + "grad_norm": 20.625, + "learning_rate": 1.823121551676894e-06, + "loss": 0.9506, + "step": 2931 + }, + { + "epoch": 0.7128616581570629, + "grad_norm": 16.375, + "learning_rate": 1.822692035515421e-06, + "loss": 0.7385, + "step": 2932 + }, + { + "epoch": 0.7131047896912229, + "grad_norm": 20.25, + "learning_rate": 1.8222624337552325e-06, + "loss": 0.9211, + "step": 2933 + }, + { + "epoch": 0.7133479212253829, + "grad_norm": 23.375, + "learning_rate": 1.8218327464605397e-06, + "loss": 1.2839, + "step": 2934 + }, + { + "epoch": 0.7135910527595429, + "grad_norm": 22.375, + "learning_rate": 1.8214029736955675e-06, + "loss": 1.0768, + "step": 2935 + }, + { + "epoch": 0.7138341842937029, + "grad_norm": 22.75, + "learning_rate": 1.8209731155245523e-06, + "loss": 1.1243, + "step": 2936 + }, + { + "epoch": 0.7140773158278628, + "grad_norm": 23.375, + "learning_rate": 1.8205431720117436e-06, + "loss": 0.8342, + "step": 2937 + }, + { + "epoch": 0.7143204473620228, + "grad_norm": 15.875, + "learning_rate": 1.8201131432214045e-06, + "loss": 0.6138, + "step": 2938 + }, + { + "epoch": 0.7145635788961828, + "grad_norm": 17.625, + "learning_rate": 1.8196830292178097e-06, + "loss": 0.805, + "step": 2939 + }, + { + "epoch": 0.7148067104303428, + "grad_norm": 16.375, + "learning_rate": 1.8192528300652479e-06, + "loss": 0.7749, + "step": 2940 + }, + { + "epoch": 0.7150498419645028, + "grad_norm": 18.625, + "learning_rate": 1.8188225458280187e-06, + "loss": 0.7135, + "step": 2941 + }, + { + "epoch": 0.7152929734986628, + "grad_norm": 16.25, + "learning_rate": 1.8183921765704365e-06, + "loss": 0.5857, + "step": 2942 + }, + { + "epoch": 0.7155361050328227, + "grad_norm": 19.125, + "learning_rate": 1.8179617223568269e-06, + "loss": 0.7907, + "step": 2943 + }, + { + "epoch": 0.7157792365669827, + "grad_norm": 15.25, + "learning_rate": 1.8175311832515289e-06, + "loss": 0.4061, + "step": 2944 + }, + { + "epoch": 0.7160223681011427, + "grad_norm": 17.0, + "learning_rate": 1.8171005593188939e-06, + "loss": 0.865, + "step": 2945 + }, + { + "epoch": 0.7162654996353027, + "grad_norm": 13.75, + "learning_rate": 1.816669850623286e-06, + "loss": 0.5594, + "step": 2946 + }, + { + "epoch": 0.7165086311694627, + "grad_norm": 29.75, + "learning_rate": 1.8162390572290828e-06, + "loss": 1.004, + "step": 2947 + }, + { + "epoch": 0.7167517627036226, + "grad_norm": 28.75, + "learning_rate": 1.8158081792006727e-06, + "loss": 1.5714, + "step": 2948 + }, + { + "epoch": 0.7169948942377826, + "grad_norm": 16.125, + "learning_rate": 1.8153772166024585e-06, + "loss": 0.6644, + "step": 2949 + }, + { + "epoch": 0.7172380257719426, + "grad_norm": 14.0, + "learning_rate": 1.8149461694988548e-06, + "loss": 0.4888, + "step": 2950 + }, + { + "epoch": 0.7174811573061026, + "grad_norm": 18.625, + "learning_rate": 1.814515037954289e-06, + "loss": 0.5384, + "step": 2951 + }, + { + "epoch": 0.7177242888402626, + "grad_norm": 15.0, + "learning_rate": 1.8140838220332019e-06, + "loss": 0.6208, + "step": 2952 + }, + { + "epoch": 0.7179674203744225, + "grad_norm": 17.75, + "learning_rate": 1.8136525218000448e-06, + "loss": 0.6364, + "step": 2953 + }, + { + "epoch": 0.7182105519085825, + "grad_norm": 17.5, + "learning_rate": 1.8132211373192844e-06, + "loss": 0.8892, + "step": 2954 + }, + { + "epoch": 0.7184536834427425, + "grad_norm": 21.125, + "learning_rate": 1.8127896686553973e-06, + "loss": 0.8518, + "step": 2955 + }, + { + "epoch": 0.7186968149769025, + "grad_norm": 19.25, + "learning_rate": 1.8123581158728744e-06, + "loss": 0.7045, + "step": 2956 + }, + { + "epoch": 0.7189399465110625, + "grad_norm": 14.8125, + "learning_rate": 1.811926479036219e-06, + "loss": 0.5171, + "step": 2957 + }, + { + "epoch": 0.7191830780452224, + "grad_norm": 20.125, + "learning_rate": 1.8114947582099466e-06, + "loss": 0.8784, + "step": 2958 + }, + { + "epoch": 0.7194262095793824, + "grad_norm": 18.625, + "learning_rate": 1.8110629534585854e-06, + "loss": 0.797, + "step": 2959 + }, + { + "epoch": 0.7196693411135424, + "grad_norm": 16.375, + "learning_rate": 1.8106310648466754e-06, + "loss": 0.7181, + "step": 2960 + }, + { + "epoch": 0.7199124726477024, + "grad_norm": 19.75, + "learning_rate": 1.8101990924387708e-06, + "loss": 1.164, + "step": 2961 + }, + { + "epoch": 0.7201556041818624, + "grad_norm": 17.25, + "learning_rate": 1.8097670362994368e-06, + "loss": 1.1201, + "step": 2962 + }, + { + "epoch": 0.7203987357160224, + "grad_norm": 20.875, + "learning_rate": 1.8093348964932516e-06, + "loss": 1.043, + "step": 2963 + }, + { + "epoch": 0.7206418672501823, + "grad_norm": 29.875, + "learning_rate": 1.808902673084806e-06, + "loss": 1.2297, + "step": 2964 + }, + { + "epoch": 0.7208849987843423, + "grad_norm": 19.75, + "learning_rate": 1.8084703661387035e-06, + "loss": 0.9002, + "step": 2965 + }, + { + "epoch": 0.7211281303185023, + "grad_norm": 21.5, + "learning_rate": 1.8080379757195597e-06, + "loss": 0.912, + "step": 2966 + }, + { + "epoch": 0.7213712618526623, + "grad_norm": 15.375, + "learning_rate": 1.8076055018920024e-06, + "loss": 0.65, + "step": 2967 + }, + { + "epoch": 0.7216143933868223, + "grad_norm": 20.625, + "learning_rate": 1.8071729447206731e-06, + "loss": 0.8992, + "step": 2968 + }, + { + "epoch": 0.7218575249209822, + "grad_norm": 19.875, + "learning_rate": 1.8067403042702241e-06, + "loss": 1.1088, + "step": 2969 + }, + { + "epoch": 0.7221006564551422, + "grad_norm": 18.875, + "learning_rate": 1.8063075806053219e-06, + "loss": 0.909, + "step": 2970 + }, + { + "epoch": 0.7223437879893022, + "grad_norm": 16.875, + "learning_rate": 1.8058747737906436e-06, + "loss": 1.0959, + "step": 2971 + }, + { + "epoch": 0.7225869195234622, + "grad_norm": 17.875, + "learning_rate": 1.80544188389088e-06, + "loss": 1.126, + "step": 2972 + }, + { + "epoch": 0.7228300510576222, + "grad_norm": 14.875, + "learning_rate": 1.8050089109707345e-06, + "loss": 0.4312, + "step": 2973 + }, + { + "epoch": 0.7230731825917821, + "grad_norm": 19.375, + "learning_rate": 1.8045758550949217e-06, + "loss": 0.9033, + "step": 2974 + }, + { + "epoch": 0.7233163141259421, + "grad_norm": 19.125, + "learning_rate": 1.8041427163281693e-06, + "loss": 0.9799, + "step": 2975 + }, + { + "epoch": 0.7235594456601021, + "grad_norm": 20.375, + "learning_rate": 1.8037094947352177e-06, + "loss": 0.8835, + "step": 2976 + }, + { + "epoch": 0.7238025771942621, + "grad_norm": 21.625, + "learning_rate": 1.8032761903808194e-06, + "loss": 1.0635, + "step": 2977 + }, + { + "epoch": 0.7240457087284221, + "grad_norm": 17.375, + "learning_rate": 1.802842803329739e-06, + "loss": 0.768, + "step": 2978 + }, + { + "epoch": 0.7242888402625821, + "grad_norm": 20.5, + "learning_rate": 1.8024093336467535e-06, + "loss": 1.0363, + "step": 2979 + }, + { + "epoch": 0.724531971796742, + "grad_norm": 23.875, + "learning_rate": 1.8019757813966526e-06, + "loss": 0.9208, + "step": 2980 + }, + { + "epoch": 0.724775103330902, + "grad_norm": 19.75, + "learning_rate": 1.8015421466442385e-06, + "loss": 0.7719, + "step": 2981 + }, + { + "epoch": 0.725018234865062, + "grad_norm": 16.625, + "learning_rate": 1.8011084294543245e-06, + "loss": 0.6558, + "step": 2982 + }, + { + "epoch": 0.725261366399222, + "grad_norm": 20.0, + "learning_rate": 1.8006746298917389e-06, + "loss": 0.8556, + "step": 2983 + }, + { + "epoch": 0.725504497933382, + "grad_norm": 26.125, + "learning_rate": 1.8002407480213183e-06, + "loss": 1.2889, + "step": 2984 + }, + { + "epoch": 0.7257476294675419, + "grad_norm": 18.875, + "learning_rate": 1.7998067839079154e-06, + "loss": 0.9437, + "step": 2985 + }, + { + "epoch": 0.7259907610017019, + "grad_norm": 23.875, + "learning_rate": 1.799372737616393e-06, + "loss": 1.1154, + "step": 2986 + }, + { + "epoch": 0.7262338925358619, + "grad_norm": 22.5, + "learning_rate": 1.798938609211627e-06, + "loss": 0.8806, + "step": 2987 + }, + { + "epoch": 0.7264770240700219, + "grad_norm": 19.875, + "learning_rate": 1.7985043987585054e-06, + "loss": 1.0027, + "step": 2988 + }, + { + "epoch": 0.7267201556041819, + "grad_norm": 22.875, + "learning_rate": 1.7980701063219286e-06, + "loss": 1.3771, + "step": 2989 + }, + { + "epoch": 0.7269632871383418, + "grad_norm": 24.0, + "learning_rate": 1.7976357319668086e-06, + "loss": 1.0942, + "step": 2990 + }, + { + "epoch": 0.7272064186725018, + "grad_norm": 18.75, + "learning_rate": 1.7972012757580703e-06, + "loss": 0.6214, + "step": 2991 + }, + { + "epoch": 0.7274495502066618, + "grad_norm": 21.125, + "learning_rate": 1.7967667377606515e-06, + "loss": 1.0108, + "step": 2992 + }, + { + "epoch": 0.7276926817408218, + "grad_norm": 20.25, + "learning_rate": 1.7963321180395004e-06, + "loss": 0.8376, + "step": 2993 + }, + { + "epoch": 0.7279358132749818, + "grad_norm": 34.0, + "learning_rate": 1.7958974166595788e-06, + "loss": 0.9713, + "step": 2994 + }, + { + "epoch": 0.7281789448091417, + "grad_norm": 21.5, + "learning_rate": 1.7954626336858602e-06, + "loss": 1.6373, + "step": 2995 + }, + { + "epoch": 0.7284220763433017, + "grad_norm": 12.0625, + "learning_rate": 1.7950277691833308e-06, + "loss": 0.3456, + "step": 2996 + }, + { + "epoch": 0.7286652078774617, + "grad_norm": 22.625, + "learning_rate": 1.7945928232169879e-06, + "loss": 0.9632, + "step": 2997 + }, + { + "epoch": 0.7289083394116217, + "grad_norm": 17.75, + "learning_rate": 1.7941577958518424e-06, + "loss": 0.7156, + "step": 2998 + }, + { + "epoch": 0.7291514709457817, + "grad_norm": 14.9375, + "learning_rate": 1.7937226871529162e-06, + "loss": 0.9448, + "step": 2999 + }, + { + "epoch": 0.7293946024799417, + "grad_norm": 22.625, + "learning_rate": 1.7932874971852443e-06, + "loss": 1.0137, + "step": 3000 + }, + { + "epoch": 0.7296377340141016, + "grad_norm": 21.125, + "learning_rate": 1.7928522260138729e-06, + "loss": 0.891, + "step": 3001 + }, + { + "epoch": 0.7298808655482616, + "grad_norm": 17.25, + "learning_rate": 1.7924168737038612e-06, + "loss": 0.8415, + "step": 3002 + }, + { + "epoch": 0.7301239970824216, + "grad_norm": 21.75, + "learning_rate": 1.791981440320279e-06, + "loss": 0.641, + "step": 3003 + }, + { + "epoch": 0.7303671286165816, + "grad_norm": 15.125, + "learning_rate": 1.791545925928211e-06, + "loss": 0.6934, + "step": 3004 + }, + { + "epoch": 0.7306102601507416, + "grad_norm": 18.125, + "learning_rate": 1.7911103305927512e-06, + "loss": 0.8781, + "step": 3005 + }, + { + "epoch": 0.7308533916849015, + "grad_norm": 15.9375, + "learning_rate": 1.7906746543790075e-06, + "loss": 0.711, + "step": 3006 + }, + { + "epoch": 0.7310965232190615, + "grad_norm": 17.0, + "learning_rate": 1.7902388973520987e-06, + "loss": 0.7602, + "step": 3007 + }, + { + "epoch": 0.7313396547532215, + "grad_norm": 20.5, + "learning_rate": 1.7898030595771566e-06, + "loss": 0.6901, + "step": 3008 + }, + { + "epoch": 0.7315827862873815, + "grad_norm": 19.125, + "learning_rate": 1.7893671411193244e-06, + "loss": 0.6929, + "step": 3009 + }, + { + "epoch": 0.7318259178215415, + "grad_norm": 20.25, + "learning_rate": 1.7889311420437578e-06, + "loss": 1.0812, + "step": 3010 + }, + { + "epoch": 0.7320690493557014, + "grad_norm": 28.25, + "learning_rate": 1.7884950624156242e-06, + "loss": 1.1491, + "step": 3011 + }, + { + "epoch": 0.7323121808898614, + "grad_norm": 18.375, + "learning_rate": 1.7880589023001036e-06, + "loss": 0.7281, + "step": 3012 + }, + { + "epoch": 0.7325553124240214, + "grad_norm": 27.25, + "learning_rate": 1.7876226617623874e-06, + "loss": 0.9335, + "step": 3013 + }, + { + "epoch": 0.7327984439581814, + "grad_norm": 36.25, + "learning_rate": 1.7871863408676796e-06, + "loss": 1.1777, + "step": 3014 + }, + { + "epoch": 0.7330415754923414, + "grad_norm": 23.0, + "learning_rate": 1.7867499396811949e-06, + "loss": 1.0634, + "step": 3015 + }, + { + "epoch": 0.7332847070265014, + "grad_norm": 20.5, + "learning_rate": 1.786313458268162e-06, + "loss": 0.9238, + "step": 3016 + }, + { + "epoch": 0.7335278385606613, + "grad_norm": 16.75, + "learning_rate": 1.785876896693821e-06, + "loss": 0.9873, + "step": 3017 + }, + { + "epoch": 0.7337709700948213, + "grad_norm": 17.125, + "learning_rate": 1.7854402550234218e-06, + "loss": 0.6296, + "step": 3018 + }, + { + "epoch": 0.7340141016289813, + "grad_norm": 18.125, + "learning_rate": 1.7850035333222298e-06, + "loss": 0.6889, + "step": 3019 + }, + { + "epoch": 0.7342572331631413, + "grad_norm": 18.0, + "learning_rate": 1.7845667316555198e-06, + "loss": 0.4127, + "step": 3020 + }, + { + "epoch": 0.7345003646973013, + "grad_norm": 20.0, + "learning_rate": 1.7841298500885798e-06, + "loss": 0.682, + "step": 3021 + }, + { + "epoch": 0.7347434962314612, + "grad_norm": 23.0, + "learning_rate": 1.7836928886867082e-06, + "loss": 1.0004, + "step": 3022 + }, + { + "epoch": 0.7349866277656212, + "grad_norm": 20.75, + "learning_rate": 1.783255847515218e-06, + "loss": 0.78, + "step": 3023 + }, + { + "epoch": 0.7352297592997812, + "grad_norm": 19.625, + "learning_rate": 1.7828187266394312e-06, + "loss": 0.886, + "step": 3024 + }, + { + "epoch": 0.7354728908339412, + "grad_norm": 14.4375, + "learning_rate": 1.7823815261246839e-06, + "loss": 0.6049, + "step": 3025 + }, + { + "epoch": 0.7357160223681012, + "grad_norm": 14.9375, + "learning_rate": 1.7819442460363225e-06, + "loss": 0.9734, + "step": 3026 + }, + { + "epoch": 0.7359591539022611, + "grad_norm": 20.5, + "learning_rate": 1.781506886439707e-06, + "loss": 0.9641, + "step": 3027 + }, + { + "epoch": 0.7362022854364211, + "grad_norm": 37.75, + "learning_rate": 1.7810694474002076e-06, + "loss": 1.4406, + "step": 3028 + }, + { + "epoch": 0.7364454169705811, + "grad_norm": 22.125, + "learning_rate": 1.7806319289832078e-06, + "loss": 1.0294, + "step": 3029 + }, + { + "epoch": 0.7366885485047411, + "grad_norm": 24.0, + "learning_rate": 1.7801943312541014e-06, + "loss": 0.6694, + "step": 3030 + }, + { + "epoch": 0.7369316800389011, + "grad_norm": 18.625, + "learning_rate": 1.7797566542782956e-06, + "loss": 0.6523, + "step": 3031 + }, + { + "epoch": 0.737174811573061, + "grad_norm": 18.75, + "learning_rate": 1.779318898121209e-06, + "loss": 0.8146, + "step": 3032 + }, + { + "epoch": 0.737417943107221, + "grad_norm": 31.875, + "learning_rate": 1.7788810628482708e-06, + "loss": 0.816, + "step": 3033 + }, + { + "epoch": 0.737661074641381, + "grad_norm": 28.125, + "learning_rate": 1.778443148524924e-06, + "loss": 1.3549, + "step": 3034 + }, + { + "epoch": 0.737904206175541, + "grad_norm": 24.0, + "learning_rate": 1.778005155216622e-06, + "loss": 0.9372, + "step": 3035 + }, + { + "epoch": 0.738147337709701, + "grad_norm": 23.125, + "learning_rate": 1.7775670829888309e-06, + "loss": 0.8605, + "step": 3036 + }, + { + "epoch": 0.738390469243861, + "grad_norm": 20.0, + "learning_rate": 1.7771289319070276e-06, + "loss": 0.9511, + "step": 3037 + }, + { + "epoch": 0.7386336007780209, + "grad_norm": 20.25, + "learning_rate": 1.7766907020367013e-06, + "loss": 0.708, + "step": 3038 + }, + { + "epoch": 0.7388767323121809, + "grad_norm": 21.0, + "learning_rate": 1.7762523934433538e-06, + "loss": 0.8422, + "step": 3039 + }, + { + "epoch": 0.7391198638463409, + "grad_norm": 16.875, + "learning_rate": 1.7758140061924971e-06, + "loss": 0.686, + "step": 3040 + }, + { + "epoch": 0.7393629953805009, + "grad_norm": 17.25, + "learning_rate": 1.7753755403496564e-06, + "loss": 0.73, + "step": 3041 + }, + { + "epoch": 0.7396061269146609, + "grad_norm": 17.0, + "learning_rate": 1.774936995980367e-06, + "loss": 0.6003, + "step": 3042 + }, + { + "epoch": 0.7398492584488208, + "grad_norm": 19.25, + "learning_rate": 1.7744983731501783e-06, + "loss": 0.9744, + "step": 3043 + }, + { + "epoch": 0.7400923899829808, + "grad_norm": 20.625, + "learning_rate": 1.774059671924649e-06, + "loss": 1.174, + "step": 3044 + }, + { + "epoch": 0.7403355215171408, + "grad_norm": 17.625, + "learning_rate": 1.773620892369351e-06, + "loss": 0.8853, + "step": 3045 + }, + { + "epoch": 0.7405786530513008, + "grad_norm": 12.6875, + "learning_rate": 1.7731820345498672e-06, + "loss": 0.5966, + "step": 3046 + }, + { + "epoch": 0.7408217845854608, + "grad_norm": 18.625, + "learning_rate": 1.7727430985317927e-06, + "loss": 1.2801, + "step": 3047 + }, + { + "epoch": 0.7410649161196207, + "grad_norm": 17.125, + "learning_rate": 1.7723040843807343e-06, + "loss": 0.8067, + "step": 3048 + }, + { + "epoch": 0.7413080476537807, + "grad_norm": 16.5, + "learning_rate": 1.7718649921623097e-06, + "loss": 0.635, + "step": 3049 + }, + { + "epoch": 0.7415511791879407, + "grad_norm": 16.125, + "learning_rate": 1.7714258219421493e-06, + "loss": 0.6125, + "step": 3050 + }, + { + "epoch": 0.7417943107221007, + "grad_norm": 18.125, + "learning_rate": 1.7709865737858945e-06, + "loss": 1.0174, + "step": 3051 + }, + { + "epoch": 0.7420374422562607, + "grad_norm": 16.625, + "learning_rate": 1.7705472477591982e-06, + "loss": 0.7565, + "step": 3052 + }, + { + "epoch": 0.7422805737904207, + "grad_norm": 23.5, + "learning_rate": 1.7701078439277255e-06, + "loss": 0.4331, + "step": 3053 + }, + { + "epoch": 0.7425237053245806, + "grad_norm": 22.5, + "learning_rate": 1.7696683623571533e-06, + "loss": 0.9624, + "step": 3054 + }, + { + "epoch": 0.7427668368587406, + "grad_norm": 13.3125, + "learning_rate": 1.7692288031131694e-06, + "loss": 0.4313, + "step": 3055 + }, + { + "epoch": 0.7430099683929006, + "grad_norm": 18.375, + "learning_rate": 1.7687891662614733e-06, + "loss": 0.7108, + "step": 3056 + }, + { + "epoch": 0.7432530999270606, + "grad_norm": 21.875, + "learning_rate": 1.7683494518677766e-06, + "loss": 0.8518, + "step": 3057 + }, + { + "epoch": 0.7434962314612206, + "grad_norm": 15.0625, + "learning_rate": 1.7679096599978019e-06, + "loss": 0.62, + "step": 3058 + }, + { + "epoch": 0.7437393629953805, + "grad_norm": 21.75, + "learning_rate": 1.7674697907172841e-06, + "loss": 1.3389, + "step": 3059 + }, + { + "epoch": 0.7439824945295405, + "grad_norm": 22.5, + "learning_rate": 1.7670298440919692e-06, + "loss": 1.0756, + "step": 3060 + }, + { + "epoch": 0.7442256260637005, + "grad_norm": 19.625, + "learning_rate": 1.766589820187614e-06, + "loss": 0.735, + "step": 3061 + }, + { + "epoch": 0.7444687575978605, + "grad_norm": 24.625, + "learning_rate": 1.7661497190699894e-06, + "loss": 0.9854, + "step": 3062 + }, + { + "epoch": 0.7447118891320205, + "grad_norm": 17.75, + "learning_rate": 1.7657095408048744e-06, + "loss": 0.9337, + "step": 3063 + }, + { + "epoch": 0.7449550206661804, + "grad_norm": 16.25, + "learning_rate": 1.7652692854580622e-06, + "loss": 0.5433, + "step": 3064 + }, + { + "epoch": 0.7451981522003404, + "grad_norm": 18.375, + "learning_rate": 1.7648289530953561e-06, + "loss": 0.8421, + "step": 3065 + }, + { + "epoch": 0.7454412837345004, + "grad_norm": 31.875, + "learning_rate": 1.7643885437825715e-06, + "loss": 1.266, + "step": 3066 + }, + { + "epoch": 0.7456844152686604, + "grad_norm": 18.625, + "learning_rate": 1.7639480575855356e-06, + "loss": 0.7353, + "step": 3067 + }, + { + "epoch": 0.7459275468028204, + "grad_norm": 14.0, + "learning_rate": 1.7635074945700858e-06, + "loss": 0.684, + "step": 3068 + }, + { + "epoch": 0.7461706783369803, + "grad_norm": 20.75, + "learning_rate": 1.7630668548020726e-06, + "loss": 0.6465, + "step": 3069 + }, + { + "epoch": 0.7464138098711403, + "grad_norm": 36.5, + "learning_rate": 1.762626138347357e-06, + "loss": 1.2077, + "step": 3070 + }, + { + "epoch": 0.7466569414053003, + "grad_norm": 20.875, + "learning_rate": 1.7621853452718115e-06, + "loss": 1.0533, + "step": 3071 + }, + { + "epoch": 0.7469000729394603, + "grad_norm": 13.875, + "learning_rate": 1.7617444756413205e-06, + "loss": 0.5045, + "step": 3072 + }, + { + "epoch": 0.7471432044736203, + "grad_norm": 16.5, + "learning_rate": 1.7613035295217795e-06, + "loss": 0.6456, + "step": 3073 + }, + { + "epoch": 0.7473863360077803, + "grad_norm": 17.125, + "learning_rate": 1.7608625069790959e-06, + "loss": 0.8867, + "step": 3074 + }, + { + "epoch": 0.7476294675419402, + "grad_norm": 24.0, + "learning_rate": 1.760421408079187e-06, + "loss": 0.805, + "step": 3075 + }, + { + "epoch": 0.7478725990761002, + "grad_norm": 21.5, + "learning_rate": 1.759980232887984e-06, + "loss": 0.5942, + "step": 3076 + }, + { + "epoch": 0.7481157306102602, + "grad_norm": 12.375, + "learning_rate": 1.759538981471427e-06, + "loss": 0.3936, + "step": 3077 + }, + { + "epoch": 0.7483588621444202, + "grad_norm": 18.375, + "learning_rate": 1.7590976538954696e-06, + "loss": 1.0391, + "step": 3078 + }, + { + "epoch": 0.7486019936785802, + "grad_norm": 13.1875, + "learning_rate": 1.7586562502260753e-06, + "loss": 0.4015, + "step": 3079 + }, + { + "epoch": 0.74884512521274, + "grad_norm": 22.0, + "learning_rate": 1.7582147705292192e-06, + "loss": 0.9547, + "step": 3080 + }, + { + "epoch": 0.7490882567469, + "grad_norm": 18.5, + "learning_rate": 1.757773214870889e-06, + "loss": 0.93, + "step": 3081 + }, + { + "epoch": 0.74933138828106, + "grad_norm": 17.25, + "learning_rate": 1.7573315833170821e-06, + "loss": 0.8952, + "step": 3082 + }, + { + "epoch": 0.74957451981522, + "grad_norm": 19.125, + "learning_rate": 1.7568898759338082e-06, + "loss": 0.8293, + "step": 3083 + }, + { + "epoch": 0.74981765134938, + "grad_norm": 18.5, + "learning_rate": 1.756448092787088e-06, + "loss": 0.8411, + "step": 3084 + }, + { + "epoch": 0.75006078288354, + "grad_norm": 13.375, + "learning_rate": 1.7560062339429533e-06, + "loss": 0.4048, + "step": 3085 + }, + { + "epoch": 0.7503039144177, + "grad_norm": 17.75, + "learning_rate": 1.7555642994674489e-06, + "loss": 1.0634, + "step": 3086 + }, + { + "epoch": 0.75054704595186, + "grad_norm": 18.0, + "learning_rate": 1.7551222894266278e-06, + "loss": 0.7873, + "step": 3087 + }, + { + "epoch": 0.75079017748602, + "grad_norm": 18.375, + "learning_rate": 1.7546802038865568e-06, + "loss": 0.7158, + "step": 3088 + }, + { + "epoch": 0.75103330902018, + "grad_norm": 13.1875, + "learning_rate": 1.7542380429133133e-06, + "loss": 0.3718, + "step": 3089 + }, + { + "epoch": 0.75127644055434, + "grad_norm": 12.375, + "learning_rate": 1.7537958065729857e-06, + "loss": 0.3316, + "step": 3090 + }, + { + "epoch": 0.7515195720884998, + "grad_norm": 26.0, + "learning_rate": 1.7533534949316745e-06, + "loss": 1.5041, + "step": 3091 + }, + { + "epoch": 0.7517627036226598, + "grad_norm": 23.25, + "learning_rate": 1.7529111080554894e-06, + "loss": 0.8524, + "step": 3092 + }, + { + "epoch": 0.7520058351568198, + "grad_norm": 15.0625, + "learning_rate": 1.7524686460105542e-06, + "loss": 0.6641, + "step": 3093 + }, + { + "epoch": 0.7522489666909798, + "grad_norm": 22.875, + "learning_rate": 1.7520261088630016e-06, + "loss": 0.5891, + "step": 3094 + }, + { + "epoch": 0.7524920982251398, + "grad_norm": 17.875, + "learning_rate": 1.751583496678977e-06, + "loss": 0.9094, + "step": 3095 + }, + { + "epoch": 0.7527352297592997, + "grad_norm": 15.0625, + "learning_rate": 1.751140809524636e-06, + "loss": 0.4343, + "step": 3096 + }, + { + "epoch": 0.7529783612934597, + "grad_norm": 22.25, + "learning_rate": 1.7506980474661462e-06, + "loss": 0.7665, + "step": 3097 + }, + { + "epoch": 0.7532214928276197, + "grad_norm": 17.0, + "learning_rate": 1.750255210569686e-06, + "loss": 0.8135, + "step": 3098 + }, + { + "epoch": 0.7534646243617797, + "grad_norm": 20.75, + "learning_rate": 1.7498122989014443e-06, + "loss": 0.927, + "step": 3099 + }, + { + "epoch": 0.7537077558959397, + "grad_norm": 18.5, + "learning_rate": 1.749369312527623e-06, + "loss": 0.7866, + "step": 3100 + }, + { + "epoch": 0.7539508874300996, + "grad_norm": 18.5, + "learning_rate": 1.7489262515144333e-06, + "loss": 0.7273, + "step": 3101 + }, + { + "epoch": 0.7541940189642596, + "grad_norm": 18.5, + "learning_rate": 1.7484831159280986e-06, + "loss": 1.1789, + "step": 3102 + }, + { + "epoch": 0.7544371504984196, + "grad_norm": 16.25, + "learning_rate": 1.7480399058348529e-06, + "loss": 0.8168, + "step": 3103 + }, + { + "epoch": 0.7546802820325796, + "grad_norm": 22.625, + "learning_rate": 1.747596621300942e-06, + "loss": 0.8012, + "step": 3104 + }, + { + "epoch": 0.7549234135667396, + "grad_norm": 26.25, + "learning_rate": 1.7471532623926227e-06, + "loss": 1.0752, + "step": 3105 + }, + { + "epoch": 0.7551665451008995, + "grad_norm": 36.25, + "learning_rate": 1.7467098291761616e-06, + "loss": 0.8673, + "step": 3106 + }, + { + "epoch": 0.7554096766350595, + "grad_norm": 20.375, + "learning_rate": 1.7462663217178382e-06, + "loss": 0.9313, + "step": 3107 + }, + { + "epoch": 0.7556528081692195, + "grad_norm": 18.875, + "learning_rate": 1.7458227400839422e-06, + "loss": 0.8523, + "step": 3108 + }, + { + "epoch": 0.7558959397033795, + "grad_norm": 16.75, + "learning_rate": 1.7453790843407747e-06, + "loss": 0.7026, + "step": 3109 + }, + { + "epoch": 0.7561390712375395, + "grad_norm": 15.9375, + "learning_rate": 1.7449353545546477e-06, + "loss": 0.5233, + "step": 3110 + }, + { + "epoch": 0.7563822027716995, + "grad_norm": 14.125, + "learning_rate": 1.7444915507918835e-06, + "loss": 0.653, + "step": 3111 + }, + { + "epoch": 0.7566253343058594, + "grad_norm": 21.75, + "learning_rate": 1.7440476731188175e-06, + "loss": 1.1768, + "step": 3112 + }, + { + "epoch": 0.7568684658400194, + "grad_norm": 17.75, + "learning_rate": 1.743603721601794e-06, + "loss": 0.5922, + "step": 3113 + }, + { + "epoch": 0.7571115973741794, + "grad_norm": 16.375, + "learning_rate": 1.7431596963071695e-06, + "loss": 0.7568, + "step": 3114 + }, + { + "epoch": 0.7573547289083394, + "grad_norm": 15.5, + "learning_rate": 1.742715597301311e-06, + "loss": 0.6965, + "step": 3115 + }, + { + "epoch": 0.7575978604424994, + "grad_norm": 16.0, + "learning_rate": 1.7422714246505972e-06, + "loss": 0.5604, + "step": 3116 + }, + { + "epoch": 0.7578409919766593, + "grad_norm": 24.375, + "learning_rate": 1.7418271784214174e-06, + "loss": 1.4837, + "step": 3117 + }, + { + "epoch": 0.7580841235108193, + "grad_norm": 18.75, + "learning_rate": 1.7413828586801713e-06, + "loss": 0.9794, + "step": 3118 + }, + { + "epoch": 0.7583272550449793, + "grad_norm": 20.125, + "learning_rate": 1.7409384654932707e-06, + "loss": 1.03, + "step": 3119 + }, + { + "epoch": 0.7585703865791393, + "grad_norm": 18.125, + "learning_rate": 1.7404939989271374e-06, + "loss": 0.923, + "step": 3120 + }, + { + "epoch": 0.7588135181132993, + "grad_norm": 19.5, + "learning_rate": 1.7400494590482049e-06, + "loss": 1.0926, + "step": 3121 + }, + { + "epoch": 0.7590566496474592, + "grad_norm": 21.875, + "learning_rate": 1.7396048459229175e-06, + "loss": 0.6412, + "step": 3122 + }, + { + "epoch": 0.7592997811816192, + "grad_norm": 21.375, + "learning_rate": 1.73916015961773e-06, + "loss": 1.078, + "step": 3123 + }, + { + "epoch": 0.7595429127157792, + "grad_norm": 17.125, + "learning_rate": 1.7387154001991086e-06, + "loss": 0.6388, + "step": 3124 + }, + { + "epoch": 0.7597860442499392, + "grad_norm": 17.125, + "learning_rate": 1.73827056773353e-06, + "loss": 0.6687, + "step": 3125 + }, + { + "epoch": 0.7600291757840992, + "grad_norm": 17.5, + "learning_rate": 1.7378256622874826e-06, + "loss": 0.8569, + "step": 3126 + }, + { + "epoch": 0.7602723073182592, + "grad_norm": 18.375, + "learning_rate": 1.7373806839274647e-06, + "loss": 1.1778, + "step": 3127 + }, + { + "epoch": 0.7605154388524191, + "grad_norm": 19.75, + "learning_rate": 1.7369356327199862e-06, + "loss": 1.0933, + "step": 3128 + }, + { + "epoch": 0.7607585703865791, + "grad_norm": 24.375, + "learning_rate": 1.736490508731568e-06, + "loss": 0.9143, + "step": 3129 + }, + { + "epoch": 0.7610017019207391, + "grad_norm": 19.25, + "learning_rate": 1.736045312028741e-06, + "loss": 0.7533, + "step": 3130 + }, + { + "epoch": 0.7612448334548991, + "grad_norm": 18.375, + "learning_rate": 1.735600042678048e-06, + "loss": 0.9688, + "step": 3131 + }, + { + "epoch": 0.7614879649890591, + "grad_norm": 16.875, + "learning_rate": 1.735154700746042e-06, + "loss": 0.3887, + "step": 3132 + }, + { + "epoch": 0.761731096523219, + "grad_norm": 18.875, + "learning_rate": 1.7347092862992871e-06, + "loss": 0.8986, + "step": 3133 + }, + { + "epoch": 0.761974228057379, + "grad_norm": 22.75, + "learning_rate": 1.7342637994043582e-06, + "loss": 1.1174, + "step": 3134 + }, + { + "epoch": 0.762217359591539, + "grad_norm": 14.9375, + "learning_rate": 1.733818240127841e-06, + "loss": 0.7159, + "step": 3135 + }, + { + "epoch": 0.762460491125699, + "grad_norm": 14.5625, + "learning_rate": 1.7333726085363317e-06, + "loss": 0.5699, + "step": 3136 + }, + { + "epoch": 0.762703622659859, + "grad_norm": 19.0, + "learning_rate": 1.732926904696438e-06, + "loss": 0.7077, + "step": 3137 + }, + { + "epoch": 0.7629467541940189, + "grad_norm": 25.25, + "learning_rate": 1.7324811286747779e-06, + "loss": 0.9547, + "step": 3138 + }, + { + "epoch": 0.7631898857281789, + "grad_norm": 16.375, + "learning_rate": 1.7320352805379807e-06, + "loss": 0.7508, + "step": 3139 + }, + { + "epoch": 0.7634330172623389, + "grad_norm": 18.5, + "learning_rate": 1.7315893603526857e-06, + "loss": 1.1658, + "step": 3140 + }, + { + "epoch": 0.7636761487964989, + "grad_norm": 20.375, + "learning_rate": 1.7311433681855432e-06, + "loss": 1.1308, + "step": 3141 + }, + { + "epoch": 0.7639192803306589, + "grad_norm": 17.75, + "learning_rate": 1.7306973041032145e-06, + "loss": 1.1094, + "step": 3142 + }, + { + "epoch": 0.7641624118648188, + "grad_norm": 18.875, + "learning_rate": 1.7302511681723721e-06, + "loss": 1.0631, + "step": 3143 + }, + { + "epoch": 0.7644055433989788, + "grad_norm": 19.75, + "learning_rate": 1.729804960459699e-06, + "loss": 0.7407, + "step": 3144 + }, + { + "epoch": 0.7646486749331388, + "grad_norm": 20.625, + "learning_rate": 1.7293586810318872e-06, + "loss": 1.0228, + "step": 3145 + }, + { + "epoch": 0.7648918064672988, + "grad_norm": 14.6875, + "learning_rate": 1.7289123299556419e-06, + "loss": 0.5856, + "step": 3146 + }, + { + "epoch": 0.7651349380014588, + "grad_norm": 14.0, + "learning_rate": 1.7284659072976778e-06, + "loss": 0.7226, + "step": 3147 + }, + { + "epoch": 0.7653780695356188, + "grad_norm": 17.875, + "learning_rate": 1.7280194131247208e-06, + "loss": 0.8066, + "step": 3148 + }, + { + "epoch": 0.7656212010697787, + "grad_norm": 17.25, + "learning_rate": 1.7275728475035063e-06, + "loss": 0.6307, + "step": 3149 + }, + { + "epoch": 0.7658643326039387, + "grad_norm": 19.125, + "learning_rate": 1.727126210500782e-06, + "loss": 0.8575, + "step": 3150 + }, + { + "epoch": 0.7661074641380987, + "grad_norm": 23.625, + "learning_rate": 1.7266795021833052e-06, + "loss": 0.8573, + "step": 3151 + }, + { + "epoch": 0.7663505956722587, + "grad_norm": 20.875, + "learning_rate": 1.7262327226178445e-06, + "loss": 1.252, + "step": 3152 + }, + { + "epoch": 0.7665937272064187, + "grad_norm": 17.5, + "learning_rate": 1.7257858718711784e-06, + "loss": 0.9626, + "step": 3153 + }, + { + "epoch": 0.7668368587405786, + "grad_norm": 20.25, + "learning_rate": 1.7253389500100965e-06, + "loss": 0.6853, + "step": 3154 + }, + { + "epoch": 0.7670799902747386, + "grad_norm": 16.125, + "learning_rate": 1.724891957101399e-06, + "loss": 0.7116, + "step": 3155 + }, + { + "epoch": 0.7673231218088986, + "grad_norm": 16.75, + "learning_rate": 1.7244448932118976e-06, + "loss": 0.7574, + "step": 3156 + }, + { + "epoch": 0.7675662533430586, + "grad_norm": 20.25, + "learning_rate": 1.7239977584084122e-06, + "loss": 0.9469, + "step": 3157 + }, + { + "epoch": 0.7678093848772186, + "grad_norm": 21.0, + "learning_rate": 1.723550552757776e-06, + "loss": 1.0499, + "step": 3158 + }, + { + "epoch": 0.7680525164113785, + "grad_norm": 16.75, + "learning_rate": 1.7231032763268314e-06, + "loss": 0.4783, + "step": 3159 + }, + { + "epoch": 0.7682956479455385, + "grad_norm": 18.75, + "learning_rate": 1.722655929182431e-06, + "loss": 1.0957, + "step": 3160 + }, + { + "epoch": 0.7685387794796985, + "grad_norm": 17.375, + "learning_rate": 1.7222085113914388e-06, + "loss": 0.6827, + "step": 3161 + }, + { + "epoch": 0.7687819110138585, + "grad_norm": 16.0, + "learning_rate": 1.7217610230207294e-06, + "loss": 0.682, + "step": 3162 + }, + { + "epoch": 0.7690250425480185, + "grad_norm": 42.5, + "learning_rate": 1.7213134641371876e-06, + "loss": 0.893, + "step": 3163 + }, + { + "epoch": 0.7692681740821785, + "grad_norm": 18.75, + "learning_rate": 1.7208658348077087e-06, + "loss": 0.5515, + "step": 3164 + }, + { + "epoch": 0.7695113056163384, + "grad_norm": 19.25, + "learning_rate": 1.7204181350991987e-06, + "loss": 0.762, + "step": 3165 + }, + { + "epoch": 0.7697544371504984, + "grad_norm": 17.5, + "learning_rate": 1.7199703650785738e-06, + "loss": 0.7527, + "step": 3166 + }, + { + "epoch": 0.7699975686846584, + "grad_norm": 14.625, + "learning_rate": 1.7195225248127611e-06, + "loss": 0.764, + "step": 3167 + }, + { + "epoch": 0.7702407002188184, + "grad_norm": 14.875, + "learning_rate": 1.7190746143686986e-06, + "loss": 0.6568, + "step": 3168 + }, + { + "epoch": 0.7704838317529784, + "grad_norm": 21.75, + "learning_rate": 1.7186266338133334e-06, + "loss": 0.9405, + "step": 3169 + }, + { + "epoch": 0.7707269632871383, + "grad_norm": 16.875, + "learning_rate": 1.7181785832136245e-06, + "loss": 0.9862, + "step": 3170 + }, + { + "epoch": 0.7709700948212983, + "grad_norm": 26.0, + "learning_rate": 1.7177304626365404e-06, + "loss": 1.051, + "step": 3171 + }, + { + "epoch": 0.7712132263554583, + "grad_norm": 23.25, + "learning_rate": 1.717282272149061e-06, + "loss": 1.0199, + "step": 3172 + }, + { + "epoch": 0.7714563578896183, + "grad_norm": 25.25, + "learning_rate": 1.7168340118181754e-06, + "loss": 1.2688, + "step": 3173 + }, + { + "epoch": 0.7716994894237783, + "grad_norm": 18.875, + "learning_rate": 1.7163856817108845e-06, + "loss": 0.8713, + "step": 3174 + }, + { + "epoch": 0.7719426209579382, + "grad_norm": 14.125, + "learning_rate": 1.715937281894199e-06, + "loss": 0.4008, + "step": 3175 + }, + { + "epoch": 0.7721857524920982, + "grad_norm": 16.25, + "learning_rate": 1.715488812435139e-06, + "loss": 0.7364, + "step": 3176 + }, + { + "epoch": 0.7724288840262582, + "grad_norm": 20.375, + "learning_rate": 1.7150402734007372e-06, + "loss": 1.1006, + "step": 3177 + }, + { + "epoch": 0.7726720155604182, + "grad_norm": 15.0, + "learning_rate": 1.7145916648580345e-06, + "loss": 0.5837, + "step": 3178 + }, + { + "epoch": 0.7729151470945782, + "grad_norm": 15.9375, + "learning_rate": 1.7141429868740843e-06, + "loss": 0.6657, + "step": 3179 + }, + { + "epoch": 0.7731582786287381, + "grad_norm": 15.25, + "learning_rate": 1.7136942395159487e-06, + "loss": 0.4304, + "step": 3180 + }, + { + "epoch": 0.7734014101628981, + "grad_norm": 16.125, + "learning_rate": 1.7132454228507002e-06, + "loss": 0.6216, + "step": 3181 + }, + { + "epoch": 0.7736445416970581, + "grad_norm": 19.875, + "learning_rate": 1.7127965369454233e-06, + "loss": 1.0955, + "step": 3182 + }, + { + "epoch": 0.7738876732312181, + "grad_norm": 18.25, + "learning_rate": 1.7123475818672108e-06, + "loss": 0.8218, + "step": 3183 + }, + { + "epoch": 0.7741308047653781, + "grad_norm": 22.375, + "learning_rate": 1.7118985576831673e-06, + "loss": 1.0814, + "step": 3184 + }, + { + "epoch": 0.7743739362995381, + "grad_norm": 20.75, + "learning_rate": 1.7114494644604072e-06, + "loss": 1.1863, + "step": 3185 + }, + { + "epoch": 0.774617067833698, + "grad_norm": 18.125, + "learning_rate": 1.7110003022660548e-06, + "loss": 0.825, + "step": 3186 + }, + { + "epoch": 0.774860199367858, + "grad_norm": 25.375, + "learning_rate": 1.7105510711672456e-06, + "loss": 0.7863, + "step": 3187 + }, + { + "epoch": 0.775103330902018, + "grad_norm": 23.5, + "learning_rate": 1.710101771231125e-06, + "loss": 0.903, + "step": 3188 + }, + { + "epoch": 0.775346462436178, + "grad_norm": 20.75, + "learning_rate": 1.7096524025248483e-06, + "loss": 1.0174, + "step": 3189 + }, + { + "epoch": 0.775589593970338, + "grad_norm": 27.0, + "learning_rate": 1.7092029651155816e-06, + "loss": 1.0477, + "step": 3190 + }, + { + "epoch": 0.7758327255044979, + "grad_norm": 19.375, + "learning_rate": 1.7087534590705012e-06, + "loss": 0.965, + "step": 3191 + }, + { + "epoch": 0.7760758570386579, + "grad_norm": 20.375, + "learning_rate": 1.7083038844567931e-06, + "loss": 1.0624, + "step": 3192 + }, + { + "epoch": 0.7763189885728179, + "grad_norm": 20.25, + "learning_rate": 1.7078542413416547e-06, + "loss": 1.0174, + "step": 3193 + }, + { + "epoch": 0.7765621201069779, + "grad_norm": 12.125, + "learning_rate": 1.7074045297922924e-06, + "loss": 0.6654, + "step": 3194 + }, + { + "epoch": 0.7768052516411379, + "grad_norm": 17.375, + "learning_rate": 1.7069547498759231e-06, + "loss": 0.8047, + "step": 3195 + }, + { + "epoch": 0.7770483831752978, + "grad_norm": 18.125, + "learning_rate": 1.706504901659775e-06, + "loss": 0.7833, + "step": 3196 + }, + { + "epoch": 0.7772915147094578, + "grad_norm": 16.625, + "learning_rate": 1.706054985211085e-06, + "loss": 0.676, + "step": 3197 + }, + { + "epoch": 0.7775346462436178, + "grad_norm": 27.125, + "learning_rate": 1.7056050005971008e-06, + "loss": 1.1959, + "step": 3198 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 17.875, + "learning_rate": 1.7051549478850816e-06, + "loss": 0.9621, + "step": 3199 + }, + { + "epoch": 0.7780209093119378, + "grad_norm": 19.25, + "learning_rate": 1.7047048271422937e-06, + "loss": 1.1471, + "step": 3200 + }, + { + "epoch": 0.7782640408460978, + "grad_norm": 20.375, + "learning_rate": 1.7042546384360171e-06, + "loss": 0.7744, + "step": 3201 + }, + { + "epoch": 0.7785071723802577, + "grad_norm": 15.3125, + "learning_rate": 1.7038043818335389e-06, + "loss": 0.691, + "step": 3202 + }, + { + "epoch": 0.7787503039144177, + "grad_norm": 19.25, + "learning_rate": 1.7033540574021587e-06, + "loss": 0.6704, + "step": 3203 + }, + { + "epoch": 0.7789934354485777, + "grad_norm": 19.0, + "learning_rate": 1.7029036652091846e-06, + "loss": 1.4227, + "step": 3204 + }, + { + "epoch": 0.7792365669827377, + "grad_norm": 16.625, + "learning_rate": 1.7024532053219362e-06, + "loss": 0.6215, + "step": 3205 + }, + { + "epoch": 0.7794796985168977, + "grad_norm": 16.25, + "learning_rate": 1.7020026778077423e-06, + "loss": 0.7057, + "step": 3206 + }, + { + "epoch": 0.7797228300510576, + "grad_norm": 24.75, + "learning_rate": 1.7015520827339415e-06, + "loss": 0.8641, + "step": 3207 + }, + { + "epoch": 0.7799659615852176, + "grad_norm": 16.125, + "learning_rate": 1.7011014201678832e-06, + "loss": 1.3396, + "step": 3208 + }, + { + "epoch": 0.7802090931193776, + "grad_norm": 17.0, + "learning_rate": 1.7006506901769271e-06, + "loss": 0.9634, + "step": 3209 + }, + { + "epoch": 0.7804522246535376, + "grad_norm": 15.1875, + "learning_rate": 1.7001998928284423e-06, + "loss": 0.627, + "step": 3210 + }, + { + "epoch": 0.7806953561876976, + "grad_norm": 22.5, + "learning_rate": 1.6997490281898082e-06, + "loss": 0.8008, + "step": 3211 + }, + { + "epoch": 0.7809384877218575, + "grad_norm": 19.625, + "learning_rate": 1.6992980963284144e-06, + "loss": 0.8307, + "step": 3212 + }, + { + "epoch": 0.7811816192560175, + "grad_norm": 16.375, + "learning_rate": 1.6988470973116611e-06, + "loss": 0.7118, + "step": 3213 + }, + { + "epoch": 0.7814247507901775, + "grad_norm": 20.875, + "learning_rate": 1.6983960312069566e-06, + "loss": 0.92, + "step": 3214 + }, + { + "epoch": 0.7816678823243375, + "grad_norm": 22.0, + "learning_rate": 1.6979448980817212e-06, + "loss": 1.061, + "step": 3215 + }, + { + "epoch": 0.7819110138584975, + "grad_norm": 18.75, + "learning_rate": 1.6974936980033846e-06, + "loss": 0.8675, + "step": 3216 + }, + { + "epoch": 0.7821541453926574, + "grad_norm": 19.625, + "learning_rate": 1.6970424310393863e-06, + "loss": 0.705, + "step": 3217 + }, + { + "epoch": 0.7823972769268174, + "grad_norm": 18.875, + "learning_rate": 1.6965910972571763e-06, + "loss": 0.833, + "step": 3218 + }, + { + "epoch": 0.7826404084609774, + "grad_norm": 23.5, + "learning_rate": 1.6961396967242133e-06, + "loss": 1.1624, + "step": 3219 + }, + { + "epoch": 0.7828835399951374, + "grad_norm": 20.875, + "learning_rate": 1.6956882295079683e-06, + "loss": 1.2196, + "step": 3220 + }, + { + "epoch": 0.7831266715292974, + "grad_norm": 19.625, + "learning_rate": 1.6952366956759195e-06, + "loss": 1.1915, + "step": 3221 + }, + { + "epoch": 0.7833698030634574, + "grad_norm": 16.875, + "learning_rate": 1.6947850952955572e-06, + "loss": 0.7576, + "step": 3222 + }, + { + "epoch": 0.7836129345976173, + "grad_norm": 18.75, + "learning_rate": 1.6943334284343807e-06, + "loss": 0.648, + "step": 3223 + }, + { + "epoch": 0.7838560661317773, + "grad_norm": 23.875, + "learning_rate": 1.6938816951598993e-06, + "loss": 0.8941, + "step": 3224 + }, + { + "epoch": 0.7840991976659373, + "grad_norm": 29.125, + "learning_rate": 1.6934298955396331e-06, + "loss": 1.1015, + "step": 3225 + }, + { + "epoch": 0.7843423292000973, + "grad_norm": 21.125, + "learning_rate": 1.6929780296411099e-06, + "loss": 0.9174, + "step": 3226 + }, + { + "epoch": 0.7845854607342573, + "grad_norm": 23.25, + "learning_rate": 1.69252609753187e-06, + "loss": 0.6624, + "step": 3227 + }, + { + "epoch": 0.7848285922684172, + "grad_norm": 17.625, + "learning_rate": 1.692074099279462e-06, + "loss": 0.7137, + "step": 3228 + }, + { + "epoch": 0.7850717238025772, + "grad_norm": 21.375, + "learning_rate": 1.6916220349514451e-06, + "loss": 1.0116, + "step": 3229 + }, + { + "epoch": 0.7853148553367372, + "grad_norm": 21.5, + "learning_rate": 1.6911699046153884e-06, + "loss": 0.8142, + "step": 3230 + }, + { + "epoch": 0.7855579868708972, + "grad_norm": 20.875, + "learning_rate": 1.6907177083388693e-06, + "loss": 0.8175, + "step": 3231 + }, + { + "epoch": 0.7858011184050572, + "grad_norm": 21.125, + "learning_rate": 1.690265446189478e-06, + "loss": 0.7351, + "step": 3232 + }, + { + "epoch": 0.7860442499392171, + "grad_norm": 26.875, + "learning_rate": 1.6898131182348118e-06, + "loss": 1.2597, + "step": 3233 + }, + { + "epoch": 0.7862873814733771, + "grad_norm": 22.5, + "learning_rate": 1.6893607245424792e-06, + "loss": 0.9505, + "step": 3234 + }, + { + "epoch": 0.7865305130075371, + "grad_norm": 19.375, + "learning_rate": 1.6889082651800983e-06, + "loss": 0.994, + "step": 3235 + }, + { + "epoch": 0.7867736445416971, + "grad_norm": 41.0, + "learning_rate": 1.6884557402152971e-06, + "loss": 0.8545, + "step": 3236 + }, + { + "epoch": 0.7870167760758571, + "grad_norm": 29.25, + "learning_rate": 1.6880031497157133e-06, + "loss": 0.4975, + "step": 3237 + }, + { + "epoch": 0.7872599076100171, + "grad_norm": 22.75, + "learning_rate": 1.687550493748994e-06, + "loss": 1.159, + "step": 3238 + }, + { + "epoch": 0.787503039144177, + "grad_norm": 28.625, + "learning_rate": 1.6870977723827963e-06, + "loss": 1.4826, + "step": 3239 + }, + { + "epoch": 0.787746170678337, + "grad_norm": 28.125, + "learning_rate": 1.686644985684788e-06, + "loss": 1.3861, + "step": 3240 + }, + { + "epoch": 0.787989302212497, + "grad_norm": 21.25, + "learning_rate": 1.6861921337226453e-06, + "loss": 1.2006, + "step": 3241 + }, + { + "epoch": 0.788232433746657, + "grad_norm": 25.5, + "learning_rate": 1.685739216564055e-06, + "loss": 1.059, + "step": 3242 + }, + { + "epoch": 0.788475565280817, + "grad_norm": 14.5625, + "learning_rate": 1.6852862342767132e-06, + "loss": 0.6043, + "step": 3243 + }, + { + "epoch": 0.7887186968149769, + "grad_norm": 16.75, + "learning_rate": 1.6848331869283263e-06, + "loss": 0.6907, + "step": 3244 + }, + { + "epoch": 0.7889618283491369, + "grad_norm": 17.375, + "learning_rate": 1.6843800745866095e-06, + "loss": 0.6402, + "step": 3245 + }, + { + "epoch": 0.7892049598832969, + "grad_norm": 17.125, + "learning_rate": 1.6839268973192888e-06, + "loss": 0.814, + "step": 3246 + }, + { + "epoch": 0.7894480914174569, + "grad_norm": 18.375, + "learning_rate": 1.683473655194099e-06, + "loss": 0.4058, + "step": 3247 + }, + { + "epoch": 0.7896912229516169, + "grad_norm": 30.75, + "learning_rate": 1.683020348278785e-06, + "loss": 1.2858, + "step": 3248 + }, + { + "epoch": 0.7899343544857768, + "grad_norm": 19.75, + "learning_rate": 1.6825669766411015e-06, + "loss": 1.006, + "step": 3249 + }, + { + "epoch": 0.7901774860199368, + "grad_norm": 22.25, + "learning_rate": 1.6821135403488126e-06, + "loss": 0.8786, + "step": 3250 + }, + { + "epoch": 0.7904206175540968, + "grad_norm": 26.125, + "learning_rate": 1.6816600394696924e-06, + "loss": 1.0818, + "step": 3251 + }, + { + "epoch": 0.7906637490882568, + "grad_norm": 27.75, + "learning_rate": 1.681206474071524e-06, + "loss": 1.0326, + "step": 3252 + }, + { + "epoch": 0.7909068806224168, + "grad_norm": 21.75, + "learning_rate": 1.680752844222101e-06, + "loss": 1.0456, + "step": 3253 + }, + { + "epoch": 0.7911500121565767, + "grad_norm": 23.625, + "learning_rate": 1.680299149989226e-06, + "loss": 0.6997, + "step": 3254 + }, + { + "epoch": 0.7913931436907367, + "grad_norm": 29.25, + "learning_rate": 1.6798453914407115e-06, + "loss": 0.9938, + "step": 3255 + }, + { + "epoch": 0.7916362752248967, + "grad_norm": 22.5, + "learning_rate": 1.6793915686443798e-06, + "loss": 0.9355, + "step": 3256 + }, + { + "epoch": 0.7918794067590567, + "grad_norm": 17.5, + "learning_rate": 1.6789376816680622e-06, + "loss": 0.5805, + "step": 3257 + }, + { + "epoch": 0.7921225382932167, + "grad_norm": 24.25, + "learning_rate": 1.6784837305796e-06, + "loss": 1.3705, + "step": 3258 + }, + { + "epoch": 0.7923656698273767, + "grad_norm": 19.5, + "learning_rate": 1.6780297154468442e-06, + "loss": 1.0128, + "step": 3259 + }, + { + "epoch": 0.7926088013615366, + "grad_norm": 13.4375, + "learning_rate": 1.677575636337655e-06, + "loss": 0.3846, + "step": 3260 + }, + { + "epoch": 0.7928519328956966, + "grad_norm": 25.0, + "learning_rate": 1.6771214933199027e-06, + "loss": 1.0039, + "step": 3261 + }, + { + "epoch": 0.7930950644298566, + "grad_norm": 21.0, + "learning_rate": 1.6766672864614658e-06, + "loss": 1.1159, + "step": 3262 + }, + { + "epoch": 0.7933381959640166, + "grad_norm": 20.5, + "learning_rate": 1.676213015830235e-06, + "loss": 1.0886, + "step": 3263 + }, + { + "epoch": 0.7935813274981766, + "grad_norm": 18.75, + "learning_rate": 1.6757586814941078e-06, + "loss": 1.0525, + "step": 3264 + }, + { + "epoch": 0.7938244590323364, + "grad_norm": 19.0, + "learning_rate": 1.6753042835209924e-06, + "loss": 1.0886, + "step": 3265 + }, + { + "epoch": 0.7940675905664965, + "grad_norm": 16.625, + "learning_rate": 1.6748498219788067e-06, + "loss": 0.6404, + "step": 3266 + }, + { + "epoch": 0.7943107221006565, + "grad_norm": 18.875, + "learning_rate": 1.6743952969354777e-06, + "loss": 0.972, + "step": 3267 + }, + { + "epoch": 0.7945538536348165, + "grad_norm": 16.25, + "learning_rate": 1.673940708458942e-06, + "loss": 0.5333, + "step": 3268 + }, + { + "epoch": 0.7947969851689765, + "grad_norm": 22.25, + "learning_rate": 1.6734860566171454e-06, + "loss": 0.8507, + "step": 3269 + }, + { + "epoch": 0.7950401167031363, + "grad_norm": 15.5625, + "learning_rate": 1.6730313414780442e-06, + "loss": 0.6042, + "step": 3270 + }, + { + "epoch": 0.7952832482372963, + "grad_norm": 16.0, + "learning_rate": 1.672576563109603e-06, + "loss": 0.6339, + "step": 3271 + }, + { + "epoch": 0.7955263797714563, + "grad_norm": 14.4375, + "learning_rate": 1.6721217215797961e-06, + "loss": 0.7245, + "step": 3272 + }, + { + "epoch": 0.7957695113056164, + "grad_norm": 18.125, + "learning_rate": 1.6716668169566074e-06, + "loss": 0.7123, + "step": 3273 + }, + { + "epoch": 0.7960126428397764, + "grad_norm": 19.25, + "learning_rate": 1.671211849308031e-06, + "loss": 0.8693, + "step": 3274 + }, + { + "epoch": 0.7962557743739364, + "grad_norm": 23.25, + "learning_rate": 1.670756818702069e-06, + "loss": 0.9725, + "step": 3275 + }, + { + "epoch": 0.7964989059080962, + "grad_norm": 19.25, + "learning_rate": 1.6703017252067336e-06, + "loss": 1.1398, + "step": 3276 + }, + { + "epoch": 0.7967420374422562, + "grad_norm": 23.625, + "learning_rate": 1.6698465688900462e-06, + "loss": 0.9758, + "step": 3277 + }, + { + "epoch": 0.7969851689764162, + "grad_norm": 21.75, + "learning_rate": 1.6693913498200383e-06, + "loss": 0.9211, + "step": 3278 + }, + { + "epoch": 0.7972283005105762, + "grad_norm": 16.5, + "learning_rate": 1.6689360680647504e-06, + "loss": 0.589, + "step": 3279 + }, + { + "epoch": 0.7974714320447363, + "grad_norm": 20.875, + "learning_rate": 1.6684807236922318e-06, + "loss": 0.9694, + "step": 3280 + }, + { + "epoch": 0.7977145635788961, + "grad_norm": 14.0, + "learning_rate": 1.6680253167705409e-06, + "loss": 0.4407, + "step": 3281 + }, + { + "epoch": 0.7979576951130561, + "grad_norm": 16.0, + "learning_rate": 1.6675698473677473e-06, + "loss": 0.7154, + "step": 3282 + }, + { + "epoch": 0.7982008266472161, + "grad_norm": 14.125, + "learning_rate": 1.6671143155519286e-06, + "loss": 0.6744, + "step": 3283 + }, + { + "epoch": 0.7984439581813761, + "grad_norm": 14.125, + "learning_rate": 1.6666587213911714e-06, + "loss": 0.4512, + "step": 3284 + }, + { + "epoch": 0.7986870897155361, + "grad_norm": 21.25, + "learning_rate": 1.6662030649535725e-06, + "loss": 1.0703, + "step": 3285 + }, + { + "epoch": 0.798930221249696, + "grad_norm": 15.3125, + "learning_rate": 1.665747346307237e-06, + "loss": 0.6652, + "step": 3286 + }, + { + "epoch": 0.799173352783856, + "grad_norm": 18.5, + "learning_rate": 1.665291565520281e-06, + "loss": 0.7123, + "step": 3287 + }, + { + "epoch": 0.799416484318016, + "grad_norm": 18.0, + "learning_rate": 1.6648357226608281e-06, + "loss": 0.6371, + "step": 3288 + }, + { + "epoch": 0.799659615852176, + "grad_norm": 20.125, + "learning_rate": 1.6643798177970113e-06, + "loss": 0.9583, + "step": 3289 + }, + { + "epoch": 0.799902747386336, + "grad_norm": 18.5, + "learning_rate": 1.663923850996975e-06, + "loss": 1.0215, + "step": 3290 + }, + { + "epoch": 0.8001458789204959, + "grad_norm": 19.125, + "learning_rate": 1.66346782232887e-06, + "loss": 0.9595, + "step": 3291 + }, + { + "epoch": 0.8003890104546559, + "grad_norm": 17.875, + "learning_rate": 1.6630117318608585e-06, + "loss": 0.9341, + "step": 3292 + }, + { + "epoch": 0.8006321419888159, + "grad_norm": 11.1875, + "learning_rate": 1.6625555796611102e-06, + "loss": 0.2916, + "step": 3293 + }, + { + "epoch": 0.8008752735229759, + "grad_norm": 19.5, + "learning_rate": 1.6620993657978054e-06, + "loss": 0.9743, + "step": 3294 + }, + { + "epoch": 0.8011184050571359, + "grad_norm": 23.375, + "learning_rate": 1.6616430903391337e-06, + "loss": 1.0148, + "step": 3295 + }, + { + "epoch": 0.8013615365912959, + "grad_norm": 22.875, + "learning_rate": 1.6611867533532921e-06, + "loss": 1.2657, + "step": 3296 + }, + { + "epoch": 0.8016046681254558, + "grad_norm": 20.0, + "learning_rate": 1.6607303549084897e-06, + "loss": 1.0109, + "step": 3297 + }, + { + "epoch": 0.8018477996596158, + "grad_norm": 29.625, + "learning_rate": 1.6602738950729417e-06, + "loss": 1.0593, + "step": 3298 + }, + { + "epoch": 0.8020909311937758, + "grad_norm": 17.625, + "learning_rate": 1.6598173739148744e-06, + "loss": 0.6808, + "step": 3299 + }, + { + "epoch": 0.8023340627279358, + "grad_norm": 15.1875, + "learning_rate": 1.6593607915025227e-06, + "loss": 0.5933, + "step": 3300 + }, + { + "epoch": 0.8025771942620958, + "grad_norm": 14.625, + "learning_rate": 1.658904147904131e-06, + "loss": 0.5634, + "step": 3301 + }, + { + "epoch": 0.8028203257962557, + "grad_norm": 14.75, + "learning_rate": 1.6584474431879527e-06, + "loss": 0.6136, + "step": 3302 + }, + { + "epoch": 0.8030634573304157, + "grad_norm": 18.875, + "learning_rate": 1.6579906774222493e-06, + "loss": 0.7261, + "step": 3303 + }, + { + "epoch": 0.8033065888645757, + "grad_norm": 16.875, + "learning_rate": 1.657533850675293e-06, + "loss": 0.5733, + "step": 3304 + }, + { + "epoch": 0.8035497203987357, + "grad_norm": 23.375, + "learning_rate": 1.6570769630153643e-06, + "loss": 0.7423, + "step": 3305 + }, + { + "epoch": 0.8037928519328957, + "grad_norm": 17.375, + "learning_rate": 1.6566200145107534e-06, + "loss": 0.5861, + "step": 3306 + }, + { + "epoch": 0.8040359834670556, + "grad_norm": 20.625, + "learning_rate": 1.6561630052297586e-06, + "loss": 0.9351, + "step": 3307 + }, + { + "epoch": 0.8042791150012156, + "grad_norm": 23.375, + "learning_rate": 1.655705935240688e-06, + "loss": 1.3245, + "step": 3308 + }, + { + "epoch": 0.8045222465353756, + "grad_norm": 16.75, + "learning_rate": 1.6552488046118588e-06, + "loss": 0.7572, + "step": 3309 + }, + { + "epoch": 0.8047653780695356, + "grad_norm": 15.5, + "learning_rate": 1.6547916134115964e-06, + "loss": 0.4335, + "step": 3310 + }, + { + "epoch": 0.8050085096036956, + "grad_norm": 15.5625, + "learning_rate": 1.6543343617082364e-06, + "loss": 0.6441, + "step": 3311 + }, + { + "epoch": 0.8052516411378556, + "grad_norm": 16.75, + "learning_rate": 1.653877049570123e-06, + "loss": 0.798, + "step": 3312 + }, + { + "epoch": 0.8054947726720155, + "grad_norm": 18.375, + "learning_rate": 1.6534196770656097e-06, + "loss": 0.6102, + "step": 3313 + }, + { + "epoch": 0.8057379042061755, + "grad_norm": 20.875, + "learning_rate": 1.6529622442630583e-06, + "loss": 1.1213, + "step": 3314 + }, + { + "epoch": 0.8059810357403355, + "grad_norm": 20.625, + "learning_rate": 1.6525047512308398e-06, + "loss": 0.8487, + "step": 3315 + }, + { + "epoch": 0.8062241672744955, + "grad_norm": 61.5, + "learning_rate": 1.6520471980373348e-06, + "loss": 0.8042, + "step": 3316 + }, + { + "epoch": 0.8064672988086555, + "grad_norm": 17.75, + "learning_rate": 1.6515895847509325e-06, + "loss": 0.9871, + "step": 3317 + }, + { + "epoch": 0.8067104303428154, + "grad_norm": 17.5, + "learning_rate": 1.6511319114400308e-06, + "loss": 0.6412, + "step": 3318 + }, + { + "epoch": 0.8069535618769754, + "grad_norm": 20.125, + "learning_rate": 1.6506741781730379e-06, + "loss": 0.6683, + "step": 3319 + }, + { + "epoch": 0.8071966934111354, + "grad_norm": 18.0, + "learning_rate": 1.6502163850183683e-06, + "loss": 0.94, + "step": 3320 + }, + { + "epoch": 0.8074398249452954, + "grad_norm": 38.5, + "learning_rate": 1.6497585320444487e-06, + "loss": 1.5201, + "step": 3321 + }, + { + "epoch": 0.8076829564794554, + "grad_norm": 21.625, + "learning_rate": 1.649300619319712e-06, + "loss": 0.8041, + "step": 3322 + }, + { + "epoch": 0.8079260880136153, + "grad_norm": 27.875, + "learning_rate": 1.6488426469126017e-06, + "loss": 0.9246, + "step": 3323 + }, + { + "epoch": 0.8081692195477753, + "grad_norm": 17.125, + "learning_rate": 1.6483846148915698e-06, + "loss": 0.7319, + "step": 3324 + }, + { + "epoch": 0.8084123510819353, + "grad_norm": 19.25, + "learning_rate": 1.6479265233250763e-06, + "loss": 1.2679, + "step": 3325 + }, + { + "epoch": 0.8086554826160953, + "grad_norm": 22.125, + "learning_rate": 1.647468372281592e-06, + "loss": 0.7942, + "step": 3326 + }, + { + "epoch": 0.8088986141502553, + "grad_norm": 25.125, + "learning_rate": 1.6470101618295946e-06, + "loss": 1.0763, + "step": 3327 + }, + { + "epoch": 0.8091417456844152, + "grad_norm": 24.125, + "learning_rate": 1.6465518920375723e-06, + "loss": 1.0571, + "step": 3328 + }, + { + "epoch": 0.8093848772185752, + "grad_norm": 17.625, + "learning_rate": 1.6460935629740207e-06, + "loss": 0.8949, + "step": 3329 + }, + { + "epoch": 0.8096280087527352, + "grad_norm": 20.25, + "learning_rate": 1.6456351747074454e-06, + "loss": 1.1852, + "step": 3330 + }, + { + "epoch": 0.8098711402868952, + "grad_norm": 18.625, + "learning_rate": 1.6451767273063605e-06, + "loss": 0.7703, + "step": 3331 + }, + { + "epoch": 0.8101142718210552, + "grad_norm": 19.0, + "learning_rate": 1.6447182208392887e-06, + "loss": 1.0443, + "step": 3332 + }, + { + "epoch": 0.8103574033552152, + "grad_norm": 16.5, + "learning_rate": 1.644259655374762e-06, + "loss": 0.7272, + "step": 3333 + }, + { + "epoch": 0.8106005348893751, + "grad_norm": 25.125, + "learning_rate": 1.6438010309813202e-06, + "loss": 1.2095, + "step": 3334 + }, + { + "epoch": 0.8108436664235351, + "grad_norm": 16.0, + "learning_rate": 1.6433423477275134e-06, + "loss": 0.9695, + "step": 3335 + }, + { + "epoch": 0.8110867979576951, + "grad_norm": 25.375, + "learning_rate": 1.6428836056818995e-06, + "loss": 1.247, + "step": 3336 + }, + { + "epoch": 0.8113299294918551, + "grad_norm": 18.625, + "learning_rate": 1.6424248049130453e-06, + "loss": 1.0522, + "step": 3337 + }, + { + "epoch": 0.8115730610260151, + "grad_norm": 20.0, + "learning_rate": 1.641965945489527e-06, + "loss": 0.7952, + "step": 3338 + }, + { + "epoch": 0.811816192560175, + "grad_norm": 20.625, + "learning_rate": 1.641507027479928e-06, + "loss": 0.8989, + "step": 3339 + }, + { + "epoch": 0.812059324094335, + "grad_norm": 16.5, + "learning_rate": 1.6410480509528427e-06, + "loss": 0.5721, + "step": 3340 + }, + { + "epoch": 0.812302455628495, + "grad_norm": 21.5, + "learning_rate": 1.6405890159768722e-06, + "loss": 1.0475, + "step": 3341 + }, + { + "epoch": 0.812545587162655, + "grad_norm": 19.125, + "learning_rate": 1.640129922620628e-06, + "loss": 1.0032, + "step": 3342 + }, + { + "epoch": 0.812788718696815, + "grad_norm": 12.5, + "learning_rate": 1.6396707709527287e-06, + "loss": 0.3883, + "step": 3343 + }, + { + "epoch": 0.8130318502309749, + "grad_norm": 20.75, + "learning_rate": 1.639211561041803e-06, + "loss": 0.6252, + "step": 3344 + }, + { + "epoch": 0.8132749817651349, + "grad_norm": 17.25, + "learning_rate": 1.6387522929564874e-06, + "loss": 0.5771, + "step": 3345 + }, + { + "epoch": 0.8135181132992949, + "grad_norm": 40.5, + "learning_rate": 1.6382929667654278e-06, + "loss": 1.5006, + "step": 3346 + }, + { + "epoch": 0.8137612448334549, + "grad_norm": 17.75, + "learning_rate": 1.6378335825372786e-06, + "loss": 0.7383, + "step": 3347 + }, + { + "epoch": 0.8140043763676149, + "grad_norm": 20.125, + "learning_rate": 1.6373741403407018e-06, + "loss": 1.3033, + "step": 3348 + }, + { + "epoch": 0.8142475079017749, + "grad_norm": 16.625, + "learning_rate": 1.6369146402443698e-06, + "loss": 0.6044, + "step": 3349 + }, + { + "epoch": 0.8144906394359348, + "grad_norm": 17.25, + "learning_rate": 1.6364550823169625e-06, + "loss": 0.704, + "step": 3350 + }, + { + "epoch": 0.8147337709700948, + "grad_norm": 19.125, + "learning_rate": 1.6359954666271688e-06, + "loss": 1.004, + "step": 3351 + }, + { + "epoch": 0.8149769025042548, + "grad_norm": 13.9375, + "learning_rate": 1.6355357932436863e-06, + "loss": 0.4525, + "step": 3352 + }, + { + "epoch": 0.8152200340384148, + "grad_norm": 22.5, + "learning_rate": 1.635076062235221e-06, + "loss": 1.1442, + "step": 3353 + }, + { + "epoch": 0.8154631655725748, + "grad_norm": 25.625, + "learning_rate": 1.6346162736704878e-06, + "loss": 1.3611, + "step": 3354 + }, + { + "epoch": 0.8157062971067347, + "grad_norm": 14.8125, + "learning_rate": 1.6341564276182097e-06, + "loss": 0.8538, + "step": 3355 + }, + { + "epoch": 0.8159494286408947, + "grad_norm": 18.125, + "learning_rate": 1.6336965241471193e-06, + "loss": 0.6753, + "step": 3356 + }, + { + "epoch": 0.8161925601750547, + "grad_norm": 14.25, + "learning_rate": 1.6332365633259568e-06, + "loss": 0.7703, + "step": 3357 + }, + { + "epoch": 0.8164356917092147, + "grad_norm": 15.0, + "learning_rate": 1.6327765452234706e-06, + "loss": 0.4715, + "step": 3358 + }, + { + "epoch": 0.8166788232433747, + "grad_norm": 16.5, + "learning_rate": 1.6323164699084193e-06, + "loss": 0.6636, + "step": 3359 + }, + { + "epoch": 0.8169219547775346, + "grad_norm": 25.25, + "learning_rate": 1.6318563374495686e-06, + "loss": 1.0865, + "step": 3360 + }, + { + "epoch": 0.8171650863116946, + "grad_norm": 17.125, + "learning_rate": 1.6313961479156935e-06, + "loss": 0.5912, + "step": 3361 + }, + { + "epoch": 0.8174082178458546, + "grad_norm": 14.3125, + "learning_rate": 1.6309359013755772e-06, + "loss": 0.2972, + "step": 3362 + }, + { + "epoch": 0.8176513493800146, + "grad_norm": 18.875, + "learning_rate": 1.630475597898011e-06, + "loss": 0.9035, + "step": 3363 + }, + { + "epoch": 0.8178944809141746, + "grad_norm": 19.75, + "learning_rate": 1.6300152375517964e-06, + "loss": 0.9052, + "step": 3364 + }, + { + "epoch": 0.8181376124483345, + "grad_norm": 13.5625, + "learning_rate": 1.629554820405741e-06, + "loss": 0.411, + "step": 3365 + }, + { + "epoch": 0.8183807439824945, + "grad_norm": 24.25, + "learning_rate": 1.6290943465286623e-06, + "loss": 0.6659, + "step": 3366 + }, + { + "epoch": 0.8186238755166545, + "grad_norm": 22.5, + "learning_rate": 1.6286338159893867e-06, + "loss": 0.7502, + "step": 3367 + }, + { + "epoch": 0.8188670070508145, + "grad_norm": 22.875, + "learning_rate": 1.6281732288567482e-06, + "loss": 0.9068, + "step": 3368 + }, + { + "epoch": 0.8191101385849745, + "grad_norm": 22.125, + "learning_rate": 1.6277125851995892e-06, + "loss": 1.0115, + "step": 3369 + }, + { + "epoch": 0.8193532701191345, + "grad_norm": 23.0, + "learning_rate": 1.6272518850867609e-06, + "loss": 0.9395, + "step": 3370 + }, + { + "epoch": 0.8195964016532944, + "grad_norm": 23.25, + "learning_rate": 1.6267911285871233e-06, + "loss": 0.9708, + "step": 3371 + }, + { + "epoch": 0.8198395331874544, + "grad_norm": 22.0, + "learning_rate": 1.6263303157695438e-06, + "loss": 1.1156, + "step": 3372 + }, + { + "epoch": 0.8200826647216144, + "grad_norm": 18.625, + "learning_rate": 1.625869446702899e-06, + "loss": 0.7308, + "step": 3373 + }, + { + "epoch": 0.8203257962557744, + "grad_norm": 18.375, + "learning_rate": 1.6254085214560743e-06, + "loss": 0.7755, + "step": 3374 + }, + { + "epoch": 0.8205689277899344, + "grad_norm": 17.75, + "learning_rate": 1.6249475400979625e-06, + "loss": 0.8015, + "step": 3375 + }, + { + "epoch": 0.8208120593240943, + "grad_norm": 18.625, + "learning_rate": 1.6244865026974654e-06, + "loss": 0.7887, + "step": 3376 + }, + { + "epoch": 0.8210551908582543, + "grad_norm": 16.125, + "learning_rate": 1.6240254093234925e-06, + "loss": 0.7635, + "step": 3377 + }, + { + "epoch": 0.8212983223924143, + "grad_norm": 14.75, + "learning_rate": 1.6235642600449628e-06, + "loss": 0.5193, + "step": 3378 + }, + { + "epoch": 0.8215414539265743, + "grad_norm": 21.125, + "learning_rate": 1.6231030549308024e-06, + "loss": 0.6491, + "step": 3379 + }, + { + "epoch": 0.8217845854607343, + "grad_norm": 21.0, + "learning_rate": 1.622641794049947e-06, + "loss": 0.8781, + "step": 3380 + }, + { + "epoch": 0.8220277169948942, + "grad_norm": 17.625, + "learning_rate": 1.6221804774713397e-06, + "loss": 1.0416, + "step": 3381 + }, + { + "epoch": 0.8222708485290542, + "grad_norm": 17.625, + "learning_rate": 1.6217191052639323e-06, + "loss": 0.6293, + "step": 3382 + }, + { + "epoch": 0.8225139800632142, + "grad_norm": 16.875, + "learning_rate": 1.6212576774966848e-06, + "loss": 0.6833, + "step": 3383 + }, + { + "epoch": 0.8227571115973742, + "grad_norm": 22.75, + "learning_rate": 1.6207961942385655e-06, + "loss": 1.1186, + "step": 3384 + }, + { + "epoch": 0.8230002431315342, + "grad_norm": 16.25, + "learning_rate": 1.620334655558551e-06, + "loss": 0.8758, + "step": 3385 + }, + { + "epoch": 0.8232433746656942, + "grad_norm": 17.75, + "learning_rate": 1.6198730615256267e-06, + "loss": 0.9285, + "step": 3386 + }, + { + "epoch": 0.8234865061998541, + "grad_norm": 17.75, + "learning_rate": 1.6194114122087852e-06, + "loss": 0.6273, + "step": 3387 + }, + { + "epoch": 0.8237296377340141, + "grad_norm": 20.25, + "learning_rate": 1.6189497076770282e-06, + "loss": 0.8995, + "step": 3388 + }, + { + "epoch": 0.8239727692681741, + "grad_norm": 19.0, + "learning_rate": 1.618487947999365e-06, + "loss": 0.6926, + "step": 3389 + }, + { + "epoch": 0.8242159008023341, + "grad_norm": 16.875, + "learning_rate": 1.6180261332448146e-06, + "loss": 0.6152, + "step": 3390 + }, + { + "epoch": 0.8244590323364941, + "grad_norm": 22.25, + "learning_rate": 1.6175642634824025e-06, + "loss": 1.3543, + "step": 3391 + }, + { + "epoch": 0.824702163870654, + "grad_norm": 21.25, + "learning_rate": 1.6171023387811627e-06, + "loss": 1.1115, + "step": 3392 + }, + { + "epoch": 0.824945295404814, + "grad_norm": 18.75, + "learning_rate": 1.6166403592101384e-06, + "loss": 0.8612, + "step": 3393 + }, + { + "epoch": 0.825188426938974, + "grad_norm": 16.875, + "learning_rate": 1.6161783248383805e-06, + "loss": 0.5647, + "step": 3394 + }, + { + "epoch": 0.825431558473134, + "grad_norm": 18.5, + "learning_rate": 1.6157162357349482e-06, + "loss": 0.7841, + "step": 3395 + }, + { + "epoch": 0.825674690007294, + "grad_norm": 16.625, + "learning_rate": 1.6152540919689077e-06, + "loss": 0.5308, + "step": 3396 + }, + { + "epoch": 0.8259178215414539, + "grad_norm": 30.5, + "learning_rate": 1.6147918936093355e-06, + "loss": 1.1225, + "step": 3397 + }, + { + "epoch": 0.8261609530756139, + "grad_norm": 21.75, + "learning_rate": 1.6143296407253142e-06, + "loss": 0.9388, + "step": 3398 + }, + { + "epoch": 0.8264040846097739, + "grad_norm": 27.5, + "learning_rate": 1.613867333385936e-06, + "loss": 1.1364, + "step": 3399 + }, + { + "epoch": 0.8266472161439339, + "grad_norm": 18.5, + "learning_rate": 1.613404971660301e-06, + "loss": 0.8135, + "step": 3400 + }, + { + "epoch": 0.8268903476780939, + "grad_norm": 14.625, + "learning_rate": 1.612942555617516e-06, + "loss": 0.5177, + "step": 3401 + }, + { + "epoch": 0.8271334792122538, + "grad_norm": 21.75, + "learning_rate": 1.6124800853266986e-06, + "loss": 1.094, + "step": 3402 + }, + { + "epoch": 0.8273766107464138, + "grad_norm": 24.625, + "learning_rate": 1.6120175608569718e-06, + "loss": 0.786, + "step": 3403 + }, + { + "epoch": 0.8276197422805738, + "grad_norm": 20.125, + "learning_rate": 1.6115549822774684e-06, + "loss": 0.6718, + "step": 3404 + }, + { + "epoch": 0.8278628738147338, + "grad_norm": 20.875, + "learning_rate": 1.6110923496573283e-06, + "loss": 0.68, + "step": 3405 + }, + { + "epoch": 0.8281060053488938, + "grad_norm": 21.0, + "learning_rate": 1.6106296630657005e-06, + "loss": 1.1493, + "step": 3406 + }, + { + "epoch": 0.8283491368830538, + "grad_norm": 20.75, + "learning_rate": 1.6101669225717417e-06, + "loss": 0.554, + "step": 3407 + }, + { + "epoch": 0.8285922684172137, + "grad_norm": 23.25, + "learning_rate": 1.6097041282446152e-06, + "loss": 1.284, + "step": 3408 + }, + { + "epoch": 0.8288353999513737, + "grad_norm": 19.875, + "learning_rate": 1.6092412801534949e-06, + "loss": 0.8646, + "step": 3409 + }, + { + "epoch": 0.8290785314855337, + "grad_norm": 16.75, + "learning_rate": 1.6087783783675611e-06, + "loss": 0.7641, + "step": 3410 + }, + { + "epoch": 0.8293216630196937, + "grad_norm": 24.5, + "learning_rate": 1.6083154229560022e-06, + "loss": 1.0882, + "step": 3411 + }, + { + "epoch": 0.8295647945538537, + "grad_norm": 15.1875, + "learning_rate": 1.607852413988015e-06, + "loss": 0.4787, + "step": 3412 + }, + { + "epoch": 0.8298079260880136, + "grad_norm": 17.625, + "learning_rate": 1.607389351532804e-06, + "loss": 0.78, + "step": 3413 + }, + { + "epoch": 0.8300510576221736, + "grad_norm": 24.5, + "learning_rate": 1.6069262356595827e-06, + "loss": 0.9327, + "step": 3414 + }, + { + "epoch": 0.8302941891563336, + "grad_norm": 23.625, + "learning_rate": 1.6064630664375705e-06, + "loss": 1.0338, + "step": 3415 + }, + { + "epoch": 0.8305373206904936, + "grad_norm": 20.875, + "learning_rate": 1.6059998439359967e-06, + "loss": 0.9083, + "step": 3416 + }, + { + "epoch": 0.8307804522246536, + "grad_norm": 20.375, + "learning_rate": 1.6055365682240985e-06, + "loss": 1.0087, + "step": 3417 + }, + { + "epoch": 0.8310235837588135, + "grad_norm": 20.875, + "learning_rate": 1.6050732393711193e-06, + "loss": 0.9165, + "step": 3418 + }, + { + "epoch": 0.8312667152929735, + "grad_norm": 28.0, + "learning_rate": 1.6046098574463126e-06, + "loss": 1.3364, + "step": 3419 + }, + { + "epoch": 0.8315098468271335, + "grad_norm": 13.5625, + "learning_rate": 1.6041464225189376e-06, + "loss": 0.5851, + "step": 3420 + }, + { + "epoch": 0.8317529783612935, + "grad_norm": 16.25, + "learning_rate": 1.603682934658264e-06, + "loss": 0.3832, + "step": 3421 + }, + { + "epoch": 0.8319961098954535, + "grad_norm": 21.5, + "learning_rate": 1.6032193939335676e-06, + "loss": 0.7287, + "step": 3422 + }, + { + "epoch": 0.8322392414296135, + "grad_norm": 21.875, + "learning_rate": 1.6027558004141323e-06, + "loss": 0.9834, + "step": 3423 + }, + { + "epoch": 0.8324823729637734, + "grad_norm": 19.375, + "learning_rate": 1.6022921541692501e-06, + "loss": 1.0242, + "step": 3424 + }, + { + "epoch": 0.8327255044979334, + "grad_norm": 19.375, + "learning_rate": 1.6018284552682215e-06, + "loss": 0.9596, + "step": 3425 + }, + { + "epoch": 0.8329686360320934, + "grad_norm": 21.125, + "learning_rate": 1.6013647037803539e-06, + "loss": 0.8035, + "step": 3426 + }, + { + "epoch": 0.8332117675662534, + "grad_norm": 20.75, + "learning_rate": 1.6009008997749631e-06, + "loss": 1.0939, + "step": 3427 + }, + { + "epoch": 0.8334548991004134, + "grad_norm": 16.625, + "learning_rate": 1.600437043321372e-06, + "loss": 0.4163, + "step": 3428 + }, + { + "epoch": 0.8336980306345733, + "grad_norm": 23.625, + "learning_rate": 1.5999731344889132e-06, + "loss": 1.03, + "step": 3429 + }, + { + "epoch": 0.8339411621687333, + "grad_norm": 15.3125, + "learning_rate": 1.599509173346925e-06, + "loss": 0.7969, + "step": 3430 + }, + { + "epoch": 0.8341842937028933, + "grad_norm": 17.875, + "learning_rate": 1.599045159964755e-06, + "loss": 1.0562, + "step": 3431 + }, + { + "epoch": 0.8344274252370533, + "grad_norm": 32.5, + "learning_rate": 1.598581094411757e-06, + "loss": 0.8758, + "step": 3432 + }, + { + "epoch": 0.8346705567712133, + "grad_norm": 24.0, + "learning_rate": 1.598116976757294e-06, + "loss": 1.3562, + "step": 3433 + }, + { + "epoch": 0.8349136883053732, + "grad_norm": 18.125, + "learning_rate": 1.5976528070707376e-06, + "loss": 0.7141, + "step": 3434 + }, + { + "epoch": 0.8351568198395332, + "grad_norm": 18.75, + "learning_rate": 1.5971885854214642e-06, + "loss": 0.9906, + "step": 3435 + }, + { + "epoch": 0.8353999513736932, + "grad_norm": 18.75, + "learning_rate": 1.596724311878861e-06, + "loss": 0.7147, + "step": 3436 + }, + { + "epoch": 0.8356430829078532, + "grad_norm": 20.25, + "learning_rate": 1.596259986512321e-06, + "loss": 0.7191, + "step": 3437 + }, + { + "epoch": 0.8358862144420132, + "grad_norm": 22.875, + "learning_rate": 1.5957956093912459e-06, + "loss": 1.1478, + "step": 3438 + }, + { + "epoch": 0.836129345976173, + "grad_norm": 31.5, + "learning_rate": 1.5953311805850448e-06, + "loss": 0.8481, + "step": 3439 + }, + { + "epoch": 0.8363724775103331, + "grad_norm": 16.375, + "learning_rate": 1.5948667001631352e-06, + "loss": 0.921, + "step": 3440 + }, + { + "epoch": 0.8366156090444931, + "grad_norm": 22.375, + "learning_rate": 1.594402168194941e-06, + "loss": 0.8647, + "step": 3441 + }, + { + "epoch": 0.8368587405786531, + "grad_norm": 21.5, + "learning_rate": 1.5939375847498944e-06, + "loss": 1.1712, + "step": 3442 + }, + { + "epoch": 0.8371018721128131, + "grad_norm": 21.625, + "learning_rate": 1.5934729498974362e-06, + "loss": 0.8327, + "step": 3443 + }, + { + "epoch": 0.8373450036469731, + "grad_norm": 23.375, + "learning_rate": 1.5930082637070132e-06, + "loss": 1.1919, + "step": 3444 + }, + { + "epoch": 0.837588135181133, + "grad_norm": 20.875, + "learning_rate": 1.5925435262480815e-06, + "loss": 0.7159, + "step": 3445 + }, + { + "epoch": 0.837831266715293, + "grad_norm": 17.0, + "learning_rate": 1.592078737590104e-06, + "loss": 0.8424, + "step": 3446 + }, + { + "epoch": 0.838074398249453, + "grad_norm": 17.625, + "learning_rate": 1.5916138978025509e-06, + "loss": 0.6072, + "step": 3447 + }, + { + "epoch": 0.838317529783613, + "grad_norm": 19.625, + "learning_rate": 1.591149006954901e-06, + "loss": 0.6674, + "step": 3448 + }, + { + "epoch": 0.838560661317773, + "grad_norm": 31.375, + "learning_rate": 1.5906840651166402e-06, + "loss": 1.2384, + "step": 3449 + }, + { + "epoch": 0.8388037928519329, + "grad_norm": 15.9375, + "learning_rate": 1.5902190723572622e-06, + "loss": 0.7587, + "step": 3450 + }, + { + "epoch": 0.8390469243860929, + "grad_norm": 22.75, + "learning_rate": 1.589754028746268e-06, + "loss": 1.1475, + "step": 3451 + }, + { + "epoch": 0.8392900559202529, + "grad_norm": 16.75, + "learning_rate": 1.5892889343531662e-06, + "loss": 0.665, + "step": 3452 + }, + { + "epoch": 0.8395331874544129, + "grad_norm": 29.5, + "learning_rate": 1.588823789247474e-06, + "loss": 1.2101, + "step": 3453 + }, + { + "epoch": 0.8397763189885729, + "grad_norm": 50.75, + "learning_rate": 1.588358593498714e-06, + "loss": 0.9499, + "step": 3454 + }, + { + "epoch": 0.8400194505227327, + "grad_norm": 15.375, + "learning_rate": 1.5878933471764192e-06, + "loss": 0.7118, + "step": 3455 + }, + { + "epoch": 0.8402625820568927, + "grad_norm": 17.5, + "learning_rate": 1.5874280503501278e-06, + "loss": 0.7053, + "step": 3456 + }, + { + "epoch": 0.8405057135910527, + "grad_norm": 19.25, + "learning_rate": 1.5869627030893867e-06, + "loss": 1.09, + "step": 3457 + }, + { + "epoch": 0.8407488451252128, + "grad_norm": 17.625, + "learning_rate": 1.5864973054637504e-06, + "loss": 1.0151, + "step": 3458 + }, + { + "epoch": 0.8409919766593728, + "grad_norm": 20.625, + "learning_rate": 1.5860318575427793e-06, + "loss": 0.9082, + "step": 3459 + }, + { + "epoch": 0.8412351081935328, + "grad_norm": 15.25, + "learning_rate": 1.5855663593960446e-06, + "loss": 0.7381, + "step": 3460 + }, + { + "epoch": 0.8414782397276926, + "grad_norm": 25.5, + "learning_rate": 1.585100811093122e-06, + "loss": 1.1652, + "step": 3461 + }, + { + "epoch": 0.8417213712618526, + "grad_norm": 21.75, + "learning_rate": 1.5846352127035952e-06, + "loss": 1.0499, + "step": 3462 + }, + { + "epoch": 0.8419645027960126, + "grad_norm": 22.625, + "learning_rate": 1.584169564297057e-06, + "loss": 0.8066, + "step": 3463 + }, + { + "epoch": 0.8422076343301726, + "grad_norm": 17.875, + "learning_rate": 1.5837038659431059e-06, + "loss": 0.79, + "step": 3464 + }, + { + "epoch": 0.8424507658643327, + "grad_norm": 17.375, + "learning_rate": 1.583238117711349e-06, + "loss": 0.822, + "step": 3465 + }, + { + "epoch": 0.8426938973984925, + "grad_norm": 17.375, + "learning_rate": 1.5827723196713998e-06, + "loss": 0.814, + "step": 3466 + }, + { + "epoch": 0.8429370289326525, + "grad_norm": 17.625, + "learning_rate": 1.5823064718928807e-06, + "loss": 0.6785, + "step": 3467 + }, + { + "epoch": 0.8431801604668125, + "grad_norm": 18.75, + "learning_rate": 1.58184057444542e-06, + "loss": 0.6204, + "step": 3468 + }, + { + "epoch": 0.8434232920009725, + "grad_norm": 19.75, + "learning_rate": 1.5813746273986541e-06, + "loss": 1.026, + "step": 3469 + }, + { + "epoch": 0.8436664235351325, + "grad_norm": 16.875, + "learning_rate": 1.5809086308222273e-06, + "loss": 0.8074, + "step": 3470 + }, + { + "epoch": 0.8439095550692924, + "grad_norm": 20.75, + "learning_rate": 1.5804425847857908e-06, + "loss": 0.9195, + "step": 3471 + }, + { + "epoch": 0.8441526866034524, + "grad_norm": 21.25, + "learning_rate": 1.5799764893590033e-06, + "loss": 1.2829, + "step": 3472 + }, + { + "epoch": 0.8443958181376124, + "grad_norm": 20.5, + "learning_rate": 1.5795103446115302e-06, + "loss": 0.7941, + "step": 3473 + }, + { + "epoch": 0.8446389496717724, + "grad_norm": 15.9375, + "learning_rate": 1.5790441506130453e-06, + "loss": 0.7508, + "step": 3474 + }, + { + "epoch": 0.8448820812059324, + "grad_norm": 18.75, + "learning_rate": 1.5785779074332292e-06, + "loss": 1.1244, + "step": 3475 + }, + { + "epoch": 0.8451252127400923, + "grad_norm": 19.25, + "learning_rate": 1.5781116151417703e-06, + "loss": 0.7926, + "step": 3476 + }, + { + "epoch": 0.8453683442742523, + "grad_norm": 24.625, + "learning_rate": 1.5776452738083637e-06, + "loss": 0.9534, + "step": 3477 + }, + { + "epoch": 0.8456114758084123, + "grad_norm": 17.625, + "learning_rate": 1.5771788835027122e-06, + "loss": 0.9546, + "step": 3478 + }, + { + "epoch": 0.8458546073425723, + "grad_norm": 16.875, + "learning_rate": 1.5767124442945264e-06, + "loss": 0.7319, + "step": 3479 + }, + { + "epoch": 0.8460977388767323, + "grad_norm": 15.125, + "learning_rate": 1.576245956253523e-06, + "loss": 0.7444, + "step": 3480 + }, + { + "epoch": 0.8463408704108923, + "grad_norm": 20.0, + "learning_rate": 1.575779419449427e-06, + "loss": 0.9531, + "step": 3481 + }, + { + "epoch": 0.8465840019450522, + "grad_norm": 20.875, + "learning_rate": 1.5753128339519702e-06, + "loss": 0.9928, + "step": 3482 + }, + { + "epoch": 0.8468271334792122, + "grad_norm": 20.5, + "learning_rate": 1.574846199830892e-06, + "loss": 0.7358, + "step": 3483 + }, + { + "epoch": 0.8470702650133722, + "grad_norm": 18.0, + "learning_rate": 1.5743795171559392e-06, + "loss": 0.78, + "step": 3484 + }, + { + "epoch": 0.8473133965475322, + "grad_norm": 17.0, + "learning_rate": 1.5739127859968652e-06, + "loss": 0.5908, + "step": 3485 + }, + { + "epoch": 0.8475565280816922, + "grad_norm": 20.875, + "learning_rate": 1.5734460064234314e-06, + "loss": 0.9047, + "step": 3486 + }, + { + "epoch": 0.8477996596158521, + "grad_norm": 15.6875, + "learning_rate": 1.5729791785054056e-06, + "loss": 0.5851, + "step": 3487 + }, + { + "epoch": 0.8480427911500121, + "grad_norm": 14.75, + "learning_rate": 1.5725123023125633e-06, + "loss": 0.9931, + "step": 3488 + }, + { + "epoch": 0.8482859226841721, + "grad_norm": 17.0, + "learning_rate": 1.572045377914688e-06, + "loss": 0.7318, + "step": 3489 + }, + { + "epoch": 0.8485290542183321, + "grad_norm": 17.875, + "learning_rate": 1.5715784053815687e-06, + "loss": 0.7543, + "step": 3490 + }, + { + "epoch": 0.8487721857524921, + "grad_norm": 17.25, + "learning_rate": 1.5711113847830029e-06, + "loss": 0.6945, + "step": 3491 + }, + { + "epoch": 0.849015317286652, + "grad_norm": 21.5, + "learning_rate": 1.5706443161887948e-06, + "loss": 0.6763, + "step": 3492 + }, + { + "epoch": 0.849258448820812, + "grad_norm": 22.625, + "learning_rate": 1.570177199668756e-06, + "loss": 1.0924, + "step": 3493 + }, + { + "epoch": 0.849501580354972, + "grad_norm": 23.125, + "learning_rate": 1.569710035292705e-06, + "loss": 0.9922, + "step": 3494 + }, + { + "epoch": 0.849744711889132, + "grad_norm": 24.125, + "learning_rate": 1.5692428231304676e-06, + "loss": 0.8303, + "step": 3495 + }, + { + "epoch": 0.849987843423292, + "grad_norm": 25.75, + "learning_rate": 1.5687755632518772e-06, + "loss": 0.9958, + "step": 3496 + }, + { + "epoch": 0.850230974957452, + "grad_norm": 14.5625, + "learning_rate": 1.5683082557267728e-06, + "loss": 0.3681, + "step": 3497 + }, + { + "epoch": 0.8504741064916119, + "grad_norm": 25.25, + "learning_rate": 1.567840900625003e-06, + "loss": 0.8429, + "step": 3498 + }, + { + "epoch": 0.8507172380257719, + "grad_norm": 23.75, + "learning_rate": 1.5673734980164204e-06, + "loss": 0.8081, + "step": 3499 + }, + { + "epoch": 0.8509603695599319, + "grad_norm": 23.0, + "learning_rate": 1.5669060479708878e-06, + "loss": 0.9996, + "step": 3500 + }, + { + "epoch": 0.8512035010940919, + "grad_norm": 19.125, + "learning_rate": 1.566438550558273e-06, + "loss": 0.906, + "step": 3501 + }, + { + "epoch": 0.8514466326282519, + "grad_norm": 16.375, + "learning_rate": 1.5659710058484518e-06, + "loss": 0.3945, + "step": 3502 + }, + { + "epoch": 0.8516897641624118, + "grad_norm": 21.5, + "learning_rate": 1.5655034139113072e-06, + "loss": 0.812, + "step": 3503 + }, + { + "epoch": 0.8519328956965718, + "grad_norm": 15.0625, + "learning_rate": 1.5650357748167278e-06, + "loss": 0.8199, + "step": 3504 + }, + { + "epoch": 0.8521760272307318, + "grad_norm": 23.0, + "learning_rate": 1.5645680886346112e-06, + "loss": 1.5766, + "step": 3505 + }, + { + "epoch": 0.8524191587648918, + "grad_norm": 18.0, + "learning_rate": 1.564100355434861e-06, + "loss": 0.7244, + "step": 3506 + }, + { + "epoch": 0.8526622902990518, + "grad_norm": 15.25, + "learning_rate": 1.563632575287388e-06, + "loss": 0.6229, + "step": 3507 + }, + { + "epoch": 0.8529054218332117, + "grad_norm": 19.75, + "learning_rate": 1.56316474826211e-06, + "loss": 0.9417, + "step": 3508 + }, + { + "epoch": 0.8531485533673717, + "grad_norm": 20.75, + "learning_rate": 1.5626968744289516e-06, + "loss": 0.8819, + "step": 3509 + }, + { + "epoch": 0.8533916849015317, + "grad_norm": 20.0, + "learning_rate": 1.5622289538578453e-06, + "loss": 0.9293, + "step": 3510 + }, + { + "epoch": 0.8536348164356917, + "grad_norm": 18.875, + "learning_rate": 1.5617609866187291e-06, + "loss": 0.8597, + "step": 3511 + }, + { + "epoch": 0.8538779479698517, + "grad_norm": 16.875, + "learning_rate": 1.5612929727815494e-06, + "loss": 0.6089, + "step": 3512 + }, + { + "epoch": 0.8541210795040116, + "grad_norm": 17.5, + "learning_rate": 1.5608249124162586e-06, + "loss": 0.7872, + "step": 3513 + }, + { + "epoch": 0.8543642110381716, + "grad_norm": 19.875, + "learning_rate": 1.5603568055928164e-06, + "loss": 0.9658, + "step": 3514 + }, + { + "epoch": 0.8546073425723316, + "grad_norm": 20.375, + "learning_rate": 1.5598886523811898e-06, + "loss": 0.8721, + "step": 3515 + }, + { + "epoch": 0.8548504741064916, + "grad_norm": 18.625, + "learning_rate": 1.559420452851352e-06, + "loss": 0.7256, + "step": 3516 + }, + { + "epoch": 0.8550936056406516, + "grad_norm": 19.75, + "learning_rate": 1.5589522070732838e-06, + "loss": 0.8953, + "step": 3517 + }, + { + "epoch": 0.8553367371748116, + "grad_norm": 16.875, + "learning_rate": 1.558483915116972e-06, + "loss": 0.6124, + "step": 3518 + }, + { + "epoch": 0.8555798687089715, + "grad_norm": 14.5, + "learning_rate": 1.5580155770524119e-06, + "loss": 0.4481, + "step": 3519 + }, + { + "epoch": 0.8558230002431315, + "grad_norm": 17.25, + "learning_rate": 1.557547192949604e-06, + "loss": 0.9589, + "step": 3520 + }, + { + "epoch": 0.8560661317772915, + "grad_norm": 21.375, + "learning_rate": 1.5570787628785563e-06, + "loss": 0.9213, + "step": 3521 + }, + { + "epoch": 0.8563092633114515, + "grad_norm": 20.5, + "learning_rate": 1.5566102869092847e-06, + "loss": 0.661, + "step": 3522 + }, + { + "epoch": 0.8565523948456115, + "grad_norm": 23.0, + "learning_rate": 1.5561417651118098e-06, + "loss": 0.9228, + "step": 3523 + }, + { + "epoch": 0.8567955263797714, + "grad_norm": 16.625, + "learning_rate": 1.5556731975561613e-06, + "loss": 0.3625, + "step": 3524 + }, + { + "epoch": 0.8570386579139314, + "grad_norm": 19.25, + "learning_rate": 1.5552045843123737e-06, + "loss": 1.1483, + "step": 3525 + }, + { + "epoch": 0.8572817894480914, + "grad_norm": 19.625, + "learning_rate": 1.5547359254504903e-06, + "loss": 0.6586, + "step": 3526 + }, + { + "epoch": 0.8575249209822514, + "grad_norm": 23.125, + "learning_rate": 1.5542672210405603e-06, + "loss": 0.9238, + "step": 3527 + }, + { + "epoch": 0.8577680525164114, + "grad_norm": 21.875, + "learning_rate": 1.5537984711526382e-06, + "loss": 0.9914, + "step": 3528 + }, + { + "epoch": 0.8580111840505713, + "grad_norm": 20.5, + "learning_rate": 1.5533296758567884e-06, + "loss": 0.9804, + "step": 3529 + }, + { + "epoch": 0.8582543155847313, + "grad_norm": 24.875, + "learning_rate": 1.5528608352230798e-06, + "loss": 1.1042, + "step": 3530 + }, + { + "epoch": 0.8584974471188913, + "grad_norm": 16.875, + "learning_rate": 1.5523919493215888e-06, + "loss": 0.88, + "step": 3531 + }, + { + "epoch": 0.8587405786530513, + "grad_norm": 21.75, + "learning_rate": 1.5519230182223984e-06, + "loss": 0.8651, + "step": 3532 + }, + { + "epoch": 0.8589837101872113, + "grad_norm": 19.875, + "learning_rate": 1.5514540419955986e-06, + "loss": 0.7751, + "step": 3533 + }, + { + "epoch": 0.8592268417213713, + "grad_norm": 17.5, + "learning_rate": 1.550985020711286e-06, + "loss": 0.6555, + "step": 3534 + }, + { + "epoch": 0.8594699732555312, + "grad_norm": 18.375, + "learning_rate": 1.550515954439564e-06, + "loss": 0.829, + "step": 3535 + }, + { + "epoch": 0.8597131047896912, + "grad_norm": 19.375, + "learning_rate": 1.5500468432505422e-06, + "loss": 0.7958, + "step": 3536 + }, + { + "epoch": 0.8599562363238512, + "grad_norm": 19.375, + "learning_rate": 1.5495776872143379e-06, + "loss": 0.8569, + "step": 3537 + }, + { + "epoch": 0.8601993678580112, + "grad_norm": 18.25, + "learning_rate": 1.5491084864010741e-06, + "loss": 0.6053, + "step": 3538 + }, + { + "epoch": 0.8604424993921712, + "grad_norm": 18.125, + "learning_rate": 1.5486392408808818e-06, + "loss": 0.7797, + "step": 3539 + }, + { + "epoch": 0.8606856309263311, + "grad_norm": 19.375, + "learning_rate": 1.5481699507238965e-06, + "loss": 0.7783, + "step": 3540 + }, + { + "epoch": 0.8609287624604911, + "grad_norm": 18.25, + "learning_rate": 1.5477006160002631e-06, + "loss": 0.6464, + "step": 3541 + }, + { + "epoch": 0.8611718939946511, + "grad_norm": 20.875, + "learning_rate": 1.547231236780131e-06, + "loss": 0.7141, + "step": 3542 + }, + { + "epoch": 0.8614150255288111, + "grad_norm": 21.875, + "learning_rate": 1.546761813133657e-06, + "loss": 0.877, + "step": 3543 + }, + { + "epoch": 0.8616581570629711, + "grad_norm": 24.125, + "learning_rate": 1.5462923451310049e-06, + "loss": 1.0813, + "step": 3544 + }, + { + "epoch": 0.861901288597131, + "grad_norm": 17.75, + "learning_rate": 1.5458228328423447e-06, + "loss": 0.6834, + "step": 3545 + }, + { + "epoch": 0.862144420131291, + "grad_norm": 13.5625, + "learning_rate": 1.545353276337853e-06, + "loss": 0.4675, + "step": 3546 + }, + { + "epoch": 0.862387551665451, + "grad_norm": 18.125, + "learning_rate": 1.5448836756877135e-06, + "loss": 0.8901, + "step": 3547 + }, + { + "epoch": 0.862630683199611, + "grad_norm": 21.625, + "learning_rate": 1.5444140309621153e-06, + "loss": 1.0051, + "step": 3548 + }, + { + "epoch": 0.862873814733771, + "grad_norm": 19.5, + "learning_rate": 1.5439443422312562e-06, + "loss": 0.8683, + "step": 3549 + }, + { + "epoch": 0.8631169462679309, + "grad_norm": 15.8125, + "learning_rate": 1.543474609565338e-06, + "loss": 0.7164, + "step": 3550 + }, + { + "epoch": 0.8633600778020909, + "grad_norm": 17.75, + "learning_rate": 1.5430048330345712e-06, + "loss": 0.5809, + "step": 3551 + }, + { + "epoch": 0.8636032093362509, + "grad_norm": 18.125, + "learning_rate": 1.5425350127091716e-06, + "loss": 0.6418, + "step": 3552 + }, + { + "epoch": 0.8638463408704109, + "grad_norm": 22.0, + "learning_rate": 1.5420651486593624e-06, + "loss": 0.9831, + "step": 3553 + }, + { + "epoch": 0.8640894724045709, + "grad_norm": 25.25, + "learning_rate": 1.5415952409553721e-06, + "loss": 0.9321, + "step": 3554 + }, + { + "epoch": 0.8643326039387309, + "grad_norm": 25.125, + "learning_rate": 1.5411252896674369e-06, + "loss": 0.6299, + "step": 3555 + }, + { + "epoch": 0.8645757354728908, + "grad_norm": 24.25, + "learning_rate": 1.5406552948658e-06, + "loss": 1.4371, + "step": 3556 + }, + { + "epoch": 0.8648188670070508, + "grad_norm": 23.0, + "learning_rate": 1.540185256620709e-06, + "loss": 1.1024, + "step": 3557 + }, + { + "epoch": 0.8650619985412108, + "grad_norm": 19.125, + "learning_rate": 1.53971517500242e-06, + "loss": 0.7464, + "step": 3558 + }, + { + "epoch": 0.8653051300753708, + "grad_norm": 20.625, + "learning_rate": 1.539245050081194e-06, + "loss": 0.9117, + "step": 3559 + }, + { + "epoch": 0.8655482616095308, + "grad_norm": 22.125, + "learning_rate": 1.5387748819273001e-06, + "loss": 0.8153, + "step": 3560 + }, + { + "epoch": 0.8657913931436907, + "grad_norm": 21.25, + "learning_rate": 1.5383046706110133e-06, + "loss": 1.0534, + "step": 3561 + }, + { + "epoch": 0.8660345246778507, + "grad_norm": 14.6875, + "learning_rate": 1.5378344162026137e-06, + "loss": 0.3717, + "step": 3562 + }, + { + "epoch": 0.8662776562120107, + "grad_norm": 16.375, + "learning_rate": 1.5373641187723898e-06, + "loss": 0.666, + "step": 3563 + }, + { + "epoch": 0.8665207877461707, + "grad_norm": 19.25, + "learning_rate": 1.5368937783906352e-06, + "loss": 0.7635, + "step": 3564 + }, + { + "epoch": 0.8667639192803307, + "grad_norm": 20.75, + "learning_rate": 1.5364233951276505e-06, + "loss": 0.6799, + "step": 3565 + }, + { + "epoch": 0.8670070508144906, + "grad_norm": 82.0, + "learning_rate": 1.5359529690537431e-06, + "loss": 1.3364, + "step": 3566 + }, + { + "epoch": 0.8672501823486506, + "grad_norm": 23.875, + "learning_rate": 1.5354825002392254e-06, + "loss": 1.0862, + "step": 3567 + }, + { + "epoch": 0.8674933138828106, + "grad_norm": 21.5, + "learning_rate": 1.535011988754418e-06, + "loss": 1.0549, + "step": 3568 + }, + { + "epoch": 0.8677364454169706, + "grad_norm": 19.25, + "learning_rate": 1.5345414346696463e-06, + "loss": 0.6873, + "step": 3569 + }, + { + "epoch": 0.8679795769511306, + "grad_norm": 17.125, + "learning_rate": 1.5340708380552436e-06, + "loss": 0.7122, + "step": 3570 + }, + { + "epoch": 0.8682227084852906, + "grad_norm": 17.0, + "learning_rate": 1.5336001989815472e-06, + "loss": 0.655, + "step": 3571 + }, + { + "epoch": 0.8684658400194505, + "grad_norm": 19.75, + "learning_rate": 1.5331295175189034e-06, + "loss": 0.7531, + "step": 3572 + }, + { + "epoch": 0.8687089715536105, + "grad_norm": 17.375, + "learning_rate": 1.5326587937376635e-06, + "loss": 0.7692, + "step": 3573 + }, + { + "epoch": 0.8689521030877705, + "grad_norm": 22.5, + "learning_rate": 1.5321880277081852e-06, + "loss": 0.8734, + "step": 3574 + }, + { + "epoch": 0.8691952346219305, + "grad_norm": 18.875, + "learning_rate": 1.5317172195008326e-06, + "loss": 0.583, + "step": 3575 + }, + { + "epoch": 0.8694383661560905, + "grad_norm": 17.25, + "learning_rate": 1.531246369185976e-06, + "loss": 0.7171, + "step": 3576 + }, + { + "epoch": 0.8696814976902504, + "grad_norm": 19.375, + "learning_rate": 1.5307754768339922e-06, + "loss": 0.9413, + "step": 3577 + }, + { + "epoch": 0.8699246292244104, + "grad_norm": 20.625, + "learning_rate": 1.5303045425152643e-06, + "loss": 0.9347, + "step": 3578 + }, + { + "epoch": 0.8701677607585704, + "grad_norm": 15.9375, + "learning_rate": 1.5298335663001814e-06, + "loss": 0.7473, + "step": 3579 + }, + { + "epoch": 0.8704108922927304, + "grad_norm": 23.25, + "learning_rate": 1.5293625482591396e-06, + "loss": 1.1312, + "step": 3580 + }, + { + "epoch": 0.8706540238268904, + "grad_norm": 18.625, + "learning_rate": 1.52889148846254e-06, + "loss": 0.5924, + "step": 3581 + }, + { + "epoch": 0.8708971553610503, + "grad_norm": 21.75, + "learning_rate": 1.5284203869807906e-06, + "loss": 0.9332, + "step": 3582 + }, + { + "epoch": 0.8711402868952103, + "grad_norm": 20.875, + "learning_rate": 1.5279492438843058e-06, + "loss": 0.7122, + "step": 3583 + }, + { + "epoch": 0.8713834184293703, + "grad_norm": 20.5, + "learning_rate": 1.5274780592435064e-06, + "loss": 1.2327, + "step": 3584 + }, + { + "epoch": 0.8716265499635303, + "grad_norm": 40.75, + "learning_rate": 1.527006833128819e-06, + "loss": 0.9345, + "step": 3585 + }, + { + "epoch": 0.8718696814976903, + "grad_norm": 33.75, + "learning_rate": 1.5265355656106757e-06, + "loss": 1.377, + "step": 3586 + }, + { + "epoch": 0.8721128130318502, + "grad_norm": 18.125, + "learning_rate": 1.526064256759517e-06, + "loss": 1.1876, + "step": 3587 + }, + { + "epoch": 0.8723559445660102, + "grad_norm": 16.75, + "learning_rate": 1.5255929066457868e-06, + "loss": 1.1715, + "step": 3588 + }, + { + "epoch": 0.8725990761001702, + "grad_norm": 17.25, + "learning_rate": 1.525121515339937e-06, + "loss": 0.5743, + "step": 3589 + }, + { + "epoch": 0.8728422076343302, + "grad_norm": 18.125, + "learning_rate": 1.5246500829124253e-06, + "loss": 0.9771, + "step": 3590 + }, + { + "epoch": 0.8730853391684902, + "grad_norm": 14.6875, + "learning_rate": 1.5241786094337151e-06, + "loss": 0.6867, + "step": 3591 + }, + { + "epoch": 0.8733284707026502, + "grad_norm": 16.125, + "learning_rate": 1.5237070949742772e-06, + "loss": 0.5928, + "step": 3592 + }, + { + "epoch": 0.8735716022368101, + "grad_norm": 14.9375, + "learning_rate": 1.5232355396045864e-06, + "loss": 0.447, + "step": 3593 + }, + { + "epoch": 0.8738147337709701, + "grad_norm": 28.375, + "learning_rate": 1.5227639433951252e-06, + "loss": 0.8632, + "step": 3594 + }, + { + "epoch": 0.8740578653051301, + "grad_norm": 20.25, + "learning_rate": 1.5222923064163822e-06, + "loss": 0.8428, + "step": 3595 + }, + { + "epoch": 0.8743009968392901, + "grad_norm": 23.0, + "learning_rate": 1.521820628738851e-06, + "loss": 1.3935, + "step": 3596 + }, + { + "epoch": 0.8745441283734501, + "grad_norm": 19.375, + "learning_rate": 1.5213489104330328e-06, + "loss": 0.6194, + "step": 3597 + }, + { + "epoch": 0.87478725990761, + "grad_norm": 18.75, + "learning_rate": 1.5208771515694329e-06, + "loss": 0.7012, + "step": 3598 + }, + { + "epoch": 0.87503039144177, + "grad_norm": 16.75, + "learning_rate": 1.520405352218565e-06, + "loss": 0.681, + "step": 3599 + }, + { + "epoch": 0.87527352297593, + "grad_norm": 19.5, + "learning_rate": 1.519933512450947e-06, + "loss": 0.7969, + "step": 3600 + }, + { + "epoch": 0.87551665451009, + "grad_norm": 16.5, + "learning_rate": 1.5194616323371036e-06, + "loss": 0.6389, + "step": 3601 + }, + { + "epoch": 0.87575978604425, + "grad_norm": 19.75, + "learning_rate": 1.5189897119475654e-06, + "loss": 1.2287, + "step": 3602 + }, + { + "epoch": 0.8760029175784099, + "grad_norm": 23.875, + "learning_rate": 1.5185177513528693e-06, + "loss": 1.3549, + "step": 3603 + }, + { + "epoch": 0.8762460491125699, + "grad_norm": 18.125, + "learning_rate": 1.518045750623558e-06, + "loss": 1.167, + "step": 3604 + }, + { + "epoch": 0.8764891806467299, + "grad_norm": 24.875, + "learning_rate": 1.5175737098301792e-06, + "loss": 1.3455, + "step": 3605 + }, + { + "epoch": 0.8767323121808899, + "grad_norm": 22.625, + "learning_rate": 1.517101629043289e-06, + "loss": 0.9083, + "step": 3606 + }, + { + "epoch": 0.8769754437150499, + "grad_norm": 17.5, + "learning_rate": 1.5166295083334473e-06, + "loss": 0.8156, + "step": 3607 + }, + { + "epoch": 0.8772185752492099, + "grad_norm": 16.375, + "learning_rate": 1.5161573477712205e-06, + "loss": 0.5163, + "step": 3608 + }, + { + "epoch": 0.8774617067833698, + "grad_norm": 22.375, + "learning_rate": 1.5156851474271815e-06, + "loss": 1.2113, + "step": 3609 + }, + { + "epoch": 0.8777048383175298, + "grad_norm": 18.125, + "learning_rate": 1.5152129073719085e-06, + "loss": 0.9579, + "step": 3610 + }, + { + "epoch": 0.8779479698516898, + "grad_norm": 16.875, + "learning_rate": 1.5147406276759865e-06, + "loss": 0.4668, + "step": 3611 + }, + { + "epoch": 0.8781911013858498, + "grad_norm": 19.125, + "learning_rate": 1.514268308410005e-06, + "loss": 0.5061, + "step": 3612 + }, + { + "epoch": 0.8784342329200098, + "grad_norm": 17.125, + "learning_rate": 1.5137959496445612e-06, + "loss": 1.0777, + "step": 3613 + }, + { + "epoch": 0.8786773644541697, + "grad_norm": 17.625, + "learning_rate": 1.5133235514502564e-06, + "loss": 0.7726, + "step": 3614 + }, + { + "epoch": 0.8789204959883297, + "grad_norm": 15.1875, + "learning_rate": 1.5128511138976992e-06, + "loss": 0.7334, + "step": 3615 + }, + { + "epoch": 0.8791636275224897, + "grad_norm": 16.75, + "learning_rate": 1.5123786370575038e-06, + "loss": 0.5958, + "step": 3616 + }, + { + "epoch": 0.8794067590566497, + "grad_norm": 18.25, + "learning_rate": 1.5119061210002892e-06, + "loss": 0.8657, + "step": 3617 + }, + { + "epoch": 0.8796498905908097, + "grad_norm": 19.0, + "learning_rate": 1.5114335657966816e-06, + "loss": 1.0357, + "step": 3618 + }, + { + "epoch": 0.8798930221249696, + "grad_norm": 20.375, + "learning_rate": 1.5109609715173127e-06, + "loss": 0.795, + "step": 3619 + }, + { + "epoch": 0.8801361536591296, + "grad_norm": 22.875, + "learning_rate": 1.5104883382328195e-06, + "loss": 0.9602, + "step": 3620 + }, + { + "epoch": 0.8803792851932896, + "grad_norm": 18.75, + "learning_rate": 1.5100156660138454e-06, + "loss": 0.9176, + "step": 3621 + }, + { + "epoch": 0.8806224167274496, + "grad_norm": 18.125, + "learning_rate": 1.5095429549310392e-06, + "loss": 0.5905, + "step": 3622 + }, + { + "epoch": 0.8808655482616096, + "grad_norm": 18.75, + "learning_rate": 1.5090702050550562e-06, + "loss": 0.6135, + "step": 3623 + }, + { + "epoch": 0.8811086797957695, + "grad_norm": 23.0, + "learning_rate": 1.5085974164565567e-06, + "loss": 1.1195, + "step": 3624 + }, + { + "epoch": 0.8813518113299295, + "grad_norm": 15.5625, + "learning_rate": 1.5081245892062072e-06, + "loss": 0.6659, + "step": 3625 + }, + { + "epoch": 0.8815949428640895, + "grad_norm": 17.25, + "learning_rate": 1.5076517233746796e-06, + "loss": 0.8656, + "step": 3626 + }, + { + "epoch": 0.8818380743982495, + "grad_norm": 20.625, + "learning_rate": 1.5071788190326521e-06, + "loss": 0.7428, + "step": 3627 + }, + { + "epoch": 0.8820812059324095, + "grad_norm": 27.0, + "learning_rate": 1.506705876250809e-06, + "loss": 1.2332, + "step": 3628 + }, + { + "epoch": 0.8823243374665695, + "grad_norm": 32.75, + "learning_rate": 1.5062328950998386e-06, + "loss": 0.8189, + "step": 3629 + }, + { + "epoch": 0.8825674690007294, + "grad_norm": 30.375, + "learning_rate": 1.5057598756504373e-06, + "loss": 1.1142, + "step": 3630 + }, + { + "epoch": 0.8828106005348894, + "grad_norm": 33.5, + "learning_rate": 1.5052868179733054e-06, + "loss": 0.918, + "step": 3631 + }, + { + "epoch": 0.8830537320690494, + "grad_norm": 26.125, + "learning_rate": 1.5048137221391493e-06, + "loss": 1.3675, + "step": 3632 + }, + { + "epoch": 0.8832968636032094, + "grad_norm": 21.625, + "learning_rate": 1.5043405882186819e-06, + "loss": 0.8361, + "step": 3633 + }, + { + "epoch": 0.8835399951373694, + "grad_norm": 18.875, + "learning_rate": 1.5038674162826205e-06, + "loss": 0.5911, + "step": 3634 + }, + { + "epoch": 0.8837831266715293, + "grad_norm": 18.75, + "learning_rate": 1.50339420640169e-06, + "loss": 0.7616, + "step": 3635 + }, + { + "epoch": 0.8840262582056893, + "grad_norm": 22.875, + "learning_rate": 1.5029209586466184e-06, + "loss": 1.0386, + "step": 3636 + }, + { + "epoch": 0.8842693897398493, + "grad_norm": 23.0, + "learning_rate": 1.502447673088142e-06, + "loss": 0.999, + "step": 3637 + }, + { + "epoch": 0.8845125212740093, + "grad_norm": 25.875, + "learning_rate": 1.5019743497970008e-06, + "loss": 0.8509, + "step": 3638 + }, + { + "epoch": 0.8847556528081693, + "grad_norm": 18.25, + "learning_rate": 1.5015009888439408e-06, + "loss": 0.5531, + "step": 3639 + }, + { + "epoch": 0.8849987843423291, + "grad_norm": 23.375, + "learning_rate": 1.5010275902997148e-06, + "loss": 0.7159, + "step": 3640 + }, + { + "epoch": 0.8852419158764891, + "grad_norm": 12.75, + "learning_rate": 1.5005541542350802e-06, + "loss": 0.3923, + "step": 3641 + }, + { + "epoch": 0.8854850474106492, + "grad_norm": 20.125, + "learning_rate": 1.5000806807207999e-06, + "loss": 0.7973, + "step": 3642 + }, + { + "epoch": 0.8857281789448092, + "grad_norm": 17.0, + "learning_rate": 1.499607169827643e-06, + "loss": 1.0972, + "step": 3643 + }, + { + "epoch": 0.8859713104789692, + "grad_norm": 18.75, + "learning_rate": 1.4991336216263833e-06, + "loss": 0.7585, + "step": 3644 + }, + { + "epoch": 0.8862144420131292, + "grad_norm": 19.625, + "learning_rate": 1.4986600361878012e-06, + "loss": 1.1922, + "step": 3645 + }, + { + "epoch": 0.886457573547289, + "grad_norm": 26.375, + "learning_rate": 1.4981864135826823e-06, + "loss": 1.1931, + "step": 3646 + }, + { + "epoch": 0.886700705081449, + "grad_norm": 19.5, + "learning_rate": 1.497712753881818e-06, + "loss": 0.8558, + "step": 3647 + }, + { + "epoch": 0.886943836615609, + "grad_norm": 16.875, + "learning_rate": 1.4972390571560035e-06, + "loss": 0.6759, + "step": 3648 + }, + { + "epoch": 0.887186968149769, + "grad_norm": 20.0, + "learning_rate": 1.496765323476043e-06, + "loss": 1.4714, + "step": 3649 + }, + { + "epoch": 0.887430099683929, + "grad_norm": 18.25, + "learning_rate": 1.4962915529127426e-06, + "loss": 0.779, + "step": 3650 + }, + { + "epoch": 0.8876732312180889, + "grad_norm": 15.5625, + "learning_rate": 1.495817745536916e-06, + "loss": 0.7506, + "step": 3651 + }, + { + "epoch": 0.8879163627522489, + "grad_norm": 17.75, + "learning_rate": 1.495343901419382e-06, + "loss": 0.7888, + "step": 3652 + }, + { + "epoch": 0.888159494286409, + "grad_norm": 19.75, + "learning_rate": 1.4948700206309645e-06, + "loss": 0.9544, + "step": 3653 + }, + { + "epoch": 0.888402625820569, + "grad_norm": 17.375, + "learning_rate": 1.4943961032424939e-06, + "loss": 0.877, + "step": 3654 + }, + { + "epoch": 0.888645757354729, + "grad_norm": 17.5, + "learning_rate": 1.4939221493248043e-06, + "loss": 0.6839, + "step": 3655 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 18.0, + "learning_rate": 1.493448158948737e-06, + "loss": 0.6713, + "step": 3656 + }, + { + "epoch": 0.8891320204230488, + "grad_norm": 18.875, + "learning_rate": 1.4929741321851376e-06, + "loss": 1.162, + "step": 3657 + }, + { + "epoch": 0.8893751519572088, + "grad_norm": 20.75, + "learning_rate": 1.492500069104858e-06, + "loss": 0.7053, + "step": 3658 + }, + { + "epoch": 0.8896182834913688, + "grad_norm": 13.5, + "learning_rate": 1.492025969778755e-06, + "loss": 0.5691, + "step": 3659 + }, + { + "epoch": 0.8898614150255288, + "grad_norm": 20.25, + "learning_rate": 1.4915518342776909e-06, + "loss": 1.1944, + "step": 3660 + }, + { + "epoch": 0.8901045465596887, + "grad_norm": 24.875, + "learning_rate": 1.4910776626725336e-06, + "loss": 1.1712, + "step": 3661 + }, + { + "epoch": 0.8903476780938487, + "grad_norm": 15.1875, + "learning_rate": 1.4906034550341559e-06, + "loss": 0.7042, + "step": 3662 + }, + { + "epoch": 0.8905908096280087, + "grad_norm": 18.875, + "learning_rate": 1.4901292114334362e-06, + "loss": 0.9218, + "step": 3663 + }, + { + "epoch": 0.8908339411621687, + "grad_norm": 23.5, + "learning_rate": 1.489654931941259e-06, + "loss": 0.8784, + "step": 3664 + }, + { + "epoch": 0.8910770726963287, + "grad_norm": 28.625, + "learning_rate": 1.4891806166285131e-06, + "loss": 1.4625, + "step": 3665 + }, + { + "epoch": 0.8913202042304887, + "grad_norm": 17.5, + "learning_rate": 1.4887062655660933e-06, + "loss": 1.1436, + "step": 3666 + }, + { + "epoch": 0.8915633357646486, + "grad_norm": 22.25, + "learning_rate": 1.4882318788248996e-06, + "loss": 0.7973, + "step": 3667 + }, + { + "epoch": 0.8918064672988086, + "grad_norm": 20.125, + "learning_rate": 1.487757456475837e-06, + "loss": 0.7844, + "step": 3668 + }, + { + "epoch": 0.8920495988329686, + "grad_norm": 17.75, + "learning_rate": 1.4872829985898167e-06, + "loss": 0.9402, + "step": 3669 + }, + { + "epoch": 0.8922927303671286, + "grad_norm": 21.0, + "learning_rate": 1.4868085052377538e-06, + "loss": 0.7525, + "step": 3670 + }, + { + "epoch": 0.8925358619012886, + "grad_norm": 25.625, + "learning_rate": 1.4863339764905698e-06, + "loss": 1.0096, + "step": 3671 + }, + { + "epoch": 0.8927789934354485, + "grad_norm": 19.625, + "learning_rate": 1.485859412419192e-06, + "loss": 0.5815, + "step": 3672 + }, + { + "epoch": 0.8930221249696085, + "grad_norm": 23.875, + "learning_rate": 1.4853848130945514e-06, + "loss": 0.9197, + "step": 3673 + }, + { + "epoch": 0.8932652565037685, + "grad_norm": 20.5, + "learning_rate": 1.4849101785875853e-06, + "loss": 0.8862, + "step": 3674 + }, + { + "epoch": 0.8935083880379285, + "grad_norm": 16.625, + "learning_rate": 1.4844355089692352e-06, + "loss": 0.6091, + "step": 3675 + }, + { + "epoch": 0.8937515195720885, + "grad_norm": 20.125, + "learning_rate": 1.4839608043104503e-06, + "loss": 1.1555, + "step": 3676 + }, + { + "epoch": 0.8939946511062484, + "grad_norm": 16.75, + "learning_rate": 1.4834860646821825e-06, + "loss": 0.6236, + "step": 3677 + }, + { + "epoch": 0.8942377826404084, + "grad_norm": 24.25, + "learning_rate": 1.4830112901553898e-06, + "loss": 1.389, + "step": 3678 + }, + { + "epoch": 0.8944809141745684, + "grad_norm": 17.75, + "learning_rate": 1.482536480801035e-06, + "loss": 0.6193, + "step": 3679 + }, + { + "epoch": 0.8947240457087284, + "grad_norm": 21.375, + "learning_rate": 1.4820616366900878e-06, + "loss": 0.6483, + "step": 3680 + }, + { + "epoch": 0.8949671772428884, + "grad_norm": 19.875, + "learning_rate": 1.481586757893521e-06, + "loss": 0.8801, + "step": 3681 + }, + { + "epoch": 0.8952103087770484, + "grad_norm": 21.5, + "learning_rate": 1.4811118444823133e-06, + "loss": 0.6479, + "step": 3682 + }, + { + "epoch": 0.8954534403112083, + "grad_norm": 22.0, + "learning_rate": 1.4806368965274492e-06, + "loss": 1.0339, + "step": 3683 + }, + { + "epoch": 0.8956965718453683, + "grad_norm": 18.25, + "learning_rate": 1.4801619140999176e-06, + "loss": 0.9632, + "step": 3684 + }, + { + "epoch": 0.8959397033795283, + "grad_norm": 15.625, + "learning_rate": 1.4796868972707132e-06, + "loss": 0.695, + "step": 3685 + }, + { + "epoch": 0.8961828349136883, + "grad_norm": 26.5, + "learning_rate": 1.479211846110835e-06, + "loss": 0.8159, + "step": 3686 + }, + { + "epoch": 0.8964259664478483, + "grad_norm": 22.125, + "learning_rate": 1.4787367606912872e-06, + "loss": 1.1078, + "step": 3687 + }, + { + "epoch": 0.8966690979820082, + "grad_norm": 18.375, + "learning_rate": 1.478261641083081e-06, + "loss": 0.6567, + "step": 3688 + }, + { + "epoch": 0.8969122295161682, + "grad_norm": 20.25, + "learning_rate": 1.47778648735723e-06, + "loss": 0.7557, + "step": 3689 + }, + { + "epoch": 0.8971553610503282, + "grad_norm": 16.0, + "learning_rate": 1.4773112995847543e-06, + "loss": 0.7756, + "step": 3690 + }, + { + "epoch": 0.8973984925844882, + "grad_norm": 17.5, + "learning_rate": 1.4768360778366791e-06, + "loss": 0.5595, + "step": 3691 + }, + { + "epoch": 0.8976416241186482, + "grad_norm": 19.375, + "learning_rate": 1.4763608221840346e-06, + "loss": 1.1363, + "step": 3692 + }, + { + "epoch": 0.8978847556528081, + "grad_norm": 18.0, + "learning_rate": 1.475885532697856e-06, + "loss": 0.909, + "step": 3693 + }, + { + "epoch": 0.8981278871869681, + "grad_norm": 21.625, + "learning_rate": 1.4754102094491826e-06, + "loss": 1.0297, + "step": 3694 + }, + { + "epoch": 0.8983710187211281, + "grad_norm": 34.25, + "learning_rate": 1.4749348525090611e-06, + "loss": 0.9239, + "step": 3695 + }, + { + "epoch": 0.8986141502552881, + "grad_norm": 22.0, + "learning_rate": 1.4744594619485412e-06, + "loss": 1.225, + "step": 3696 + }, + { + "epoch": 0.8988572817894481, + "grad_norm": 13.9375, + "learning_rate": 1.4739840378386782e-06, + "loss": 0.3779, + "step": 3697 + }, + { + "epoch": 0.899100413323608, + "grad_norm": 19.875, + "learning_rate": 1.473508580250532e-06, + "loss": 0.8047, + "step": 3698 + }, + { + "epoch": 0.899343544857768, + "grad_norm": 28.625, + "learning_rate": 1.4730330892551684e-06, + "loss": 0.9387, + "step": 3699 + }, + { + "epoch": 0.899586676391928, + "grad_norm": 20.375, + "learning_rate": 1.4725575649236578e-06, + "loss": 0.5998, + "step": 3700 + }, + { + "epoch": 0.899829807926088, + "grad_norm": 16.125, + "learning_rate": 1.4720820073270755e-06, + "loss": 0.6035, + "step": 3701 + }, + { + "epoch": 0.900072939460248, + "grad_norm": 28.0, + "learning_rate": 1.4716064165365018e-06, + "loss": 0.7062, + "step": 3702 + }, + { + "epoch": 0.900316070994408, + "grad_norm": 15.75, + "learning_rate": 1.4711307926230216e-06, + "loss": 0.7638, + "step": 3703 + }, + { + "epoch": 0.9005592025285679, + "grad_norm": 17.625, + "learning_rate": 1.4706551356577253e-06, + "loss": 0.7957, + "step": 3704 + }, + { + "epoch": 0.9008023340627279, + "grad_norm": 17.25, + "learning_rate": 1.4701794457117088e-06, + "loss": 0.6963, + "step": 3705 + }, + { + "epoch": 0.9010454655968879, + "grad_norm": 20.875, + "learning_rate": 1.4697037228560706e-06, + "loss": 0.5563, + "step": 3706 + }, + { + "epoch": 0.9012885971310479, + "grad_norm": 24.375, + "learning_rate": 1.4692279671619173e-06, + "loss": 1.057, + "step": 3707 + }, + { + "epoch": 0.9015317286652079, + "grad_norm": 15.75, + "learning_rate": 1.4687521787003577e-06, + "loss": 0.8086, + "step": 3708 + }, + { + "epoch": 0.9017748601993678, + "grad_norm": 20.625, + "learning_rate": 1.4682763575425078e-06, + "loss": 0.876, + "step": 3709 + }, + { + "epoch": 0.9020179917335278, + "grad_norm": 21.375, + "learning_rate": 1.4678005037594853e-06, + "loss": 0.7953, + "step": 3710 + }, + { + "epoch": 0.9022611232676878, + "grad_norm": 17.75, + "learning_rate": 1.4673246174224166e-06, + "loss": 0.7017, + "step": 3711 + }, + { + "epoch": 0.9025042548018478, + "grad_norm": 17.125, + "learning_rate": 1.4668486986024305e-06, + "loss": 0.7379, + "step": 3712 + }, + { + "epoch": 0.9027473863360078, + "grad_norm": 19.375, + "learning_rate": 1.4663727473706612e-06, + "loss": 0.6913, + "step": 3713 + }, + { + "epoch": 0.9029905178701677, + "grad_norm": 22.0, + "learning_rate": 1.465896763798248e-06, + "loss": 0.8068, + "step": 3714 + }, + { + "epoch": 0.9032336494043277, + "grad_norm": 23.375, + "learning_rate": 1.4654207479563349e-06, + "loss": 1.0214, + "step": 3715 + }, + { + "epoch": 0.9034767809384877, + "grad_norm": 16.125, + "learning_rate": 1.4649446999160701e-06, + "loss": 1.1374, + "step": 3716 + }, + { + "epoch": 0.9037199124726477, + "grad_norm": 17.25, + "learning_rate": 1.464468619748608e-06, + "loss": 0.6149, + "step": 3717 + }, + { + "epoch": 0.9039630440068077, + "grad_norm": 20.625, + "learning_rate": 1.4639925075251065e-06, + "loss": 0.5648, + "step": 3718 + }, + { + "epoch": 0.9042061755409677, + "grad_norm": 16.375, + "learning_rate": 1.4635163633167296e-06, + "loss": 0.8324, + "step": 3719 + }, + { + "epoch": 0.9044493070751276, + "grad_norm": 21.5, + "learning_rate": 1.463040187194644e-06, + "loss": 0.8693, + "step": 3720 + }, + { + "epoch": 0.9046924386092876, + "grad_norm": 23.875, + "learning_rate": 1.4625639792300233e-06, + "loss": 1.0723, + "step": 3721 + }, + { + "epoch": 0.9049355701434476, + "grad_norm": 16.25, + "learning_rate": 1.4620877394940447e-06, + "loss": 0.7044, + "step": 3722 + }, + { + "epoch": 0.9051787016776076, + "grad_norm": 23.75, + "learning_rate": 1.4616114680578905e-06, + "loss": 0.8572, + "step": 3723 + }, + { + "epoch": 0.9054218332117676, + "grad_norm": 16.125, + "learning_rate": 1.4611351649927482e-06, + "loss": 0.5652, + "step": 3724 + }, + { + "epoch": 0.9056649647459275, + "grad_norm": 15.625, + "learning_rate": 1.4606588303698082e-06, + "loss": 0.3811, + "step": 3725 + }, + { + "epoch": 0.9059080962800875, + "grad_norm": 19.625, + "learning_rate": 1.4601824642602688e-06, + "loss": 1.3879, + "step": 3726 + }, + { + "epoch": 0.9061512278142475, + "grad_norm": 21.375, + "learning_rate": 1.4597060667353296e-06, + "loss": 0.9602, + "step": 3727 + }, + { + "epoch": 0.9063943593484075, + "grad_norm": 19.5, + "learning_rate": 1.4592296378661968e-06, + "loss": 1.0221, + "step": 3728 + }, + { + "epoch": 0.9066374908825675, + "grad_norm": 25.125, + "learning_rate": 1.4587531777240814e-06, + "loss": 1.2204, + "step": 3729 + }, + { + "epoch": 0.9068806224167274, + "grad_norm": 18.875, + "learning_rate": 1.4582766863801984e-06, + "loss": 0.971, + "step": 3730 + }, + { + "epoch": 0.9071237539508874, + "grad_norm": 15.75, + "learning_rate": 1.4578001639057676e-06, + "loss": 0.602, + "step": 3731 + }, + { + "epoch": 0.9073668854850474, + "grad_norm": 19.5, + "learning_rate": 1.4573236103720132e-06, + "loss": 0.8516, + "step": 3732 + }, + { + "epoch": 0.9076100170192074, + "grad_norm": 16.5, + "learning_rate": 1.4568470258501646e-06, + "loss": 0.5341, + "step": 3733 + }, + { + "epoch": 0.9078531485533674, + "grad_norm": 18.5, + "learning_rate": 1.4563704104114557e-06, + "loss": 0.7416, + "step": 3734 + }, + { + "epoch": 0.9080962800875274, + "grad_norm": 18.125, + "learning_rate": 1.4558937641271248e-06, + "loss": 0.6536, + "step": 3735 + }, + { + "epoch": 0.9083394116216873, + "grad_norm": 22.75, + "learning_rate": 1.455417087068415e-06, + "loss": 0.8411, + "step": 3736 + }, + { + "epoch": 0.9085825431558473, + "grad_norm": 18.0, + "learning_rate": 1.4549403793065737e-06, + "loss": 0.8491, + "step": 3737 + }, + { + "epoch": 0.9088256746900073, + "grad_norm": 17.875, + "learning_rate": 1.4544636409128538e-06, + "loss": 0.8147, + "step": 3738 + }, + { + "epoch": 0.9090688062241673, + "grad_norm": 18.625, + "learning_rate": 1.453986871958511e-06, + "loss": 0.6442, + "step": 3739 + }, + { + "epoch": 0.9093119377583273, + "grad_norm": 24.625, + "learning_rate": 1.4535100725148072e-06, + "loss": 0.7537, + "step": 3740 + }, + { + "epoch": 0.9095550692924872, + "grad_norm": 19.0, + "learning_rate": 1.4530332426530086e-06, + "loss": 0.6678, + "step": 3741 + }, + { + "epoch": 0.9097982008266472, + "grad_norm": 15.9375, + "learning_rate": 1.452556382444385e-06, + "loss": 1.1911, + "step": 3742 + }, + { + "epoch": 0.9100413323608072, + "grad_norm": 19.75, + "learning_rate": 1.4520794919602125e-06, + "loss": 1.2636, + "step": 3743 + }, + { + "epoch": 0.9102844638949672, + "grad_norm": 24.125, + "learning_rate": 1.4516025712717692e-06, + "loss": 0.9072, + "step": 3744 + }, + { + "epoch": 0.9105275954291272, + "grad_norm": 24.25, + "learning_rate": 1.4511256204503403e-06, + "loss": 0.6314, + "step": 3745 + }, + { + "epoch": 0.9107707269632871, + "grad_norm": 20.625, + "learning_rate": 1.4506486395672134e-06, + "loss": 0.6908, + "step": 3746 + }, + { + "epoch": 0.9110138584974471, + "grad_norm": 22.375, + "learning_rate": 1.4501716286936824e-06, + "loss": 1.0111, + "step": 3747 + }, + { + "epoch": 0.9112569900316071, + "grad_norm": 19.125, + "learning_rate": 1.449694587901044e-06, + "loss": 0.7819, + "step": 3748 + }, + { + "epoch": 0.9115001215657671, + "grad_norm": 21.0, + "learning_rate": 1.4492175172606006e-06, + "loss": 0.5429, + "step": 3749 + }, + { + "epoch": 0.9117432530999271, + "grad_norm": 21.75, + "learning_rate": 1.4487404168436593e-06, + "loss": 1.2505, + "step": 3750 + }, + { + "epoch": 0.911986384634087, + "grad_norm": 22.875, + "learning_rate": 1.44826328672153e-06, + "loss": 0.6566, + "step": 3751 + }, + { + "epoch": 0.912229516168247, + "grad_norm": 20.125, + "learning_rate": 1.447786126965528e-06, + "loss": 1.0915, + "step": 3752 + }, + { + "epoch": 0.912472647702407, + "grad_norm": 21.75, + "learning_rate": 1.4473089376469737e-06, + "loss": 1.2058, + "step": 3753 + }, + { + "epoch": 0.912715779236567, + "grad_norm": 17.375, + "learning_rate": 1.446831718837191e-06, + "loss": 0.8974, + "step": 3754 + }, + { + "epoch": 0.912958910770727, + "grad_norm": 15.4375, + "learning_rate": 1.4463544706075088e-06, + "loss": 0.5681, + "step": 3755 + }, + { + "epoch": 0.913202042304887, + "grad_norm": 23.375, + "learning_rate": 1.4458771930292592e-06, + "loss": 0.84, + "step": 3756 + }, + { + "epoch": 0.9134451738390469, + "grad_norm": 22.875, + "learning_rate": 1.4453998861737808e-06, + "loss": 1.123, + "step": 3757 + }, + { + "epoch": 0.9136883053732069, + "grad_norm": 20.5, + "learning_rate": 1.4449225501124146e-06, + "loss": 1.0275, + "step": 3758 + }, + { + "epoch": 0.9139314369073669, + "grad_norm": 18.75, + "learning_rate": 1.4444451849165067e-06, + "loss": 0.8495, + "step": 3759 + }, + { + "epoch": 0.9141745684415269, + "grad_norm": 18.75, + "learning_rate": 1.4439677906574076e-06, + "loss": 0.754, + "step": 3760 + }, + { + "epoch": 0.9144176999756869, + "grad_norm": 22.375, + "learning_rate": 1.4434903674064726e-06, + "loss": 0.7724, + "step": 3761 + }, + { + "epoch": 0.9146608315098468, + "grad_norm": 17.25, + "learning_rate": 1.4430129152350605e-06, + "loss": 0.496, + "step": 3762 + }, + { + "epoch": 0.9149039630440068, + "grad_norm": 24.0, + "learning_rate": 1.4425354342145346e-06, + "loss": 0.9878, + "step": 3763 + }, + { + "epoch": 0.9151470945781668, + "grad_norm": 15.75, + "learning_rate": 1.4420579244162627e-06, + "loss": 0.8022, + "step": 3764 + }, + { + "epoch": 0.9153902261123268, + "grad_norm": 20.75, + "learning_rate": 1.4415803859116173e-06, + "loss": 0.7064, + "step": 3765 + }, + { + "epoch": 0.9156333576464868, + "grad_norm": 18.5, + "learning_rate": 1.441102818771974e-06, + "loss": 0.8631, + "step": 3766 + }, + { + "epoch": 0.9158764891806467, + "grad_norm": 17.375, + "learning_rate": 1.4406252230687148e-06, + "loss": 1.3142, + "step": 3767 + }, + { + "epoch": 0.9161196207148067, + "grad_norm": 24.125, + "learning_rate": 1.440147598873223e-06, + "loss": 0.9144, + "step": 3768 + }, + { + "epoch": 0.9163627522489667, + "grad_norm": 17.75, + "learning_rate": 1.4396699462568894e-06, + "loss": 0.5567, + "step": 3769 + }, + { + "epoch": 0.9166058837831267, + "grad_norm": 14.5625, + "learning_rate": 1.439192265291106e-06, + "loss": 0.43, + "step": 3770 + }, + { + "epoch": 0.9168490153172867, + "grad_norm": 16.5, + "learning_rate": 1.4387145560472712e-06, + "loss": 1.0951, + "step": 3771 + }, + { + "epoch": 0.9170921468514467, + "grad_norm": 20.75, + "learning_rate": 1.4382368185967868e-06, + "loss": 0.6864, + "step": 3772 + }, + { + "epoch": 0.9173352783856066, + "grad_norm": 21.25, + "learning_rate": 1.4377590530110591e-06, + "loss": 1.1724, + "step": 3773 + }, + { + "epoch": 0.9175784099197666, + "grad_norm": 19.75, + "learning_rate": 1.4372812593614983e-06, + "loss": 1.2391, + "step": 3774 + }, + { + "epoch": 0.9178215414539266, + "grad_norm": 18.625, + "learning_rate": 1.4368034377195183e-06, + "loss": 0.8745, + "step": 3775 + }, + { + "epoch": 0.9180646729880866, + "grad_norm": 20.5, + "learning_rate": 1.4363255881565389e-06, + "loss": 0.8104, + "step": 3776 + }, + { + "epoch": 0.9183078045222466, + "grad_norm": 20.625, + "learning_rate": 1.435847710743982e-06, + "loss": 0.8274, + "step": 3777 + }, + { + "epoch": 0.9185509360564065, + "grad_norm": 14.75, + "learning_rate": 1.435369805553275e-06, + "loss": 0.7391, + "step": 3778 + }, + { + "epoch": 0.9187940675905665, + "grad_norm": 19.625, + "learning_rate": 1.4348918726558495e-06, + "loss": 1.0393, + "step": 3779 + }, + { + "epoch": 0.9190371991247265, + "grad_norm": 21.0, + "learning_rate": 1.4344139121231402e-06, + "loss": 0.9398, + "step": 3780 + }, + { + "epoch": 0.9192803306588865, + "grad_norm": 14.8125, + "learning_rate": 1.433935924026587e-06, + "loss": 0.7216, + "step": 3781 + }, + { + "epoch": 0.9195234621930465, + "grad_norm": 18.0, + "learning_rate": 1.433457908437633e-06, + "loss": 0.9834, + "step": 3782 + }, + { + "epoch": 0.9197665937272064, + "grad_norm": 22.25, + "learning_rate": 1.432979865427726e-06, + "loss": 1.0621, + "step": 3783 + }, + { + "epoch": 0.9200097252613664, + "grad_norm": 18.625, + "learning_rate": 1.4325017950683182e-06, + "loss": 0.7211, + "step": 3784 + }, + { + "epoch": 0.9202528567955264, + "grad_norm": 25.875, + "learning_rate": 1.4320236974308652e-06, + "loss": 1.1432, + "step": 3785 + }, + { + "epoch": 0.9204959883296864, + "grad_norm": 21.125, + "learning_rate": 1.431545572586827e-06, + "loss": 0.7333, + "step": 3786 + }, + { + "epoch": 0.9207391198638464, + "grad_norm": 23.625, + "learning_rate": 1.4310674206076675e-06, + "loss": 1.1414, + "step": 3787 + }, + { + "epoch": 0.9209822513980063, + "grad_norm": 20.5, + "learning_rate": 1.4305892415648549e-06, + "loss": 0.8259, + "step": 3788 + }, + { + "epoch": 0.9212253829321663, + "grad_norm": 19.5, + "learning_rate": 1.4301110355298612e-06, + "loss": 1.0309, + "step": 3789 + }, + { + "epoch": 0.9214685144663263, + "grad_norm": 17.125, + "learning_rate": 1.4296328025741626e-06, + "loss": 0.8037, + "step": 3790 + }, + { + "epoch": 0.9217116460004863, + "grad_norm": 16.5, + "learning_rate": 1.4291545427692394e-06, + "loss": 1.0646, + "step": 3791 + }, + { + "epoch": 0.9219547775346463, + "grad_norm": 20.0, + "learning_rate": 1.4286762561865756e-06, + "loss": 0.8686, + "step": 3792 + }, + { + "epoch": 0.9221979090688063, + "grad_norm": 28.125, + "learning_rate": 1.4281979428976594e-06, + "loss": 1.1802, + "step": 3793 + }, + { + "epoch": 0.9224410406029662, + "grad_norm": 15.8125, + "learning_rate": 1.4277196029739831e-06, + "loss": 0.7507, + "step": 3794 + }, + { + "epoch": 0.9226841721371262, + "grad_norm": 18.5, + "learning_rate": 1.427241236487043e-06, + "loss": 1.3671, + "step": 3795 + }, + { + "epoch": 0.9229273036712862, + "grad_norm": 21.25, + "learning_rate": 1.4267628435083388e-06, + "loss": 0.7674, + "step": 3796 + }, + { + "epoch": 0.9231704352054462, + "grad_norm": 22.375, + "learning_rate": 1.4262844241093749e-06, + "loss": 1.1878, + "step": 3797 + }, + { + "epoch": 0.9234135667396062, + "grad_norm": 14.1875, + "learning_rate": 1.4258059783616596e-06, + "loss": 0.7436, + "step": 3798 + }, + { + "epoch": 0.9236566982737661, + "grad_norm": 20.875, + "learning_rate": 1.4253275063367038e-06, + "loss": 0.6861, + "step": 3799 + }, + { + "epoch": 0.9238998298079261, + "grad_norm": 18.625, + "learning_rate": 1.4248490081060248e-06, + "loss": 1.0555, + "step": 3800 + }, + { + "epoch": 0.9241429613420861, + "grad_norm": 15.25, + "learning_rate": 1.4243704837411418e-06, + "loss": 0.5213, + "step": 3801 + }, + { + "epoch": 0.9243860928762461, + "grad_norm": 19.375, + "learning_rate": 1.4238919333135778e-06, + "loss": 0.7359, + "step": 3802 + }, + { + "epoch": 0.9246292244104061, + "grad_norm": 19.625, + "learning_rate": 1.423413356894862e-06, + "loss": 0.8722, + "step": 3803 + }, + { + "epoch": 0.924872355944566, + "grad_norm": 20.5, + "learning_rate": 1.4229347545565248e-06, + "loss": 0.9464, + "step": 3804 + }, + { + "epoch": 0.925115487478726, + "grad_norm": 18.125, + "learning_rate": 1.422456126370102e-06, + "loss": 0.777, + "step": 3805 + }, + { + "epoch": 0.925358619012886, + "grad_norm": 17.75, + "learning_rate": 1.4219774724071322e-06, + "loss": 0.8119, + "step": 3806 + }, + { + "epoch": 0.925601750547046, + "grad_norm": 16.25, + "learning_rate": 1.4214987927391594e-06, + "loss": 0.8855, + "step": 3807 + }, + { + "epoch": 0.925844882081206, + "grad_norm": 18.625, + "learning_rate": 1.42102008743773e-06, + "loss": 0.804, + "step": 3808 + }, + { + "epoch": 0.926088013615366, + "grad_norm": 16.625, + "learning_rate": 1.420541356574395e-06, + "loss": 0.4214, + "step": 3809 + }, + { + "epoch": 0.9263311451495259, + "grad_norm": 22.25, + "learning_rate": 1.4200626002207089e-06, + "loss": 0.8928, + "step": 3810 + }, + { + "epoch": 0.9265742766836859, + "grad_norm": 36.5, + "learning_rate": 1.41958381844823e-06, + "loss": 1.06, + "step": 3811 + }, + { + "epoch": 0.9268174082178459, + "grad_norm": 23.0, + "learning_rate": 1.419105011328521e-06, + "loss": 1.194, + "step": 3812 + }, + { + "epoch": 0.9270605397520059, + "grad_norm": 19.625, + "learning_rate": 1.4186261789331471e-06, + "loss": 0.8113, + "step": 3813 + }, + { + "epoch": 0.9273036712861659, + "grad_norm": 15.5625, + "learning_rate": 1.4181473213336783e-06, + "loss": 0.7907, + "step": 3814 + }, + { + "epoch": 0.9275468028203258, + "grad_norm": 23.125, + "learning_rate": 1.4176684386016886e-06, + "loss": 0.9357, + "step": 3815 + }, + { + "epoch": 0.9277899343544858, + "grad_norm": 24.25, + "learning_rate": 1.417189530808755e-06, + "loss": 0.8202, + "step": 3816 + }, + { + "epoch": 0.9280330658886458, + "grad_norm": 22.375, + "learning_rate": 1.416710598026459e-06, + "loss": 0.9124, + "step": 3817 + }, + { + "epoch": 0.9282761974228058, + "grad_norm": 19.125, + "learning_rate": 1.416231640326384e-06, + "loss": 0.8411, + "step": 3818 + }, + { + "epoch": 0.9285193289569658, + "grad_norm": 16.5, + "learning_rate": 1.41575265778012e-06, + "loss": 0.5253, + "step": 3819 + }, + { + "epoch": 0.9287624604911257, + "grad_norm": 19.375, + "learning_rate": 1.4152736504592587e-06, + "loss": 0.9743, + "step": 3820 + }, + { + "epoch": 0.9290055920252857, + "grad_norm": 18.875, + "learning_rate": 1.4147946184353958e-06, + "loss": 0.4579, + "step": 3821 + }, + { + "epoch": 0.9292487235594457, + "grad_norm": 23.25, + "learning_rate": 1.414315561780131e-06, + "loss": 0.8227, + "step": 3822 + }, + { + "epoch": 0.9294918550936057, + "grad_norm": 18.875, + "learning_rate": 1.4138364805650679e-06, + "loss": 0.8655, + "step": 3823 + }, + { + "epoch": 0.9297349866277657, + "grad_norm": 22.375, + "learning_rate": 1.4133573748618135e-06, + "loss": 0.9211, + "step": 3824 + }, + { + "epoch": 0.9299781181619255, + "grad_norm": 19.0, + "learning_rate": 1.4128782447419775e-06, + "loss": 0.7798, + "step": 3825 + }, + { + "epoch": 0.9302212496960856, + "grad_norm": 19.625, + "learning_rate": 1.4123990902771747e-06, + "loss": 1.0926, + "step": 3826 + }, + { + "epoch": 0.9304643812302456, + "grad_norm": 13.0625, + "learning_rate": 1.411919911539024e-06, + "loss": 0.392, + "step": 3827 + }, + { + "epoch": 0.9307075127644056, + "grad_norm": 36.0, + "learning_rate": 1.4114407085991456e-06, + "loss": 0.8685, + "step": 3828 + }, + { + "epoch": 0.9309506442985656, + "grad_norm": 24.0, + "learning_rate": 1.4109614815291648e-06, + "loss": 0.9956, + "step": 3829 + }, + { + "epoch": 0.9311937758327256, + "grad_norm": 22.75, + "learning_rate": 1.410482230400711e-06, + "loss": 1.1699, + "step": 3830 + }, + { + "epoch": 0.9314369073668854, + "grad_norm": 15.6875, + "learning_rate": 1.410002955285416e-06, + "loss": 0.5616, + "step": 3831 + }, + { + "epoch": 0.9316800389010454, + "grad_norm": 19.375, + "learning_rate": 1.4095236562549167e-06, + "loss": 0.9048, + "step": 3832 + }, + { + "epoch": 0.9319231704352055, + "grad_norm": 25.0, + "learning_rate": 1.409044333380851e-06, + "loss": 0.8924, + "step": 3833 + }, + { + "epoch": 0.9321663019693655, + "grad_norm": 18.5, + "learning_rate": 1.4085649867348635e-06, + "loss": 0.9992, + "step": 3834 + }, + { + "epoch": 0.9324094335035255, + "grad_norm": 18.5, + "learning_rate": 1.4080856163886001e-06, + "loss": 0.7677, + "step": 3835 + }, + { + "epoch": 0.9326525650376853, + "grad_norm": 21.5, + "learning_rate": 1.407606222413711e-06, + "loss": 0.9993, + "step": 3836 + }, + { + "epoch": 0.9328956965718453, + "grad_norm": 18.125, + "learning_rate": 1.4071268048818499e-06, + "loss": 0.9306, + "step": 3837 + }, + { + "epoch": 0.9331388281060053, + "grad_norm": 22.625, + "learning_rate": 1.406647363864674e-06, + "loss": 1.0934, + "step": 3838 + }, + { + "epoch": 0.9333819596401653, + "grad_norm": 18.5, + "learning_rate": 1.4061678994338449e-06, + "loss": 0.744, + "step": 3839 + }, + { + "epoch": 0.9336250911743253, + "grad_norm": 18.75, + "learning_rate": 1.4056884116610255e-06, + "loss": 1.2328, + "step": 3840 + }, + { + "epoch": 0.9338682227084852, + "grad_norm": 21.0, + "learning_rate": 1.4052089006178843e-06, + "loss": 0.9657, + "step": 3841 + }, + { + "epoch": 0.9341113542426452, + "grad_norm": 20.0, + "learning_rate": 1.4047293663760922e-06, + "loss": 0.7684, + "step": 3842 + }, + { + "epoch": 0.9343544857768052, + "grad_norm": 15.5625, + "learning_rate": 1.4042498090073243e-06, + "loss": 0.5175, + "step": 3843 + }, + { + "epoch": 0.9345976173109652, + "grad_norm": 17.375, + "learning_rate": 1.4037702285832586e-06, + "loss": 0.9267, + "step": 3844 + }, + { + "epoch": 0.9348407488451252, + "grad_norm": 13.1875, + "learning_rate": 1.403290625175576e-06, + "loss": 0.3068, + "step": 3845 + }, + { + "epoch": 0.9350838803792852, + "grad_norm": 21.25, + "learning_rate": 1.4028109988559624e-06, + "loss": 1.2203, + "step": 3846 + }, + { + "epoch": 0.9353270119134451, + "grad_norm": 23.75, + "learning_rate": 1.4023313496961059e-06, + "loss": 1.0322, + "step": 3847 + }, + { + "epoch": 0.9355701434476051, + "grad_norm": 18.875, + "learning_rate": 1.401851677767698e-06, + "loss": 0.6709, + "step": 3848 + }, + { + "epoch": 0.9358132749817651, + "grad_norm": 22.875, + "learning_rate": 1.4013719831424347e-06, + "loss": 0.9684, + "step": 3849 + }, + { + "epoch": 0.9360564065159251, + "grad_norm": 20.625, + "learning_rate": 1.4008922658920138e-06, + "loss": 0.8002, + "step": 3850 + }, + { + "epoch": 0.9362995380500851, + "grad_norm": 20.125, + "learning_rate": 1.4004125260881383e-06, + "loss": 1.0745, + "step": 3851 + }, + { + "epoch": 0.936542669584245, + "grad_norm": 12.8125, + "learning_rate": 1.3999327638025128e-06, + "loss": 0.4899, + "step": 3852 + }, + { + "epoch": 0.936785801118405, + "grad_norm": 17.375, + "learning_rate": 1.3994529791068461e-06, + "loss": 0.5899, + "step": 3853 + }, + { + "epoch": 0.937028932652565, + "grad_norm": 19.125, + "learning_rate": 1.398973172072851e-06, + "loss": 0.694, + "step": 3854 + }, + { + "epoch": 0.937272064186725, + "grad_norm": 24.5, + "learning_rate": 1.3984933427722419e-06, + "loss": 0.9549, + "step": 3855 + }, + { + "epoch": 0.937515195720885, + "grad_norm": 19.0, + "learning_rate": 1.3980134912767385e-06, + "loss": 0.899, + "step": 3856 + }, + { + "epoch": 0.9377583272550449, + "grad_norm": 16.5, + "learning_rate": 1.3975336176580625e-06, + "loss": 0.9902, + "step": 3857 + }, + { + "epoch": 0.9380014587892049, + "grad_norm": 21.25, + "learning_rate": 1.3970537219879395e-06, + "loss": 0.5704, + "step": 3858 + }, + { + "epoch": 0.9382445903233649, + "grad_norm": 17.25, + "learning_rate": 1.396573804338098e-06, + "loss": 0.6479, + "step": 3859 + }, + { + "epoch": 0.9384877218575249, + "grad_norm": 16.625, + "learning_rate": 1.3960938647802699e-06, + "loss": 1.0088, + "step": 3860 + }, + { + "epoch": 0.9387308533916849, + "grad_norm": 16.875, + "learning_rate": 1.3956139033861904e-06, + "loss": 0.8491, + "step": 3861 + }, + { + "epoch": 0.9389739849258448, + "grad_norm": 17.0, + "learning_rate": 1.3951339202275984e-06, + "loss": 0.7275, + "step": 3862 + }, + { + "epoch": 0.9392171164600048, + "grad_norm": 16.625, + "learning_rate": 1.394653915376236e-06, + "loss": 0.8649, + "step": 3863 + }, + { + "epoch": 0.9394602479941648, + "grad_norm": 19.125, + "learning_rate": 1.3941738889038467e-06, + "loss": 0.7899, + "step": 3864 + }, + { + "epoch": 0.9397033795283248, + "grad_norm": 24.875, + "learning_rate": 1.3936938408821804e-06, + "loss": 1.1052, + "step": 3865 + }, + { + "epoch": 0.9399465110624848, + "grad_norm": 16.125, + "learning_rate": 1.3932137713829878e-06, + "loss": 0.974, + "step": 3866 + }, + { + "epoch": 0.9401896425966448, + "grad_norm": 16.25, + "learning_rate": 1.3927336804780235e-06, + "loss": 0.5647, + "step": 3867 + }, + { + "epoch": 0.9404327741308047, + "grad_norm": 15.125, + "learning_rate": 1.3922535682390453e-06, + "loss": 0.678, + "step": 3868 + }, + { + "epoch": 0.9406759056649647, + "grad_norm": 20.625, + "learning_rate": 1.391773434737815e-06, + "loss": 0.7141, + "step": 3869 + }, + { + "epoch": 0.9409190371991247, + "grad_norm": 21.5, + "learning_rate": 1.3912932800460965e-06, + "loss": 0.9183, + "step": 3870 + }, + { + "epoch": 0.9411621687332847, + "grad_norm": 15.625, + "learning_rate": 1.3908131042356568e-06, + "loss": 0.6303, + "step": 3871 + }, + { + "epoch": 0.9414053002674447, + "grad_norm": 18.0, + "learning_rate": 1.3903329073782668e-06, + "loss": 0.7484, + "step": 3872 + }, + { + "epoch": 0.9416484318016046, + "grad_norm": 19.625, + "learning_rate": 1.3898526895457e-06, + "loss": 0.9081, + "step": 3873 + }, + { + "epoch": 0.9418915633357646, + "grad_norm": 20.5, + "learning_rate": 1.3893724508097334e-06, + "loss": 1.2226, + "step": 3874 + }, + { + "epoch": 0.9421346948699246, + "grad_norm": 17.75, + "learning_rate": 1.3888921912421473e-06, + "loss": 0.7871, + "step": 3875 + }, + { + "epoch": 0.9423778264040846, + "grad_norm": 16.875, + "learning_rate": 1.388411910914724e-06, + "loss": 0.8181, + "step": 3876 + }, + { + "epoch": 0.9426209579382446, + "grad_norm": 22.625, + "learning_rate": 1.3879316098992507e-06, + "loss": 0.9265, + "step": 3877 + }, + { + "epoch": 0.9428640894724045, + "grad_norm": 15.875, + "learning_rate": 1.3874512882675156e-06, + "loss": 0.4322, + "step": 3878 + }, + { + "epoch": 0.9431072210065645, + "grad_norm": 17.75, + "learning_rate": 1.386970946091312e-06, + "loss": 0.6555, + "step": 3879 + }, + { + "epoch": 0.9433503525407245, + "grad_norm": 20.125, + "learning_rate": 1.3864905834424348e-06, + "loss": 0.8817, + "step": 3880 + }, + { + "epoch": 0.9435934840748845, + "grad_norm": 19.0, + "learning_rate": 1.3860102003926827e-06, + "loss": 0.8455, + "step": 3881 + }, + { + "epoch": 0.9438366156090445, + "grad_norm": 17.125, + "learning_rate": 1.3855297970138571e-06, + "loss": 0.6743, + "step": 3882 + }, + { + "epoch": 0.9440797471432045, + "grad_norm": 21.5, + "learning_rate": 1.3850493733777622e-06, + "loss": 1.0024, + "step": 3883 + }, + { + "epoch": 0.9443228786773644, + "grad_norm": 20.875, + "learning_rate": 1.384568929556207e-06, + "loss": 0.8406, + "step": 3884 + }, + { + "epoch": 0.9445660102115244, + "grad_norm": 18.5, + "learning_rate": 1.3840884656210007e-06, + "loss": 1.0866, + "step": 3885 + }, + { + "epoch": 0.9448091417456844, + "grad_norm": 17.875, + "learning_rate": 1.3836079816439575e-06, + "loss": 0.8666, + "step": 3886 + }, + { + "epoch": 0.9450522732798444, + "grad_norm": 19.625, + "learning_rate": 1.3831274776968936e-06, + "loss": 0.9899, + "step": 3887 + }, + { + "epoch": 0.9452954048140044, + "grad_norm": 21.125, + "learning_rate": 1.3826469538516292e-06, + "loss": 1.2078, + "step": 3888 + }, + { + "epoch": 0.9455385363481643, + "grad_norm": 17.375, + "learning_rate": 1.382166410179987e-06, + "loss": 0.8049, + "step": 3889 + }, + { + "epoch": 0.9457816678823243, + "grad_norm": 24.125, + "learning_rate": 1.381685846753792e-06, + "loss": 0.9659, + "step": 3890 + }, + { + "epoch": 0.9460247994164843, + "grad_norm": 16.25, + "learning_rate": 1.3812052636448728e-06, + "loss": 0.4699, + "step": 3891 + }, + { + "epoch": 0.9462679309506443, + "grad_norm": 17.625, + "learning_rate": 1.380724660925061e-06, + "loss": 0.905, + "step": 3892 + }, + { + "epoch": 0.9465110624848043, + "grad_norm": 19.0, + "learning_rate": 1.3802440386661908e-06, + "loss": 0.5628, + "step": 3893 + }, + { + "epoch": 0.9467541940189642, + "grad_norm": 16.5, + "learning_rate": 1.3797633969401e-06, + "loss": 0.7007, + "step": 3894 + }, + { + "epoch": 0.9469973255531242, + "grad_norm": 19.375, + "learning_rate": 1.3792827358186277e-06, + "loss": 1.1727, + "step": 3895 + }, + { + "epoch": 0.9472404570872842, + "grad_norm": 16.125, + "learning_rate": 1.3788020553736186e-06, + "loss": 0.7185, + "step": 3896 + }, + { + "epoch": 0.9474835886214442, + "grad_norm": 14.4375, + "learning_rate": 1.3783213556769177e-06, + "loss": 0.4945, + "step": 3897 + }, + { + "epoch": 0.9477267201556042, + "grad_norm": 18.0, + "learning_rate": 1.3778406368003735e-06, + "loss": 0.5671, + "step": 3898 + }, + { + "epoch": 0.9479698516897641, + "grad_norm": 25.25, + "learning_rate": 1.3773598988158386e-06, + "loss": 1.0576, + "step": 3899 + }, + { + "epoch": 0.9482129832239241, + "grad_norm": 22.5, + "learning_rate": 1.3768791417951671e-06, + "loss": 0.6711, + "step": 3900 + }, + { + "epoch": 0.9484561147580841, + "grad_norm": 20.0, + "learning_rate": 1.3763983658102168e-06, + "loss": 0.7188, + "step": 3901 + }, + { + "epoch": 0.9486992462922441, + "grad_norm": 23.375, + "learning_rate": 1.3759175709328476e-06, + "loss": 1.1597, + "step": 3902 + }, + { + "epoch": 0.9489423778264041, + "grad_norm": 18.625, + "learning_rate": 1.3754367572349225e-06, + "loss": 0.7496, + "step": 3903 + }, + { + "epoch": 0.9491855093605641, + "grad_norm": 20.875, + "learning_rate": 1.374955924788308e-06, + "loss": 0.664, + "step": 3904 + }, + { + "epoch": 0.949428640894724, + "grad_norm": 17.25, + "learning_rate": 1.3744750736648724e-06, + "loss": 0.7426, + "step": 3905 + }, + { + "epoch": 0.949671772428884, + "grad_norm": 15.875, + "learning_rate": 1.3739942039364876e-06, + "loss": 0.5051, + "step": 3906 + }, + { + "epoch": 0.949914903963044, + "grad_norm": 22.625, + "learning_rate": 1.3735133156750268e-06, + "loss": 1.3829, + "step": 3907 + }, + { + "epoch": 0.950158035497204, + "grad_norm": 23.125, + "learning_rate": 1.3730324089523683e-06, + "loss": 0.8434, + "step": 3908 + }, + { + "epoch": 0.950401167031364, + "grad_norm": 20.625, + "learning_rate": 1.3725514838403914e-06, + "loss": 0.5742, + "step": 3909 + }, + { + "epoch": 0.9506442985655239, + "grad_norm": 23.0, + "learning_rate": 1.3720705404109787e-06, + "loss": 1.0885, + "step": 3910 + }, + { + "epoch": 0.9508874300996839, + "grad_norm": 25.375, + "learning_rate": 1.3715895787360155e-06, + "loss": 1.2471, + "step": 3911 + }, + { + "epoch": 0.9511305616338439, + "grad_norm": 19.25, + "learning_rate": 1.3711085988873898e-06, + "loss": 0.6364, + "step": 3912 + }, + { + "epoch": 0.9513736931680039, + "grad_norm": 18.5, + "learning_rate": 1.3706276009369925e-06, + "loss": 0.5563, + "step": 3913 + }, + { + "epoch": 0.9516168247021639, + "grad_norm": 17.75, + "learning_rate": 1.3701465849567167e-06, + "loss": 0.7277, + "step": 3914 + }, + { + "epoch": 0.9518599562363238, + "grad_norm": 30.875, + "learning_rate": 1.3696655510184592e-06, + "loss": 1.1632, + "step": 3915 + }, + { + "epoch": 0.9521030877704838, + "grad_norm": 16.375, + "learning_rate": 1.369184499194118e-06, + "loss": 0.7922, + "step": 3916 + }, + { + "epoch": 0.9523462193046438, + "grad_norm": 22.125, + "learning_rate": 1.3687034295555951e-06, + "loss": 1.055, + "step": 3917 + }, + { + "epoch": 0.9525893508388038, + "grad_norm": 21.375, + "learning_rate": 1.3682223421747948e-06, + "loss": 0.8957, + "step": 3918 + }, + { + "epoch": 0.9528324823729638, + "grad_norm": 27.625, + "learning_rate": 1.3677412371236232e-06, + "loss": 0.9606, + "step": 3919 + }, + { + "epoch": 0.9530756139071238, + "grad_norm": 14.9375, + "learning_rate": 1.367260114473991e-06, + "loss": 0.4122, + "step": 3920 + }, + { + "epoch": 0.9533187454412837, + "grad_norm": 17.0, + "learning_rate": 1.3667789742978089e-06, + "loss": 0.7189, + "step": 3921 + }, + { + "epoch": 0.9535618769754437, + "grad_norm": 16.75, + "learning_rate": 1.3662978166669924e-06, + "loss": 0.6071, + "step": 3922 + }, + { + "epoch": 0.9538050085096037, + "grad_norm": 22.0, + "learning_rate": 1.3658166416534588e-06, + "loss": 0.8416, + "step": 3923 + }, + { + "epoch": 0.9540481400437637, + "grad_norm": 20.25, + "learning_rate": 1.3653354493291276e-06, + "loss": 0.9223, + "step": 3924 + }, + { + "epoch": 0.9542912715779237, + "grad_norm": 14.75, + "learning_rate": 1.364854239765922e-06, + "loss": 0.494, + "step": 3925 + }, + { + "epoch": 0.9545344031120836, + "grad_norm": 21.875, + "learning_rate": 1.3643730130357662e-06, + "loss": 0.9353, + "step": 3926 + }, + { + "epoch": 0.9547775346462436, + "grad_norm": 18.5, + "learning_rate": 1.3638917692105888e-06, + "loss": 0.7108, + "step": 3927 + }, + { + "epoch": 0.9550206661804036, + "grad_norm": 22.375, + "learning_rate": 1.3634105083623191e-06, + "loss": 0.9413, + "step": 3928 + }, + { + "epoch": 0.9552637977145636, + "grad_norm": 34.0, + "learning_rate": 1.3629292305628905e-06, + "loss": 0.3229, + "step": 3929 + }, + { + "epoch": 0.9555069292487236, + "grad_norm": 18.5, + "learning_rate": 1.362447935884238e-06, + "loss": 1.3861, + "step": 3930 + }, + { + "epoch": 0.9557500607828835, + "grad_norm": 19.375, + "learning_rate": 1.3619666243982993e-06, + "loss": 0.6782, + "step": 3931 + }, + { + "epoch": 0.9559931923170435, + "grad_norm": 16.875, + "learning_rate": 1.361485296177015e-06, + "loss": 1.1356, + "step": 3932 + }, + { + "epoch": 0.9562363238512035, + "grad_norm": 18.75, + "learning_rate": 1.3610039512923278e-06, + "loss": 0.9059, + "step": 3933 + }, + { + "epoch": 0.9564794553853635, + "grad_norm": 18.25, + "learning_rate": 1.3605225898161828e-06, + "loss": 0.7756, + "step": 3934 + }, + { + "epoch": 0.9567225869195235, + "grad_norm": 17.125, + "learning_rate": 1.360041211820528e-06, + "loss": 0.6717, + "step": 3935 + }, + { + "epoch": 0.9569657184536834, + "grad_norm": 16.0, + "learning_rate": 1.3595598173773137e-06, + "loss": 0.621, + "step": 3936 + }, + { + "epoch": 0.9572088499878434, + "grad_norm": 36.25, + "learning_rate": 1.3590784065584927e-06, + "loss": 1.2804, + "step": 3937 + }, + { + "epoch": 0.9574519815220034, + "grad_norm": 21.5, + "learning_rate": 1.3585969794360197e-06, + "loss": 0.6779, + "step": 3938 + }, + { + "epoch": 0.9576951130561634, + "grad_norm": 19.375, + "learning_rate": 1.3581155360818526e-06, + "loss": 1.0433, + "step": 3939 + }, + { + "epoch": 0.9579382445903234, + "grad_norm": 20.5, + "learning_rate": 1.3576340765679516e-06, + "loss": 1.1591, + "step": 3940 + }, + { + "epoch": 0.9581813761244834, + "grad_norm": 14.9375, + "learning_rate": 1.3571526009662784e-06, + "loss": 0.6295, + "step": 3941 + }, + { + "epoch": 0.9584245076586433, + "grad_norm": 15.625, + "learning_rate": 1.356671109348799e-06, + "loss": 0.7017, + "step": 3942 + }, + { + "epoch": 0.9586676391928033, + "grad_norm": 23.875, + "learning_rate": 1.3561896017874799e-06, + "loss": 0.8728, + "step": 3943 + }, + { + "epoch": 0.9589107707269633, + "grad_norm": 41.75, + "learning_rate": 1.355708078354291e-06, + "loss": 0.951, + "step": 3944 + }, + { + "epoch": 0.9591539022611233, + "grad_norm": 23.125, + "learning_rate": 1.3552265391212038e-06, + "loss": 1.0707, + "step": 3945 + }, + { + "epoch": 0.9593970337952833, + "grad_norm": 19.625, + "learning_rate": 1.3547449841601935e-06, + "loss": 1.3283, + "step": 3946 + }, + { + "epoch": 0.9596401653294432, + "grad_norm": 21.0, + "learning_rate": 1.354263413543236e-06, + "loss": 0.6235, + "step": 3947 + }, + { + "epoch": 0.9598832968636032, + "grad_norm": 21.125, + "learning_rate": 1.3537818273423103e-06, + "loss": 0.6245, + "step": 3948 + }, + { + "epoch": 0.9601264283977632, + "grad_norm": 25.5, + "learning_rate": 1.3533002256293987e-06, + "loss": 0.8943, + "step": 3949 + }, + { + "epoch": 0.9603695599319232, + "grad_norm": 19.75, + "learning_rate": 1.352818608476484e-06, + "loss": 0.9066, + "step": 3950 + }, + { + "epoch": 0.9606126914660832, + "grad_norm": 18.0, + "learning_rate": 1.3523369759555526e-06, + "loss": 0.6345, + "step": 3951 + }, + { + "epoch": 0.9608558230002431, + "grad_norm": 20.625, + "learning_rate": 1.3518553281385929e-06, + "loss": 1.0054, + "step": 3952 + }, + { + "epoch": 0.9610989545344031, + "grad_norm": 18.625, + "learning_rate": 1.3513736650975947e-06, + "loss": 0.3696, + "step": 3953 + }, + { + "epoch": 0.9613420860685631, + "grad_norm": 21.25, + "learning_rate": 1.3508919869045522e-06, + "loss": 0.9926, + "step": 3954 + }, + { + "epoch": 0.9615852176027231, + "grad_norm": 20.625, + "learning_rate": 1.3504102936314594e-06, + "loss": 1.0307, + "step": 3955 + }, + { + "epoch": 0.9618283491368831, + "grad_norm": 25.75, + "learning_rate": 1.3499285853503146e-06, + "loss": 1.0777, + "step": 3956 + }, + { + "epoch": 0.9620714806710431, + "grad_norm": 17.25, + "learning_rate": 1.349446862133116e-06, + "loss": 0.9713, + "step": 3957 + }, + { + "epoch": 0.962314612205203, + "grad_norm": 19.75, + "learning_rate": 1.348965124051867e-06, + "loss": 0.9016, + "step": 3958 + }, + { + "epoch": 0.962557743739363, + "grad_norm": 19.375, + "learning_rate": 1.348483371178571e-06, + "loss": 0.6962, + "step": 3959 + }, + { + "epoch": 0.962800875273523, + "grad_norm": 22.375, + "learning_rate": 1.3480016035852342e-06, + "loss": 1.2241, + "step": 3960 + }, + { + "epoch": 0.963044006807683, + "grad_norm": 18.125, + "learning_rate": 1.3475198213438651e-06, + "loss": 0.7728, + "step": 3961 + }, + { + "epoch": 0.963287138341843, + "grad_norm": 21.375, + "learning_rate": 1.3470380245264744e-06, + "loss": 0.9239, + "step": 3962 + }, + { + "epoch": 0.9635302698760029, + "grad_norm": 26.75, + "learning_rate": 1.3465562132050752e-06, + "loss": 0.9932, + "step": 3963 + }, + { + "epoch": 0.9637734014101629, + "grad_norm": 21.125, + "learning_rate": 1.3460743874516823e-06, + "loss": 0.8634, + "step": 3964 + }, + { + "epoch": 0.9640165329443229, + "grad_norm": 22.375, + "learning_rate": 1.3455925473383128e-06, + "loss": 1.2086, + "step": 3965 + }, + { + "epoch": 0.9642596644784829, + "grad_norm": 18.0, + "learning_rate": 1.3451106929369864e-06, + "loss": 0.6385, + "step": 3966 + }, + { + "epoch": 0.9645027960126429, + "grad_norm": 15.0625, + "learning_rate": 1.3446288243197242e-06, + "loss": 0.7051, + "step": 3967 + }, + { + "epoch": 0.9647459275468028, + "grad_norm": 24.875, + "learning_rate": 1.3441469415585501e-06, + "loss": 1.1945, + "step": 3968 + }, + { + "epoch": 0.9649890590809628, + "grad_norm": 19.5, + "learning_rate": 1.3436650447254892e-06, + "loss": 0.7287, + "step": 3969 + }, + { + "epoch": 0.9652321906151228, + "grad_norm": 16.875, + "learning_rate": 1.3431831338925699e-06, + "loss": 0.6114, + "step": 3970 + }, + { + "epoch": 0.9654753221492828, + "grad_norm": 15.4375, + "learning_rate": 1.3427012091318224e-06, + "loss": 0.8383, + "step": 3971 + }, + { + "epoch": 0.9657184536834428, + "grad_norm": 17.875, + "learning_rate": 1.3422192705152773e-06, + "loss": 0.697, + "step": 3972 + }, + { + "epoch": 0.9659615852176027, + "grad_norm": 26.0, + "learning_rate": 1.3417373181149704e-06, + "loss": 0.8018, + "step": 3973 + }, + { + "epoch": 0.9662047167517627, + "grad_norm": 26.875, + "learning_rate": 1.3412553520029365e-06, + "loss": 1.0745, + "step": 3974 + }, + { + "epoch": 0.9664478482859227, + "grad_norm": 20.625, + "learning_rate": 1.3407733722512144e-06, + "loss": 1.014, + "step": 3975 + }, + { + "epoch": 0.9666909798200827, + "grad_norm": 29.125, + "learning_rate": 1.3402913789318436e-06, + "loss": 1.9196, + "step": 3976 + }, + { + "epoch": 0.9669341113542427, + "grad_norm": 25.875, + "learning_rate": 1.3398093721168672e-06, + "loss": 0.949, + "step": 3977 + }, + { + "epoch": 0.9671772428884027, + "grad_norm": 23.5, + "learning_rate": 1.3393273518783292e-06, + "loss": 1.3465, + "step": 3978 + }, + { + "epoch": 0.9674203744225626, + "grad_norm": 27.5, + "learning_rate": 1.3388453182882757e-06, + "loss": 0.9278, + "step": 3979 + }, + { + "epoch": 0.9676635059567226, + "grad_norm": 40.5, + "learning_rate": 1.3383632714187547e-06, + "loss": 1.2588, + "step": 3980 + }, + { + "epoch": 0.9679066374908826, + "grad_norm": 18.0, + "learning_rate": 1.3378812113418168e-06, + "loss": 0.7095, + "step": 3981 + }, + { + "epoch": 0.9681497690250426, + "grad_norm": 20.125, + "learning_rate": 1.3373991381295142e-06, + "loss": 1.1938, + "step": 3982 + }, + { + "epoch": 0.9683929005592026, + "grad_norm": 19.875, + "learning_rate": 1.3369170518539013e-06, + "loss": 0.9127, + "step": 3983 + }, + { + "epoch": 0.9686360320933625, + "grad_norm": 15.5, + "learning_rate": 1.3364349525870332e-06, + "loss": 0.596, + "step": 3984 + }, + { + "epoch": 0.9688791636275225, + "grad_norm": 18.125, + "learning_rate": 1.3359528404009691e-06, + "loss": 0.7775, + "step": 3985 + }, + { + "epoch": 0.9691222951616825, + "grad_norm": 24.5, + "learning_rate": 1.3354707153677685e-06, + "loss": 0.8741, + "step": 3986 + }, + { + "epoch": 0.9693654266958425, + "grad_norm": 18.25, + "learning_rate": 1.334988577559493e-06, + "loss": 0.9123, + "step": 3987 + }, + { + "epoch": 0.9696085582300025, + "grad_norm": 15.0625, + "learning_rate": 1.3345064270482072e-06, + "loss": 0.5815, + "step": 3988 + }, + { + "epoch": 0.9698516897641624, + "grad_norm": 16.625, + "learning_rate": 1.3340242639059764e-06, + "loss": 0.8815, + "step": 3989 + }, + { + "epoch": 0.9700948212983224, + "grad_norm": 14.625, + "learning_rate": 1.3335420882048683e-06, + "loss": 0.9273, + "step": 3990 + }, + { + "epoch": 0.9703379528324824, + "grad_norm": 17.125, + "learning_rate": 1.3330599000169519e-06, + "loss": 0.6381, + "step": 3991 + }, + { + "epoch": 0.9705810843666424, + "grad_norm": 36.5, + "learning_rate": 1.3325776994142991e-06, + "loss": 1.6005, + "step": 3992 + }, + { + "epoch": 0.9708242159008024, + "grad_norm": 19.75, + "learning_rate": 1.3320954864689831e-06, + "loss": 0.9846, + "step": 3993 + }, + { + "epoch": 0.9710673474349624, + "grad_norm": 19.625, + "learning_rate": 1.3316132612530786e-06, + "loss": 0.9429, + "step": 3994 + }, + { + "epoch": 0.9713104789691223, + "grad_norm": 22.625, + "learning_rate": 1.3311310238386626e-06, + "loss": 0.8868, + "step": 3995 + }, + { + "epoch": 0.9715536105032823, + "grad_norm": 18.875, + "learning_rate": 1.3306487742978142e-06, + "loss": 0.6734, + "step": 3996 + }, + { + "epoch": 0.9717967420374423, + "grad_norm": 21.75, + "learning_rate": 1.3301665127026137e-06, + "loss": 1.0677, + "step": 3997 + }, + { + "epoch": 0.9720398735716023, + "grad_norm": 15.9375, + "learning_rate": 1.329684239125143e-06, + "loss": 0.7513, + "step": 3998 + }, + { + "epoch": 0.9722830051057623, + "grad_norm": 19.0, + "learning_rate": 1.3292019536374866e-06, + "loss": 1.0038, + "step": 3999 + }, + { + "epoch": 0.9725261366399222, + "grad_norm": 16.875, + "learning_rate": 1.3287196563117308e-06, + "loss": 0.9442, + "step": 4000 + }, + { + "epoch": 0.9727692681740822, + "grad_norm": 27.625, + "learning_rate": 1.3282373472199623e-06, + "loss": 0.8517, + "step": 4001 + }, + { + "epoch": 0.9730123997082422, + "grad_norm": 19.75, + "learning_rate": 1.3277550264342714e-06, + "loss": 1.0273, + "step": 4002 + }, + { + "epoch": 0.9732555312424022, + "grad_norm": 18.125, + "learning_rate": 1.3272726940267485e-06, + "loss": 0.5411, + "step": 4003 + }, + { + "epoch": 0.9734986627765622, + "grad_norm": 19.875, + "learning_rate": 1.3267903500694875e-06, + "loss": 0.998, + "step": 4004 + }, + { + "epoch": 0.973741794310722, + "grad_norm": 24.125, + "learning_rate": 1.3263079946345822e-06, + "loss": 0.9674, + "step": 4005 + }, + { + "epoch": 0.973984925844882, + "grad_norm": 21.875, + "learning_rate": 1.3258256277941291e-06, + "loss": 1.068, + "step": 4006 + }, + { + "epoch": 0.9742280573790421, + "grad_norm": 15.875, + "learning_rate": 1.3253432496202267e-06, + "loss": 0.667, + "step": 4007 + }, + { + "epoch": 0.9744711889132021, + "grad_norm": 15.5625, + "learning_rate": 1.3248608601849741e-06, + "loss": 0.3742, + "step": 4008 + }, + { + "epoch": 0.9747143204473621, + "grad_norm": 20.875, + "learning_rate": 1.3243784595604733e-06, + "loss": 0.6048, + "step": 4009 + }, + { + "epoch": 0.974957451981522, + "grad_norm": 22.75, + "learning_rate": 1.323896047818827e-06, + "loss": 1.2635, + "step": 4010 + }, + { + "epoch": 0.975200583515682, + "grad_norm": 17.5, + "learning_rate": 1.3234136250321403e-06, + "loss": 0.6713, + "step": 4011 + }, + { + "epoch": 0.975443715049842, + "grad_norm": 19.375, + "learning_rate": 1.3229311912725193e-06, + "loss": 0.5918, + "step": 4012 + }, + { + "epoch": 0.975686846584002, + "grad_norm": 16.25, + "learning_rate": 1.322448746612072e-06, + "loss": 0.7757, + "step": 4013 + }, + { + "epoch": 0.975929978118162, + "grad_norm": 20.0, + "learning_rate": 1.321966291122909e-06, + "loss": 0.7835, + "step": 4014 + }, + { + "epoch": 0.976173109652322, + "grad_norm": 17.5, + "learning_rate": 1.3214838248771396e-06, + "loss": 1.0936, + "step": 4015 + }, + { + "epoch": 0.9764162411864818, + "grad_norm": 21.0, + "learning_rate": 1.3210013479468791e-06, + "loss": 0.9025, + "step": 4016 + }, + { + "epoch": 0.9766593727206418, + "grad_norm": 23.5, + "learning_rate": 1.3205188604042407e-06, + "loss": 1.0179, + "step": 4017 + }, + { + "epoch": 0.9769025042548019, + "grad_norm": 19.125, + "learning_rate": 1.3200363623213406e-06, + "loss": 0.9663, + "step": 4018 + }, + { + "epoch": 0.9771456357889619, + "grad_norm": 21.5, + "learning_rate": 1.3195538537702965e-06, + "loss": 0.9284, + "step": 4019 + }, + { + "epoch": 0.9773887673231219, + "grad_norm": 16.5, + "learning_rate": 1.319071334823228e-06, + "loss": 0.912, + "step": 4020 + }, + { + "epoch": 0.9776318988572817, + "grad_norm": 19.625, + "learning_rate": 1.3185888055522556e-06, + "loss": 0.7986, + "step": 4021 + }, + { + "epoch": 0.9778750303914417, + "grad_norm": 24.25, + "learning_rate": 1.3181062660295013e-06, + "loss": 1.0828, + "step": 4022 + }, + { + "epoch": 0.9781181619256017, + "grad_norm": 25.0, + "learning_rate": 1.3176237163270893e-06, + "loss": 0.8634, + "step": 4023 + }, + { + "epoch": 0.9783612934597617, + "grad_norm": 14.75, + "learning_rate": 1.3171411565171452e-06, + "loss": 0.7959, + "step": 4024 + }, + { + "epoch": 0.9786044249939218, + "grad_norm": 20.625, + "learning_rate": 1.3166585866717953e-06, + "loss": 0.7384, + "step": 4025 + }, + { + "epoch": 0.9788475565280816, + "grad_norm": 17.125, + "learning_rate": 1.3161760068631691e-06, + "loss": 0.7498, + "step": 4026 + }, + { + "epoch": 0.9790906880622416, + "grad_norm": 22.0, + "learning_rate": 1.315693417163395e-06, + "loss": 0.7234, + "step": 4027 + }, + { + "epoch": 0.9793338195964016, + "grad_norm": 17.0, + "learning_rate": 1.315210817644606e-06, + "loss": 0.9886, + "step": 4028 + }, + { + "epoch": 0.9795769511305616, + "grad_norm": 18.0, + "learning_rate": 1.3147282083789337e-06, + "loss": 0.9663, + "step": 4029 + }, + { + "epoch": 0.9798200826647216, + "grad_norm": 15.5, + "learning_rate": 1.3142455894385125e-06, + "loss": 0.4486, + "step": 4030 + }, + { + "epoch": 0.9800632141988816, + "grad_norm": 15.75, + "learning_rate": 1.3137629608954785e-06, + "loss": 0.8873, + "step": 4031 + }, + { + "epoch": 0.9803063457330415, + "grad_norm": 22.375, + "learning_rate": 1.3132803228219688e-06, + "loss": 0.794, + "step": 4032 + }, + { + "epoch": 0.9805494772672015, + "grad_norm": 13.9375, + "learning_rate": 1.3127976752901222e-06, + "loss": 0.6404, + "step": 4033 + }, + { + "epoch": 0.9807926088013615, + "grad_norm": 17.125, + "learning_rate": 1.312315018372078e-06, + "loss": 0.9471, + "step": 4034 + }, + { + "epoch": 0.9810357403355215, + "grad_norm": 21.0, + "learning_rate": 1.3118323521399787e-06, + "loss": 1.1323, + "step": 4035 + }, + { + "epoch": 0.9812788718696815, + "grad_norm": 12.6875, + "learning_rate": 1.3113496766659661e-06, + "loss": 0.3732, + "step": 4036 + }, + { + "epoch": 0.9815220034038414, + "grad_norm": 21.125, + "learning_rate": 1.3108669920221848e-06, + "loss": 0.6477, + "step": 4037 + }, + { + "epoch": 0.9817651349380014, + "grad_norm": 18.0, + "learning_rate": 1.3103842982807802e-06, + "loss": 0.7884, + "step": 4038 + }, + { + "epoch": 0.9820082664721614, + "grad_norm": 22.25, + "learning_rate": 1.3099015955138997e-06, + "loss": 0.7812, + "step": 4039 + }, + { + "epoch": 0.9822513980063214, + "grad_norm": 17.25, + "learning_rate": 1.3094188837936912e-06, + "loss": 0.9805, + "step": 4040 + }, + { + "epoch": 0.9824945295404814, + "grad_norm": 18.75, + "learning_rate": 1.3089361631923043e-06, + "loss": 0.8195, + "step": 4041 + }, + { + "epoch": 0.9827376610746413, + "grad_norm": 21.75, + "learning_rate": 1.3084534337818896e-06, + "loss": 1.1097, + "step": 4042 + }, + { + "epoch": 0.9829807926088013, + "grad_norm": 28.125, + "learning_rate": 1.3079706956345997e-06, + "loss": 1.3946, + "step": 4043 + }, + { + "epoch": 0.9832239241429613, + "grad_norm": 23.125, + "learning_rate": 1.3074879488225883e-06, + "loss": 0.8835, + "step": 4044 + }, + { + "epoch": 0.9834670556771213, + "grad_norm": 23.125, + "learning_rate": 1.3070051934180106e-06, + "loss": 0.9037, + "step": 4045 + }, + { + "epoch": 0.9837101872112813, + "grad_norm": 17.0, + "learning_rate": 1.3065224294930213e-06, + "loss": 0.6628, + "step": 4046 + }, + { + "epoch": 0.9839533187454412, + "grad_norm": 18.75, + "learning_rate": 1.3060396571197794e-06, + "loss": 0.8511, + "step": 4047 + }, + { + "epoch": 0.9841964502796012, + "grad_norm": 17.75, + "learning_rate": 1.3055568763704425e-06, + "loss": 0.9845, + "step": 4048 + }, + { + "epoch": 0.9844395818137612, + "grad_norm": 17.5, + "learning_rate": 1.3050740873171714e-06, + "loss": 0.873, + "step": 4049 + }, + { + "epoch": 0.9846827133479212, + "grad_norm": 23.375, + "learning_rate": 1.3045912900321264e-06, + "loss": 1.1688, + "step": 4050 + }, + { + "epoch": 0.9849258448820812, + "grad_norm": 18.5, + "learning_rate": 1.3041084845874705e-06, + "loss": 0.71, + "step": 4051 + }, + { + "epoch": 0.9851689764162412, + "grad_norm": 19.5, + "learning_rate": 1.303625671055367e-06, + "loss": 0.6832, + "step": 4052 + }, + { + "epoch": 0.9854121079504011, + "grad_norm": 18.75, + "learning_rate": 1.3031428495079807e-06, + "loss": 0.6501, + "step": 4053 + }, + { + "epoch": 0.9856552394845611, + "grad_norm": 31.5, + "learning_rate": 1.302660020017478e-06, + "loss": 0.9338, + "step": 4054 + }, + { + "epoch": 0.9858983710187211, + "grad_norm": 24.875, + "learning_rate": 1.3021771826560256e-06, + "loss": 0.9302, + "step": 4055 + }, + { + "epoch": 0.9861415025528811, + "grad_norm": 15.9375, + "learning_rate": 1.3016943374957922e-06, + "loss": 1.0072, + "step": 4056 + }, + { + "epoch": 0.9863846340870411, + "grad_norm": 20.875, + "learning_rate": 1.301211484608947e-06, + "loss": 0.7759, + "step": 4057 + }, + { + "epoch": 0.986627765621201, + "grad_norm": 20.5, + "learning_rate": 1.3007286240676614e-06, + "loss": 0.9501, + "step": 4058 + }, + { + "epoch": 0.986870897155361, + "grad_norm": 12.625, + "learning_rate": 1.300245755944107e-06, + "loss": 0.356, + "step": 4059 + }, + { + "epoch": 0.987114028689521, + "grad_norm": 19.75, + "learning_rate": 1.2997628803104563e-06, + "loss": 0.8057, + "step": 4060 + }, + { + "epoch": 0.987357160223681, + "grad_norm": 29.75, + "learning_rate": 1.2992799972388836e-06, + "loss": 0.9597, + "step": 4061 + }, + { + "epoch": 0.987600291757841, + "grad_norm": 18.75, + "learning_rate": 1.2987971068015643e-06, + "loss": 0.9781, + "step": 4062 + }, + { + "epoch": 0.9878434232920009, + "grad_norm": 14.625, + "learning_rate": 1.2983142090706744e-06, + "loss": 0.5611, + "step": 4063 + }, + { + "epoch": 0.9880865548261609, + "grad_norm": 22.125, + "learning_rate": 1.297831304118392e-06, + "loss": 1.1952, + "step": 4064 + }, + { + "epoch": 0.9883296863603209, + "grad_norm": 25.75, + "learning_rate": 1.2973483920168948e-06, + "loss": 1.1019, + "step": 4065 + }, + { + "epoch": 0.9885728178944809, + "grad_norm": 18.375, + "learning_rate": 1.2968654728383629e-06, + "loss": 0.9886, + "step": 4066 + }, + { + "epoch": 0.9888159494286409, + "grad_norm": 16.75, + "learning_rate": 1.2963825466549765e-06, + "loss": 0.7126, + "step": 4067 + }, + { + "epoch": 0.9890590809628009, + "grad_norm": 18.875, + "learning_rate": 1.2958996135389174e-06, + "loss": 0.6975, + "step": 4068 + }, + { + "epoch": 0.9893022124969608, + "grad_norm": 28.125, + "learning_rate": 1.2954166735623682e-06, + "loss": 1.2519, + "step": 4069 + }, + { + "epoch": 0.9895453440311208, + "grad_norm": 16.625, + "learning_rate": 1.294933726797513e-06, + "loss": 0.7895, + "step": 4070 + }, + { + "epoch": 0.9897884755652808, + "grad_norm": 14.5, + "learning_rate": 1.2944507733165367e-06, + "loss": 0.8475, + "step": 4071 + }, + { + "epoch": 0.9900316070994408, + "grad_norm": 63.5, + "learning_rate": 1.293967813191624e-06, + "loss": 1.0635, + "step": 4072 + }, + { + "epoch": 0.9902747386336008, + "grad_norm": 20.875, + "learning_rate": 1.2934848464949625e-06, + "loss": 0.9305, + "step": 4073 + }, + { + "epoch": 0.9905178701677607, + "grad_norm": 20.25, + "learning_rate": 1.29300187329874e-06, + "loss": 0.7422, + "step": 4074 + }, + { + "epoch": 0.9907610017019207, + "grad_norm": 17.5, + "learning_rate": 1.2925188936751443e-06, + "loss": 1.0064, + "step": 4075 + }, + { + "epoch": 0.9910041332360807, + "grad_norm": 17.875, + "learning_rate": 1.2920359076963663e-06, + "loss": 0.786, + "step": 4076 + }, + { + "epoch": 0.9912472647702407, + "grad_norm": 25.125, + "learning_rate": 1.291552915434595e-06, + "loss": 0.7397, + "step": 4077 + }, + { + "epoch": 0.9914903963044007, + "grad_norm": 17.375, + "learning_rate": 1.2910699169620235e-06, + "loss": 0.6564, + "step": 4078 + }, + { + "epoch": 0.9917335278385606, + "grad_norm": 20.375, + "learning_rate": 1.2905869123508435e-06, + "loss": 0.6919, + "step": 4079 + }, + { + "epoch": 0.9919766593727206, + "grad_norm": 14.6875, + "learning_rate": 1.290103901673248e-06, + "loss": 0.3727, + "step": 4080 + }, + { + "epoch": 0.9922197909068806, + "grad_norm": 16.75, + "learning_rate": 1.2896208850014325e-06, + "loss": 0.8732, + "step": 4081 + }, + { + "epoch": 0.9924629224410406, + "grad_norm": 19.625, + "learning_rate": 1.2891378624075912e-06, + "loss": 0.8371, + "step": 4082 + }, + { + "epoch": 0.9927060539752006, + "grad_norm": 17.0, + "learning_rate": 1.2886548339639205e-06, + "loss": 0.646, + "step": 4083 + }, + { + "epoch": 0.9929491855093605, + "grad_norm": 13.8125, + "learning_rate": 1.288171799742617e-06, + "loss": 0.6926, + "step": 4084 + }, + { + "epoch": 0.9931923170435205, + "grad_norm": 18.375, + "learning_rate": 1.287688759815879e-06, + "loss": 0.4883, + "step": 4085 + }, + { + "epoch": 0.9934354485776805, + "grad_norm": 22.5, + "learning_rate": 1.2872057142559049e-06, + "loss": 0.7492, + "step": 4086 + }, + { + "epoch": 0.9936785801118405, + "grad_norm": 18.5, + "learning_rate": 1.2867226631348943e-06, + "loss": 0.9716, + "step": 4087 + }, + { + "epoch": 0.9939217116460005, + "grad_norm": 20.0, + "learning_rate": 1.2862396065250473e-06, + "loss": 1.0586, + "step": 4088 + }, + { + "epoch": 0.9941648431801605, + "grad_norm": 14.5625, + "learning_rate": 1.285756544498565e-06, + "loss": 0.5721, + "step": 4089 + }, + { + "epoch": 0.9944079747143204, + "grad_norm": 22.375, + "learning_rate": 1.2852734771276504e-06, + "loss": 0.9043, + "step": 4090 + }, + { + "epoch": 0.9946511062484804, + "grad_norm": 15.4375, + "learning_rate": 1.284790404484505e-06, + "loss": 0.4759, + "step": 4091 + }, + { + "epoch": 0.9948942377826404, + "grad_norm": 21.0, + "learning_rate": 1.2843073266413323e-06, + "loss": 1.0459, + "step": 4092 + }, + { + "epoch": 0.9951373693168004, + "grad_norm": 20.875, + "learning_rate": 1.2838242436703377e-06, + "loss": 1.2038, + "step": 4093 + }, + { + "epoch": 0.9953805008509604, + "grad_norm": 23.375, + "learning_rate": 1.2833411556437255e-06, + "loss": 0.9746, + "step": 4094 + }, + { + "epoch": 0.9956236323851203, + "grad_norm": 14.9375, + "learning_rate": 1.2828580626337024e-06, + "loss": 0.5975, + "step": 4095 + }, + { + "epoch": 0.9958667639192803, + "grad_norm": 19.875, + "learning_rate": 1.2823749647124733e-06, + "loss": 0.7218, + "step": 4096 + }, + { + "epoch": 0.9961098954534403, + "grad_norm": 16.875, + "learning_rate": 1.2818918619522471e-06, + "loss": 0.5233, + "step": 4097 + }, + { + "epoch": 0.9963530269876003, + "grad_norm": 21.5, + "learning_rate": 1.2814087544252316e-06, + "loss": 0.8869, + "step": 4098 + }, + { + "epoch": 0.9965961585217603, + "grad_norm": 27.875, + "learning_rate": 1.2809256422036351e-06, + "loss": 0.9395, + "step": 4099 + }, + { + "epoch": 0.9968392900559202, + "grad_norm": 20.375, + "learning_rate": 1.2804425253596672e-06, + "loss": 0.9283, + "step": 4100 + }, + { + "epoch": 0.9970824215900802, + "grad_norm": 18.0, + "learning_rate": 1.279959403965538e-06, + "loss": 0.5885, + "step": 4101 + }, + { + "epoch": 0.9973255531242402, + "grad_norm": 21.125, + "learning_rate": 1.2794762780934588e-06, + "loss": 0.929, + "step": 4102 + }, + { + "epoch": 0.9975686846584002, + "grad_norm": 16.75, + "learning_rate": 1.2789931478156406e-06, + "loss": 0.8388, + "step": 4103 + }, + { + "epoch": 0.9978118161925602, + "grad_norm": 16.75, + "learning_rate": 1.2785100132042954e-06, + "loss": 0.6561, + "step": 4104 + }, + { + "epoch": 0.9980549477267202, + "grad_norm": 18.125, + "learning_rate": 1.2780268743316369e-06, + "loss": 0.4373, + "step": 4105 + }, + { + "epoch": 0.9982980792608801, + "grad_norm": 25.625, + "learning_rate": 1.2775437312698776e-06, + "loss": 1.1276, + "step": 4106 + }, + { + "epoch": 0.9985412107950401, + "grad_norm": 23.25, + "learning_rate": 1.277060584091232e-06, + "loss": 1.1631, + "step": 4107 + }, + { + "epoch": 0.9987843423292001, + "grad_norm": 13.0, + "learning_rate": 1.2765774328679147e-06, + "loss": 0.3814, + "step": 4108 + }, + { + "epoch": 0.9990274738633601, + "grad_norm": 21.875, + "learning_rate": 1.2760942776721414e-06, + "loss": 1.1216, + "step": 4109 + }, + { + "epoch": 0.9992706053975201, + "grad_norm": 15.75, + "learning_rate": 1.2756111185761277e-06, + "loss": 0.5756, + "step": 4110 + }, + { + "epoch": 0.99951373693168, + "grad_norm": 17.75, + "learning_rate": 1.2751279556520893e-06, + "loss": 0.7563, + "step": 4111 + }, + { + "epoch": 0.99975686846584, + "grad_norm": 16.375, + "learning_rate": 1.2746447889722446e-06, + "loss": 0.8834, + "step": 4112 + }, + { + "epoch": 1.0, + "grad_norm": 17.875, + "learning_rate": 1.2741616186088103e-06, + "loss": 0.9585, + "step": 4113 + }, + { + "epoch": 1.0, + "eval_loss": 1.1157740354537964, + "eval_runtime": 98.4299, + "eval_samples_per_second": 5.689, + "eval_steps_per_second": 5.689, + "step": 4113 + }, + { + "epoch": 1.00024313153416, + "grad_norm": 16.375, + "learning_rate": 1.273678444634005e-06, + "loss": 0.6687, + "step": 4114 } ], "logging_steps": 1, @@ -14425,7 +28832,7 @@ "attributes": {} } }, - "total_flos": 1.157726307530834e+18, + "total_flos": 2.315452615061668e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null