diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 2.968063872255489, + "epoch": 3.952428476380572, "eval_steps": 500, - "global_step": 2253, + "global_step": 3004, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15778,6 +15778,5263 @@ "learning_rate": 7.368764016743712e-07, "loss": 0.7304, "step": 2253 + }, + { + "epoch": 2.97, + "grad_norm": 2.734375, + "learning_rate": 7.350176020987585e-07, + "loss": 0.7597, + "step": 2254 + }, + { + "epoch": 2.97, + "grad_norm": 2.84375, + "learning_rate": 7.331607458081169e-07, + "loss": 0.7793, + "step": 2255 + }, + { + "epoch": 2.97, + "grad_norm": 2.6875, + "learning_rate": 7.313058348468866e-07, + "loss": 0.7716, + "step": 2256 + }, + { + "epoch": 2.97, + "grad_norm": 2.65625, + "learning_rate": 7.29452871257367e-07, + "loss": 0.7296, + "step": 2257 + }, + { + "epoch": 2.97, + "grad_norm": 2.75, + "learning_rate": 7.276018570797119e-07, + "loss": 0.766, + "step": 2258 + }, + { + "epoch": 2.98, + "grad_norm": 2.671875, + "learning_rate": 7.257527943519299e-07, + "loss": 0.7454, + "step": 2259 + }, + { + "epoch": 2.98, + "grad_norm": 2.65625, + "learning_rate": 7.239056851098785e-07, + "loss": 0.7622, + "step": 2260 + }, + { + "epoch": 2.98, + "grad_norm": 2.75, + "learning_rate": 7.220605313872672e-07, + "loss": 0.7468, + "step": 2261 + }, + { + "epoch": 2.98, + "grad_norm": 2.65625, + "learning_rate": 7.202173352156511e-07, + "loss": 0.7373, + "step": 2262 + }, + { + "epoch": 2.98, + "grad_norm": 2.625, + "learning_rate": 7.183760986244307e-07, + "loss": 0.7259, + "step": 2263 + }, + { + "epoch": 2.98, + "grad_norm": 2.703125, + "learning_rate": 7.16536823640848e-07, + "loss": 0.755, + "step": 2264 + }, + { + "epoch": 2.98, + "grad_norm": 2.6875, + "learning_rate": 7.146995122899861e-07, + "loss": 0.7958, + "step": 2265 + }, + { + "epoch": 2.99, + "grad_norm": 2.703125, + "learning_rate": 7.128641665947658e-07, + "loss": 0.7552, + "step": 2266 + }, + { + "epoch": 2.99, + "grad_norm": 2.78125, + "learning_rate": 7.110307885759435e-07, + "loss": 0.7518, + "step": 2267 + }, + { + "epoch": 2.99, + "grad_norm": 2.703125, + "learning_rate": 7.091993802521094e-07, + "loss": 0.7552, + "step": 2268 + }, + { + "epoch": 2.99, + "grad_norm": 2.703125, + "learning_rate": 7.073699436396847e-07, + "loss": 0.7494, + "step": 2269 + }, + { + "epoch": 2.99, + "grad_norm": 2.734375, + "learning_rate": 7.055424807529199e-07, + "loss": 0.7877, + "step": 2270 + }, + { + "epoch": 2.99, + "grad_norm": 2.6875, + "learning_rate": 7.037169936038935e-07, + "loss": 0.7652, + "step": 2271 + }, + { + "epoch": 2.99, + "grad_norm": 2.734375, + "learning_rate": 7.018934842025058e-07, + "loss": 0.75, + "step": 2272 + }, + { + "epoch": 2.99, + "grad_norm": 2.6875, + "learning_rate": 7.000719545564819e-07, + "loss": 0.738, + "step": 2273 + }, + { + "epoch": 3.0, + "grad_norm": 2.703125, + "learning_rate": 6.982524066713658e-07, + "loss": 0.7532, + "step": 2274 + }, + { + "epoch": 3.0, + "grad_norm": 2.75, + "learning_rate": 6.964348425505222e-07, + "loss": 0.7637, + "step": 2275 + }, + { + "epoch": 3.0, + "grad_norm": 2.6875, + "learning_rate": 6.946192641951285e-07, + "loss": 0.7558, + "step": 2276 + }, + { + "epoch": 3.0, + "grad_norm": 2.703125, + "learning_rate": 6.928056736041783e-07, + "loss": 0.7313, + "step": 2277 + }, + { + "epoch": 3.0, + "grad_norm": 2.703125, + "learning_rate": 6.90994072774473e-07, + "loss": 0.7377, + "step": 2278 + }, + { + "epoch": 3.0, + "grad_norm": 2.75, + "learning_rate": 6.891844637006268e-07, + "loss": 0.733, + "step": 2279 + }, + { + "epoch": 3.0, + "grad_norm": 2.609375, + "learning_rate": 6.873768483750595e-07, + "loss": 0.7361, + "step": 2280 + }, + { + "epoch": 3.01, + "grad_norm": 2.6875, + "learning_rate": 6.855712287879957e-07, + "loss": 0.7741, + "step": 2281 + }, + { + "epoch": 3.01, + "grad_norm": 2.75, + "learning_rate": 6.837676069274632e-07, + "loss": 0.7451, + "step": 2282 + }, + { + "epoch": 3.01, + "grad_norm": 2.734375, + "learning_rate": 6.819659847792895e-07, + "loss": 0.7632, + "step": 2283 + }, + { + "epoch": 3.01, + "grad_norm": 2.75, + "learning_rate": 6.801663643271012e-07, + "loss": 0.765, + "step": 2284 + }, + { + "epoch": 3.01, + "grad_norm": 2.71875, + "learning_rate": 6.783687475523199e-07, + "loss": 0.7495, + "step": 2285 + }, + { + "epoch": 3.01, + "grad_norm": 2.765625, + "learning_rate": 6.765731364341622e-07, + "loss": 0.7506, + "step": 2286 + }, + { + "epoch": 3.01, + "grad_norm": 2.703125, + "learning_rate": 6.747795329496359e-07, + "loss": 0.7544, + "step": 2287 + }, + { + "epoch": 3.01, + "grad_norm": 2.703125, + "learning_rate": 6.729879390735391e-07, + "loss": 0.7334, + "step": 2288 + }, + { + "epoch": 3.0, + "grad_norm": 2.859375, + "learning_rate": 6.711983567784552e-07, + "loss": 0.7458, + "step": 2289 + }, + { + "epoch": 3.0, + "grad_norm": 2.734375, + "learning_rate": 6.69410788034755e-07, + "loss": 0.736, + "step": 2290 + }, + { + "epoch": 3.0, + "grad_norm": 2.8125, + "learning_rate": 6.676252348105916e-07, + "loss": 0.7523, + "step": 2291 + }, + { + "epoch": 3.0, + "grad_norm": 2.9375, + "learning_rate": 6.658416990718986e-07, + "loss": 0.743, + "step": 2292 + }, + { + "epoch": 3.01, + "grad_norm": 2.671875, + "learning_rate": 6.640601827823878e-07, + "loss": 0.7135, + "step": 2293 + }, + { + "epoch": 3.01, + "grad_norm": 2.859375, + "learning_rate": 6.622806879035512e-07, + "loss": 0.7385, + "step": 2294 + }, + { + "epoch": 3.01, + "grad_norm": 2.828125, + "learning_rate": 6.605032163946496e-07, + "loss": 0.7395, + "step": 2295 + }, + { + "epoch": 3.01, + "grad_norm": 2.875, + "learning_rate": 6.587277702127196e-07, + "loss": 0.7255, + "step": 2296 + }, + { + "epoch": 3.01, + "grad_norm": 2.84375, + "learning_rate": 6.56954351312567e-07, + "loss": 0.754, + "step": 2297 + }, + { + "epoch": 3.01, + "grad_norm": 2.6875, + "learning_rate": 6.551829616467656e-07, + "loss": 0.7245, + "step": 2298 + }, + { + "epoch": 3.01, + "grad_norm": 2.75, + "learning_rate": 6.534136031656552e-07, + "loss": 0.7448, + "step": 2299 + }, + { + "epoch": 3.02, + "grad_norm": 2.6875, + "learning_rate": 6.516462778173383e-07, + "loss": 0.7134, + "step": 2300 + }, + { + "epoch": 3.02, + "grad_norm": 2.75, + "learning_rate": 6.498809875476799e-07, + "loss": 0.7394, + "step": 2301 + }, + { + "epoch": 3.02, + "grad_norm": 2.8125, + "learning_rate": 6.481177343003043e-07, + "loss": 0.7496, + "step": 2302 + }, + { + "epoch": 3.02, + "grad_norm": 2.828125, + "learning_rate": 6.463565200165919e-07, + "loss": 0.75, + "step": 2303 + }, + { + "epoch": 3.02, + "grad_norm": 2.78125, + "learning_rate": 6.445973466356794e-07, + "loss": 0.7282, + "step": 2304 + }, + { + "epoch": 3.02, + "grad_norm": 2.6875, + "learning_rate": 6.428402160944558e-07, + "loss": 0.7266, + "step": 2305 + }, + { + "epoch": 3.02, + "grad_norm": 2.8125, + "learning_rate": 6.410851303275617e-07, + "loss": 0.7538, + "step": 2306 + }, + { + "epoch": 3.02, + "grad_norm": 2.703125, + "learning_rate": 6.393320912673842e-07, + "loss": 0.7137, + "step": 2307 + }, + { + "epoch": 3.03, + "grad_norm": 2.671875, + "learning_rate": 6.375811008440591e-07, + "loss": 0.7237, + "step": 2308 + }, + { + "epoch": 3.03, + "grad_norm": 2.90625, + "learning_rate": 6.358321609854659e-07, + "loss": 0.7391, + "step": 2309 + }, + { + "epoch": 3.03, + "grad_norm": 2.84375, + "learning_rate": 6.340852736172265e-07, + "loss": 0.7325, + "step": 2310 + }, + { + "epoch": 3.03, + "grad_norm": 2.78125, + "learning_rate": 6.323404406627018e-07, + "loss": 0.7259, + "step": 2311 + }, + { + "epoch": 3.03, + "grad_norm": 2.75, + "learning_rate": 6.305976640429945e-07, + "loss": 0.7299, + "step": 2312 + }, + { + "epoch": 3.03, + "grad_norm": 2.796875, + "learning_rate": 6.288569456769377e-07, + "loss": 0.7378, + "step": 2313 + }, + { + "epoch": 3.03, + "grad_norm": 2.75, + "learning_rate": 6.271182874811024e-07, + "loss": 0.7302, + "step": 2314 + }, + { + "epoch": 3.04, + "grad_norm": 2.796875, + "learning_rate": 6.253816913697896e-07, + "loss": 0.7195, + "step": 2315 + }, + { + "epoch": 3.04, + "grad_norm": 2.65625, + "learning_rate": 6.236471592550305e-07, + "loss": 0.7367, + "step": 2316 + }, + { + "epoch": 3.04, + "grad_norm": 2.8125, + "learning_rate": 6.219146930465838e-07, + "loss": 0.7477, + "step": 2317 + }, + { + "epoch": 3.04, + "grad_norm": 2.71875, + "learning_rate": 6.201842946519343e-07, + "loss": 0.7284, + "step": 2318 + }, + { + "epoch": 3.04, + "grad_norm": 2.84375, + "learning_rate": 6.184559659762865e-07, + "loss": 0.741, + "step": 2319 + }, + { + "epoch": 3.04, + "grad_norm": 2.71875, + "learning_rate": 6.167297089225713e-07, + "loss": 0.7472, + "step": 2320 + }, + { + "epoch": 3.04, + "grad_norm": 2.734375, + "learning_rate": 6.150055253914353e-07, + "loss": 0.724, + "step": 2321 + }, + { + "epoch": 3.04, + "grad_norm": 2.75, + "learning_rate": 6.132834172812432e-07, + "loss": 0.7345, + "step": 2322 + }, + { + "epoch": 3.05, + "grad_norm": 2.765625, + "learning_rate": 6.115633864880743e-07, + "loss": 0.7521, + "step": 2323 + }, + { + "epoch": 3.05, + "grad_norm": 2.796875, + "learning_rate": 6.098454349057218e-07, + "loss": 0.7718, + "step": 2324 + }, + { + "epoch": 3.05, + "grad_norm": 2.84375, + "learning_rate": 6.081295644256868e-07, + "loss": 0.7365, + "step": 2325 + }, + { + "epoch": 3.05, + "grad_norm": 2.75, + "learning_rate": 6.064157769371823e-07, + "loss": 0.7374, + "step": 2326 + }, + { + "epoch": 3.05, + "grad_norm": 2.65625, + "learning_rate": 6.04704074327126e-07, + "loss": 0.7485, + "step": 2327 + }, + { + "epoch": 3.05, + "grad_norm": 2.75, + "learning_rate": 6.029944584801408e-07, + "loss": 0.7654, + "step": 2328 + }, + { + "epoch": 3.05, + "grad_norm": 2.78125, + "learning_rate": 6.012869312785513e-07, + "loss": 0.7542, + "step": 2329 + }, + { + "epoch": 3.06, + "grad_norm": 2.75, + "learning_rate": 5.995814946023848e-07, + "loss": 0.7427, + "step": 2330 + }, + { + "epoch": 3.06, + "grad_norm": 2.78125, + "learning_rate": 5.978781503293634e-07, + "loss": 0.7387, + "step": 2331 + }, + { + "epoch": 3.06, + "grad_norm": 2.78125, + "learning_rate": 5.961769003349077e-07, + "loss": 0.7572, + "step": 2332 + }, + { + "epoch": 3.06, + "grad_norm": 2.8125, + "learning_rate": 5.944777464921317e-07, + "loss": 0.7625, + "step": 2333 + }, + { + "epoch": 3.06, + "grad_norm": 2.671875, + "learning_rate": 5.92780690671842e-07, + "loss": 0.7181, + "step": 2334 + }, + { + "epoch": 3.06, + "grad_norm": 2.65625, + "learning_rate": 5.910857347425345e-07, + "loss": 0.7116, + "step": 2335 + }, + { + "epoch": 3.06, + "grad_norm": 2.765625, + "learning_rate": 5.893928805703943e-07, + "loss": 0.7554, + "step": 2336 + }, + { + "epoch": 3.06, + "grad_norm": 2.828125, + "learning_rate": 5.877021300192895e-07, + "loss": 0.7872, + "step": 2337 + }, + { + "epoch": 3.07, + "grad_norm": 2.734375, + "learning_rate": 5.860134849507765e-07, + "loss": 0.7269, + "step": 2338 + }, + { + "epoch": 3.07, + "grad_norm": 2.78125, + "learning_rate": 5.843269472240901e-07, + "loss": 0.7573, + "step": 2339 + }, + { + "epoch": 3.07, + "grad_norm": 2.75, + "learning_rate": 5.82642518696146e-07, + "loss": 0.735, + "step": 2340 + }, + { + "epoch": 3.07, + "grad_norm": 2.78125, + "learning_rate": 5.809602012215385e-07, + "loss": 0.75, + "step": 2341 + }, + { + "epoch": 3.07, + "grad_norm": 2.71875, + "learning_rate": 5.79279996652535e-07, + "loss": 0.6935, + "step": 2342 + }, + { + "epoch": 3.07, + "grad_norm": 2.796875, + "learning_rate": 5.776019068390792e-07, + "loss": 0.7396, + "step": 2343 + }, + { + "epoch": 3.07, + "grad_norm": 2.71875, + "learning_rate": 5.759259336287851e-07, + "loss": 0.7488, + "step": 2344 + }, + { + "epoch": 3.08, + "grad_norm": 2.78125, + "learning_rate": 5.742520788669367e-07, + "loss": 0.7453, + "step": 2345 + }, + { + "epoch": 3.08, + "grad_norm": 2.78125, + "learning_rate": 5.725803443964853e-07, + "loss": 0.7617, + "step": 2346 + }, + { + "epoch": 3.08, + "grad_norm": 2.78125, + "learning_rate": 5.709107320580479e-07, + "loss": 0.7488, + "step": 2347 + }, + { + "epoch": 3.08, + "grad_norm": 2.6875, + "learning_rate": 5.692432436899051e-07, + "loss": 0.7188, + "step": 2348 + }, + { + "epoch": 3.08, + "grad_norm": 2.734375, + "learning_rate": 5.675778811279983e-07, + "loss": 0.7247, + "step": 2349 + }, + { + "epoch": 3.08, + "grad_norm": 2.765625, + "learning_rate": 5.659146462059292e-07, + "loss": 0.7439, + "step": 2350 + }, + { + "epoch": 3.08, + "grad_norm": 2.671875, + "learning_rate": 5.642535407549565e-07, + "loss": 0.7298, + "step": 2351 + }, + { + "epoch": 3.08, + "grad_norm": 2.71875, + "learning_rate": 5.625945666039942e-07, + "loss": 0.7119, + "step": 2352 + }, + { + "epoch": 3.09, + "grad_norm": 2.640625, + "learning_rate": 5.609377255796106e-07, + "loss": 0.7445, + "step": 2353 + }, + { + "epoch": 3.09, + "grad_norm": 2.734375, + "learning_rate": 5.592830195060234e-07, + "loss": 0.7476, + "step": 2354 + }, + { + "epoch": 3.09, + "grad_norm": 2.65625, + "learning_rate": 5.576304502051011e-07, + "loss": 0.7004, + "step": 2355 + }, + { + "epoch": 3.09, + "grad_norm": 2.734375, + "learning_rate": 5.559800194963591e-07, + "loss": 0.7178, + "step": 2356 + }, + { + "epoch": 3.09, + "grad_norm": 2.71875, + "learning_rate": 5.543317291969597e-07, + "loss": 0.7292, + "step": 2357 + }, + { + "epoch": 3.09, + "grad_norm": 2.8125, + "learning_rate": 5.526855811217064e-07, + "loss": 0.7513, + "step": 2358 + }, + { + "epoch": 3.09, + "grad_norm": 2.734375, + "learning_rate": 5.510415770830457e-07, + "loss": 0.7337, + "step": 2359 + }, + { + "epoch": 3.1, + "grad_norm": 2.78125, + "learning_rate": 5.493997188910613e-07, + "loss": 0.7513, + "step": 2360 + }, + { + "epoch": 3.1, + "grad_norm": 2.6875, + "learning_rate": 5.477600083534764e-07, + "loss": 0.7437, + "step": 2361 + }, + { + "epoch": 3.1, + "grad_norm": 2.75, + "learning_rate": 5.46122447275649e-07, + "loss": 0.741, + "step": 2362 + }, + { + "epoch": 3.1, + "grad_norm": 2.78125, + "learning_rate": 5.444870374605704e-07, + "loss": 0.7225, + "step": 2363 + }, + { + "epoch": 3.1, + "grad_norm": 2.8125, + "learning_rate": 5.428537807088627e-07, + "loss": 0.6944, + "step": 2364 + }, + { + "epoch": 3.1, + "grad_norm": 2.75, + "learning_rate": 5.412226788187788e-07, + "loss": 0.7508, + "step": 2365 + }, + { + "epoch": 3.1, + "grad_norm": 2.75, + "learning_rate": 5.395937335861975e-07, + "loss": 0.725, + "step": 2366 + }, + { + "epoch": 3.1, + "grad_norm": 2.75, + "learning_rate": 5.379669468046245e-07, + "loss": 0.7428, + "step": 2367 + }, + { + "epoch": 3.11, + "grad_norm": 2.734375, + "learning_rate": 5.363423202651876e-07, + "loss": 0.7218, + "step": 2368 + }, + { + "epoch": 3.11, + "grad_norm": 2.765625, + "learning_rate": 5.347198557566374e-07, + "loss": 0.7236, + "step": 2369 + }, + { + "epoch": 3.11, + "grad_norm": 2.71875, + "learning_rate": 5.330995550653434e-07, + "loss": 0.7174, + "step": 2370 + }, + { + "epoch": 3.11, + "grad_norm": 2.703125, + "learning_rate": 5.314814199752935e-07, + "loss": 0.7339, + "step": 2371 + }, + { + "epoch": 3.11, + "grad_norm": 2.78125, + "learning_rate": 5.298654522680888e-07, + "loss": 0.7511, + "step": 2372 + }, + { + "epoch": 3.11, + "grad_norm": 2.890625, + "learning_rate": 5.282516537229468e-07, + "loss": 0.7366, + "step": 2373 + }, + { + "epoch": 3.11, + "grad_norm": 2.796875, + "learning_rate": 5.266400261166951e-07, + "loss": 0.755, + "step": 2374 + }, + { + "epoch": 3.12, + "grad_norm": 2.765625, + "learning_rate": 5.250305712237724e-07, + "loss": 0.7411, + "step": 2375 + }, + { + "epoch": 3.12, + "grad_norm": 2.796875, + "learning_rate": 5.234232908162243e-07, + "loss": 0.7681, + "step": 2376 + }, + { + "epoch": 3.12, + "grad_norm": 2.765625, + "learning_rate": 5.218181866637029e-07, + "loss": 0.7097, + "step": 2377 + }, + { + "epoch": 3.12, + "grad_norm": 2.78125, + "learning_rate": 5.202152605334623e-07, + "loss": 0.7398, + "step": 2378 + }, + { + "epoch": 3.12, + "grad_norm": 2.625, + "learning_rate": 5.186145141903606e-07, + "loss": 0.7076, + "step": 2379 + }, + { + "epoch": 3.12, + "grad_norm": 2.828125, + "learning_rate": 5.170159493968549e-07, + "loss": 0.7502, + "step": 2380 + }, + { + "epoch": 3.12, + "grad_norm": 2.78125, + "learning_rate": 5.154195679130012e-07, + "loss": 0.7569, + "step": 2381 + }, + { + "epoch": 3.12, + "grad_norm": 2.921875, + "learning_rate": 5.138253714964509e-07, + "loss": 0.7543, + "step": 2382 + }, + { + "epoch": 3.13, + "grad_norm": 2.90625, + "learning_rate": 5.122333619024494e-07, + "loss": 0.7527, + "step": 2383 + }, + { + "epoch": 3.13, + "grad_norm": 2.8125, + "learning_rate": 5.106435408838354e-07, + "loss": 0.7354, + "step": 2384 + }, + { + "epoch": 3.13, + "grad_norm": 2.703125, + "learning_rate": 5.090559101910369e-07, + "loss": 0.7404, + "step": 2385 + }, + { + "epoch": 3.13, + "grad_norm": 2.8125, + "learning_rate": 5.074704715720711e-07, + "loss": 0.7498, + "step": 2386 + }, + { + "epoch": 3.13, + "grad_norm": 2.828125, + "learning_rate": 5.058872267725407e-07, + "loss": 0.7304, + "step": 2387 + }, + { + "epoch": 3.13, + "grad_norm": 2.8125, + "learning_rate": 5.043061775356336e-07, + "loss": 0.7505, + "step": 2388 + }, + { + "epoch": 3.13, + "grad_norm": 2.765625, + "learning_rate": 5.027273256021212e-07, + "loss": 0.7263, + "step": 2389 + }, + { + "epoch": 3.14, + "grad_norm": 2.8125, + "learning_rate": 5.011506727103536e-07, + "loss": 0.7544, + "step": 2390 + }, + { + "epoch": 3.14, + "grad_norm": 2.75, + "learning_rate": 4.995762205962607e-07, + "loss": 0.756, + "step": 2391 + }, + { + "epoch": 3.14, + "grad_norm": 2.71875, + "learning_rate": 4.980039709933492e-07, + "loss": 0.7044, + "step": 2392 + }, + { + "epoch": 3.14, + "grad_norm": 2.6875, + "learning_rate": 4.964339256327022e-07, + "loss": 0.7123, + "step": 2393 + }, + { + "epoch": 3.14, + "grad_norm": 2.71875, + "learning_rate": 4.948660862429741e-07, + "loss": 0.7342, + "step": 2394 + }, + { + "epoch": 3.14, + "grad_norm": 2.765625, + "learning_rate": 4.933004545503917e-07, + "loss": 0.7532, + "step": 2395 + }, + { + "epoch": 3.14, + "grad_norm": 2.734375, + "learning_rate": 4.917370322787488e-07, + "loss": 0.7435, + "step": 2396 + }, + { + "epoch": 3.14, + "grad_norm": 2.8125, + "learning_rate": 4.901758211494089e-07, + "loss": 0.7251, + "step": 2397 + }, + { + "epoch": 3.15, + "grad_norm": 2.859375, + "learning_rate": 4.886168228813007e-07, + "loss": 0.7306, + "step": 2398 + }, + { + "epoch": 3.15, + "grad_norm": 2.6875, + "learning_rate": 4.870600391909156e-07, + "loss": 0.7276, + "step": 2399 + }, + { + "epoch": 3.15, + "grad_norm": 2.6875, + "learning_rate": 4.855054717923072e-07, + "loss": 0.7128, + "step": 2400 + }, + { + "epoch": 3.15, + "grad_norm": 2.78125, + "learning_rate": 4.839531223970889e-07, + "loss": 0.7291, + "step": 2401 + }, + { + "epoch": 3.15, + "grad_norm": 2.71875, + "learning_rate": 4.824029927144321e-07, + "loss": 0.7587, + "step": 2402 + }, + { + "epoch": 3.15, + "grad_norm": 2.84375, + "learning_rate": 4.80855084451064e-07, + "loss": 0.7431, + "step": 2403 + }, + { + "epoch": 3.15, + "grad_norm": 2.6875, + "learning_rate": 4.793093993112663e-07, + "loss": 0.7269, + "step": 2404 + }, + { + "epoch": 3.16, + "grad_norm": 2.796875, + "learning_rate": 4.777659389968728e-07, + "loss": 0.7415, + "step": 2405 + }, + { + "epoch": 3.16, + "grad_norm": 2.859375, + "learning_rate": 4.76224705207268e-07, + "loss": 0.7523, + "step": 2406 + }, + { + "epoch": 3.16, + "grad_norm": 2.71875, + "learning_rate": 4.7468569963938365e-07, + "loss": 0.7234, + "step": 2407 + }, + { + "epoch": 3.16, + "grad_norm": 2.734375, + "learning_rate": 4.731489239876999e-07, + "loss": 0.7303, + "step": 2408 + }, + { + "epoch": 3.16, + "grad_norm": 2.734375, + "learning_rate": 4.71614379944241e-07, + "loss": 0.7389, + "step": 2409 + }, + { + "epoch": 3.16, + "grad_norm": 2.8125, + "learning_rate": 4.700820691985739e-07, + "loss": 0.7597, + "step": 2410 + }, + { + "epoch": 3.16, + "grad_norm": 2.703125, + "learning_rate": 4.685519934378066e-07, + "loss": 0.7248, + "step": 2411 + }, + { + "epoch": 3.16, + "grad_norm": 2.734375, + "learning_rate": 4.670241543465881e-07, + "loss": 0.7269, + "step": 2412 + }, + { + "epoch": 3.17, + "grad_norm": 2.75, + "learning_rate": 4.654985536071019e-07, + "loss": 0.7445, + "step": 2413 + }, + { + "epoch": 3.17, + "grad_norm": 2.734375, + "learning_rate": 4.639751928990688e-07, + "loss": 0.7453, + "step": 2414 + }, + { + "epoch": 3.17, + "grad_norm": 2.734375, + "learning_rate": 4.6245407389974293e-07, + "loss": 0.7385, + "step": 2415 + }, + { + "epoch": 3.17, + "grad_norm": 2.734375, + "learning_rate": 4.6093519828391025e-07, + "loss": 0.7419, + "step": 2416 + }, + { + "epoch": 3.17, + "grad_norm": 2.765625, + "learning_rate": 4.594185677238866e-07, + "loss": 0.7306, + "step": 2417 + }, + { + "epoch": 3.17, + "grad_norm": 2.78125, + "learning_rate": 4.5790418388951596e-07, + "loss": 0.7604, + "step": 2418 + }, + { + "epoch": 3.17, + "grad_norm": 2.6875, + "learning_rate": 4.5639204844816904e-07, + "loss": 0.7203, + "step": 2419 + }, + { + "epoch": 3.18, + "grad_norm": 2.796875, + "learning_rate": 4.5488216306474044e-07, + "loss": 0.7542, + "step": 2420 + }, + { + "epoch": 3.18, + "grad_norm": 2.796875, + "learning_rate": 4.5337452940164757e-07, + "loss": 0.7379, + "step": 2421 + }, + { + "epoch": 3.18, + "grad_norm": 2.703125, + "learning_rate": 4.51869149118829e-07, + "loss": 0.7314, + "step": 2422 + }, + { + "epoch": 3.18, + "grad_norm": 2.703125, + "learning_rate": 4.5036602387374143e-07, + "loss": 0.7354, + "step": 2423 + }, + { + "epoch": 3.18, + "grad_norm": 2.71875, + "learning_rate": 4.488651553213605e-07, + "loss": 0.7207, + "step": 2424 + }, + { + "epoch": 3.18, + "grad_norm": 2.78125, + "learning_rate": 4.4736654511417456e-07, + "loss": 0.7549, + "step": 2425 + }, + { + "epoch": 3.18, + "grad_norm": 2.78125, + "learning_rate": 4.458701949021871e-07, + "loss": 0.733, + "step": 2426 + }, + { + "epoch": 3.18, + "grad_norm": 2.75, + "learning_rate": 4.443761063329133e-07, + "loss": 0.7572, + "step": 2427 + }, + { + "epoch": 3.19, + "grad_norm": 2.640625, + "learning_rate": 4.428842810513784e-07, + "loss": 0.7242, + "step": 2428 + }, + { + "epoch": 3.19, + "grad_norm": 2.6875, + "learning_rate": 4.4139472070011374e-07, + "loss": 0.7304, + "step": 2429 + }, + { + "epoch": 3.19, + "grad_norm": 2.796875, + "learning_rate": 4.399074269191614e-07, + "loss": 0.7434, + "step": 2430 + }, + { + "epoch": 3.19, + "grad_norm": 2.796875, + "learning_rate": 4.3842240134606225e-07, + "loss": 0.7429, + "step": 2431 + }, + { + "epoch": 3.19, + "grad_norm": 2.78125, + "learning_rate": 4.369396456158639e-07, + "loss": 0.7203, + "step": 2432 + }, + { + "epoch": 3.19, + "grad_norm": 2.671875, + "learning_rate": 4.3545916136111284e-07, + "loss": 0.7554, + "step": 2433 + }, + { + "epoch": 3.19, + "grad_norm": 2.78125, + "learning_rate": 4.3398095021185557e-07, + "loss": 0.7229, + "step": 2434 + }, + { + "epoch": 3.2, + "grad_norm": 2.78125, + "learning_rate": 4.325050137956355e-07, + "loss": 0.731, + "step": 2435 + }, + { + "epoch": 3.2, + "grad_norm": 2.828125, + "learning_rate": 4.3103135373749216e-07, + "loss": 0.7836, + "step": 2436 + }, + { + "epoch": 3.2, + "grad_norm": 2.8125, + "learning_rate": 4.2955997165995634e-07, + "loss": 0.7388, + "step": 2437 + }, + { + "epoch": 3.2, + "grad_norm": 2.859375, + "learning_rate": 4.2809086918305403e-07, + "loss": 0.7573, + "step": 2438 + }, + { + "epoch": 3.2, + "grad_norm": 2.84375, + "learning_rate": 4.266240479242997e-07, + "loss": 0.7542, + "step": 2439 + }, + { + "epoch": 3.2, + "grad_norm": 2.703125, + "learning_rate": 4.251595094986957e-07, + "loss": 0.7314, + "step": 2440 + }, + { + "epoch": 3.2, + "grad_norm": 2.796875, + "learning_rate": 4.236972555187319e-07, + "loss": 0.7696, + "step": 2441 + }, + { + "epoch": 3.2, + "grad_norm": 2.75, + "learning_rate": 4.222372875943831e-07, + "loss": 0.7625, + "step": 2442 + }, + { + "epoch": 3.21, + "grad_norm": 2.796875, + "learning_rate": 4.207796073331052e-07, + "loss": 0.754, + "step": 2443 + }, + { + "epoch": 3.21, + "grad_norm": 2.703125, + "learning_rate": 4.1932421633983726e-07, + "loss": 0.7264, + "step": 2444 + }, + { + "epoch": 3.21, + "grad_norm": 2.8125, + "learning_rate": 4.178711162169974e-07, + "loss": 0.7295, + "step": 2445 + }, + { + "epoch": 3.21, + "grad_norm": 2.8125, + "learning_rate": 4.1642030856448104e-07, + "loss": 0.7379, + "step": 2446 + }, + { + "epoch": 3.21, + "grad_norm": 2.75, + "learning_rate": 4.1497179497965915e-07, + "loss": 0.7712, + "step": 2447 + }, + { + "epoch": 3.21, + "grad_norm": 2.734375, + "learning_rate": 4.1352557705737957e-07, + "loss": 0.7512, + "step": 2448 + }, + { + "epoch": 3.21, + "grad_norm": 2.6875, + "learning_rate": 4.120816563899585e-07, + "loss": 0.7678, + "step": 2449 + }, + { + "epoch": 3.22, + "grad_norm": 2.734375, + "learning_rate": 4.1064003456718517e-07, + "loss": 0.7376, + "step": 2450 + }, + { + "epoch": 3.22, + "grad_norm": 2.75, + "learning_rate": 4.092007131763176e-07, + "loss": 0.7255, + "step": 2451 + }, + { + "epoch": 3.22, + "grad_norm": 2.671875, + "learning_rate": 4.077636938020807e-07, + "loss": 0.7056, + "step": 2452 + }, + { + "epoch": 3.22, + "grad_norm": 2.671875, + "learning_rate": 4.063289780266652e-07, + "loss": 0.7429, + "step": 2453 + }, + { + "epoch": 3.22, + "grad_norm": 2.859375, + "learning_rate": 4.048965674297245e-07, + "loss": 0.7441, + "step": 2454 + }, + { + "epoch": 3.22, + "grad_norm": 2.640625, + "learning_rate": 4.034664635883742e-07, + "loss": 0.6863, + "step": 2455 + }, + { + "epoch": 3.22, + "grad_norm": 2.6875, + "learning_rate": 4.0203866807719176e-07, + "loss": 0.7189, + "step": 2456 + }, + { + "epoch": 3.22, + "grad_norm": 2.8125, + "learning_rate": 4.0061318246821105e-07, + "loss": 0.7478, + "step": 2457 + }, + { + "epoch": 3.23, + "grad_norm": 2.78125, + "learning_rate": 3.991900083309241e-07, + "loss": 0.7417, + "step": 2458 + }, + { + "epoch": 3.23, + "grad_norm": 2.71875, + "learning_rate": 3.9776914723227753e-07, + "loss": 0.7149, + "step": 2459 + }, + { + "epoch": 3.23, + "grad_norm": 2.765625, + "learning_rate": 3.9635060073666993e-07, + "loss": 0.7243, + "step": 2460 + }, + { + "epoch": 3.23, + "grad_norm": 2.75, + "learning_rate": 3.9493437040595327e-07, + "loss": 0.7261, + "step": 2461 + }, + { + "epoch": 3.23, + "grad_norm": 2.890625, + "learning_rate": 3.935204577994289e-07, + "loss": 0.7668, + "step": 2462 + }, + { + "epoch": 3.23, + "grad_norm": 2.71875, + "learning_rate": 3.921088644738458e-07, + "loss": 0.7382, + "step": 2463 + }, + { + "epoch": 3.23, + "grad_norm": 2.75, + "learning_rate": 3.906995919833997e-07, + "loss": 0.743, + "step": 2464 + }, + { + "epoch": 3.24, + "grad_norm": 2.78125, + "learning_rate": 3.892926418797313e-07, + "loss": 0.7242, + "step": 2465 + }, + { + "epoch": 3.24, + "grad_norm": 2.765625, + "learning_rate": 3.878880157119241e-07, + "loss": 0.7465, + "step": 2466 + }, + { + "epoch": 3.24, + "grad_norm": 2.8125, + "learning_rate": 3.8648571502650253e-07, + "loss": 0.741, + "step": 2467 + }, + { + "epoch": 3.24, + "grad_norm": 2.75, + "learning_rate": 3.8508574136743085e-07, + "loss": 0.7638, + "step": 2468 + }, + { + "epoch": 3.24, + "grad_norm": 2.84375, + "learning_rate": 3.8368809627611135e-07, + "loss": 0.7375, + "step": 2469 + }, + { + "epoch": 3.24, + "grad_norm": 2.828125, + "learning_rate": 3.8229278129138293e-07, + "loss": 0.7364, + "step": 2470 + }, + { + "epoch": 3.24, + "grad_norm": 2.90625, + "learning_rate": 3.8089979794951825e-07, + "loss": 0.7208, + "step": 2471 + }, + { + "epoch": 3.24, + "grad_norm": 2.84375, + "learning_rate": 3.795091477842225e-07, + "loss": 0.7375, + "step": 2472 + }, + { + "epoch": 3.25, + "grad_norm": 2.765625, + "learning_rate": 3.7812083232663306e-07, + "loss": 0.7208, + "step": 2473 + }, + { + "epoch": 3.25, + "grad_norm": 2.859375, + "learning_rate": 3.7673485310531525e-07, + "loss": 0.7518, + "step": 2474 + }, + { + "epoch": 3.25, + "grad_norm": 2.796875, + "learning_rate": 3.753512116462643e-07, + "loss": 0.7345, + "step": 2475 + }, + { + "epoch": 3.25, + "grad_norm": 2.859375, + "learning_rate": 3.739699094729002e-07, + "loss": 0.7367, + "step": 2476 + }, + { + "epoch": 3.25, + "grad_norm": 2.75, + "learning_rate": 3.7259094810606745e-07, + "loss": 0.7352, + "step": 2477 + }, + { + "epoch": 3.25, + "grad_norm": 2.734375, + "learning_rate": 3.7121432906403244e-07, + "loss": 0.7244, + "step": 2478 + }, + { + "epoch": 3.25, + "grad_norm": 2.640625, + "learning_rate": 3.6984005386248385e-07, + "loss": 0.7126, + "step": 2479 + }, + { + "epoch": 3.26, + "grad_norm": 2.796875, + "learning_rate": 3.684681240145296e-07, + "loss": 0.7566, + "step": 2480 + }, + { + "epoch": 3.26, + "grad_norm": 2.6875, + "learning_rate": 3.6709854103069457e-07, + "loss": 0.7272, + "step": 2481 + }, + { + "epoch": 3.26, + "grad_norm": 2.796875, + "learning_rate": 3.6573130641892053e-07, + "loss": 0.7387, + "step": 2482 + }, + { + "epoch": 3.26, + "grad_norm": 2.8125, + "learning_rate": 3.6436642168456325e-07, + "loss": 0.7194, + "step": 2483 + }, + { + "epoch": 3.26, + "grad_norm": 2.859375, + "learning_rate": 3.63003888330391e-07, + "loss": 0.7669, + "step": 2484 + }, + { + "epoch": 3.26, + "grad_norm": 2.71875, + "learning_rate": 3.6164370785658346e-07, + "loss": 0.7403, + "step": 2485 + }, + { + "epoch": 3.26, + "grad_norm": 2.734375, + "learning_rate": 3.602858817607299e-07, + "loss": 0.739, + "step": 2486 + }, + { + "epoch": 3.26, + "grad_norm": 2.75, + "learning_rate": 3.5893041153782697e-07, + "loss": 0.7241, + "step": 2487 + }, + { + "epoch": 3.27, + "grad_norm": 2.75, + "learning_rate": 3.575772986802775e-07, + "loss": 0.7532, + "step": 2488 + }, + { + "epoch": 3.27, + "grad_norm": 2.75, + "learning_rate": 3.562265446778898e-07, + "loss": 0.7488, + "step": 2489 + }, + { + "epoch": 3.27, + "grad_norm": 2.75, + "learning_rate": 3.5487815101787253e-07, + "loss": 0.7359, + "step": 2490 + }, + { + "epoch": 3.27, + "grad_norm": 2.78125, + "learning_rate": 3.5353211918483813e-07, + "loss": 0.7444, + "step": 2491 + }, + { + "epoch": 3.27, + "grad_norm": 2.8125, + "learning_rate": 3.52188450660797e-07, + "loss": 0.7389, + "step": 2492 + }, + { + "epoch": 3.27, + "grad_norm": 2.71875, + "learning_rate": 3.50847146925159e-07, + "loss": 0.7365, + "step": 2493 + }, + { + "epoch": 3.27, + "grad_norm": 2.828125, + "learning_rate": 3.4950820945472945e-07, + "loss": 0.714, + "step": 2494 + }, + { + "epoch": 3.28, + "grad_norm": 2.78125, + "learning_rate": 3.481716397237084e-07, + "loss": 0.7675, + "step": 2495 + }, + { + "epoch": 3.28, + "grad_norm": 2.75, + "learning_rate": 3.46837439203688e-07, + "loss": 0.7285, + "step": 2496 + }, + { + "epoch": 3.28, + "grad_norm": 2.75, + "learning_rate": 3.455056093636533e-07, + "loss": 0.7343, + "step": 2497 + }, + { + "epoch": 3.28, + "grad_norm": 2.734375, + "learning_rate": 3.441761516699788e-07, + "loss": 0.7157, + "step": 2498 + }, + { + "epoch": 3.28, + "grad_norm": 2.703125, + "learning_rate": 3.4284906758642683e-07, + "loss": 0.726, + "step": 2499 + }, + { + "epoch": 3.28, + "grad_norm": 2.84375, + "learning_rate": 3.4152435857414676e-07, + "loss": 0.7455, + "step": 2500 + }, + { + "epoch": 3.28, + "grad_norm": 2.828125, + "learning_rate": 3.4020202609167245e-07, + "loss": 0.7319, + "step": 2501 + }, + { + "epoch": 3.28, + "grad_norm": 2.765625, + "learning_rate": 3.3888207159492177e-07, + "loss": 0.7075, + "step": 2502 + }, + { + "epoch": 3.29, + "grad_norm": 2.78125, + "learning_rate": 3.375644965371941e-07, + "loss": 0.7237, + "step": 2503 + }, + { + "epoch": 3.29, + "grad_norm": 2.984375, + "learning_rate": 3.362493023691685e-07, + "loss": 0.738, + "step": 2504 + }, + { + "epoch": 3.29, + "grad_norm": 2.734375, + "learning_rate": 3.3493649053890325e-07, + "loss": 0.7032, + "step": 2505 + }, + { + "epoch": 3.29, + "grad_norm": 2.8125, + "learning_rate": 3.3362606249183446e-07, + "loss": 0.744, + "step": 2506 + }, + { + "epoch": 3.29, + "grad_norm": 2.8125, + "learning_rate": 3.323180196707709e-07, + "loss": 0.7664, + "step": 2507 + }, + { + "epoch": 3.29, + "grad_norm": 2.765625, + "learning_rate": 3.3101236351589764e-07, + "loss": 0.7668, + "step": 2508 + }, + { + "epoch": 3.29, + "grad_norm": 2.78125, + "learning_rate": 3.297090954647714e-07, + "loss": 0.7308, + "step": 2509 + }, + { + "epoch": 3.3, + "grad_norm": 2.765625, + "learning_rate": 3.2840821695231867e-07, + "loss": 0.7502, + "step": 2510 + }, + { + "epoch": 3.3, + "grad_norm": 2.75, + "learning_rate": 3.271097294108369e-07, + "loss": 0.7618, + "step": 2511 + }, + { + "epoch": 3.3, + "grad_norm": 2.734375, + "learning_rate": 3.2581363426998966e-07, + "loss": 0.7359, + "step": 2512 + }, + { + "epoch": 3.3, + "grad_norm": 2.859375, + "learning_rate": 3.245199329568055e-07, + "loss": 0.7454, + "step": 2513 + }, + { + "epoch": 3.3, + "grad_norm": 2.796875, + "learning_rate": 3.2322862689567885e-07, + "loss": 0.7373, + "step": 2514 + }, + { + "epoch": 3.3, + "grad_norm": 2.8125, + "learning_rate": 3.219397175083669e-07, + "loss": 0.7672, + "step": 2515 + }, + { + "epoch": 3.3, + "grad_norm": 2.703125, + "learning_rate": 3.206532062139875e-07, + "loss": 0.7409, + "step": 2516 + }, + { + "epoch": 3.3, + "grad_norm": 2.71875, + "learning_rate": 3.193690944290179e-07, + "loss": 0.7164, + "step": 2517 + }, + { + "epoch": 3.31, + "grad_norm": 2.78125, + "learning_rate": 3.18087383567294e-07, + "loss": 0.7414, + "step": 2518 + }, + { + "epoch": 3.31, + "grad_norm": 2.6875, + "learning_rate": 3.168080750400082e-07, + "loss": 0.7373, + "step": 2519 + }, + { + "epoch": 3.31, + "grad_norm": 2.75, + "learning_rate": 3.1553117025570763e-07, + "loss": 0.7407, + "step": 2520 + }, + { + "epoch": 3.31, + "grad_norm": 2.765625, + "learning_rate": 3.142566706202929e-07, + "loss": 0.7237, + "step": 2521 + }, + { + "epoch": 3.31, + "grad_norm": 2.71875, + "learning_rate": 3.129845775370163e-07, + "loss": 0.7437, + "step": 2522 + }, + { + "epoch": 3.31, + "grad_norm": 2.765625, + "learning_rate": 3.117148924064811e-07, + "loss": 0.7347, + "step": 2523 + }, + { + "epoch": 3.31, + "grad_norm": 2.6875, + "learning_rate": 3.1044761662663933e-07, + "loss": 0.7156, + "step": 2524 + }, + { + "epoch": 3.32, + "grad_norm": 2.796875, + "learning_rate": 3.091827515927884e-07, + "loss": 0.7528, + "step": 2525 + }, + { + "epoch": 3.32, + "grad_norm": 2.75, + "learning_rate": 3.079202986975741e-07, + "loss": 0.7298, + "step": 2526 + }, + { + "epoch": 3.32, + "grad_norm": 2.828125, + "learning_rate": 3.0666025933098475e-07, + "loss": 0.7426, + "step": 2527 + }, + { + "epoch": 3.32, + "grad_norm": 2.765625, + "learning_rate": 3.054026348803521e-07, + "loss": 0.7318, + "step": 2528 + }, + { + "epoch": 3.32, + "grad_norm": 2.71875, + "learning_rate": 3.041474267303479e-07, + "loss": 0.7163, + "step": 2529 + }, + { + "epoch": 3.32, + "grad_norm": 2.828125, + "learning_rate": 3.0289463626298585e-07, + "loss": 0.7589, + "step": 2530 + }, + { + "epoch": 3.32, + "grad_norm": 2.765625, + "learning_rate": 3.016442648576151e-07, + "loss": 0.7362, + "step": 2531 + }, + { + "epoch": 3.32, + "grad_norm": 2.84375, + "learning_rate": 3.003963138909224e-07, + "loss": 0.7456, + "step": 2532 + }, + { + "epoch": 3.33, + "grad_norm": 2.71875, + "learning_rate": 2.991507847369299e-07, + "loss": 0.7276, + "step": 2533 + }, + { + "epoch": 3.33, + "grad_norm": 2.734375, + "learning_rate": 2.9790767876699293e-07, + "loss": 0.755, + "step": 2534 + }, + { + "epoch": 3.33, + "grad_norm": 2.84375, + "learning_rate": 2.9666699734979875e-07, + "loss": 0.7558, + "step": 2535 + }, + { + "epoch": 3.33, + "grad_norm": 2.75, + "learning_rate": 2.9542874185136545e-07, + "loss": 0.7497, + "step": 2536 + }, + { + "epoch": 3.33, + "grad_norm": 2.796875, + "learning_rate": 2.941929136350399e-07, + "loss": 0.7451, + "step": 2537 + }, + { + "epoch": 3.33, + "grad_norm": 2.796875, + "learning_rate": 2.929595140614963e-07, + "loss": 0.7497, + "step": 2538 + }, + { + "epoch": 3.33, + "grad_norm": 2.875, + "learning_rate": 2.917285444887355e-07, + "loss": 0.7591, + "step": 2539 + }, + { + "epoch": 3.33, + "grad_norm": 2.703125, + "learning_rate": 2.9050000627208195e-07, + "loss": 0.7396, + "step": 2540 + }, + { + "epoch": 3.34, + "grad_norm": 2.6875, + "learning_rate": 2.89273900764184e-07, + "loss": 0.7328, + "step": 2541 + }, + { + "epoch": 3.34, + "grad_norm": 2.78125, + "learning_rate": 2.880502293150117e-07, + "loss": 0.7459, + "step": 2542 + }, + { + "epoch": 3.34, + "grad_norm": 2.703125, + "learning_rate": 2.8682899327185293e-07, + "loss": 0.7186, + "step": 2543 + }, + { + "epoch": 3.34, + "grad_norm": 2.765625, + "learning_rate": 2.856101939793171e-07, + "loss": 0.729, + "step": 2544 + }, + { + "epoch": 3.34, + "grad_norm": 2.8125, + "learning_rate": 2.843938327793291e-07, + "loss": 0.7485, + "step": 2545 + }, + { + "epoch": 3.34, + "grad_norm": 2.6875, + "learning_rate": 2.8317991101112966e-07, + "loss": 0.7478, + "step": 2546 + }, + { + "epoch": 3.34, + "grad_norm": 2.828125, + "learning_rate": 2.819684300112732e-07, + "loss": 0.7725, + "step": 2547 + }, + { + "epoch": 3.35, + "grad_norm": 2.734375, + "learning_rate": 2.8075939111362915e-07, + "loss": 0.7284, + "step": 2548 + }, + { + "epoch": 3.35, + "grad_norm": 2.75, + "learning_rate": 2.7955279564937433e-07, + "loss": 0.7398, + "step": 2549 + }, + { + "epoch": 3.35, + "grad_norm": 2.875, + "learning_rate": 2.783486449469977e-07, + "loss": 0.7423, + "step": 2550 + }, + { + "epoch": 3.35, + "grad_norm": 2.75, + "learning_rate": 2.7714694033229656e-07, + "loss": 0.7381, + "step": 2551 + }, + { + "epoch": 3.35, + "grad_norm": 2.703125, + "learning_rate": 2.759476831283742e-07, + "loss": 0.7243, + "step": 2552 + }, + { + "epoch": 3.35, + "grad_norm": 2.765625, + "learning_rate": 2.747508746556396e-07, + "loss": 0.765, + "step": 2553 + }, + { + "epoch": 3.35, + "grad_norm": 2.9375, + "learning_rate": 2.7355651623180574e-07, + "loss": 0.7317, + "step": 2554 + }, + { + "epoch": 3.35, + "grad_norm": 2.796875, + "learning_rate": 2.723646091718868e-07, + "loss": 0.7361, + "step": 2555 + }, + { + "epoch": 3.36, + "grad_norm": 2.71875, + "learning_rate": 2.711751547882005e-07, + "loss": 0.7174, + "step": 2556 + }, + { + "epoch": 3.36, + "grad_norm": 2.765625, + "learning_rate": 2.699881543903618e-07, + "loss": 0.7314, + "step": 2557 + }, + { + "epoch": 3.36, + "grad_norm": 2.703125, + "learning_rate": 2.6880360928528497e-07, + "loss": 0.7426, + "step": 2558 + }, + { + "epoch": 3.36, + "grad_norm": 2.953125, + "learning_rate": 2.676215207771804e-07, + "loss": 0.7566, + "step": 2559 + }, + { + "epoch": 3.36, + "grad_norm": 2.84375, + "learning_rate": 2.6644189016755415e-07, + "loss": 0.7516, + "step": 2560 + }, + { + "epoch": 3.36, + "grad_norm": 2.84375, + "learning_rate": 2.652647187552054e-07, + "loss": 0.7347, + "step": 2561 + }, + { + "epoch": 3.36, + "grad_norm": 2.734375, + "learning_rate": 2.640900078362263e-07, + "loss": 0.7097, + "step": 2562 + }, + { + "epoch": 3.37, + "grad_norm": 2.734375, + "learning_rate": 2.629177587039997e-07, + "loss": 0.7516, + "step": 2563 + }, + { + "epoch": 3.37, + "grad_norm": 2.796875, + "learning_rate": 2.617479726491981e-07, + "loss": 0.7262, + "step": 2564 + }, + { + "epoch": 3.37, + "grad_norm": 2.796875, + "learning_rate": 2.605806509597819e-07, + "loss": 0.7511, + "step": 2565 + }, + { + "epoch": 3.37, + "grad_norm": 2.8125, + "learning_rate": 2.5941579492099853e-07, + "loss": 0.742, + "step": 2566 + }, + { + "epoch": 3.37, + "grad_norm": 2.71875, + "learning_rate": 2.5825340581538043e-07, + "loss": 0.7357, + "step": 2567 + }, + { + "epoch": 3.37, + "grad_norm": 2.8125, + "learning_rate": 2.5709348492274384e-07, + "loss": 0.7173, + "step": 2568 + }, + { + "epoch": 3.37, + "grad_norm": 2.796875, + "learning_rate": 2.559360335201874e-07, + "loss": 0.7532, + "step": 2569 + }, + { + "epoch": 3.37, + "grad_norm": 2.8125, + "learning_rate": 2.547810528820913e-07, + "loss": 0.7346, + "step": 2570 + }, + { + "epoch": 3.38, + "grad_norm": 2.765625, + "learning_rate": 2.536285442801148e-07, + "loss": 0.7426, + "step": 2571 + }, + { + "epoch": 3.38, + "grad_norm": 2.84375, + "learning_rate": 2.524785089831955e-07, + "loss": 0.7343, + "step": 2572 + }, + { + "epoch": 3.38, + "grad_norm": 2.75, + "learning_rate": 2.513309482575471e-07, + "loss": 0.7296, + "step": 2573 + }, + { + "epoch": 3.38, + "grad_norm": 2.8125, + "learning_rate": 2.5018586336666094e-07, + "loss": 0.7495, + "step": 2574 + }, + { + "epoch": 3.38, + "grad_norm": 2.75, + "learning_rate": 2.490432555713002e-07, + "loss": 0.7513, + "step": 2575 + }, + { + "epoch": 3.38, + "grad_norm": 2.796875, + "learning_rate": 2.47903126129502e-07, + "loss": 0.7371, + "step": 2576 + }, + { + "epoch": 3.38, + "grad_norm": 2.859375, + "learning_rate": 2.467654762965743e-07, + "loss": 0.7392, + "step": 2577 + }, + { + "epoch": 3.39, + "grad_norm": 2.84375, + "learning_rate": 2.456303073250943e-07, + "loss": 0.7331, + "step": 2578 + }, + { + "epoch": 3.39, + "grad_norm": 2.75, + "learning_rate": 2.4449762046490885e-07, + "loss": 0.7307, + "step": 2579 + }, + { + "epoch": 3.39, + "grad_norm": 2.8125, + "learning_rate": 2.4336741696313113e-07, + "loss": 0.7557, + "step": 2580 + }, + { + "epoch": 3.39, + "grad_norm": 2.75, + "learning_rate": 2.422396980641406e-07, + "loss": 0.722, + "step": 2581 + }, + { + "epoch": 3.39, + "grad_norm": 2.8125, + "learning_rate": 2.411144650095809e-07, + "loss": 0.7376, + "step": 2582 + }, + { + "epoch": 3.39, + "grad_norm": 2.734375, + "learning_rate": 2.3999171903835853e-07, + "loss": 0.7393, + "step": 2583 + }, + { + "epoch": 3.39, + "grad_norm": 2.734375, + "learning_rate": 2.388714613866422e-07, + "loss": 0.7223, + "step": 2584 + }, + { + "epoch": 3.39, + "grad_norm": 2.765625, + "learning_rate": 2.377536932878602e-07, + "loss": 0.7306, + "step": 2585 + }, + { + "epoch": 3.4, + "grad_norm": 2.75, + "learning_rate": 2.3663841597270032e-07, + "loss": 0.7416, + "step": 2586 + }, + { + "epoch": 3.4, + "grad_norm": 2.65625, + "learning_rate": 2.3552563066910756e-07, + "loss": 0.7443, + "step": 2587 + }, + { + "epoch": 3.4, + "grad_norm": 2.890625, + "learning_rate": 2.3441533860228376e-07, + "loss": 0.7527, + "step": 2588 + }, + { + "epoch": 3.4, + "grad_norm": 2.765625, + "learning_rate": 2.3330754099468528e-07, + "loss": 0.7381, + "step": 2589 + }, + { + "epoch": 3.4, + "grad_norm": 2.703125, + "learning_rate": 2.3220223906602113e-07, + "loss": 0.7393, + "step": 2590 + }, + { + "epoch": 3.4, + "grad_norm": 2.75, + "learning_rate": 2.3109943403325397e-07, + "loss": 0.7519, + "step": 2591 + }, + { + "epoch": 3.4, + "grad_norm": 2.78125, + "learning_rate": 2.299991271105964e-07, + "loss": 0.717, + "step": 2592 + }, + { + "epoch": 3.41, + "grad_norm": 2.703125, + "learning_rate": 2.289013195095116e-07, + "loss": 0.7463, + "step": 2593 + }, + { + "epoch": 3.41, + "grad_norm": 2.78125, + "learning_rate": 2.2780601243870964e-07, + "loss": 0.7392, + "step": 2594 + }, + { + "epoch": 3.41, + "grad_norm": 2.78125, + "learning_rate": 2.2671320710414861e-07, + "loss": 0.7651, + "step": 2595 + }, + { + "epoch": 3.41, + "grad_norm": 2.859375, + "learning_rate": 2.2562290470903082e-07, + "loss": 0.73, + "step": 2596 + }, + { + "epoch": 3.41, + "grad_norm": 2.75, + "learning_rate": 2.2453510645380344e-07, + "loss": 0.7186, + "step": 2597 + }, + { + "epoch": 3.41, + "grad_norm": 2.734375, + "learning_rate": 2.2344981353615737e-07, + "loss": 0.704, + "step": 2598 + }, + { + "epoch": 3.41, + "grad_norm": 2.796875, + "learning_rate": 2.2236702715102366e-07, + "loss": 0.7348, + "step": 2599 + }, + { + "epoch": 3.41, + "grad_norm": 2.671875, + "learning_rate": 2.212867484905745e-07, + "loss": 0.7304, + "step": 2600 + }, + { + "epoch": 3.42, + "grad_norm": 2.828125, + "learning_rate": 2.2020897874422076e-07, + "loss": 0.7573, + "step": 2601 + }, + { + "epoch": 3.42, + "grad_norm": 2.8125, + "learning_rate": 2.191337190986112e-07, + "loss": 0.7553, + "step": 2602 + }, + { + "epoch": 3.42, + "grad_norm": 2.734375, + "learning_rate": 2.180609707376302e-07, + "loss": 0.7278, + "step": 2603 + }, + { + "epoch": 3.42, + "grad_norm": 2.921875, + "learning_rate": 2.16990734842398e-07, + "loss": 0.7378, + "step": 2604 + }, + { + "epoch": 3.42, + "grad_norm": 2.703125, + "learning_rate": 2.1592301259126787e-07, + "loss": 0.718, + "step": 2605 + }, + { + "epoch": 3.42, + "grad_norm": 2.734375, + "learning_rate": 2.148578051598263e-07, + "loss": 0.7515, + "step": 2606 + }, + { + "epoch": 3.42, + "grad_norm": 2.765625, + "learning_rate": 2.1379511372089018e-07, + "loss": 0.7162, + "step": 2607 + }, + { + "epoch": 3.43, + "grad_norm": 2.78125, + "learning_rate": 2.1273493944450634e-07, + "loss": 0.7593, + "step": 2608 + }, + { + "epoch": 3.43, + "grad_norm": 2.78125, + "learning_rate": 2.1167728349795025e-07, + "loss": 0.7483, + "step": 2609 + }, + { + "epoch": 3.43, + "grad_norm": 2.75, + "learning_rate": 2.1062214704572426e-07, + "loss": 0.7329, + "step": 2610 + }, + { + "epoch": 3.43, + "grad_norm": 2.765625, + "learning_rate": 2.09569531249558e-07, + "loss": 0.73, + "step": 2611 + }, + { + "epoch": 3.43, + "grad_norm": 2.78125, + "learning_rate": 2.085194372684049e-07, + "loss": 0.7422, + "step": 2612 + }, + { + "epoch": 3.43, + "grad_norm": 2.65625, + "learning_rate": 2.0747186625844183e-07, + "loss": 0.7077, + "step": 2613 + }, + { + "epoch": 3.43, + "grad_norm": 2.734375, + "learning_rate": 2.06426819373067e-07, + "loss": 0.7269, + "step": 2614 + }, + { + "epoch": 3.43, + "grad_norm": 2.78125, + "learning_rate": 2.0538429776290114e-07, + "loss": 0.7389, + "step": 2615 + }, + { + "epoch": 3.44, + "grad_norm": 2.71875, + "learning_rate": 2.0434430257578325e-07, + "loss": 0.7308, + "step": 2616 + }, + { + "epoch": 3.44, + "grad_norm": 2.84375, + "learning_rate": 2.0330683495677182e-07, + "loss": 0.7502, + "step": 2617 + }, + { + "epoch": 3.44, + "grad_norm": 2.734375, + "learning_rate": 2.0227189604814124e-07, + "loss": 0.7286, + "step": 2618 + }, + { + "epoch": 3.44, + "grad_norm": 2.796875, + "learning_rate": 2.0123948698938272e-07, + "loss": 0.7527, + "step": 2619 + }, + { + "epoch": 3.44, + "grad_norm": 2.75, + "learning_rate": 2.0020960891720147e-07, + "loss": 0.7329, + "step": 2620 + }, + { + "epoch": 3.44, + "grad_norm": 2.734375, + "learning_rate": 1.991822629655163e-07, + "loss": 0.7064, + "step": 2621 + }, + { + "epoch": 3.44, + "grad_norm": 2.734375, + "learning_rate": 1.981574502654579e-07, + "loss": 0.7558, + "step": 2622 + }, + { + "epoch": 3.45, + "grad_norm": 2.78125, + "learning_rate": 1.971351719453679e-07, + "loss": 0.7569, + "step": 2623 + }, + { + "epoch": 3.45, + "grad_norm": 2.71875, + "learning_rate": 1.9611542913079817e-07, + "loss": 0.7268, + "step": 2624 + }, + { + "epoch": 3.45, + "grad_norm": 2.78125, + "learning_rate": 1.9509822294450725e-07, + "loss": 0.7576, + "step": 2625 + }, + { + "epoch": 3.45, + "grad_norm": 2.828125, + "learning_rate": 1.9408355450646234e-07, + "loss": 0.7543, + "step": 2626 + }, + { + "epoch": 3.45, + "grad_norm": 2.75, + "learning_rate": 1.9307142493383585e-07, + "loss": 0.7103, + "step": 2627 + }, + { + "epoch": 3.45, + "grad_norm": 2.734375, + "learning_rate": 1.9206183534100474e-07, + "loss": 0.7038, + "step": 2628 + }, + { + "epoch": 3.45, + "grad_norm": 2.796875, + "learning_rate": 1.9105478683955042e-07, + "loss": 0.7318, + "step": 2629 + }, + { + "epoch": 3.45, + "grad_norm": 2.703125, + "learning_rate": 1.9005028053825592e-07, + "loss": 0.7094, + "step": 2630 + }, + { + "epoch": 3.46, + "grad_norm": 2.75, + "learning_rate": 1.8904831754310376e-07, + "loss": 0.7539, + "step": 2631 + }, + { + "epoch": 3.46, + "grad_norm": 2.78125, + "learning_rate": 1.8804889895727872e-07, + "loss": 0.7186, + "step": 2632 + }, + { + "epoch": 3.46, + "grad_norm": 2.75, + "learning_rate": 1.8705202588116227e-07, + "loss": 0.744, + "step": 2633 + }, + { + "epoch": 3.46, + "grad_norm": 2.9375, + "learning_rate": 1.8605769941233421e-07, + "loss": 0.7349, + "step": 2634 + }, + { + "epoch": 3.46, + "grad_norm": 2.71875, + "learning_rate": 1.850659206455699e-07, + "loss": 0.7295, + "step": 2635 + }, + { + "epoch": 3.46, + "grad_norm": 2.703125, + "learning_rate": 1.8407669067284028e-07, + "loss": 0.7094, + "step": 2636 + }, + { + "epoch": 3.46, + "grad_norm": 2.953125, + "learning_rate": 1.8309001058330944e-07, + "loss": 0.7257, + "step": 2637 + }, + { + "epoch": 3.47, + "grad_norm": 2.921875, + "learning_rate": 1.821058814633339e-07, + "loss": 0.7217, + "step": 2638 + }, + { + "epoch": 3.47, + "grad_norm": 2.765625, + "learning_rate": 1.8112430439646194e-07, + "loss": 0.7139, + "step": 2639 + }, + { + "epoch": 3.47, + "grad_norm": 2.828125, + "learning_rate": 1.8014528046343183e-07, + "loss": 0.7362, + "step": 2640 + }, + { + "epoch": 3.47, + "grad_norm": 2.765625, + "learning_rate": 1.7916881074217102e-07, + "loss": 0.7666, + "step": 2641 + }, + { + "epoch": 3.47, + "grad_norm": 2.96875, + "learning_rate": 1.7819489630779423e-07, + "loss": 0.7341, + "step": 2642 + }, + { + "epoch": 3.47, + "grad_norm": 2.71875, + "learning_rate": 1.7722353823260262e-07, + "loss": 0.7503, + "step": 2643 + }, + { + "epoch": 3.47, + "grad_norm": 2.75, + "learning_rate": 1.762547375860832e-07, + "loss": 0.7234, + "step": 2644 + }, + { + "epoch": 3.47, + "grad_norm": 2.734375, + "learning_rate": 1.752884954349074e-07, + "loss": 0.759, + "step": 2645 + }, + { + "epoch": 3.48, + "grad_norm": 2.75, + "learning_rate": 1.7432481284292923e-07, + "loss": 0.7275, + "step": 2646 + }, + { + "epoch": 3.48, + "grad_norm": 2.75, + "learning_rate": 1.7336369087118443e-07, + "loss": 0.7429, + "step": 2647 + }, + { + "epoch": 3.48, + "grad_norm": 2.78125, + "learning_rate": 1.7240513057789065e-07, + "loss": 0.7283, + "step": 2648 + }, + { + "epoch": 3.48, + "grad_norm": 2.75, + "learning_rate": 1.7144913301844313e-07, + "loss": 0.7485, + "step": 2649 + }, + { + "epoch": 3.48, + "grad_norm": 2.78125, + "learning_rate": 1.7049569924541653e-07, + "loss": 0.7214, + "step": 2650 + }, + { + "epoch": 3.48, + "grad_norm": 2.71875, + "learning_rate": 1.695448303085634e-07, + "loss": 0.7184, + "step": 2651 + }, + { + "epoch": 3.48, + "grad_norm": 2.75, + "learning_rate": 1.6859652725481095e-07, + "loss": 0.7651, + "step": 2652 + }, + { + "epoch": 3.49, + "grad_norm": 2.734375, + "learning_rate": 1.6765079112826266e-07, + "loss": 0.7198, + "step": 2653 + }, + { + "epoch": 3.49, + "grad_norm": 2.78125, + "learning_rate": 1.6670762297019444e-07, + "loss": 0.7473, + "step": 2654 + }, + { + "epoch": 3.49, + "grad_norm": 2.71875, + "learning_rate": 1.657670238190559e-07, + "loss": 0.7111, + "step": 2655 + }, + { + "epoch": 3.49, + "grad_norm": 2.765625, + "learning_rate": 1.6482899471046726e-07, + "loss": 0.7149, + "step": 2656 + }, + { + "epoch": 3.49, + "grad_norm": 2.890625, + "learning_rate": 1.6389353667721984e-07, + "loss": 0.7794, + "step": 2657 + }, + { + "epoch": 3.49, + "grad_norm": 2.78125, + "learning_rate": 1.629606507492737e-07, + "loss": 0.751, + "step": 2658 + }, + { + "epoch": 3.49, + "grad_norm": 2.75, + "learning_rate": 1.6203033795375717e-07, + "loss": 0.739, + "step": 2659 + }, + { + "epoch": 3.49, + "grad_norm": 2.75, + "learning_rate": 1.6110259931496564e-07, + "loss": 0.7302, + "step": 2660 + }, + { + "epoch": 3.5, + "grad_norm": 2.75, + "learning_rate": 1.6017743585435897e-07, + "loss": 0.7629, + "step": 2661 + }, + { + "epoch": 3.5, + "grad_norm": 2.796875, + "learning_rate": 1.5925484859056372e-07, + "loss": 0.7414, + "step": 2662 + }, + { + "epoch": 3.5, + "grad_norm": 2.890625, + "learning_rate": 1.583348385393685e-07, + "loss": 0.7227, + "step": 2663 + }, + { + "epoch": 3.5, + "grad_norm": 2.703125, + "learning_rate": 1.5741740671372548e-07, + "loss": 0.7169, + "step": 2664 + }, + { + "epoch": 3.5, + "grad_norm": 2.765625, + "learning_rate": 1.5650255412374638e-07, + "loss": 0.7348, + "step": 2665 + }, + { + "epoch": 3.5, + "grad_norm": 2.703125, + "learning_rate": 1.555902817767063e-07, + "loss": 0.71, + "step": 2666 + }, + { + "epoch": 3.5, + "grad_norm": 2.75, + "learning_rate": 1.5468059067703571e-07, + "loss": 0.7191, + "step": 2667 + }, + { + "epoch": 3.51, + "grad_norm": 2.78125, + "learning_rate": 1.5377348182632536e-07, + "loss": 0.7435, + "step": 2668 + }, + { + "epoch": 3.51, + "grad_norm": 2.75, + "learning_rate": 1.528689562233221e-07, + "loss": 0.7107, + "step": 2669 + }, + { + "epoch": 3.51, + "grad_norm": 2.796875, + "learning_rate": 1.5196701486392906e-07, + "loss": 0.7216, + "step": 2670 + }, + { + "epoch": 3.51, + "grad_norm": 2.75, + "learning_rate": 1.5106765874120382e-07, + "loss": 0.7617, + "step": 2671 + }, + { + "epoch": 3.51, + "grad_norm": 2.75, + "learning_rate": 1.501708888453579e-07, + "loss": 0.7454, + "step": 2672 + }, + { + "epoch": 3.51, + "grad_norm": 2.859375, + "learning_rate": 1.4927670616375345e-07, + "loss": 0.7424, + "step": 2673 + }, + { + "epoch": 3.51, + "grad_norm": 3.09375, + "learning_rate": 1.4838511168090707e-07, + "loss": 0.7219, + "step": 2674 + }, + { + "epoch": 3.51, + "grad_norm": 2.828125, + "learning_rate": 1.4749610637848383e-07, + "loss": 0.7569, + "step": 2675 + }, + { + "epoch": 3.52, + "grad_norm": 2.828125, + "learning_rate": 1.4660969123529795e-07, + "loss": 0.7219, + "step": 2676 + }, + { + "epoch": 3.52, + "grad_norm": 2.796875, + "learning_rate": 1.4572586722731292e-07, + "loss": 0.7286, + "step": 2677 + }, + { + "epoch": 3.52, + "grad_norm": 2.75, + "learning_rate": 1.4484463532763809e-07, + "loss": 0.729, + "step": 2678 + }, + { + "epoch": 3.52, + "grad_norm": 2.65625, + "learning_rate": 1.4396599650652959e-07, + "loss": 0.7401, + "step": 2679 + }, + { + "epoch": 3.52, + "grad_norm": 2.71875, + "learning_rate": 1.4308995173138828e-07, + "loss": 0.7234, + "step": 2680 + }, + { + "epoch": 3.52, + "grad_norm": 2.828125, + "learning_rate": 1.4221650196675901e-07, + "loss": 0.7358, + "step": 2681 + }, + { + "epoch": 3.52, + "grad_norm": 2.796875, + "learning_rate": 1.4134564817432972e-07, + "loss": 0.7345, + "step": 2682 + }, + { + "epoch": 3.53, + "grad_norm": 2.890625, + "learning_rate": 1.4047739131292953e-07, + "loss": 0.758, + "step": 2683 + }, + { + "epoch": 3.53, + "grad_norm": 2.796875, + "learning_rate": 1.3961173233852876e-07, + "loss": 0.7417, + "step": 2684 + }, + { + "epoch": 3.53, + "grad_norm": 2.78125, + "learning_rate": 1.3874867220423721e-07, + "loss": 0.72, + "step": 2685 + }, + { + "epoch": 3.53, + "grad_norm": 2.84375, + "learning_rate": 1.3788821186030338e-07, + "loss": 0.7787, + "step": 2686 + }, + { + "epoch": 3.53, + "grad_norm": 2.703125, + "learning_rate": 1.370303522541133e-07, + "loss": 0.7291, + "step": 2687 + }, + { + "epoch": 3.53, + "grad_norm": 2.71875, + "learning_rate": 1.3617509433018928e-07, + "loss": 0.7344, + "step": 2688 + }, + { + "epoch": 3.53, + "grad_norm": 2.71875, + "learning_rate": 1.3532243903018993e-07, + "loss": 0.7444, + "step": 2689 + }, + { + "epoch": 3.53, + "grad_norm": 2.703125, + "learning_rate": 1.3447238729290713e-07, + "loss": 0.7318, + "step": 2690 + }, + { + "epoch": 3.54, + "grad_norm": 2.703125, + "learning_rate": 1.3362494005426662e-07, + "loss": 0.7339, + "step": 2691 + }, + { + "epoch": 3.54, + "grad_norm": 2.78125, + "learning_rate": 1.3278009824732763e-07, + "loss": 0.748, + "step": 2692 + }, + { + "epoch": 3.54, + "grad_norm": 2.8125, + "learning_rate": 1.3193786280227912e-07, + "loss": 0.7342, + "step": 2693 + }, + { + "epoch": 3.54, + "grad_norm": 2.8125, + "learning_rate": 1.310982346464415e-07, + "loss": 0.7375, + "step": 2694 + }, + { + "epoch": 3.54, + "grad_norm": 2.796875, + "learning_rate": 1.3026121470426394e-07, + "loss": 0.7616, + "step": 2695 + }, + { + "epoch": 3.54, + "grad_norm": 2.6875, + "learning_rate": 1.2942680389732398e-07, + "loss": 0.7177, + "step": 2696 + }, + { + "epoch": 3.54, + "grad_norm": 2.65625, + "learning_rate": 1.2859500314432616e-07, + "loss": 0.7198, + "step": 2697 + }, + { + "epoch": 3.55, + "grad_norm": 2.84375, + "learning_rate": 1.2776581336110234e-07, + "loss": 0.7457, + "step": 2698 + }, + { + "epoch": 3.55, + "grad_norm": 2.796875, + "learning_rate": 1.269392354606086e-07, + "loss": 0.7488, + "step": 2699 + }, + { + "epoch": 3.55, + "grad_norm": 2.828125, + "learning_rate": 1.2611527035292558e-07, + "loss": 0.7396, + "step": 2700 + }, + { + "epoch": 3.55, + "grad_norm": 2.84375, + "learning_rate": 1.2529391894525733e-07, + "loss": 0.7652, + "step": 2701 + }, + { + "epoch": 3.55, + "grad_norm": 2.84375, + "learning_rate": 1.2447518214193017e-07, + "loss": 0.7386, + "step": 2702 + }, + { + "epoch": 3.55, + "grad_norm": 2.859375, + "learning_rate": 1.2365906084439133e-07, + "loss": 0.7597, + "step": 2703 + }, + { + "epoch": 3.55, + "grad_norm": 2.78125, + "learning_rate": 1.2284555595120901e-07, + "loss": 0.7328, + "step": 2704 + }, + { + "epoch": 3.55, + "grad_norm": 2.75, + "learning_rate": 1.2203466835806977e-07, + "loss": 0.7354, + "step": 2705 + }, + { + "epoch": 3.56, + "grad_norm": 2.734375, + "learning_rate": 1.2122639895777887e-07, + "loss": 0.7442, + "step": 2706 + }, + { + "epoch": 3.56, + "grad_norm": 2.71875, + "learning_rate": 1.2042074864025977e-07, + "loss": 0.7365, + "step": 2707 + }, + { + "epoch": 3.56, + "grad_norm": 2.84375, + "learning_rate": 1.1961771829255037e-07, + "loss": 0.7599, + "step": 2708 + }, + { + "epoch": 3.56, + "grad_norm": 2.75, + "learning_rate": 1.1881730879880565e-07, + "loss": 0.751, + "step": 2709 + }, + { + "epoch": 3.56, + "grad_norm": 2.671875, + "learning_rate": 1.1801952104029347e-07, + "loss": 0.713, + "step": 2710 + }, + { + "epoch": 3.56, + "grad_norm": 2.78125, + "learning_rate": 1.1722435589539699e-07, + "loss": 0.7472, + "step": 2711 + }, + { + "epoch": 3.56, + "grad_norm": 2.734375, + "learning_rate": 1.1643181423961037e-07, + "loss": 0.7608, + "step": 2712 + }, + { + "epoch": 3.57, + "grad_norm": 2.796875, + "learning_rate": 1.1564189694554001e-07, + "loss": 0.7604, + "step": 2713 + }, + { + "epoch": 3.57, + "grad_norm": 2.796875, + "learning_rate": 1.1485460488290163e-07, + "loss": 0.7267, + "step": 2714 + }, + { + "epoch": 3.57, + "grad_norm": 2.734375, + "learning_rate": 1.140699389185218e-07, + "loss": 0.7258, + "step": 2715 + }, + { + "epoch": 3.57, + "grad_norm": 2.734375, + "learning_rate": 1.1328789991633532e-07, + "loss": 0.7486, + "step": 2716 + }, + { + "epoch": 3.57, + "grad_norm": 2.890625, + "learning_rate": 1.1250848873738452e-07, + "loss": 0.7492, + "step": 2717 + }, + { + "epoch": 3.57, + "grad_norm": 2.765625, + "learning_rate": 1.1173170623981822e-07, + "loss": 0.7401, + "step": 2718 + }, + { + "epoch": 3.57, + "grad_norm": 2.765625, + "learning_rate": 1.1095755327889146e-07, + "loss": 0.7704, + "step": 2719 + }, + { + "epoch": 3.57, + "grad_norm": 2.703125, + "learning_rate": 1.1018603070696409e-07, + "loss": 0.7402, + "step": 2720 + }, + { + "epoch": 3.58, + "grad_norm": 2.71875, + "learning_rate": 1.094171393734994e-07, + "loss": 0.7158, + "step": 2721 + }, + { + "epoch": 3.58, + "grad_norm": 2.8125, + "learning_rate": 1.0865088012506408e-07, + "loss": 0.7411, + "step": 2722 + }, + { + "epoch": 3.58, + "grad_norm": 2.734375, + "learning_rate": 1.078872538053266e-07, + "loss": 0.7364, + "step": 2723 + }, + { + "epoch": 3.58, + "grad_norm": 2.84375, + "learning_rate": 1.0712626125505665e-07, + "loss": 0.7643, + "step": 2724 + }, + { + "epoch": 3.58, + "grad_norm": 2.78125, + "learning_rate": 1.0636790331212399e-07, + "loss": 0.727, + "step": 2725 + }, + { + "epoch": 3.58, + "grad_norm": 2.75, + "learning_rate": 1.0561218081149711e-07, + "loss": 0.7175, + "step": 2726 + }, + { + "epoch": 3.58, + "grad_norm": 2.875, + "learning_rate": 1.0485909458524402e-07, + "loss": 0.7593, + "step": 2727 + }, + { + "epoch": 3.59, + "grad_norm": 2.796875, + "learning_rate": 1.0410864546252841e-07, + "loss": 0.7395, + "step": 2728 + }, + { + "epoch": 3.59, + "grad_norm": 2.59375, + "learning_rate": 1.0336083426961296e-07, + "loss": 0.6924, + "step": 2729 + }, + { + "epoch": 3.59, + "grad_norm": 2.859375, + "learning_rate": 1.0261566182985378e-07, + "loss": 0.7466, + "step": 2730 + }, + { + "epoch": 3.59, + "grad_norm": 2.75, + "learning_rate": 1.0187312896370233e-07, + "loss": 0.727, + "step": 2731 + }, + { + "epoch": 3.59, + "grad_norm": 2.796875, + "learning_rate": 1.0113323648870354e-07, + "loss": 0.7462, + "step": 2732 + }, + { + "epoch": 3.59, + "grad_norm": 2.859375, + "learning_rate": 1.0039598521949578e-07, + "loss": 0.7598, + "step": 2733 + }, + { + "epoch": 3.59, + "grad_norm": 2.796875, + "learning_rate": 9.966137596780945e-08, + "loss": 0.7507, + "step": 2734 + }, + { + "epoch": 3.59, + "grad_norm": 2.75, + "learning_rate": 9.892940954246532e-08, + "loss": 0.7347, + "step": 2735 + }, + { + "epoch": 3.6, + "grad_norm": 2.8125, + "learning_rate": 9.820008674937459e-08, + "loss": 0.7634, + "step": 2736 + }, + { + "epoch": 3.6, + "grad_norm": 2.78125, + "learning_rate": 9.747340839153852e-08, + "loss": 0.7517, + "step": 2737 + }, + { + "epoch": 3.6, + "grad_norm": 2.75, + "learning_rate": 9.674937526904571e-08, + "loss": 0.7389, + "step": 2738 + }, + { + "epoch": 3.6, + "grad_norm": 2.828125, + "learning_rate": 9.602798817907294e-08, + "loss": 0.7393, + "step": 2739 + }, + { + "epoch": 3.6, + "grad_norm": 2.796875, + "learning_rate": 9.530924791588319e-08, + "loss": 0.7612, + "step": 2740 + }, + { + "epoch": 3.6, + "grad_norm": 2.734375, + "learning_rate": 9.459315527082624e-08, + "loss": 0.7261, + "step": 2741 + }, + { + "epoch": 3.6, + "grad_norm": 2.78125, + "learning_rate": 9.387971103233556e-08, + "loss": 0.7272, + "step": 2742 + }, + { + "epoch": 3.61, + "grad_norm": 2.703125, + "learning_rate": 9.316891598592892e-08, + "loss": 0.7141, + "step": 2743 + }, + { + "epoch": 3.61, + "grad_norm": 2.765625, + "learning_rate": 9.24607709142078e-08, + "loss": 0.7343, + "step": 2744 + }, + { + "epoch": 3.61, + "grad_norm": 2.78125, + "learning_rate": 9.175527659685573e-08, + "loss": 0.7291, + "step": 2745 + }, + { + "epoch": 3.61, + "grad_norm": 2.703125, + "learning_rate": 9.10524338106375e-08, + "loss": 0.6963, + "step": 2746 + }, + { + "epoch": 3.61, + "grad_norm": 2.734375, + "learning_rate": 9.035224332939879e-08, + "loss": 0.7264, + "step": 2747 + }, + { + "epoch": 3.61, + "grad_norm": 2.828125, + "learning_rate": 8.965470592406544e-08, + "loss": 0.758, + "step": 2748 + }, + { + "epoch": 3.61, + "grad_norm": 2.734375, + "learning_rate": 8.895982236264117e-08, + "loss": 0.7443, + "step": 2749 + }, + { + "epoch": 3.61, + "grad_norm": 2.78125, + "learning_rate": 8.826759341020813e-08, + "loss": 0.7427, + "step": 2750 + }, + { + "epoch": 3.62, + "grad_norm": 2.84375, + "learning_rate": 8.757801982892583e-08, + "loss": 0.7344, + "step": 2751 + }, + { + "epoch": 3.62, + "grad_norm": 2.75, + "learning_rate": 8.689110237803056e-08, + "loss": 0.7442, + "step": 2752 + }, + { + "epoch": 3.62, + "grad_norm": 2.765625, + "learning_rate": 8.620684181383343e-08, + "loss": 0.7205, + "step": 2753 + }, + { + "epoch": 3.62, + "grad_norm": 2.765625, + "learning_rate": 8.552523888972097e-08, + "loss": 0.7302, + "step": 2754 + }, + { + "epoch": 3.62, + "grad_norm": 2.75, + "learning_rate": 8.48462943561526e-08, + "loss": 0.745, + "step": 2755 + }, + { + "epoch": 3.62, + "grad_norm": 2.75, + "learning_rate": 8.417000896066201e-08, + "loss": 0.7418, + "step": 2756 + }, + { + "epoch": 3.62, + "grad_norm": 2.734375, + "learning_rate": 8.349638344785388e-08, + "loss": 0.7604, + "step": 2757 + }, + { + "epoch": 3.63, + "grad_norm": 2.84375, + "learning_rate": 8.282541855940546e-08, + "loss": 0.7529, + "step": 2758 + }, + { + "epoch": 3.63, + "grad_norm": 2.796875, + "learning_rate": 8.21571150340636e-08, + "loss": 0.7441, + "step": 2759 + }, + { + "epoch": 3.63, + "grad_norm": 2.796875, + "learning_rate": 8.149147360764609e-08, + "loss": 0.7247, + "step": 2760 + }, + { + "epoch": 3.63, + "grad_norm": 2.796875, + "learning_rate": 8.082849501303836e-08, + "loss": 0.7345, + "step": 2761 + }, + { + "epoch": 3.63, + "grad_norm": 2.71875, + "learning_rate": 8.016817998019455e-08, + "loss": 0.7331, + "step": 2762 + }, + { + "epoch": 3.63, + "grad_norm": 2.75, + "learning_rate": 7.951052923613644e-08, + "loss": 0.726, + "step": 2763 + }, + { + "epoch": 3.63, + "grad_norm": 2.78125, + "learning_rate": 7.885554350495206e-08, + "loss": 0.7535, + "step": 2764 + }, + { + "epoch": 3.63, + "grad_norm": 2.78125, + "learning_rate": 7.820322350779536e-08, + "loss": 0.7317, + "step": 2765 + }, + { + "epoch": 3.64, + "grad_norm": 2.6875, + "learning_rate": 7.755356996288571e-08, + "loss": 0.7013, + "step": 2766 + }, + { + "epoch": 3.64, + "grad_norm": 2.78125, + "learning_rate": 7.69065835855054e-08, + "loss": 0.7384, + "step": 2767 + }, + { + "epoch": 3.64, + "grad_norm": 2.71875, + "learning_rate": 7.626226508800128e-08, + "loss": 0.7478, + "step": 2768 + }, + { + "epoch": 3.64, + "grad_norm": 2.828125, + "learning_rate": 7.562061517978198e-08, + "loss": 0.7591, + "step": 2769 + }, + { + "epoch": 3.64, + "grad_norm": 2.796875, + "learning_rate": 7.498163456731878e-08, + "loss": 0.7609, + "step": 2770 + }, + { + "epoch": 3.64, + "grad_norm": 2.78125, + "learning_rate": 7.434532395414335e-08, + "loss": 0.7136, + "step": 2771 + }, + { + "epoch": 3.64, + "grad_norm": 2.765625, + "learning_rate": 7.371168404084806e-08, + "loss": 0.7357, + "step": 2772 + }, + { + "epoch": 3.65, + "grad_norm": 2.78125, + "learning_rate": 7.308071552508455e-08, + "loss": 0.7532, + "step": 2773 + }, + { + "epoch": 3.65, + "grad_norm": 2.765625, + "learning_rate": 7.245241910156297e-08, + "loss": 0.7328, + "step": 2774 + }, + { + "epoch": 3.65, + "grad_norm": 2.765625, + "learning_rate": 7.18267954620519e-08, + "loss": 0.7677, + "step": 2775 + }, + { + "epoch": 3.65, + "grad_norm": 2.78125, + "learning_rate": 7.120384529537672e-08, + "loss": 0.7534, + "step": 2776 + }, + { + "epoch": 3.65, + "grad_norm": 2.859375, + "learning_rate": 7.058356928741988e-08, + "loss": 0.7279, + "step": 2777 + }, + { + "epoch": 3.65, + "grad_norm": 2.703125, + "learning_rate": 6.996596812111872e-08, + "loss": 0.7258, + "step": 2778 + }, + { + "epoch": 3.65, + "grad_norm": 2.765625, + "learning_rate": 6.935104247646596e-08, + "loss": 0.7477, + "step": 2779 + }, + { + "epoch": 3.65, + "grad_norm": 2.703125, + "learning_rate": 6.873879303050863e-08, + "loss": 0.7258, + "step": 2780 + }, + { + "epoch": 3.66, + "grad_norm": 2.859375, + "learning_rate": 6.812922045734666e-08, + "loss": 0.7537, + "step": 2781 + }, + { + "epoch": 3.66, + "grad_norm": 2.796875, + "learning_rate": 6.752232542813319e-08, + "loss": 0.7369, + "step": 2782 + }, + { + "epoch": 3.66, + "grad_norm": 2.78125, + "learning_rate": 6.691810861107318e-08, + "loss": 0.7418, + "step": 2783 + }, + { + "epoch": 3.66, + "grad_norm": 2.78125, + "learning_rate": 6.631657067142306e-08, + "loss": 0.7394, + "step": 2784 + }, + { + "epoch": 3.66, + "grad_norm": 2.75, + "learning_rate": 6.571771227148943e-08, + "loss": 0.7529, + "step": 2785 + }, + { + "epoch": 3.66, + "grad_norm": 2.765625, + "learning_rate": 6.512153407062849e-08, + "loss": 0.7551, + "step": 2786 + }, + { + "epoch": 3.66, + "grad_norm": 2.828125, + "learning_rate": 6.452803672524571e-08, + "loss": 0.746, + "step": 2787 + }, + { + "epoch": 3.67, + "grad_norm": 2.78125, + "learning_rate": 6.393722088879534e-08, + "loss": 0.7208, + "step": 2788 + }, + { + "epoch": 3.67, + "grad_norm": 2.8125, + "learning_rate": 6.334908721177868e-08, + "loss": 0.7459, + "step": 2789 + }, + { + "epoch": 3.67, + "grad_norm": 2.734375, + "learning_rate": 6.276363634174359e-08, + "loss": 0.7333, + "step": 2790 + }, + { + "epoch": 3.67, + "grad_norm": 2.8125, + "learning_rate": 6.218086892328446e-08, + "loss": 0.7524, + "step": 2791 + }, + { + "epoch": 3.67, + "grad_norm": 2.75, + "learning_rate": 6.160078559804189e-08, + "loss": 0.7351, + "step": 2792 + }, + { + "epoch": 3.67, + "grad_norm": 2.75, + "learning_rate": 6.102338700470028e-08, + "loss": 0.7482, + "step": 2793 + }, + { + "epoch": 3.67, + "grad_norm": 2.78125, + "learning_rate": 6.044867377898806e-08, + "loss": 0.7403, + "step": 2794 + }, + { + "epoch": 3.67, + "grad_norm": 2.765625, + "learning_rate": 5.987664655367792e-08, + "loss": 0.7417, + "step": 2795 + }, + { + "epoch": 3.68, + "grad_norm": 2.671875, + "learning_rate": 5.930730595858414e-08, + "loss": 0.7322, + "step": 2796 + }, + { + "epoch": 3.68, + "grad_norm": 2.828125, + "learning_rate": 5.8740652620563334e-08, + "loss": 0.7684, + "step": 2797 + }, + { + "epoch": 3.68, + "grad_norm": 2.734375, + "learning_rate": 5.817668716351338e-08, + "loss": 0.7392, + "step": 2798 + }, + { + "epoch": 3.68, + "grad_norm": 2.796875, + "learning_rate": 5.7615410208373415e-08, + "loss": 0.7334, + "step": 2799 + }, + { + "epoch": 3.68, + "grad_norm": 2.75, + "learning_rate": 5.7056822373121324e-08, + "loss": 0.7448, + "step": 2800 + }, + { + "epoch": 3.68, + "grad_norm": 2.734375, + "learning_rate": 5.650092427277487e-08, + "loss": 0.7345, + "step": 2801 + }, + { + "epoch": 3.68, + "grad_norm": 2.796875, + "learning_rate": 5.594771651939057e-08, + "loss": 0.7373, + "step": 2802 + }, + { + "epoch": 3.68, + "grad_norm": 2.71875, + "learning_rate": 5.5397199722062034e-08, + "loss": 0.7372, + "step": 2803 + }, + { + "epoch": 3.69, + "grad_norm": 2.796875, + "learning_rate": 5.484937448692079e-08, + "loss": 0.7445, + "step": 2804 + }, + { + "epoch": 3.69, + "grad_norm": 2.71875, + "learning_rate": 5.430424141713464e-08, + "loss": 0.7263, + "step": 2805 + }, + { + "epoch": 3.69, + "grad_norm": 2.78125, + "learning_rate": 5.3761801112907356e-08, + "loss": 0.7268, + "step": 2806 + }, + { + "epoch": 3.69, + "grad_norm": 2.84375, + "learning_rate": 5.3222054171477574e-08, + "loss": 0.7446, + "step": 2807 + }, + { + "epoch": 3.69, + "grad_norm": 2.828125, + "learning_rate": 5.268500118711883e-08, + "loss": 0.7415, + "step": 2808 + }, + { + "epoch": 3.69, + "grad_norm": 2.71875, + "learning_rate": 5.2150642751138124e-08, + "loss": 0.7326, + "step": 2809 + }, + { + "epoch": 3.69, + "grad_norm": 2.765625, + "learning_rate": 5.16189794518765e-08, + "loss": 0.7352, + "step": 2810 + }, + { + "epoch": 3.7, + "grad_norm": 2.765625, + "learning_rate": 5.10900118747068e-08, + "loss": 0.7468, + "step": 2811 + }, + { + "epoch": 3.7, + "grad_norm": 2.8125, + "learning_rate": 5.0563740602034284e-08, + "loss": 0.7509, + "step": 2812 + }, + { + "epoch": 3.7, + "grad_norm": 2.734375, + "learning_rate": 5.004016621329544e-08, + "loss": 0.7191, + "step": 2813 + }, + { + "epoch": 3.7, + "grad_norm": 2.671875, + "learning_rate": 4.9519289284956916e-08, + "loss": 0.7324, + "step": 2814 + }, + { + "epoch": 3.7, + "grad_norm": 2.796875, + "learning_rate": 4.900111039051636e-08, + "loss": 0.7422, + "step": 2815 + }, + { + "epoch": 3.7, + "grad_norm": 2.796875, + "learning_rate": 4.848563010049989e-08, + "loss": 0.7299, + "step": 2816 + }, + { + "epoch": 3.7, + "grad_norm": 2.78125, + "learning_rate": 4.797284898246296e-08, + "loss": 0.7403, + "step": 2817 + }, + { + "epoch": 3.7, + "grad_norm": 2.84375, + "learning_rate": 4.746276760098867e-08, + "loss": 0.7352, + "step": 2818 + }, + { + "epoch": 3.71, + "grad_norm": 2.765625, + "learning_rate": 4.695538651768861e-08, + "loss": 0.7482, + "step": 2819 + }, + { + "epoch": 3.71, + "grad_norm": 2.734375, + "learning_rate": 4.6450706291200354e-08, + "loss": 0.7571, + "step": 2820 + }, + { + "epoch": 3.71, + "grad_norm": 2.8125, + "learning_rate": 4.594872747718804e-08, + "loss": 0.752, + "step": 2821 + }, + { + "epoch": 3.71, + "grad_norm": 2.890625, + "learning_rate": 4.54494506283415e-08, + "loss": 0.7472, + "step": 2822 + }, + { + "epoch": 3.71, + "grad_norm": 2.734375, + "learning_rate": 4.4952876294376e-08, + "loss": 0.7175, + "step": 2823 + }, + { + "epoch": 3.71, + "grad_norm": 2.75, + "learning_rate": 4.44590050220306e-08, + "loss": 0.7253, + "step": 2824 + }, + { + "epoch": 3.71, + "grad_norm": 2.796875, + "learning_rate": 4.396783735506893e-08, + "loss": 0.7376, + "step": 2825 + }, + { + "epoch": 3.72, + "grad_norm": 2.734375, + "learning_rate": 4.3479373834277026e-08, + "loss": 0.7321, + "step": 2826 + }, + { + "epoch": 3.72, + "grad_norm": 2.71875, + "learning_rate": 4.299361499746441e-08, + "loss": 0.739, + "step": 2827 + }, + { + "epoch": 3.72, + "grad_norm": 2.75, + "learning_rate": 4.2510561379462136e-08, + "loss": 0.7638, + "step": 2828 + }, + { + "epoch": 3.72, + "grad_norm": 2.765625, + "learning_rate": 4.2030213512123384e-08, + "loss": 0.7394, + "step": 2829 + }, + { + "epoch": 3.72, + "grad_norm": 2.796875, + "learning_rate": 4.155257192432205e-08, + "loss": 0.7281, + "step": 2830 + }, + { + "epoch": 3.72, + "grad_norm": 2.765625, + "learning_rate": 4.1077637141951624e-08, + "loss": 0.7392, + "step": 2831 + }, + { + "epoch": 3.72, + "grad_norm": 2.875, + "learning_rate": 4.060540968792631e-08, + "loss": 0.7199, + "step": 2832 + }, + { + "epoch": 3.72, + "grad_norm": 2.71875, + "learning_rate": 4.013589008217883e-08, + "loss": 0.7199, + "step": 2833 + }, + { + "epoch": 3.73, + "grad_norm": 2.765625, + "learning_rate": 3.9669078841660655e-08, + "loss": 0.7228, + "step": 2834 + }, + { + "epoch": 3.73, + "grad_norm": 2.71875, + "learning_rate": 3.92049764803415e-08, + "loss": 0.7264, + "step": 2835 + }, + { + "epoch": 3.73, + "grad_norm": 2.765625, + "learning_rate": 3.874358350920843e-08, + "loss": 0.7295, + "step": 2836 + }, + { + "epoch": 3.73, + "grad_norm": 2.75, + "learning_rate": 3.828490043626537e-08, + "loss": 0.7348, + "step": 2837 + }, + { + "epoch": 3.73, + "grad_norm": 2.71875, + "learning_rate": 3.782892776653252e-08, + "loss": 0.7246, + "step": 2838 + }, + { + "epoch": 3.73, + "grad_norm": 2.78125, + "learning_rate": 3.7375666002046064e-08, + "loss": 0.7531, + "step": 2839 + }, + { + "epoch": 3.73, + "grad_norm": 2.84375, + "learning_rate": 3.6925115641857356e-08, + "loss": 0.7359, + "step": 2840 + }, + { + "epoch": 3.74, + "grad_norm": 2.765625, + "learning_rate": 3.6477277182032365e-08, + "loss": 0.7443, + "step": 2841 + }, + { + "epoch": 3.74, + "grad_norm": 2.78125, + "learning_rate": 3.603215111565139e-08, + "loss": 0.7426, + "step": 2842 + }, + { + "epoch": 3.74, + "grad_norm": 2.75, + "learning_rate": 3.5589737932807674e-08, + "loss": 0.7321, + "step": 2843 + }, + { + "epoch": 3.74, + "grad_norm": 2.734375, + "learning_rate": 3.515003812060824e-08, + "loss": 0.7242, + "step": 2844 + }, + { + "epoch": 3.74, + "grad_norm": 2.8125, + "learning_rate": 3.47130521631725e-08, + "loss": 0.7508, + "step": 2845 + }, + { + "epoch": 3.74, + "grad_norm": 2.703125, + "learning_rate": 3.427878054163142e-08, + "loss": 0.7402, + "step": 2846 + }, + { + "epoch": 3.74, + "grad_norm": 2.703125, + "learning_rate": 3.384722373412835e-08, + "loss": 0.7309, + "step": 2847 + }, + { + "epoch": 3.74, + "grad_norm": 2.734375, + "learning_rate": 3.341838221581656e-08, + "loss": 0.7722, + "step": 2848 + }, + { + "epoch": 3.75, + "grad_norm": 2.78125, + "learning_rate": 3.2992256458860006e-08, + "loss": 0.74, + "step": 2849 + }, + { + "epoch": 3.75, + "grad_norm": 2.765625, + "learning_rate": 3.2568846932432565e-08, + "loss": 0.7351, + "step": 2850 + }, + { + "epoch": 3.75, + "grad_norm": 2.671875, + "learning_rate": 3.2148154102717985e-08, + "loss": 0.7087, + "step": 2851 + }, + { + "epoch": 3.75, + "grad_norm": 2.78125, + "learning_rate": 3.1730178432908266e-08, + "loss": 0.7378, + "step": 2852 + }, + { + "epoch": 3.75, + "grad_norm": 2.71875, + "learning_rate": 3.1314920383203886e-08, + "loss": 0.7254, + "step": 2853 + }, + { + "epoch": 3.75, + "grad_norm": 3.0625, + "learning_rate": 3.090238041081328e-08, + "loss": 0.7478, + "step": 2854 + }, + { + "epoch": 3.75, + "grad_norm": 2.765625, + "learning_rate": 3.049255896995201e-08, + "loss": 0.7333, + "step": 2855 + }, + { + "epoch": 3.76, + "grad_norm": 2.890625, + "learning_rate": 3.008545651184275e-08, + "loss": 0.7478, + "step": 2856 + }, + { + "epoch": 3.76, + "grad_norm": 2.71875, + "learning_rate": 2.9681073484714438e-08, + "loss": 0.7359, + "step": 2857 + }, + { + "epoch": 3.76, + "grad_norm": 2.859375, + "learning_rate": 2.9279410333801773e-08, + "loss": 0.7365, + "step": 2858 + }, + { + "epoch": 3.76, + "grad_norm": 2.75, + "learning_rate": 2.88804675013446e-08, + "loss": 0.7288, + "step": 2859 + }, + { + "epoch": 3.76, + "grad_norm": 2.765625, + "learning_rate": 2.848424542658823e-08, + "loss": 0.7377, + "step": 2860 + }, + { + "epoch": 3.76, + "grad_norm": 2.71875, + "learning_rate": 2.8090744545782034e-08, + "loss": 0.7067, + "step": 2861 + }, + { + "epoch": 3.76, + "grad_norm": 2.78125, + "learning_rate": 2.769996529217861e-08, + "loss": 0.7117, + "step": 2862 + }, + { + "epoch": 3.76, + "grad_norm": 2.78125, + "learning_rate": 2.7311908096035454e-08, + "loss": 0.7122, + "step": 2863 + }, + { + "epoch": 3.77, + "grad_norm": 2.6875, + "learning_rate": 2.6926573384611353e-08, + "loss": 0.7102, + "step": 2864 + }, + { + "epoch": 3.77, + "grad_norm": 2.75, + "learning_rate": 2.6543961582169153e-08, + "loss": 0.7175, + "step": 2865 + }, + { + "epoch": 3.77, + "grad_norm": 2.734375, + "learning_rate": 2.6164073109972986e-08, + "loss": 0.7483, + "step": 2866 + }, + { + "epoch": 3.77, + "grad_norm": 2.71875, + "learning_rate": 2.5786908386288e-08, + "loss": 0.7433, + "step": 2867 + }, + { + "epoch": 3.77, + "grad_norm": 2.796875, + "learning_rate": 2.5412467826381182e-08, + "loss": 0.7559, + "step": 2868 + }, + { + "epoch": 3.77, + "grad_norm": 2.765625, + "learning_rate": 2.504075184251997e-08, + "loss": 0.7435, + "step": 2869 + }, + { + "epoch": 3.77, + "grad_norm": 2.78125, + "learning_rate": 2.4671760843971992e-08, + "loss": 0.7434, + "step": 2870 + }, + { + "epoch": 3.78, + "grad_norm": 2.734375, + "learning_rate": 2.4305495237004762e-08, + "loss": 0.7279, + "step": 2871 + }, + { + "epoch": 3.78, + "grad_norm": 2.703125, + "learning_rate": 2.3941955424884312e-08, + "loss": 0.7393, + "step": 2872 + }, + { + "epoch": 3.78, + "grad_norm": 2.765625, + "learning_rate": 2.3581141807876573e-08, + "loss": 0.7192, + "step": 2873 + }, + { + "epoch": 3.78, + "grad_norm": 2.8125, + "learning_rate": 2.322305478324516e-08, + "loss": 0.731, + "step": 2874 + }, + { + "epoch": 3.78, + "grad_norm": 2.828125, + "learning_rate": 2.2867694745251634e-08, + "loss": 0.7385, + "step": 2875 + }, + { + "epoch": 3.78, + "grad_norm": 2.765625, + "learning_rate": 2.251506208515608e-08, + "loss": 0.7166, + "step": 2876 + }, + { + "epoch": 3.78, + "grad_norm": 2.8125, + "learning_rate": 2.2165157191214314e-08, + "loss": 0.7436, + "step": 2877 + }, + { + "epoch": 3.78, + "grad_norm": 2.796875, + "learning_rate": 2.1817980448679553e-08, + "loss": 0.728, + "step": 2878 + }, + { + "epoch": 3.79, + "grad_norm": 2.78125, + "learning_rate": 2.147353223980103e-08, + "loss": 0.7318, + "step": 2879 + }, + { + "epoch": 3.79, + "grad_norm": 2.71875, + "learning_rate": 2.1131812943823994e-08, + "loss": 0.7152, + "step": 2880 + }, + { + "epoch": 3.79, + "grad_norm": 2.796875, + "learning_rate": 2.0792822936989142e-08, + "loss": 0.7667, + "step": 2881 + }, + { + "epoch": 3.79, + "grad_norm": 2.78125, + "learning_rate": 2.0456562592532093e-08, + "loss": 0.7233, + "step": 2882 + }, + { + "epoch": 3.79, + "grad_norm": 2.796875, + "learning_rate": 2.0123032280682242e-08, + "loss": 0.7389, + "step": 2883 + }, + { + "epoch": 3.79, + "grad_norm": 2.765625, + "learning_rate": 1.979223236866501e-08, + "loss": 0.7327, + "step": 2884 + }, + { + "epoch": 3.79, + "grad_norm": 2.65625, + "learning_rate": 1.946416322069794e-08, + "loss": 0.733, + "step": 2885 + }, + { + "epoch": 3.8, + "grad_norm": 2.78125, + "learning_rate": 1.9138825197992096e-08, + "loss": 0.7327, + "step": 2886 + }, + { + "epoch": 3.8, + "grad_norm": 2.734375, + "learning_rate": 1.8816218658752605e-08, + "loss": 0.7271, + "step": 2887 + }, + { + "epoch": 3.8, + "grad_norm": 2.765625, + "learning_rate": 1.8496343958175898e-08, + "loss": 0.7392, + "step": 2888 + }, + { + "epoch": 3.8, + "grad_norm": 2.796875, + "learning_rate": 1.8179201448451368e-08, + "loss": 0.769, + "step": 2889 + }, + { + "epoch": 3.8, + "grad_norm": 2.765625, + "learning_rate": 1.7864791478760245e-08, + "loss": 0.7105, + "step": 2890 + }, + { + "epoch": 3.8, + "grad_norm": 2.78125, + "learning_rate": 1.755311439527424e-08, + "loss": 0.737, + "step": 2891 + }, + { + "epoch": 3.8, + "grad_norm": 2.8125, + "learning_rate": 1.7244170541157455e-08, + "loss": 0.749, + "step": 2892 + }, + { + "epoch": 3.8, + "grad_norm": 2.703125, + "learning_rate": 1.6937960256563914e-08, + "loss": 0.7221, + "step": 2893 + }, + { + "epoch": 3.81, + "grad_norm": 2.828125, + "learning_rate": 1.663448387863753e-08, + "loss": 0.7467, + "step": 2894 + }, + { + "epoch": 3.81, + "grad_norm": 2.796875, + "learning_rate": 1.6333741741512977e-08, + "loss": 0.7574, + "step": 2895 + }, + { + "epoch": 3.81, + "grad_norm": 2.765625, + "learning_rate": 1.603573417631371e-08, + "loss": 0.7659, + "step": 2896 + }, + { + "epoch": 3.81, + "grad_norm": 2.765625, + "learning_rate": 1.5740461511152827e-08, + "loss": 0.7434, + "step": 2897 + }, + { + "epoch": 3.81, + "grad_norm": 2.875, + "learning_rate": 1.5447924071131937e-08, + "loss": 0.7636, + "step": 2898 + }, + { + "epoch": 3.81, + "grad_norm": 2.8125, + "learning_rate": 1.5158122178341727e-08, + "loss": 0.7282, + "step": 2899 + }, + { + "epoch": 3.81, + "grad_norm": 2.765625, + "learning_rate": 1.4871056151860296e-08, + "loss": 0.7234, + "step": 2900 + }, + { + "epoch": 3.82, + "grad_norm": 2.796875, + "learning_rate": 1.4586726307753985e-08, + "loss": 0.7264, + "step": 2901 + }, + { + "epoch": 3.82, + "grad_norm": 2.859375, + "learning_rate": 1.4305132959075706e-08, + "loss": 0.7561, + "step": 2902 + }, + { + "epoch": 3.82, + "grad_norm": 2.796875, + "learning_rate": 1.4026276415866624e-08, + "loss": 0.7512, + "step": 2903 + }, + { + "epoch": 3.82, + "grad_norm": 2.78125, + "learning_rate": 1.3750156985153361e-08, + "loss": 0.7299, + "step": 2904 + }, + { + "epoch": 3.82, + "grad_norm": 2.78125, + "learning_rate": 1.3476774970950236e-08, + "loss": 0.7348, + "step": 2905 + }, + { + "epoch": 3.82, + "grad_norm": 2.734375, + "learning_rate": 1.3206130674256746e-08, + "loss": 0.7432, + "step": 2906 + }, + { + "epoch": 3.82, + "grad_norm": 2.84375, + "learning_rate": 1.293822439305814e-08, + "loss": 0.7636, + "step": 2907 + }, + { + "epoch": 3.82, + "grad_norm": 2.765625, + "learning_rate": 1.2673056422325413e-08, + "loss": 0.7549, + "step": 2908 + }, + { + "epoch": 3.83, + "grad_norm": 2.828125, + "learning_rate": 1.2410627054013913e-08, + "loss": 0.772, + "step": 2909 + }, + { + "epoch": 3.83, + "grad_norm": 2.796875, + "learning_rate": 1.2150936577065008e-08, + "loss": 0.766, + "step": 2910 + }, + { + "epoch": 3.83, + "grad_norm": 2.703125, + "learning_rate": 1.1893985277403041e-08, + "loss": 0.704, + "step": 2911 + }, + { + "epoch": 3.83, + "grad_norm": 2.84375, + "learning_rate": 1.1639773437937263e-08, + "loss": 0.7266, + "step": 2912 + }, + { + "epoch": 3.83, + "grad_norm": 2.796875, + "learning_rate": 1.1388301338561004e-08, + "loss": 0.7554, + "step": 2913 + }, + { + "epoch": 3.83, + "grad_norm": 2.703125, + "learning_rate": 1.1139569256150285e-08, + "loss": 0.7383, + "step": 2914 + }, + { + "epoch": 3.83, + "grad_norm": 2.796875, + "learning_rate": 1.0893577464564654e-08, + "loss": 0.7462, + "step": 2915 + }, + { + "epoch": 3.84, + "grad_norm": 2.796875, + "learning_rate": 1.0650326234646347e-08, + "loss": 0.7399, + "step": 2916 + }, + { + "epoch": 3.84, + "grad_norm": 2.84375, + "learning_rate": 1.0409815834221126e-08, + "loss": 0.7402, + "step": 2917 + }, + { + "epoch": 3.84, + "grad_norm": 2.75, + "learning_rate": 1.0172046528095502e-08, + "loss": 0.7301, + "step": 2918 + }, + { + "epoch": 3.84, + "grad_norm": 2.671875, + "learning_rate": 9.937018578059232e-09, + "loss": 0.6891, + "step": 2919 + }, + { + "epoch": 3.84, + "grad_norm": 2.71875, + "learning_rate": 9.704732242883374e-09, + "loss": 0.7348, + "step": 2920 + }, + { + "epoch": 3.84, + "grad_norm": 2.75, + "learning_rate": 9.475187778320017e-09, + "loss": 0.7366, + "step": 2921 + }, + { + "epoch": 3.84, + "grad_norm": 2.796875, + "learning_rate": 9.248385437103102e-09, + "loss": 0.7531, + "step": 2922 + }, + { + "epoch": 3.84, + "grad_norm": 2.8125, + "learning_rate": 9.024325468946771e-09, + "loss": 0.7097, + "step": 2923 + }, + { + "epoch": 3.85, + "grad_norm": 2.8125, + "learning_rate": 8.803008120546186e-09, + "loss": 0.7508, + "step": 2924 + }, + { + "epoch": 3.85, + "grad_norm": 2.921875, + "learning_rate": 8.584433635576706e-09, + "loss": 0.7541, + "step": 2925 + }, + { + "epoch": 3.85, + "grad_norm": 2.75, + "learning_rate": 8.368602254693603e-09, + "loss": 0.7324, + "step": 2926 + }, + { + "epoch": 3.85, + "grad_norm": 2.6875, + "learning_rate": 8.155514215532067e-09, + "loss": 0.7129, + "step": 2927 + }, + { + "epoch": 3.85, + "grad_norm": 2.734375, + "learning_rate": 7.945169752706927e-09, + "loss": 0.7325, + "step": 2928 + }, + { + "epoch": 3.85, + "grad_norm": 2.765625, + "learning_rate": 7.737569097811814e-09, + "loss": 0.7265, + "step": 2929 + }, + { + "epoch": 3.85, + "grad_norm": 2.703125, + "learning_rate": 7.532712479420279e-09, + "loss": 0.7415, + "step": 2930 + }, + { + "epoch": 3.86, + "grad_norm": 2.78125, + "learning_rate": 7.330600123083287e-09, + "loss": 0.7386, + "step": 2931 + }, + { + "epoch": 3.86, + "grad_norm": 2.875, + "learning_rate": 7.131232251331721e-09, + "loss": 0.7384, + "step": 2932 + }, + { + "epoch": 3.86, + "grad_norm": 2.75, + "learning_rate": 6.9346090836733275e-09, + "loss": 0.7664, + "step": 2933 + }, + { + "epoch": 3.86, + "grad_norm": 2.796875, + "learning_rate": 6.740730836595211e-09, + "loss": 0.7617, + "step": 2934 + }, + { + "epoch": 3.86, + "grad_norm": 2.828125, + "learning_rate": 6.5495977235613405e-09, + "loss": 0.7337, + "step": 2935 + }, + { + "epoch": 3.86, + "grad_norm": 2.765625, + "learning_rate": 6.361209955013659e-09, + "loss": 0.7311, + "step": 2936 + }, + { + "epoch": 3.86, + "grad_norm": 2.734375, + "learning_rate": 6.1755677383712465e-09, + "loss": 0.735, + "step": 2937 + }, + { + "epoch": 3.86, + "grad_norm": 2.78125, + "learning_rate": 5.992671278030327e-09, + "loss": 0.7274, + "step": 2938 + }, + { + "epoch": 3.87, + "grad_norm": 2.765625, + "learning_rate": 5.812520775363984e-09, + "loss": 0.7217, + "step": 2939 + }, + { + "epoch": 3.87, + "grad_norm": 2.828125, + "learning_rate": 5.635116428722165e-09, + "loss": 0.7365, + "step": 2940 + }, + { + "epoch": 3.87, + "grad_norm": 2.859375, + "learning_rate": 5.460458433430571e-09, + "loss": 0.7187, + "step": 2941 + }, + { + "epoch": 3.87, + "grad_norm": 2.78125, + "learning_rate": 5.288546981791764e-09, + "loss": 0.7244, + "step": 2942 + }, + { + "epoch": 3.87, + "grad_norm": 2.78125, + "learning_rate": 5.1193822630840605e-09, + "loss": 0.7478, + "step": 2943 + }, + { + "epoch": 3.87, + "grad_norm": 2.78125, + "learning_rate": 4.952964463561805e-09, + "loss": 0.763, + "step": 2944 + }, + { + "epoch": 3.87, + "grad_norm": 2.671875, + "learning_rate": 4.789293766454262e-09, + "loss": 0.7294, + "step": 2945 + }, + { + "epoch": 3.88, + "grad_norm": 2.78125, + "learning_rate": 4.628370351967004e-09, + "loss": 0.7239, + "step": 2946 + }, + { + "epoch": 3.88, + "grad_norm": 2.703125, + "learning_rate": 4.47019439727997e-09, + "loss": 0.7102, + "step": 2947 + }, + { + "epoch": 3.88, + "grad_norm": 2.796875, + "learning_rate": 4.314766076548571e-09, + "loss": 0.719, + "step": 2948 + }, + { + "epoch": 3.88, + "grad_norm": 2.875, + "learning_rate": 4.162085560902585e-09, + "loss": 0.7664, + "step": 2949 + }, + { + "epoch": 3.88, + "grad_norm": 2.75, + "learning_rate": 4.012153018446984e-09, + "loss": 0.7484, + "step": 2950 + }, + { + "epoch": 3.88, + "grad_norm": 2.921875, + "learning_rate": 3.864968614260834e-09, + "loss": 0.7535, + "step": 2951 + }, + { + "epoch": 3.88, + "grad_norm": 2.75, + "learning_rate": 3.720532510397279e-09, + "loss": 0.7329, + "step": 2952 + }, + { + "epoch": 3.88, + "grad_norm": 2.765625, + "learning_rate": 3.5788448658838347e-09, + "loss": 0.7396, + "step": 2953 + }, + { + "epoch": 3.89, + "grad_norm": 2.75, + "learning_rate": 3.4399058367215466e-09, + "loss": 0.7397, + "step": 2954 + }, + { + "epoch": 3.89, + "grad_norm": 2.796875, + "learning_rate": 3.3037155758855466e-09, + "loss": 0.7373, + "step": 2955 + }, + { + "epoch": 3.89, + "grad_norm": 2.765625, + "learning_rate": 3.170274233324222e-09, + "loss": 0.7425, + "step": 2956 + }, + { + "epoch": 3.89, + "grad_norm": 2.703125, + "learning_rate": 3.039581955959492e-09, + "loss": 0.7203, + "step": 2957 + }, + { + "epoch": 3.89, + "grad_norm": 2.734375, + "learning_rate": 2.9116388876868075e-09, + "loss": 0.7311, + "step": 2958 + }, + { + "epoch": 3.89, + "grad_norm": 2.828125, + "learning_rate": 2.7864451693740412e-09, + "loss": 0.7643, + "step": 2959 + }, + { + "epoch": 3.89, + "grad_norm": 2.75, + "learning_rate": 2.6640009388620434e-09, + "loss": 0.753, + "step": 2960 + }, + { + "epoch": 3.9, + "grad_norm": 2.78125, + "learning_rate": 2.544306330965196e-09, + "loss": 0.7482, + "step": 2961 + }, + { + "epoch": 3.9, + "grad_norm": 2.75, + "learning_rate": 2.4273614774691923e-09, + "loss": 0.7422, + "step": 2962 + }, + { + "epoch": 3.9, + "grad_norm": 2.734375, + "learning_rate": 2.3131665071338127e-09, + "loss": 0.7334, + "step": 2963 + }, + { + "epoch": 3.9, + "grad_norm": 2.71875, + "learning_rate": 2.2017215456895947e-09, + "loss": 0.7389, + "step": 2964 + }, + { + "epoch": 3.9, + "grad_norm": 2.734375, + "learning_rate": 2.093026715840052e-09, + "loss": 0.7329, + "step": 2965 + }, + { + "epoch": 3.9, + "grad_norm": 2.71875, + "learning_rate": 1.987082137260843e-09, + "loss": 0.7374, + "step": 2966 + }, + { + "epoch": 3.9, + "grad_norm": 2.78125, + "learning_rate": 1.883887926598937e-09, + "loss": 0.7191, + "step": 2967 + }, + { + "epoch": 3.9, + "grad_norm": 2.859375, + "learning_rate": 1.7834441974740047e-09, + "loss": 0.7467, + "step": 2968 + }, + { + "epoch": 3.91, + "grad_norm": 2.703125, + "learning_rate": 1.6857510604764704e-09, + "loss": 0.7067, + "step": 2969 + }, + { + "epoch": 3.91, + "grad_norm": 2.828125, + "learning_rate": 1.590808623168627e-09, + "loss": 0.7633, + "step": 2970 + }, + { + "epoch": 3.91, + "grad_norm": 2.8125, + "learning_rate": 1.4986169900840785e-09, + "loss": 0.7312, + "step": 2971 + }, + { + "epoch": 3.91, + "grad_norm": 2.6875, + "learning_rate": 1.4091762627282956e-09, + "loss": 0.7373, + "step": 2972 + }, + { + "epoch": 3.91, + "grad_norm": 2.828125, + "learning_rate": 1.3224865395775054e-09, + "loss": 0.7671, + "step": 2973 + }, + { + "epoch": 3.91, + "grad_norm": 2.765625, + "learning_rate": 1.2385479160784141e-09, + "loss": 0.7335, + "step": 2974 + }, + { + "epoch": 3.91, + "grad_norm": 2.765625, + "learning_rate": 1.1573604846501496e-09, + "loss": 0.7544, + "step": 2975 + }, + { + "epoch": 3.92, + "grad_norm": 2.828125, + "learning_rate": 1.0789243346812083e-09, + "loss": 0.7404, + "step": 2976 + }, + { + "epoch": 3.92, + "grad_norm": 2.78125, + "learning_rate": 1.0032395525316762e-09, + "loss": 0.7736, + "step": 2977 + }, + { + "epoch": 3.92, + "grad_norm": 2.796875, + "learning_rate": 9.303062215323955e-10, + "loss": 0.7172, + "step": 2978 + }, + { + "epoch": 3.92, + "grad_norm": 2.921875, + "learning_rate": 8.601244219846872e-10, + "loss": 0.746, + "step": 2979 + }, + { + "epoch": 3.92, + "grad_norm": 2.859375, + "learning_rate": 7.926942311597962e-10, + "loss": 0.7433, + "step": 2980 + }, + { + "epoch": 3.92, + "grad_norm": 2.796875, + "learning_rate": 7.280157233002794e-10, + "loss": 0.7349, + "step": 2981 + }, + { + "epoch": 3.92, + "grad_norm": 2.703125, + "learning_rate": 6.660889696186168e-10, + "loss": 0.742, + "step": 2982 + }, + { + "epoch": 3.92, + "grad_norm": 2.8125, + "learning_rate": 6.069140382972128e-10, + "loss": 0.7588, + "step": 2983 + }, + { + "epoch": 3.93, + "grad_norm": 2.875, + "learning_rate": 5.504909944895054e-10, + "loss": 0.7558, + "step": 2984 + }, + { + "epoch": 3.93, + "grad_norm": 2.796875, + "learning_rate": 4.968199003177466e-10, + "loss": 0.7512, + "step": 2985 + }, + { + "epoch": 3.93, + "grad_norm": 2.796875, + "learning_rate": 4.4590081487577706e-10, + "loss": 0.7457, + "step": 2986 + }, + { + "epoch": 3.93, + "grad_norm": 2.890625, + "learning_rate": 3.9773379422625156e-10, + "loss": 0.7603, + "step": 2987 + }, + { + "epoch": 3.93, + "grad_norm": 2.828125, + "learning_rate": 3.5231889140230347e-10, + "loss": 0.7217, + "step": 2988 + }, + { + "epoch": 3.93, + "grad_norm": 2.734375, + "learning_rate": 3.0965615640671243e-10, + "loss": 0.7448, + "step": 2989 + }, + { + "epoch": 3.93, + "grad_norm": 2.71875, + "learning_rate": 2.697456362119044e-10, + "loss": 0.7266, + "step": 2990 + }, + { + "epoch": 3.94, + "grad_norm": 2.75, + "learning_rate": 2.3258737476050674e-10, + "loss": 0.7257, + "step": 2991 + }, + { + "epoch": 3.94, + "grad_norm": 2.78125, + "learning_rate": 1.9818141296451544e-10, + "loss": 0.7411, + "step": 2992 + }, + { + "epoch": 3.94, + "grad_norm": 2.84375, + "learning_rate": 1.6652778870557274e-10, + "loss": 0.7428, + "step": 2993 + }, + { + "epoch": 3.94, + "grad_norm": 2.796875, + "learning_rate": 1.3762653683496718e-10, + "loss": 0.7325, + "step": 2994 + }, + { + "epoch": 3.94, + "grad_norm": 2.796875, + "learning_rate": 1.1147768917391111e-10, + "loss": 0.7328, + "step": 2995 + }, + { + "epoch": 3.94, + "grad_norm": 2.765625, + "learning_rate": 8.808127451270798e-11, + "loss": 0.7472, + "step": 2996 + }, + { + "epoch": 3.94, + "grad_norm": 2.765625, + "learning_rate": 6.743731861130753e-11, + "loss": 0.7252, + "step": 2997 + }, + { + "epoch": 3.94, + "grad_norm": 2.6875, + "learning_rate": 4.954584419930575e-11, + "loss": 0.7301, + "step": 2998 + }, + { + "epoch": 3.95, + "grad_norm": 2.71875, + "learning_rate": 3.440687097538975e-11, + "loss": 0.7206, + "step": 2999 + }, + { + "epoch": 3.95, + "grad_norm": 2.734375, + "learning_rate": 2.202041560789292e-11, + "loss": 0.7362, + "step": 3000 + }, + { + "epoch": 3.95, + "grad_norm": 2.828125, + "learning_rate": 1.2386491734794891e-11, + "loss": 0.7393, + "step": 3001 + }, + { + "epoch": 3.95, + "grad_norm": 2.796875, + "learning_rate": 5.505109963166444e-12, + "loss": 0.7456, + "step": 3002 + }, + { + "epoch": 3.95, + "grad_norm": 2.875, + "learning_rate": 1.376277869724607e-12, + "loss": 0.7549, + "step": 3003 + }, + { + "epoch": 3.95, + "grad_norm": 2.65625, + "learning_rate": 0.0, + "loss": 0.7152, + "step": 3004 } ], "logging_steps": 1, @@ -15785,7 +21042,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 751, - "total_flos": 2.519778018662233e+19, + "total_flos": 3.359704024882977e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null