diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14983 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999531550100716, + "eval_steps": 500, + "global_step": 10673, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.368997985665433e-05, + "grad_norm": 59.75, + "learning_rate": 1.8726591760299626e-07, + "loss": 18.5002, + "step": 1 + }, + { + "epoch": 0.00046844989928327167, + "grad_norm": 61.75, + "learning_rate": 9.363295880149814e-07, + "loss": 18.7036, + "step": 5 + }, + { + "epoch": 0.0009368997985665433, + "grad_norm": 70.0, + "learning_rate": 1.8726591760299627e-06, + "loss": 17.5667, + "step": 10 + }, + { + "epoch": 0.001405349697849815, + "grad_norm": 62.0, + "learning_rate": 2.808988764044944e-06, + "loss": 18.7767, + "step": 15 + }, + { + "epoch": 0.0018737995971330867, + "grad_norm": 60.75, + "learning_rate": 3.7453183520599255e-06, + "loss": 18.3964, + "step": 20 + }, + { + "epoch": 0.0023422494964163583, + "grad_norm": 57.5, + "learning_rate": 4.6816479400749066e-06, + "loss": 17.2933, + "step": 25 + }, + { + "epoch": 0.00281069939569963, + "grad_norm": 57.0, + "learning_rate": 5.617977528089888e-06, + "loss": 16.126, + "step": 30 + }, + { + "epoch": 0.0032791492949829017, + "grad_norm": 50.75, + "learning_rate": 6.554307116104869e-06, + "loss": 16.2407, + "step": 35 + }, + { + "epoch": 0.0037475991942661734, + "grad_norm": 51.25, + "learning_rate": 7.490636704119851e-06, + "loss": 15.0747, + "step": 40 + }, + { + "epoch": 0.004216049093549445, + "grad_norm": 39.75, + "learning_rate": 8.426966292134832e-06, + "loss": 13.6143, + "step": 45 + }, + { + "epoch": 0.004684498992832717, + "grad_norm": 36.5, + "learning_rate": 9.363295880149813e-06, + "loss": 12.2398, + "step": 50 + }, + { + "epoch": 0.005152948892115988, + "grad_norm": 36.75, + "learning_rate": 1.0299625468164795e-05, + "loss": 10.2035, + "step": 55 + }, + { + "epoch": 0.00562139879139926, + "grad_norm": 36.5, + "learning_rate": 1.1235955056179776e-05, + "loss": 10.1946, + "step": 60 + }, + { + "epoch": 0.006089848690682531, + "grad_norm": 26.0, + "learning_rate": 1.2172284644194758e-05, + "loss": 8.0871, + "step": 65 + }, + { + "epoch": 0.006558298589965803, + "grad_norm": 19.5, + "learning_rate": 1.3108614232209737e-05, + "loss": 7.436, + "step": 70 + }, + { + "epoch": 0.007026748489249075, + "grad_norm": 11.25, + "learning_rate": 1.4044943820224721e-05, + "loss": 6.5085, + "step": 75 + }, + { + "epoch": 0.007495198388532347, + "grad_norm": 9.875, + "learning_rate": 1.4981273408239702e-05, + "loss": 5.8836, + "step": 80 + }, + { + "epoch": 0.007963648287815619, + "grad_norm": 7.375, + "learning_rate": 1.591760299625468e-05, + "loss": 5.2192, + "step": 85 + }, + { + "epoch": 0.00843209818709889, + "grad_norm": 3.640625, + "learning_rate": 1.6853932584269665e-05, + "loss": 4.9297, + "step": 90 + }, + { + "epoch": 0.008900548086382161, + "grad_norm": 3.234375, + "learning_rate": 1.7790262172284646e-05, + "loss": 4.1697, + "step": 95 + }, + { + "epoch": 0.009368997985665433, + "grad_norm": 2.390625, + "learning_rate": 1.8726591760299626e-05, + "loss": 4.0899, + "step": 100 + }, + { + "epoch": 0.009837447884948705, + "grad_norm": 2.484375, + "learning_rate": 1.9662921348314607e-05, + "loss": 4.0412, + "step": 105 + }, + { + "epoch": 0.010305897784231976, + "grad_norm": 2.15625, + "learning_rate": 2.059925093632959e-05, + "loss": 3.6698, + "step": 110 + }, + { + "epoch": 0.010774347683515248, + "grad_norm": 1.7265625, + "learning_rate": 2.153558052434457e-05, + "loss": 3.8425, + "step": 115 + }, + { + "epoch": 0.01124279758279852, + "grad_norm": 1.7890625, + "learning_rate": 2.2471910112359552e-05, + "loss": 3.5762, + "step": 120 + }, + { + "epoch": 0.011711247482081792, + "grad_norm": 2.078125, + "learning_rate": 2.3408239700374533e-05, + "loss": 3.8157, + "step": 125 + }, + { + "epoch": 0.012179697381365063, + "grad_norm": 2.4375, + "learning_rate": 2.4344569288389517e-05, + "loss": 3.6434, + "step": 130 + }, + { + "epoch": 0.012648147280648335, + "grad_norm": 2.546875, + "learning_rate": 2.5280898876404497e-05, + "loss": 3.3979, + "step": 135 + }, + { + "epoch": 0.013116597179931607, + "grad_norm": 2.984375, + "learning_rate": 2.6217228464419475e-05, + "loss": 3.2866, + "step": 140 + }, + { + "epoch": 0.013585047079214877, + "grad_norm": 4.78125, + "learning_rate": 2.715355805243446e-05, + "loss": 2.9153, + "step": 145 + }, + { + "epoch": 0.01405349697849815, + "grad_norm": 7.875, + "learning_rate": 2.8089887640449443e-05, + "loss": 2.8557, + "step": 150 + }, + { + "epoch": 0.014521946877781421, + "grad_norm": 7.0625, + "learning_rate": 2.902621722846442e-05, + "loss": 2.3448, + "step": 155 + }, + { + "epoch": 0.014990396777064693, + "grad_norm": 5.21875, + "learning_rate": 2.9962546816479404e-05, + "loss": 1.7144, + "step": 160 + }, + { + "epoch": 0.015458846676347964, + "grad_norm": 1.8515625, + "learning_rate": 3.089887640449438e-05, + "loss": 1.47, + "step": 165 + }, + { + "epoch": 0.015927296575631238, + "grad_norm": 3.859375, + "learning_rate": 3.183520599250936e-05, + "loss": 1.4025, + "step": 170 + }, + { + "epoch": 0.016395746474914506, + "grad_norm": 2.546875, + "learning_rate": 3.277153558052435e-05, + "loss": 1.3861, + "step": 175 + }, + { + "epoch": 0.01686419637419778, + "grad_norm": 2.171875, + "learning_rate": 3.370786516853933e-05, + "loss": 1.384, + "step": 180 + }, + { + "epoch": 0.01733264627348105, + "grad_norm": 2.5625, + "learning_rate": 3.464419475655431e-05, + "loss": 1.3646, + "step": 185 + }, + { + "epoch": 0.017801096172764323, + "grad_norm": 1.359375, + "learning_rate": 3.558052434456929e-05, + "loss": 1.368, + "step": 190 + }, + { + "epoch": 0.018269546072047595, + "grad_norm": 3.453125, + "learning_rate": 3.651685393258427e-05, + "loss": 1.2881, + "step": 195 + }, + { + "epoch": 0.018737995971330867, + "grad_norm": 4.0, + "learning_rate": 3.745318352059925e-05, + "loss": 1.278, + "step": 200 + }, + { + "epoch": 0.01920644587061414, + "grad_norm": 1.9375, + "learning_rate": 3.838951310861423e-05, + "loss": 1.3055, + "step": 205 + }, + { + "epoch": 0.01967489576989741, + "grad_norm": 1.9609375, + "learning_rate": 3.9325842696629214e-05, + "loss": 1.2798, + "step": 210 + }, + { + "epoch": 0.02014334566918068, + "grad_norm": 1.3984375, + "learning_rate": 4.0262172284644194e-05, + "loss": 1.2411, + "step": 215 + }, + { + "epoch": 0.02061179556846395, + "grad_norm": 2.265625, + "learning_rate": 4.119850187265918e-05, + "loss": 1.2734, + "step": 220 + }, + { + "epoch": 0.021080245467747224, + "grad_norm": 2.015625, + "learning_rate": 4.2134831460674156e-05, + "loss": 1.2793, + "step": 225 + }, + { + "epoch": 0.021548695367030496, + "grad_norm": 2.015625, + "learning_rate": 4.307116104868914e-05, + "loss": 1.2023, + "step": 230 + }, + { + "epoch": 0.022017145266313768, + "grad_norm": 7.3125, + "learning_rate": 4.4007490636704124e-05, + "loss": 1.2116, + "step": 235 + }, + { + "epoch": 0.02248559516559704, + "grad_norm": 3.46875, + "learning_rate": 4.4943820224719104e-05, + "loss": 1.1648, + "step": 240 + }, + { + "epoch": 0.022954045064880312, + "grad_norm": 1.375, + "learning_rate": 4.5880149812734085e-05, + "loss": 1.1978, + "step": 245 + }, + { + "epoch": 0.023422494964163584, + "grad_norm": 1.7265625, + "learning_rate": 4.6816479400749066e-05, + "loss": 1.2017, + "step": 250 + }, + { + "epoch": 0.023890944863446853, + "grad_norm": 3.8125, + "learning_rate": 4.7752808988764046e-05, + "loss": 1.1734, + "step": 255 + }, + { + "epoch": 0.024359394762730125, + "grad_norm": 3.859375, + "learning_rate": 4.8689138576779034e-05, + "loss": 1.1892, + "step": 260 + }, + { + "epoch": 0.024827844662013397, + "grad_norm": 1.6796875, + "learning_rate": 4.962546816479401e-05, + "loss": 1.3598, + "step": 265 + }, + { + "epoch": 0.02529629456129667, + "grad_norm": 4.03125, + "learning_rate": 5.0561797752808995e-05, + "loss": 1.1311, + "step": 270 + }, + { + "epoch": 0.02576474446057994, + "grad_norm": 1.96875, + "learning_rate": 5.1498127340823975e-05, + "loss": 1.1556, + "step": 275 + }, + { + "epoch": 0.026233194359863213, + "grad_norm": 3.28125, + "learning_rate": 5.243445692883895e-05, + "loss": 1.1625, + "step": 280 + }, + { + "epoch": 0.026701644259146486, + "grad_norm": 1.890625, + "learning_rate": 5.337078651685393e-05, + "loss": 1.1606, + "step": 285 + }, + { + "epoch": 0.027170094158429754, + "grad_norm": 5.625, + "learning_rate": 5.430711610486892e-05, + "loss": 1.153, + "step": 290 + }, + { + "epoch": 0.027638544057713026, + "grad_norm": 2.875, + "learning_rate": 5.52434456928839e-05, + "loss": 1.1565, + "step": 295 + }, + { + "epoch": 0.0281069939569963, + "grad_norm": 6.21875, + "learning_rate": 5.6179775280898885e-05, + "loss": 1.1315, + "step": 300 + }, + { + "epoch": 0.02857544385627957, + "grad_norm": 2.09375, + "learning_rate": 5.711610486891385e-05, + "loss": 1.1626, + "step": 305 + }, + { + "epoch": 0.029043893755562843, + "grad_norm": 5.15625, + "learning_rate": 5.805243445692884e-05, + "loss": 1.0912, + "step": 310 + }, + { + "epoch": 0.029512343654846115, + "grad_norm": 9.125, + "learning_rate": 5.898876404494382e-05, + "loss": 1.1472, + "step": 315 + }, + { + "epoch": 0.029980793554129387, + "grad_norm": 5.875, + "learning_rate": 5.992509363295881e-05, + "loss": 1.1274, + "step": 320 + }, + { + "epoch": 0.03044924345341266, + "grad_norm": 4.03125, + "learning_rate": 6.086142322097379e-05, + "loss": 1.1388, + "step": 325 + }, + { + "epoch": 0.030917693352695928, + "grad_norm": 1.90625, + "learning_rate": 6.179775280898876e-05, + "loss": 1.1032, + "step": 330 + }, + { + "epoch": 0.0313861432519792, + "grad_norm": 2.71875, + "learning_rate": 6.273408239700374e-05, + "loss": 1.1108, + "step": 335 + }, + { + "epoch": 0.031854593151262475, + "grad_norm": 1.84375, + "learning_rate": 6.367041198501872e-05, + "loss": 1.0866, + "step": 340 + }, + { + "epoch": 0.032323043050545744, + "grad_norm": 2.53125, + "learning_rate": 6.460674157303372e-05, + "loss": 1.1008, + "step": 345 + }, + { + "epoch": 0.03279149294982901, + "grad_norm": 1.9140625, + "learning_rate": 6.55430711610487e-05, + "loss": 1.1271, + "step": 350 + }, + { + "epoch": 0.03325994284911229, + "grad_norm": 2.90625, + "learning_rate": 6.647940074906367e-05, + "loss": 1.1526, + "step": 355 + }, + { + "epoch": 0.03372839274839556, + "grad_norm": 5.125, + "learning_rate": 6.741573033707866e-05, + "loss": 1.1009, + "step": 360 + }, + { + "epoch": 0.03419684264767883, + "grad_norm": 4.21875, + "learning_rate": 6.835205992509364e-05, + "loss": 1.1185, + "step": 365 + }, + { + "epoch": 0.0346652925469621, + "grad_norm": 2.078125, + "learning_rate": 6.928838951310862e-05, + "loss": 1.1078, + "step": 370 + }, + { + "epoch": 0.035133742446245377, + "grad_norm": 10.4375, + "learning_rate": 7.02247191011236e-05, + "loss": 1.0713, + "step": 375 + }, + { + "epoch": 0.035602192345528645, + "grad_norm": 2.359375, + "learning_rate": 7.116104868913858e-05, + "loss": 1.1002, + "step": 380 + }, + { + "epoch": 0.03607064224481192, + "grad_norm": 9.0625, + "learning_rate": 7.209737827715356e-05, + "loss": 1.0704, + "step": 385 + }, + { + "epoch": 0.03653909214409519, + "grad_norm": 4.5625, + "learning_rate": 7.303370786516854e-05, + "loss": 1.1168, + "step": 390 + }, + { + "epoch": 0.03700754204337846, + "grad_norm": 2.15625, + "learning_rate": 7.397003745318352e-05, + "loss": 1.0697, + "step": 395 + }, + { + "epoch": 0.037475991942661734, + "grad_norm": 1.4453125, + "learning_rate": 7.49063670411985e-05, + "loss": 1.0654, + "step": 400 + }, + { + "epoch": 0.037944441841945, + "grad_norm": 3.15625, + "learning_rate": 7.584269662921349e-05, + "loss": 1.0849, + "step": 405 + }, + { + "epoch": 0.03841289174122828, + "grad_norm": 1.5859375, + "learning_rate": 7.677902621722847e-05, + "loss": 1.0929, + "step": 410 + }, + { + "epoch": 0.038881341640511546, + "grad_norm": 4.9375, + "learning_rate": 7.771535580524345e-05, + "loss": 1.0614, + "step": 415 + }, + { + "epoch": 0.03934979153979482, + "grad_norm": 2.0625, + "learning_rate": 7.865168539325843e-05, + "loss": 1.0803, + "step": 420 + }, + { + "epoch": 0.03981824143907809, + "grad_norm": 1.7890625, + "learning_rate": 7.958801498127341e-05, + "loss": 1.0723, + "step": 425 + }, + { + "epoch": 0.04028669133836136, + "grad_norm": 5.21875, + "learning_rate": 8.052434456928839e-05, + "loss": 1.087, + "step": 430 + }, + { + "epoch": 0.040755141237644635, + "grad_norm": 3.140625, + "learning_rate": 8.146067415730337e-05, + "loss": 1.0719, + "step": 435 + }, + { + "epoch": 0.0412235911369279, + "grad_norm": 1.5078125, + "learning_rate": 8.239700374531836e-05, + "loss": 1.1398, + "step": 440 + }, + { + "epoch": 0.04169204103621118, + "grad_norm": 3.0625, + "learning_rate": 8.333333333333334e-05, + "loss": 1.0681, + "step": 445 + }, + { + "epoch": 0.04216049093549445, + "grad_norm": 7.5625, + "learning_rate": 8.426966292134831e-05, + "loss": 1.1118, + "step": 450 + }, + { + "epoch": 0.04262894083477772, + "grad_norm": 1.71875, + "learning_rate": 8.520599250936329e-05, + "loss": 1.0653, + "step": 455 + }, + { + "epoch": 0.04309739073406099, + "grad_norm": 2.609375, + "learning_rate": 8.614232209737829e-05, + "loss": 1.0697, + "step": 460 + }, + { + "epoch": 0.04356584063334426, + "grad_norm": 2.5625, + "learning_rate": 8.707865168539327e-05, + "loss": 1.0489, + "step": 465 + }, + { + "epoch": 0.044034290532627536, + "grad_norm": 1.9375, + "learning_rate": 8.801498127340825e-05, + "loss": 1.0835, + "step": 470 + }, + { + "epoch": 0.044502740431910805, + "grad_norm": 2.34375, + "learning_rate": 8.895131086142321e-05, + "loss": 1.0403, + "step": 475 + }, + { + "epoch": 0.04497119033119408, + "grad_norm": 4.03125, + "learning_rate": 8.988764044943821e-05, + "loss": 1.0366, + "step": 480 + }, + { + "epoch": 0.04543964023047735, + "grad_norm": 2.046875, + "learning_rate": 9.082397003745319e-05, + "loss": 1.0804, + "step": 485 + }, + { + "epoch": 0.045908090129760624, + "grad_norm": 6.21875, + "learning_rate": 9.176029962546817e-05, + "loss": 1.0536, + "step": 490 + }, + { + "epoch": 0.04637654002904389, + "grad_norm": 2.328125, + "learning_rate": 9.269662921348315e-05, + "loss": 1.0541, + "step": 495 + }, + { + "epoch": 0.04684498992832717, + "grad_norm": 4.875, + "learning_rate": 9.363295880149813e-05, + "loss": 1.037, + "step": 500 + }, + { + "epoch": 0.04731343982761044, + "grad_norm": 4.9375, + "learning_rate": 9.456928838951311e-05, + "loss": 1.0434, + "step": 505 + }, + { + "epoch": 0.047781889726893706, + "grad_norm": 1.6328125, + "learning_rate": 9.550561797752809e-05, + "loss": 1.0342, + "step": 510 + }, + { + "epoch": 0.04825033962617698, + "grad_norm": 1.609375, + "learning_rate": 9.644194756554307e-05, + "loss": 1.0589, + "step": 515 + }, + { + "epoch": 0.04871878952546025, + "grad_norm": 1.7109375, + "learning_rate": 9.737827715355807e-05, + "loss": 1.0573, + "step": 520 + }, + { + "epoch": 0.049187239424743526, + "grad_norm": 2.515625, + "learning_rate": 9.831460674157303e-05, + "loss": 1.0491, + "step": 525 + }, + { + "epoch": 0.049655689324026794, + "grad_norm": 3.140625, + "learning_rate": 9.925093632958801e-05, + "loss": 1.0508, + "step": 530 + }, + { + "epoch": 0.05012413922331007, + "grad_norm": 3.484375, + "learning_rate": 0.000100187265917603, + "loss": 1.0257, + "step": 535 + }, + { + "epoch": 0.05059258912259334, + "grad_norm": 6.34375, + "learning_rate": 0.00010112359550561799, + "loss": 1.0485, + "step": 540 + }, + { + "epoch": 0.05106103902187661, + "grad_norm": 5.0, + "learning_rate": 0.00010205992509363296, + "loss": 1.0738, + "step": 545 + }, + { + "epoch": 0.05152948892115988, + "grad_norm": 2.59375, + "learning_rate": 0.00010299625468164795, + "loss": 1.05, + "step": 550 + }, + { + "epoch": 0.05199793882044315, + "grad_norm": 1.6328125, + "learning_rate": 0.00010393258426966293, + "loss": 1.1056, + "step": 555 + }, + { + "epoch": 0.05246638871972643, + "grad_norm": 4.3125, + "learning_rate": 0.0001048689138576779, + "loss": 1.0542, + "step": 560 + }, + { + "epoch": 0.052934838619009696, + "grad_norm": 2.421875, + "learning_rate": 0.00010580524344569289, + "loss": 1.0249, + "step": 565 + }, + { + "epoch": 0.05340328851829297, + "grad_norm": 4.5625, + "learning_rate": 0.00010674157303370786, + "loss": 1.0088, + "step": 570 + }, + { + "epoch": 0.05387173841757624, + "grad_norm": 4.84375, + "learning_rate": 0.00010767790262172285, + "loss": 1.0343, + "step": 575 + }, + { + "epoch": 0.05434018831685951, + "grad_norm": 2.625, + "learning_rate": 0.00010861423220973783, + "loss": 1.053, + "step": 580 + }, + { + "epoch": 0.054808638216142784, + "grad_norm": 2.375, + "learning_rate": 0.0001095505617977528, + "loss": 1.0745, + "step": 585 + }, + { + "epoch": 0.05527708811542605, + "grad_norm": 2.59375, + "learning_rate": 0.0001104868913857678, + "loss": 1.0639, + "step": 590 + }, + { + "epoch": 0.05574553801470933, + "grad_norm": 13.6875, + "learning_rate": 0.00011142322097378278, + "loss": 1.0186, + "step": 595 + }, + { + "epoch": 0.0562139879139926, + "grad_norm": 1.0703125, + "learning_rate": 0.00011235955056179777, + "loss": 1.037, + "step": 600 + }, + { + "epoch": 0.05668243781327587, + "grad_norm": 1.34375, + "learning_rate": 0.00011329588014981274, + "loss": 1.0343, + "step": 605 + }, + { + "epoch": 0.05715088771255914, + "grad_norm": 1.171875, + "learning_rate": 0.0001142322097378277, + "loss": 1.0305, + "step": 610 + }, + { + "epoch": 0.05761933761184242, + "grad_norm": 1.0546875, + "learning_rate": 0.0001151685393258427, + "loss": 1.0014, + "step": 615 + }, + { + "epoch": 0.058087787511125685, + "grad_norm": 2.3125, + "learning_rate": 0.00011610486891385768, + "loss": 1.0608, + "step": 620 + }, + { + "epoch": 0.058556237410408954, + "grad_norm": 3.03125, + "learning_rate": 0.00011704119850187267, + "loss": 1.0366, + "step": 625 + }, + { + "epoch": 0.05902468730969223, + "grad_norm": 3.203125, + "learning_rate": 0.00011797752808988764, + "loss": 1.043, + "step": 630 + }, + { + "epoch": 0.0594931372089755, + "grad_norm": 4.21875, + "learning_rate": 0.00011891385767790262, + "loss": 1.0977, + "step": 635 + }, + { + "epoch": 0.059961587108258774, + "grad_norm": 2.34375, + "learning_rate": 0.00011985018726591762, + "loss": 1.0167, + "step": 640 + }, + { + "epoch": 0.06043003700754204, + "grad_norm": 1.484375, + "learning_rate": 0.00012078651685393258, + "loss": 1.0229, + "step": 645 + }, + { + "epoch": 0.06089848690682532, + "grad_norm": 1.2734375, + "learning_rate": 0.00012172284644194758, + "loss": 1.045, + "step": 650 + }, + { + "epoch": 0.06136693680610859, + "grad_norm": 1.1875, + "learning_rate": 0.00012265917602996256, + "loss": 1.0012, + "step": 655 + }, + { + "epoch": 0.061835386705391855, + "grad_norm": 1.609375, + "learning_rate": 0.00012359550561797752, + "loss": 1.0454, + "step": 660 + }, + { + "epoch": 0.06230383660467513, + "grad_norm": 0.9375, + "learning_rate": 0.00012453183520599252, + "loss": 1.0195, + "step": 665 + }, + { + "epoch": 0.0627722865039584, + "grad_norm": 1.046875, + "learning_rate": 0.00012546816479400749, + "loss": 1.0344, + "step": 670 + }, + { + "epoch": 0.06324073640324167, + "grad_norm": 2.96875, + "learning_rate": 0.00012640449438202248, + "loss": 0.9961, + "step": 675 + }, + { + "epoch": 0.06370918630252495, + "grad_norm": 2.125, + "learning_rate": 0.00012734082397003745, + "loss": 1.1804, + "step": 680 + }, + { + "epoch": 0.06417763620180822, + "grad_norm": 2.03125, + "learning_rate": 0.00012827715355805244, + "loss": 1.0536, + "step": 685 + }, + { + "epoch": 0.06464608610109149, + "grad_norm": 2.46875, + "learning_rate": 0.00012921348314606744, + "loss": 0.9982, + "step": 690 + }, + { + "epoch": 0.06511453600037476, + "grad_norm": 3.65625, + "learning_rate": 0.0001301498127340824, + "loss": 1.026, + "step": 695 + }, + { + "epoch": 0.06558298589965803, + "grad_norm": 2.0625, + "learning_rate": 0.0001310861423220974, + "loss": 1.0377, + "step": 700 + }, + { + "epoch": 0.06605143579894131, + "grad_norm": 3.3125, + "learning_rate": 0.00013202247191011236, + "loss": 1.0295, + "step": 705 + }, + { + "epoch": 0.06651988569822458, + "grad_norm": 3.1875, + "learning_rate": 0.00013295880149812733, + "loss": 1.0156, + "step": 710 + }, + { + "epoch": 0.06698833559750784, + "grad_norm": 3.0625, + "learning_rate": 0.00013389513108614233, + "loss": 0.9841, + "step": 715 + }, + { + "epoch": 0.06745678549679111, + "grad_norm": 2.8125, + "learning_rate": 0.00013483146067415732, + "loss": 1.0149, + "step": 720 + }, + { + "epoch": 0.0679252353960744, + "grad_norm": 3.78125, + "learning_rate": 0.00013576779026217231, + "loss": 1.0258, + "step": 725 + }, + { + "epoch": 0.06839368529535766, + "grad_norm": 6.15625, + "learning_rate": 0.00013670411985018728, + "loss": 1.022, + "step": 730 + }, + { + "epoch": 0.06886213519464093, + "grad_norm": 3.375, + "learning_rate": 0.00013764044943820225, + "loss": 1.0308, + "step": 735 + }, + { + "epoch": 0.0693305850939242, + "grad_norm": 1.0703125, + "learning_rate": 0.00013857677902621724, + "loss": 0.9978, + "step": 740 + }, + { + "epoch": 0.06979903499320747, + "grad_norm": 2.671875, + "learning_rate": 0.0001395131086142322, + "loss": 1.0429, + "step": 745 + }, + { + "epoch": 0.07026748489249075, + "grad_norm": 1.921875, + "learning_rate": 0.0001404494382022472, + "loss": 1.0349, + "step": 750 + }, + { + "epoch": 0.07073593479177402, + "grad_norm": 2.65625, + "learning_rate": 0.00014138576779026217, + "loss": 0.9822, + "step": 755 + }, + { + "epoch": 0.07120438469105729, + "grad_norm": 2.078125, + "learning_rate": 0.00014232209737827716, + "loss": 1.0195, + "step": 760 + }, + { + "epoch": 0.07167283459034056, + "grad_norm": 2.265625, + "learning_rate": 0.00014325842696629216, + "loss": 1.0603, + "step": 765 + }, + { + "epoch": 0.07214128448962384, + "grad_norm": 0.94140625, + "learning_rate": 0.00014419475655430713, + "loss": 1.0338, + "step": 770 + }, + { + "epoch": 0.07260973438890711, + "grad_norm": 2.0, + "learning_rate": 0.00014513108614232212, + "loss": 0.994, + "step": 775 + }, + { + "epoch": 0.07307818428819038, + "grad_norm": 1.0, + "learning_rate": 0.0001460674157303371, + "loss": 0.9962, + "step": 780 + }, + { + "epoch": 0.07354663418747365, + "grad_norm": 1.21875, + "learning_rate": 0.00014700374531835205, + "loss": 1.0253, + "step": 785 + }, + { + "epoch": 0.07401508408675692, + "grad_norm": 3.515625, + "learning_rate": 0.00014794007490636705, + "loss": 1.016, + "step": 790 + }, + { + "epoch": 0.0744835339860402, + "grad_norm": 1.21875, + "learning_rate": 0.00014887640449438202, + "loss": 1.0569, + "step": 795 + }, + { + "epoch": 0.07495198388532347, + "grad_norm": 3.75, + "learning_rate": 0.000149812734082397, + "loss": 1.0709, + "step": 800 + }, + { + "epoch": 0.07542043378460674, + "grad_norm": 4.9375, + "learning_rate": 0.000150749063670412, + "loss": 0.9931, + "step": 805 + }, + { + "epoch": 0.07588888368389, + "grad_norm": 1.171875, + "learning_rate": 0.00015168539325842697, + "loss": 1.0313, + "step": 810 + }, + { + "epoch": 0.07635733358317327, + "grad_norm": 3.265625, + "learning_rate": 0.00015262172284644197, + "loss": 1.0261, + "step": 815 + }, + { + "epoch": 0.07682578348245656, + "grad_norm": 5.0625, + "learning_rate": 0.00015355805243445693, + "loss": 1.0057, + "step": 820 + }, + { + "epoch": 0.07729423338173982, + "grad_norm": 2.265625, + "learning_rate": 0.00015449438202247193, + "loss": 1.0323, + "step": 825 + }, + { + "epoch": 0.07776268328102309, + "grad_norm": 2.046875, + "learning_rate": 0.0001554307116104869, + "loss": 1.0045, + "step": 830 + }, + { + "epoch": 0.07823113318030636, + "grad_norm": 0.765625, + "learning_rate": 0.00015636704119850186, + "loss": 1.0209, + "step": 835 + }, + { + "epoch": 0.07869958307958964, + "grad_norm": 1.046875, + "learning_rate": 0.00015730337078651685, + "loss": 1.0035, + "step": 840 + }, + { + "epoch": 0.07916803297887291, + "grad_norm": 0.83984375, + "learning_rate": 0.00015823970037453185, + "loss": 0.9975, + "step": 845 + }, + { + "epoch": 0.07963648287815618, + "grad_norm": 1.328125, + "learning_rate": 0.00015917602996254682, + "loss": 1.0231, + "step": 850 + }, + { + "epoch": 0.08010493277743945, + "grad_norm": 1.140625, + "learning_rate": 0.0001601123595505618, + "loss": 0.9831, + "step": 855 + }, + { + "epoch": 0.08057338267672272, + "grad_norm": 1.2890625, + "learning_rate": 0.00016104868913857678, + "loss": 1.0326, + "step": 860 + }, + { + "epoch": 0.081041832576006, + "grad_norm": 1.3671875, + "learning_rate": 0.00016198501872659177, + "loss": 1.0352, + "step": 865 + }, + { + "epoch": 0.08151028247528927, + "grad_norm": 0.93359375, + "learning_rate": 0.00016292134831460674, + "loss": 1.0035, + "step": 870 + }, + { + "epoch": 0.08197873237457254, + "grad_norm": 1.1796875, + "learning_rate": 0.00016385767790262173, + "loss": 0.9943, + "step": 875 + }, + { + "epoch": 0.0824471822738558, + "grad_norm": 2.15625, + "learning_rate": 0.00016479400749063673, + "loss": 0.99, + "step": 880 + }, + { + "epoch": 0.08291563217313909, + "grad_norm": 0.89453125, + "learning_rate": 0.0001657303370786517, + "loss": 1.0166, + "step": 885 + }, + { + "epoch": 0.08338408207242236, + "grad_norm": 1.7265625, + "learning_rate": 0.0001666666666666667, + "loss": 1.0082, + "step": 890 + }, + { + "epoch": 0.08385253197170563, + "grad_norm": 1.046875, + "learning_rate": 0.00016760299625468166, + "loss": 1.0033, + "step": 895 + }, + { + "epoch": 0.0843209818709889, + "grad_norm": 2.28125, + "learning_rate": 0.00016853932584269662, + "loss": 0.9985, + "step": 900 + }, + { + "epoch": 0.08478943177027216, + "grad_norm": 3.609375, + "learning_rate": 0.00016947565543071162, + "loss": 1.0126, + "step": 905 + }, + { + "epoch": 0.08525788166955545, + "grad_norm": 1.609375, + "learning_rate": 0.00017041198501872658, + "loss": 0.9762, + "step": 910 + }, + { + "epoch": 0.08572633156883872, + "grad_norm": 0.8828125, + "learning_rate": 0.00017134831460674158, + "loss": 1.0369, + "step": 915 + }, + { + "epoch": 0.08619478146812198, + "grad_norm": 1.09375, + "learning_rate": 0.00017228464419475657, + "loss": 1.0006, + "step": 920 + }, + { + "epoch": 0.08666323136740525, + "grad_norm": 1.0, + "learning_rate": 0.00017322097378277154, + "loss": 0.9776, + "step": 925 + }, + { + "epoch": 0.08713168126668852, + "grad_norm": 0.82421875, + "learning_rate": 0.00017415730337078653, + "loss": 0.9939, + "step": 930 + }, + { + "epoch": 0.0876001311659718, + "grad_norm": 1.0234375, + "learning_rate": 0.0001750936329588015, + "loss": 0.9966, + "step": 935 + }, + { + "epoch": 0.08806858106525507, + "grad_norm": 0.953125, + "learning_rate": 0.0001760299625468165, + "loss": 1.023, + "step": 940 + }, + { + "epoch": 0.08853703096453834, + "grad_norm": 1.5859375, + "learning_rate": 0.00017696629213483146, + "loss": 1.0315, + "step": 945 + }, + { + "epoch": 0.08900548086382161, + "grad_norm": 0.8359375, + "learning_rate": 0.00017790262172284643, + "loss": 0.9933, + "step": 950 + }, + { + "epoch": 0.08947393076310489, + "grad_norm": 1.0859375, + "learning_rate": 0.00017883895131086142, + "loss": 0.9714, + "step": 955 + }, + { + "epoch": 0.08994238066238816, + "grad_norm": 0.7578125, + "learning_rate": 0.00017977528089887642, + "loss": 1.0209, + "step": 960 + }, + { + "epoch": 0.09041083056167143, + "grad_norm": 0.9140625, + "learning_rate": 0.0001807116104868914, + "loss": 0.9626, + "step": 965 + }, + { + "epoch": 0.0908792804609547, + "grad_norm": 3.234375, + "learning_rate": 0.00018164794007490638, + "loss": 1.0271, + "step": 970 + }, + { + "epoch": 0.09134773036023797, + "grad_norm": 0.85546875, + "learning_rate": 0.00018258426966292135, + "loss": 0.9851, + "step": 975 + }, + { + "epoch": 0.09181618025952125, + "grad_norm": 1.0546875, + "learning_rate": 0.00018352059925093634, + "loss": 1.0166, + "step": 980 + }, + { + "epoch": 0.09228463015880452, + "grad_norm": 0.76171875, + "learning_rate": 0.0001844569288389513, + "loss": 0.9921, + "step": 985 + }, + { + "epoch": 0.09275308005808779, + "grad_norm": 1.3046875, + "learning_rate": 0.0001853932584269663, + "loss": 1.0042, + "step": 990 + }, + { + "epoch": 0.09322152995737105, + "grad_norm": 1.3203125, + "learning_rate": 0.0001863295880149813, + "loss": 0.9728, + "step": 995 + }, + { + "epoch": 0.09368997985665434, + "grad_norm": 0.96875, + "learning_rate": 0.00018726591760299626, + "loss": 1.0043, + "step": 1000 + }, + { + "epoch": 0.0941584297559376, + "grad_norm": 0.85546875, + "learning_rate": 0.00018820224719101126, + "loss": 0.9723, + "step": 1005 + }, + { + "epoch": 0.09462687965522087, + "grad_norm": 1.3515625, + "learning_rate": 0.00018913857677902622, + "loss": 1.002, + "step": 1010 + }, + { + "epoch": 0.09509532955450414, + "grad_norm": 0.8046875, + "learning_rate": 0.00019007490636704122, + "loss": 1.0008, + "step": 1015 + }, + { + "epoch": 0.09556377945378741, + "grad_norm": 0.73046875, + "learning_rate": 0.00019101123595505618, + "loss": 1.0241, + "step": 1020 + }, + { + "epoch": 0.0960322293530707, + "grad_norm": 0.9921875, + "learning_rate": 0.00019194756554307115, + "loss": 0.9787, + "step": 1025 + }, + { + "epoch": 0.09650067925235396, + "grad_norm": 2.1875, + "learning_rate": 0.00019288389513108615, + "loss": 1.0108, + "step": 1030 + }, + { + "epoch": 0.09696912915163723, + "grad_norm": 0.875, + "learning_rate": 0.00019382022471910114, + "loss": 1.0067, + "step": 1035 + }, + { + "epoch": 0.0974375790509205, + "grad_norm": 2.03125, + "learning_rate": 0.00019475655430711613, + "loss": 0.9861, + "step": 1040 + }, + { + "epoch": 0.09790602895020377, + "grad_norm": 3.90625, + "learning_rate": 0.0001956928838951311, + "loss": 1.0238, + "step": 1045 + }, + { + "epoch": 0.09837447884948705, + "grad_norm": 0.8515625, + "learning_rate": 0.00019662921348314607, + "loss": 0.9925, + "step": 1050 + }, + { + "epoch": 0.09884292874877032, + "grad_norm": 0.73828125, + "learning_rate": 0.00019756554307116106, + "loss": 0.9541, + "step": 1055 + }, + { + "epoch": 0.09931137864805359, + "grad_norm": 1.4296875, + "learning_rate": 0.00019850187265917603, + "loss": 1.1368, + "step": 1060 + }, + { + "epoch": 0.09977982854733686, + "grad_norm": 0.83984375, + "learning_rate": 0.00019943820224719102, + "loss": 0.9589, + "step": 1065 + }, + { + "epoch": 0.10024827844662014, + "grad_norm": 0.921875, + "learning_rate": 0.0001999999786038819, + "loss": 0.9608, + "step": 1070 + }, + { + "epoch": 0.10071672834590341, + "grad_norm": 2.25, + "learning_rate": 0.00019999973789765796, + "loss": 0.9608, + "step": 1075 + }, + { + "epoch": 0.10118517824518668, + "grad_norm": 1.28125, + "learning_rate": 0.0001999992297407084, + "loss": 1.0051, + "step": 1080 + }, + { + "epoch": 0.10165362814446995, + "grad_norm": 0.8203125, + "learning_rate": 0.00019999845413439226, + "loss": 1.0035, + "step": 1085 + }, + { + "epoch": 0.10212207804375321, + "grad_norm": 0.94921875, + "learning_rate": 0.00019999741108078393, + "loss": 0.982, + "step": 1090 + }, + { + "epoch": 0.1025905279430365, + "grad_norm": 2.625, + "learning_rate": 0.00019999610058267302, + "loss": 0.9796, + "step": 1095 + }, + { + "epoch": 0.10305897784231977, + "grad_norm": 1.15625, + "learning_rate": 0.00019999452264356452, + "loss": 1.0045, + "step": 1100 + }, + { + "epoch": 0.10352742774160303, + "grad_norm": 1.21875, + "learning_rate": 0.00019999267726767862, + "loss": 1.0186, + "step": 1105 + }, + { + "epoch": 0.1039958776408863, + "grad_norm": 1.703125, + "learning_rate": 0.00019999056445995083, + "loss": 0.988, + "step": 1110 + }, + { + "epoch": 0.10446432754016959, + "grad_norm": 4.34375, + "learning_rate": 0.00019998818422603187, + "loss": 1.0081, + "step": 1115 + }, + { + "epoch": 0.10493277743945285, + "grad_norm": 1.6875, + "learning_rate": 0.00019998553657228773, + "loss": 1.0179, + "step": 1120 + }, + { + "epoch": 0.10540122733873612, + "grad_norm": 1.0078125, + "learning_rate": 0.00019998262150579957, + "loss": 0.9541, + "step": 1125 + }, + { + "epoch": 0.10586967723801939, + "grad_norm": 2.625, + "learning_rate": 0.0001999794390343638, + "loss": 0.9809, + "step": 1130 + }, + { + "epoch": 0.10633812713730266, + "grad_norm": 0.77734375, + "learning_rate": 0.00019997598916649196, + "loss": 0.9732, + "step": 1135 + }, + { + "epoch": 0.10680657703658594, + "grad_norm": 0.75, + "learning_rate": 0.0001999722719114108, + "loss": 1.0038, + "step": 1140 + }, + { + "epoch": 0.10727502693586921, + "grad_norm": 1.5703125, + "learning_rate": 0.00019996828727906213, + "loss": 0.9543, + "step": 1145 + }, + { + "epoch": 0.10774347683515248, + "grad_norm": 1.90625, + "learning_rate": 0.00019996403528010296, + "loss": 0.9554, + "step": 1150 + }, + { + "epoch": 0.10821192673443575, + "grad_norm": 0.7109375, + "learning_rate": 0.00019995951592590528, + "loss": 0.9916, + "step": 1155 + }, + { + "epoch": 0.10868037663371902, + "grad_norm": 0.81640625, + "learning_rate": 0.0001999547292285562, + "loss": 1.0122, + "step": 1160 + }, + { + "epoch": 0.1091488265330023, + "grad_norm": 1.4296875, + "learning_rate": 0.00019994967520085776, + "loss": 0.9566, + "step": 1165 + }, + { + "epoch": 0.10961727643228557, + "grad_norm": 0.9296875, + "learning_rate": 0.00019994435385632707, + "loss": 0.9811, + "step": 1170 + }, + { + "epoch": 0.11008572633156884, + "grad_norm": 2.09375, + "learning_rate": 0.00019993876520919615, + "loss": 0.9731, + "step": 1175 + }, + { + "epoch": 0.1105541762308521, + "grad_norm": 1.3125, + "learning_rate": 0.00019993290927441192, + "loss": 0.9967, + "step": 1180 + }, + { + "epoch": 0.11102262613013539, + "grad_norm": 2.875, + "learning_rate": 0.0001999267860676361, + "loss": 0.9995, + "step": 1185 + }, + { + "epoch": 0.11149107602941866, + "grad_norm": 3.953125, + "learning_rate": 0.00019992039560524534, + "loss": 0.984, + "step": 1190 + }, + { + "epoch": 0.11195952592870193, + "grad_norm": 1.359375, + "learning_rate": 0.00019991373790433103, + "loss": 0.9865, + "step": 1195 + }, + { + "epoch": 0.1124279758279852, + "grad_norm": 1.609375, + "learning_rate": 0.00019990681298269927, + "loss": 0.9805, + "step": 1200 + }, + { + "epoch": 0.11289642572726846, + "grad_norm": 1.7578125, + "learning_rate": 0.00019989962085887087, + "loss": 0.9892, + "step": 1205 + }, + { + "epoch": 0.11336487562655174, + "grad_norm": 0.71875, + "learning_rate": 0.00019989216155208125, + "loss": 0.9501, + "step": 1210 + }, + { + "epoch": 0.11383332552583501, + "grad_norm": 0.79296875, + "learning_rate": 0.00019988443508228045, + "loss": 0.9623, + "step": 1215 + }, + { + "epoch": 0.11430177542511828, + "grad_norm": 0.66015625, + "learning_rate": 0.00019987644147013303, + "loss": 0.9741, + "step": 1220 + }, + { + "epoch": 0.11477022532440155, + "grad_norm": 0.69921875, + "learning_rate": 0.000199868180737018, + "loss": 0.9947, + "step": 1225 + }, + { + "epoch": 0.11523867522368483, + "grad_norm": 0.88671875, + "learning_rate": 0.00019985965290502883, + "loss": 0.9716, + "step": 1230 + }, + { + "epoch": 0.1157071251229681, + "grad_norm": 0.71484375, + "learning_rate": 0.0001998508579969733, + "loss": 0.9408, + "step": 1235 + }, + { + "epoch": 0.11617557502225137, + "grad_norm": 0.86328125, + "learning_rate": 0.0001998417960363735, + "loss": 1.018, + "step": 1240 + }, + { + "epoch": 0.11664402492153464, + "grad_norm": 0.80859375, + "learning_rate": 0.00019983246704746588, + "loss": 0.9403, + "step": 1245 + }, + { + "epoch": 0.11711247482081791, + "grad_norm": 1.078125, + "learning_rate": 0.00019982287105520087, + "loss": 0.9866, + "step": 1250 + }, + { + "epoch": 0.11758092472010119, + "grad_norm": 0.6953125, + "learning_rate": 0.00019981300808524303, + "loss": 1.0338, + "step": 1255 + }, + { + "epoch": 0.11804937461938446, + "grad_norm": 1.3671875, + "learning_rate": 0.00019980287816397113, + "loss": 1.0068, + "step": 1260 + }, + { + "epoch": 0.11851782451866773, + "grad_norm": 1.1328125, + "learning_rate": 0.00019979248131847774, + "loss": 0.969, + "step": 1265 + }, + { + "epoch": 0.118986274417951, + "grad_norm": 2.28125, + "learning_rate": 0.00019978181757656938, + "loss": 1.0264, + "step": 1270 + }, + { + "epoch": 0.11945472431723426, + "grad_norm": 1.2734375, + "learning_rate": 0.00019977088696676636, + "loss": 0.9888, + "step": 1275 + }, + { + "epoch": 0.11992317421651755, + "grad_norm": 0.71875, + "learning_rate": 0.00019975968951830276, + "loss": 0.9585, + "step": 1280 + }, + { + "epoch": 0.12039162411580082, + "grad_norm": 1.078125, + "learning_rate": 0.00019974822526112632, + "loss": 0.9908, + "step": 1285 + }, + { + "epoch": 0.12086007401508408, + "grad_norm": 0.6875, + "learning_rate": 0.00019973649422589835, + "loss": 0.9777, + "step": 1290 + }, + { + "epoch": 0.12132852391436735, + "grad_norm": 1.390625, + "learning_rate": 0.00019972449644399372, + "loss": 0.9487, + "step": 1295 + }, + { + "epoch": 0.12179697381365064, + "grad_norm": 1.265625, + "learning_rate": 0.0001997122319475006, + "loss": 1.0256, + "step": 1300 + }, + { + "epoch": 0.1222654237129339, + "grad_norm": 0.9609375, + "learning_rate": 0.0001996997007692206, + "loss": 0.9939, + "step": 1305 + }, + { + "epoch": 0.12273387361221717, + "grad_norm": 1.859375, + "learning_rate": 0.0001996869029426685, + "loss": 0.9643, + "step": 1310 + }, + { + "epoch": 0.12320232351150044, + "grad_norm": 1.484375, + "learning_rate": 0.0001996738385020723, + "loss": 0.9645, + "step": 1315 + }, + { + "epoch": 0.12367077341078371, + "grad_norm": 1.3203125, + "learning_rate": 0.00019966050748237307, + "loss": 0.9948, + "step": 1320 + }, + { + "epoch": 0.12413922331006699, + "grad_norm": 0.66015625, + "learning_rate": 0.00019964690991922473, + "loss": 0.9915, + "step": 1325 + }, + { + "epoch": 0.12460767320935026, + "grad_norm": 0.625, + "learning_rate": 0.0001996330458489942, + "loss": 0.942, + "step": 1330 + }, + { + "epoch": 0.12507612310863353, + "grad_norm": 0.9609375, + "learning_rate": 0.00019961891530876114, + "loss": 0.9838, + "step": 1335 + }, + { + "epoch": 0.1255445730079168, + "grad_norm": 0.70703125, + "learning_rate": 0.00019960451833631789, + "loss": 0.9848, + "step": 1340 + }, + { + "epoch": 0.12601302290720007, + "grad_norm": 0.76953125, + "learning_rate": 0.0001995898549701693, + "loss": 0.9989, + "step": 1345 + }, + { + "epoch": 0.12648147280648334, + "grad_norm": 0.96484375, + "learning_rate": 0.00019957492524953283, + "loss": 0.9635, + "step": 1350 + }, + { + "epoch": 0.1269499227057666, + "grad_norm": 0.91796875, + "learning_rate": 0.00019955972921433817, + "loss": 0.9777, + "step": 1355 + }, + { + "epoch": 0.1274183726050499, + "grad_norm": 0.75, + "learning_rate": 0.00019954426690522734, + "loss": 1.0064, + "step": 1360 + }, + { + "epoch": 0.12788682250433317, + "grad_norm": 0.703125, + "learning_rate": 0.00019952853836355456, + "loss": 0.9731, + "step": 1365 + }, + { + "epoch": 0.12835527240361644, + "grad_norm": 1.265625, + "learning_rate": 0.00019951254363138597, + "loss": 0.9619, + "step": 1370 + }, + { + "epoch": 0.1288237223028997, + "grad_norm": 0.91015625, + "learning_rate": 0.00019949628275149977, + "loss": 0.9523, + "step": 1375 + }, + { + "epoch": 0.12929217220218298, + "grad_norm": 0.65234375, + "learning_rate": 0.00019947975576738584, + "loss": 0.9971, + "step": 1380 + }, + { + "epoch": 0.12976062210146624, + "grad_norm": 1.84375, + "learning_rate": 0.00019946296272324593, + "loss": 0.9941, + "step": 1385 + }, + { + "epoch": 0.1302290720007495, + "grad_norm": 1.2890625, + "learning_rate": 0.00019944590366399322, + "loss": 0.9658, + "step": 1390 + }, + { + "epoch": 0.13069752190003278, + "grad_norm": 0.87890625, + "learning_rate": 0.00019942857863525237, + "loss": 0.9469, + "step": 1395 + }, + { + "epoch": 0.13116597179931605, + "grad_norm": 0.828125, + "learning_rate": 0.00019941098768335951, + "loss": 0.9712, + "step": 1400 + }, + { + "epoch": 0.13163442169859935, + "grad_norm": 1.25, + "learning_rate": 0.00019939313085536183, + "loss": 0.9528, + "step": 1405 + }, + { + "epoch": 0.13210287159788262, + "grad_norm": 0.83984375, + "learning_rate": 0.00019937500819901767, + "loss": 0.9763, + "step": 1410 + }, + { + "epoch": 0.13257132149716588, + "grad_norm": 0.73046875, + "learning_rate": 0.00019935661976279635, + "loss": 0.953, + "step": 1415 + }, + { + "epoch": 0.13303977139644915, + "grad_norm": 0.68359375, + "learning_rate": 0.00019933796559587797, + "loss": 0.982, + "step": 1420 + }, + { + "epoch": 0.13350822129573242, + "grad_norm": 1.015625, + "learning_rate": 0.00019931904574815347, + "loss": 0.9518, + "step": 1425 + }, + { + "epoch": 0.1339766711950157, + "grad_norm": 0.94921875, + "learning_rate": 0.00019929986027022412, + "loss": 0.9832, + "step": 1430 + }, + { + "epoch": 0.13444512109429896, + "grad_norm": 1.1484375, + "learning_rate": 0.00019928040921340183, + "loss": 0.9502, + "step": 1435 + }, + { + "epoch": 0.13491357099358223, + "grad_norm": 0.8671875, + "learning_rate": 0.00019926069262970875, + "loss": 0.991, + "step": 1440 + }, + { + "epoch": 0.1353820208928655, + "grad_norm": 0.75390625, + "learning_rate": 0.00019924071057187708, + "loss": 0.9673, + "step": 1445 + }, + { + "epoch": 0.1358504707921488, + "grad_norm": 1.4765625, + "learning_rate": 0.00019922046309334918, + "loss": 0.9941, + "step": 1450 + }, + { + "epoch": 0.13631892069143206, + "grad_norm": 1.015625, + "learning_rate": 0.0001991999502482772, + "loss": 0.9716, + "step": 1455 + }, + { + "epoch": 0.13678737059071533, + "grad_norm": 1.7734375, + "learning_rate": 0.00019917917209152306, + "loss": 0.9603, + "step": 1460 + }, + { + "epoch": 0.1372558204899986, + "grad_norm": 0.69140625, + "learning_rate": 0.00019915812867865823, + "loss": 0.9475, + "step": 1465 + }, + { + "epoch": 0.13772427038928187, + "grad_norm": 1.609375, + "learning_rate": 0.00019913682006596358, + "loss": 0.9458, + "step": 1470 + }, + { + "epoch": 0.13819272028856514, + "grad_norm": 0.92578125, + "learning_rate": 0.00019911524631042934, + "loss": 0.9505, + "step": 1475 + }, + { + "epoch": 0.1386611701878484, + "grad_norm": 0.7265625, + "learning_rate": 0.00019909340746975478, + "loss": 0.953, + "step": 1480 + }, + { + "epoch": 0.13912962008713167, + "grad_norm": 1.015625, + "learning_rate": 0.00019907130360234825, + "loss": 0.9743, + "step": 1485 + }, + { + "epoch": 0.13959806998641494, + "grad_norm": 1.625, + "learning_rate": 0.00019904893476732684, + "loss": 0.9259, + "step": 1490 + }, + { + "epoch": 0.14006651988569824, + "grad_norm": 1.296875, + "learning_rate": 0.0001990263010245163, + "loss": 0.9878, + "step": 1495 + }, + { + "epoch": 0.1405349697849815, + "grad_norm": 1.2109375, + "learning_rate": 0.00019900340243445092, + "loss": 0.9407, + "step": 1500 + }, + { + "epoch": 0.14100341968426477, + "grad_norm": 1.078125, + "learning_rate": 0.0001989802390583733, + "loss": 0.9487, + "step": 1505 + }, + { + "epoch": 0.14147186958354804, + "grad_norm": 1.640625, + "learning_rate": 0.00019895681095823418, + "loss": 0.9912, + "step": 1510 + }, + { + "epoch": 0.1419403194828313, + "grad_norm": 0.7734375, + "learning_rate": 0.00019893311819669244, + "loss": 0.9411, + "step": 1515 + }, + { + "epoch": 0.14240876938211458, + "grad_norm": 1.453125, + "learning_rate": 0.0001989091608371146, + "loss": 0.9697, + "step": 1520 + }, + { + "epoch": 0.14287721928139785, + "grad_norm": 0.9296875, + "learning_rate": 0.00019888493894357505, + "loss": 0.917, + "step": 1525 + }, + { + "epoch": 0.14334566918068112, + "grad_norm": 1.21875, + "learning_rate": 0.00019886045258085553, + "loss": 0.9603, + "step": 1530 + }, + { + "epoch": 0.1438141190799644, + "grad_norm": 0.73828125, + "learning_rate": 0.0001988357018144452, + "loss": 0.9652, + "step": 1535 + }, + { + "epoch": 0.14428256897924768, + "grad_norm": 1.2890625, + "learning_rate": 0.0001988106867105403, + "loss": 1.1053, + "step": 1540 + }, + { + "epoch": 0.14475101887853095, + "grad_norm": 0.90625, + "learning_rate": 0.00019878540733604414, + "loss": 1.0219, + "step": 1545 + }, + { + "epoch": 0.14521946877781422, + "grad_norm": 0.796875, + "learning_rate": 0.00019875986375856672, + "loss": 0.9356, + "step": 1550 + }, + { + "epoch": 0.1456879186770975, + "grad_norm": 0.9296875, + "learning_rate": 0.00019873405604642472, + "loss": 0.9499, + "step": 1555 + }, + { + "epoch": 0.14615636857638076, + "grad_norm": 1.0078125, + "learning_rate": 0.00019870798426864123, + "loss": 0.9407, + "step": 1560 + }, + { + "epoch": 0.14662481847566403, + "grad_norm": 0.85546875, + "learning_rate": 0.0001986816484949456, + "loss": 0.9597, + "step": 1565 + }, + { + "epoch": 0.1470932683749473, + "grad_norm": 0.96484375, + "learning_rate": 0.00019865504879577322, + "loss": 0.9411, + "step": 1570 + }, + { + "epoch": 0.14756171827423056, + "grad_norm": 3.625, + "learning_rate": 0.00019862818524226538, + "loss": 1.0596, + "step": 1575 + }, + { + "epoch": 0.14803016817351383, + "grad_norm": 1.3046875, + "learning_rate": 0.00019860105790626902, + "loss": 0.9631, + "step": 1580 + }, + { + "epoch": 0.1484986180727971, + "grad_norm": 1.4765625, + "learning_rate": 0.0001985736668603366, + "loss": 0.9317, + "step": 1585 + }, + { + "epoch": 0.1489670679720804, + "grad_norm": 1.9921875, + "learning_rate": 0.00019854601217772586, + "loss": 0.9542, + "step": 1590 + }, + { + "epoch": 0.14943551787136367, + "grad_norm": 1.625, + "learning_rate": 0.00019851809393239963, + "loss": 0.9504, + "step": 1595 + }, + { + "epoch": 0.14990396777064693, + "grad_norm": 1.8359375, + "learning_rate": 0.0001984899121990257, + "loss": 0.9589, + "step": 1600 + }, + { + "epoch": 0.1503724176699302, + "grad_norm": 0.6953125, + "learning_rate": 0.00019846146705297646, + "loss": 0.9818, + "step": 1605 + }, + { + "epoch": 0.15084086756921347, + "grad_norm": 0.9609375, + "learning_rate": 0.0001984327585703289, + "loss": 0.9372, + "step": 1610 + }, + { + "epoch": 0.15130931746849674, + "grad_norm": 0.7265625, + "learning_rate": 0.00019840378682786422, + "loss": 0.9715, + "step": 1615 + }, + { + "epoch": 0.15177776736778, + "grad_norm": 0.6640625, + "learning_rate": 0.0001983745519030678, + "loss": 0.9831, + "step": 1620 + }, + { + "epoch": 0.15224621726706328, + "grad_norm": 0.7734375, + "learning_rate": 0.0001983450538741289, + "loss": 0.9464, + "step": 1625 + }, + { + "epoch": 0.15271466716634655, + "grad_norm": 0.69921875, + "learning_rate": 0.00019831529281994032, + "loss": 0.9777, + "step": 1630 + }, + { + "epoch": 0.15318311706562984, + "grad_norm": 0.828125, + "learning_rate": 0.0001982852688200985, + "loss": 0.9461, + "step": 1635 + }, + { + "epoch": 0.1536515669649131, + "grad_norm": 0.859375, + "learning_rate": 0.00019825498195490303, + "loss": 0.9725, + "step": 1640 + }, + { + "epoch": 0.15412001686419638, + "grad_norm": 1.1640625, + "learning_rate": 0.00019822443230535654, + "loss": 0.9799, + "step": 1645 + }, + { + "epoch": 0.15458846676347965, + "grad_norm": 0.76953125, + "learning_rate": 0.00019819361995316456, + "loss": 0.9815, + "step": 1650 + }, + { + "epoch": 0.15505691666276292, + "grad_norm": 0.71875, + "learning_rate": 0.00019816254498073512, + "loss": 0.931, + "step": 1655 + }, + { + "epoch": 0.15552536656204619, + "grad_norm": 0.64453125, + "learning_rate": 0.00019813120747117868, + "loss": 0.9635, + "step": 1660 + }, + { + "epoch": 0.15599381646132945, + "grad_norm": 0.64453125, + "learning_rate": 0.00019809960750830788, + "loss": 0.9625, + "step": 1665 + }, + { + "epoch": 0.15646226636061272, + "grad_norm": 0.67578125, + "learning_rate": 0.00019806774517663726, + "loss": 0.9563, + "step": 1670 + }, + { + "epoch": 0.156930716259896, + "grad_norm": 0.78125, + "learning_rate": 0.0001980356205613831, + "loss": 0.9396, + "step": 1675 + }, + { + "epoch": 0.1573991661591793, + "grad_norm": 0.73046875, + "learning_rate": 0.0001980032337484631, + "loss": 0.9367, + "step": 1680 + }, + { + "epoch": 0.15786761605846256, + "grad_norm": 0.7265625, + "learning_rate": 0.00019797058482449626, + "loss": 0.9413, + "step": 1685 + }, + { + "epoch": 0.15833606595774583, + "grad_norm": 0.78515625, + "learning_rate": 0.0001979376738768026, + "loss": 0.9907, + "step": 1690 + }, + { + "epoch": 0.1588045158570291, + "grad_norm": 0.60546875, + "learning_rate": 0.00019790450099340298, + "loss": 0.9707, + "step": 1695 + }, + { + "epoch": 0.15927296575631236, + "grad_norm": 0.95703125, + "learning_rate": 0.0001978710662630187, + "loss": 0.9337, + "step": 1700 + }, + { + "epoch": 0.15974141565559563, + "grad_norm": 0.71875, + "learning_rate": 0.00019783736977507137, + "loss": 0.9568, + "step": 1705 + }, + { + "epoch": 0.1602098655548789, + "grad_norm": 0.65234375, + "learning_rate": 0.00019780341161968279, + "loss": 0.9621, + "step": 1710 + }, + { + "epoch": 0.16067831545416217, + "grad_norm": 0.98046875, + "learning_rate": 0.00019776919188767452, + "loss": 0.9956, + "step": 1715 + }, + { + "epoch": 0.16114676535344544, + "grad_norm": 0.640625, + "learning_rate": 0.00019773471067056772, + "loss": 0.9403, + "step": 1720 + }, + { + "epoch": 0.16161521525272873, + "grad_norm": 1.25, + "learning_rate": 0.00019769996806058291, + "loss": 0.9364, + "step": 1725 + }, + { + "epoch": 0.162083665152012, + "grad_norm": 0.85546875, + "learning_rate": 0.00019766496415063965, + "loss": 0.9565, + "step": 1730 + }, + { + "epoch": 0.16255211505129527, + "grad_norm": 0.70703125, + "learning_rate": 0.00019762969903435647, + "loss": 0.9566, + "step": 1735 + }, + { + "epoch": 0.16302056495057854, + "grad_norm": 0.6015625, + "learning_rate": 0.00019759417280605036, + "loss": 0.9432, + "step": 1740 + }, + { + "epoch": 0.1634890148498618, + "grad_norm": 0.91796875, + "learning_rate": 0.00019755838556073682, + "loss": 0.945, + "step": 1745 + }, + { + "epoch": 0.16395746474914508, + "grad_norm": 1.4296875, + "learning_rate": 0.0001975223373941292, + "loss": 0.9445, + "step": 1750 + }, + { + "epoch": 0.16442591464842835, + "grad_norm": 2.375, + "learning_rate": 0.0001974860284026389, + "loss": 0.9567, + "step": 1755 + }, + { + "epoch": 0.1648943645477116, + "grad_norm": 1.234375, + "learning_rate": 0.00019744945868337493, + "loss": 0.9604, + "step": 1760 + }, + { + "epoch": 0.16536281444699488, + "grad_norm": 0.703125, + "learning_rate": 0.0001974126283341434, + "loss": 0.9302, + "step": 1765 + }, + { + "epoch": 0.16583126434627818, + "grad_norm": 1.046875, + "learning_rate": 0.00019737553745344765, + "loss": 0.9507, + "step": 1770 + }, + { + "epoch": 0.16629971424556145, + "grad_norm": 1.125, + "learning_rate": 0.0001973381861404878, + "loss": 0.9551, + "step": 1775 + }, + { + "epoch": 0.16676816414484472, + "grad_norm": 1.4765625, + "learning_rate": 0.00019730057449516043, + "loss": 0.939, + "step": 1780 + }, + { + "epoch": 0.16723661404412798, + "grad_norm": 1.265625, + "learning_rate": 0.0001972627026180584, + "loss": 0.9205, + "step": 1785 + }, + { + "epoch": 0.16770506394341125, + "grad_norm": 0.7109375, + "learning_rate": 0.00019722457061047062, + "loss": 0.9681, + "step": 1790 + }, + { + "epoch": 0.16817351384269452, + "grad_norm": 2.171875, + "learning_rate": 0.00019718617857438173, + "loss": 0.9651, + "step": 1795 + }, + { + "epoch": 0.1686419637419778, + "grad_norm": 0.6328125, + "learning_rate": 0.0001971475266124717, + "loss": 0.9494, + "step": 1800 + }, + { + "epoch": 0.16911041364126106, + "grad_norm": 0.7109375, + "learning_rate": 0.0001971086148281158, + "loss": 0.9315, + "step": 1805 + }, + { + "epoch": 0.16957886354054433, + "grad_norm": 0.703125, + "learning_rate": 0.00019706944332538415, + "loss": 0.959, + "step": 1810 + }, + { + "epoch": 0.17004731343982762, + "grad_norm": 1.1953125, + "learning_rate": 0.00019703001220904148, + "loss": 0.9456, + "step": 1815 + }, + { + "epoch": 0.1705157633391109, + "grad_norm": 1.4453125, + "learning_rate": 0.00019699032158454686, + "loss": 0.9194, + "step": 1820 + }, + { + "epoch": 0.17098421323839416, + "grad_norm": 1.1875, + "learning_rate": 0.00019695037155805344, + "loss": 0.9861, + "step": 1825 + }, + { + "epoch": 0.17145266313767743, + "grad_norm": 0.8203125, + "learning_rate": 0.00019691016223640818, + "loss": 0.9428, + "step": 1830 + }, + { + "epoch": 0.1719211130369607, + "grad_norm": 0.66796875, + "learning_rate": 0.00019686969372715142, + "loss": 0.9658, + "step": 1835 + }, + { + "epoch": 0.17238956293624397, + "grad_norm": 0.6796875, + "learning_rate": 0.0001968289661385168, + "loss": 0.957, + "step": 1840 + }, + { + "epoch": 0.17285801283552724, + "grad_norm": 1.015625, + "learning_rate": 0.00019678797957943083, + "loss": 0.9351, + "step": 1845 + }, + { + "epoch": 0.1733264627348105, + "grad_norm": 0.9609375, + "learning_rate": 0.00019674673415951265, + "loss": 0.9439, + "step": 1850 + }, + { + "epoch": 0.17379491263409377, + "grad_norm": 1.3203125, + "learning_rate": 0.00019670522998907375, + "loss": 0.9431, + "step": 1855 + }, + { + "epoch": 0.17426336253337704, + "grad_norm": 0.8671875, + "learning_rate": 0.00019666346717911757, + "loss": 0.9579, + "step": 1860 + }, + { + "epoch": 0.17473181243266034, + "grad_norm": 1.078125, + "learning_rate": 0.00019662144584133934, + "loss": 0.9198, + "step": 1865 + }, + { + "epoch": 0.1752002623319436, + "grad_norm": 1.328125, + "learning_rate": 0.00019657916608812579, + "loss": 0.9394, + "step": 1870 + }, + { + "epoch": 0.17566871223122688, + "grad_norm": 1.5390625, + "learning_rate": 0.00019653662803255468, + "loss": 0.9744, + "step": 1875 + }, + { + "epoch": 0.17613716213051014, + "grad_norm": 1.6796875, + "learning_rate": 0.00019649383178839468, + "loss": 0.9383, + "step": 1880 + }, + { + "epoch": 0.1766056120297934, + "grad_norm": 1.609375, + "learning_rate": 0.0001964507774701049, + "loss": 0.9408, + "step": 1885 + }, + { + "epoch": 0.17707406192907668, + "grad_norm": 1.1640625, + "learning_rate": 0.00019640746519283475, + "loss": 0.9479, + "step": 1890 + }, + { + "epoch": 0.17754251182835995, + "grad_norm": 0.93359375, + "learning_rate": 0.00019636389507242356, + "loss": 0.9514, + "step": 1895 + }, + { + "epoch": 0.17801096172764322, + "grad_norm": 0.953125, + "learning_rate": 0.00019632006722540023, + "loss": 0.9277, + "step": 1900 + }, + { + "epoch": 0.1784794116269265, + "grad_norm": 0.62890625, + "learning_rate": 0.00019627598176898294, + "loss": 0.9309, + "step": 1905 + }, + { + "epoch": 0.17894786152620978, + "grad_norm": 1.4765625, + "learning_rate": 0.00019623163882107888, + "loss": 0.9363, + "step": 1910 + }, + { + "epoch": 0.17941631142549305, + "grad_norm": 1.203125, + "learning_rate": 0.00019618703850028392, + "loss": 0.9387, + "step": 1915 + }, + { + "epoch": 0.17988476132477632, + "grad_norm": 0.62890625, + "learning_rate": 0.00019614218092588225, + "loss": 0.9706, + "step": 1920 + }, + { + "epoch": 0.1803532112240596, + "grad_norm": 0.6015625, + "learning_rate": 0.00019609706621784607, + "loss": 0.9215, + "step": 1925 + }, + { + "epoch": 0.18082166112334286, + "grad_norm": 0.7265625, + "learning_rate": 0.0001960516944968353, + "loss": 0.926, + "step": 1930 + }, + { + "epoch": 0.18129011102262613, + "grad_norm": 0.65234375, + "learning_rate": 0.0001960060658841973, + "loss": 0.9295, + "step": 1935 + }, + { + "epoch": 0.1817585609219094, + "grad_norm": 0.73046875, + "learning_rate": 0.0001959601805019664, + "loss": 0.9429, + "step": 1940 + }, + { + "epoch": 0.18222701082119266, + "grad_norm": 0.67578125, + "learning_rate": 0.00019591403847286372, + "loss": 0.9609, + "step": 1945 + }, + { + "epoch": 0.18269546072047593, + "grad_norm": 0.6328125, + "learning_rate": 0.0001958676399202968, + "loss": 0.9205, + "step": 1950 + }, + { + "epoch": 0.18316391061975923, + "grad_norm": 0.609375, + "learning_rate": 0.0001958209849683592, + "loss": 0.9242, + "step": 1955 + }, + { + "epoch": 0.1836323605190425, + "grad_norm": 0.81640625, + "learning_rate": 0.0001957740737418303, + "loss": 0.9499, + "step": 1960 + }, + { + "epoch": 0.18410081041832577, + "grad_norm": 0.82421875, + "learning_rate": 0.0001957269063661748, + "loss": 0.9658, + "step": 1965 + }, + { + "epoch": 0.18456926031760904, + "grad_norm": 0.64453125, + "learning_rate": 0.00019567948296754254, + "loss": 0.9494, + "step": 1970 + }, + { + "epoch": 0.1850377102168923, + "grad_norm": 0.703125, + "learning_rate": 0.00019563180367276808, + "loss": 0.9382, + "step": 1975 + }, + { + "epoch": 0.18550616011617557, + "grad_norm": 1.1875, + "learning_rate": 0.00019558386860937034, + "loss": 0.9361, + "step": 1980 + }, + { + "epoch": 0.18597461001545884, + "grad_norm": 1.109375, + "learning_rate": 0.00019553567790555238, + "loss": 0.9412, + "step": 1985 + }, + { + "epoch": 0.1864430599147421, + "grad_norm": 0.7578125, + "learning_rate": 0.00019548723169020087, + "loss": 0.9626, + "step": 1990 + }, + { + "epoch": 0.18691150981402538, + "grad_norm": 0.82421875, + "learning_rate": 0.00019543853009288596, + "loss": 0.9385, + "step": 1995 + }, + { + "epoch": 0.18737995971330867, + "grad_norm": 0.69140625, + "learning_rate": 0.00019538957324386074, + "loss": 0.9474, + "step": 2000 + }, + { + "epoch": 0.18784840961259194, + "grad_norm": 0.7734375, + "learning_rate": 0.00019534036127406097, + "loss": 0.9497, + "step": 2005 + }, + { + "epoch": 0.1883168595118752, + "grad_norm": 0.58203125, + "learning_rate": 0.00019529089431510484, + "loss": 0.9346, + "step": 2010 + }, + { + "epoch": 0.18878530941115848, + "grad_norm": 0.69921875, + "learning_rate": 0.00019524117249929238, + "loss": 0.944, + "step": 2015 + }, + { + "epoch": 0.18925375931044175, + "grad_norm": 0.6953125, + "learning_rate": 0.00019519119595960528, + "loss": 0.9091, + "step": 2020 + }, + { + "epoch": 0.18972220920972502, + "grad_norm": 0.640625, + "learning_rate": 0.00019514096482970659, + "loss": 0.9136, + "step": 2025 + }, + { + "epoch": 0.1901906591090083, + "grad_norm": 0.6484375, + "learning_rate": 0.0001950904792439401, + "loss": 0.9716, + "step": 2030 + }, + { + "epoch": 0.19065910900829156, + "grad_norm": 0.7265625, + "learning_rate": 0.00019503973933733025, + "loss": 0.9657, + "step": 2035 + }, + { + "epoch": 0.19112755890757482, + "grad_norm": 0.6640625, + "learning_rate": 0.00019498874524558171, + "loss": 0.948, + "step": 2040 + }, + { + "epoch": 0.19159600880685812, + "grad_norm": 1.7578125, + "learning_rate": 0.00019493749710507881, + "loss": 0.9562, + "step": 2045 + }, + { + "epoch": 0.1920644587061414, + "grad_norm": 0.6875, + "learning_rate": 0.0001948859950528855, + "loss": 0.9681, + "step": 2050 + }, + { + "epoch": 0.19253290860542466, + "grad_norm": 0.62890625, + "learning_rate": 0.00019483423922674468, + "loss": 0.9417, + "step": 2055 + }, + { + "epoch": 0.19300135850470793, + "grad_norm": 0.56640625, + "learning_rate": 0.00019478222976507815, + "loss": 0.938, + "step": 2060 + }, + { + "epoch": 0.1934698084039912, + "grad_norm": 0.7734375, + "learning_rate": 0.00019472996680698588, + "loss": 0.966, + "step": 2065 + }, + { + "epoch": 0.19393825830327446, + "grad_norm": 0.703125, + "learning_rate": 0.00019467745049224592, + "loss": 0.944, + "step": 2070 + }, + { + "epoch": 0.19440670820255773, + "grad_norm": 0.703125, + "learning_rate": 0.00019462468096131388, + "loss": 0.9288, + "step": 2075 + }, + { + "epoch": 0.194875158101841, + "grad_norm": 0.73828125, + "learning_rate": 0.00019457165835532267, + "loss": 0.9343, + "step": 2080 + }, + { + "epoch": 0.19534360800112427, + "grad_norm": 0.91015625, + "learning_rate": 0.00019451838281608197, + "loss": 0.9434, + "step": 2085 + }, + { + "epoch": 0.19581205790040754, + "grad_norm": 1.203125, + "learning_rate": 0.00019446485448607796, + "loss": 0.9183, + "step": 2090 + }, + { + "epoch": 0.19628050779969083, + "grad_norm": 0.76953125, + "learning_rate": 0.00019441107350847298, + "loss": 0.9252, + "step": 2095 + }, + { + "epoch": 0.1967489576989741, + "grad_norm": 0.734375, + "learning_rate": 0.00019435704002710496, + "loss": 0.9565, + "step": 2100 + }, + { + "epoch": 0.19721740759825737, + "grad_norm": 1.6328125, + "learning_rate": 0.00019430275418648725, + "loss": 0.9324, + "step": 2105 + }, + { + "epoch": 0.19768585749754064, + "grad_norm": 1.0234375, + "learning_rate": 0.0001942482161318081, + "loss": 0.9295, + "step": 2110 + }, + { + "epoch": 0.1981543073968239, + "grad_norm": 0.55078125, + "learning_rate": 0.00019419342600893028, + "loss": 0.9554, + "step": 2115 + }, + { + "epoch": 0.19862275729610718, + "grad_norm": 0.7578125, + "learning_rate": 0.00019413838396439083, + "loss": 0.9814, + "step": 2120 + }, + { + "epoch": 0.19909120719539045, + "grad_norm": 0.5546875, + "learning_rate": 0.00019408309014540045, + "loss": 0.9376, + "step": 2125 + }, + { + "epoch": 0.19955965709467371, + "grad_norm": 0.60546875, + "learning_rate": 0.0001940275446998432, + "loss": 0.9205, + "step": 2130 + }, + { + "epoch": 0.20002810699395698, + "grad_norm": 0.78125, + "learning_rate": 0.00019397174777627623, + "loss": 0.9365, + "step": 2135 + }, + { + "epoch": 0.20049655689324028, + "grad_norm": 0.92578125, + "learning_rate": 0.00019391569952392917, + "loss": 0.9213, + "step": 2140 + }, + { + "epoch": 0.20096500679252355, + "grad_norm": 0.859375, + "learning_rate": 0.00019385940009270387, + "loss": 0.9562, + "step": 2145 + }, + { + "epoch": 0.20143345669180682, + "grad_norm": 1.203125, + "learning_rate": 0.00019380284963317398, + "loss": 0.9498, + "step": 2150 + }, + { + "epoch": 0.20190190659109009, + "grad_norm": 1.2109375, + "learning_rate": 0.00019374604829658452, + "loss": 0.9581, + "step": 2155 + }, + { + "epoch": 0.20237035649037335, + "grad_norm": 0.6640625, + "learning_rate": 0.00019368899623485147, + "loss": 0.9337, + "step": 2160 + }, + { + "epoch": 0.20283880638965662, + "grad_norm": 0.7890625, + "learning_rate": 0.00019363169360056133, + "loss": 0.9724, + "step": 2165 + }, + { + "epoch": 0.2033072562889399, + "grad_norm": 1.0390625, + "learning_rate": 0.00019357414054697089, + "loss": 0.959, + "step": 2170 + }, + { + "epoch": 0.20377570618822316, + "grad_norm": 1.7421875, + "learning_rate": 0.00019351633722800658, + "loss": 0.9478, + "step": 2175 + }, + { + "epoch": 0.20424415608750643, + "grad_norm": 0.78515625, + "learning_rate": 0.0001934582837982642, + "loss": 0.9777, + "step": 2180 + }, + { + "epoch": 0.20471260598678973, + "grad_norm": 0.67578125, + "learning_rate": 0.00019339998041300848, + "loss": 0.9455, + "step": 2185 + }, + { + "epoch": 0.205181055886073, + "grad_norm": 0.99609375, + "learning_rate": 0.00019334142722817266, + "loss": 0.9345, + "step": 2190 + }, + { + "epoch": 0.20564950578535626, + "grad_norm": 0.84765625, + "learning_rate": 0.00019328262440035802, + "loss": 0.9202, + "step": 2195 + }, + { + "epoch": 0.20611795568463953, + "grad_norm": 0.890625, + "learning_rate": 0.00019322357208683362, + "loss": 0.9401, + "step": 2200 + }, + { + "epoch": 0.2065864055839228, + "grad_norm": 0.67578125, + "learning_rate": 0.00019316427044553574, + "loss": 0.93, + "step": 2205 + }, + { + "epoch": 0.20705485548320607, + "grad_norm": 0.609375, + "learning_rate": 0.0001931047196350674, + "loss": 0.9293, + "step": 2210 + }, + { + "epoch": 0.20752330538248934, + "grad_norm": 0.75, + "learning_rate": 0.0001930449198146981, + "loss": 0.9321, + "step": 2215 + }, + { + "epoch": 0.2079917552817726, + "grad_norm": 0.67578125, + "learning_rate": 0.00019298487114436332, + "loss": 0.9288, + "step": 2220 + }, + { + "epoch": 0.20846020518105587, + "grad_norm": 1.40625, + "learning_rate": 0.00019292457378466412, + "loss": 0.9641, + "step": 2225 + }, + { + "epoch": 0.20892865508033917, + "grad_norm": 1.40625, + "learning_rate": 0.00019286402789686662, + "loss": 0.9317, + "step": 2230 + }, + { + "epoch": 0.20939710497962244, + "grad_norm": 0.64453125, + "learning_rate": 0.00019280323364290167, + "loss": 0.8955, + "step": 2235 + }, + { + "epoch": 0.2098655548789057, + "grad_norm": 0.984375, + "learning_rate": 0.00019274219118536434, + "loss": 0.9284, + "step": 2240 + }, + { + "epoch": 0.21033400477818898, + "grad_norm": 0.96875, + "learning_rate": 0.0001926809006875136, + "loss": 0.9328, + "step": 2245 + }, + { + "epoch": 0.21080245467747225, + "grad_norm": 0.71875, + "learning_rate": 0.00019261936231327172, + "loss": 0.9412, + "step": 2250 + }, + { + "epoch": 0.2112709045767555, + "grad_norm": 1.046875, + "learning_rate": 0.00019255757622722397, + "loss": 0.9362, + "step": 2255 + }, + { + "epoch": 0.21173935447603878, + "grad_norm": 0.5625, + "learning_rate": 0.00019249554259461813, + "loss": 0.9391, + "step": 2260 + }, + { + "epoch": 0.21220780437532205, + "grad_norm": 0.7109375, + "learning_rate": 0.00019243326158136406, + "loss": 0.9099, + "step": 2265 + }, + { + "epoch": 0.21267625427460532, + "grad_norm": 0.76171875, + "learning_rate": 0.00019237073335403318, + "loss": 0.9659, + "step": 2270 + }, + { + "epoch": 0.21314470417388862, + "grad_norm": 0.65625, + "learning_rate": 0.0001923079580798581, + "loss": 0.9235, + "step": 2275 + }, + { + "epoch": 0.21361315407317188, + "grad_norm": 0.6328125, + "learning_rate": 0.00019224493592673224, + "loss": 0.9115, + "step": 2280 + }, + { + "epoch": 0.21408160397245515, + "grad_norm": 0.5625, + "learning_rate": 0.00019218166706320923, + "loss": 0.945, + "step": 2285 + }, + { + "epoch": 0.21455005387173842, + "grad_norm": 0.5859375, + "learning_rate": 0.0001921181516585026, + "loss": 0.9474, + "step": 2290 + }, + { + "epoch": 0.2150185037710217, + "grad_norm": 0.6015625, + "learning_rate": 0.0001920543898824851, + "loss": 0.9415, + "step": 2295 + }, + { + "epoch": 0.21548695367030496, + "grad_norm": 0.6640625, + "learning_rate": 0.00019199038190568856, + "loss": 0.937, + "step": 2300 + }, + { + "epoch": 0.21595540356958823, + "grad_norm": 0.69140625, + "learning_rate": 0.00019192612789930324, + "loss": 0.9182, + "step": 2305 + }, + { + "epoch": 0.2164238534688715, + "grad_norm": 0.58203125, + "learning_rate": 0.0001918616280351774, + "loss": 0.9502, + "step": 2310 + }, + { + "epoch": 0.21689230336815477, + "grad_norm": 0.671875, + "learning_rate": 0.0001917968824858168, + "loss": 0.8982, + "step": 2315 + }, + { + "epoch": 0.21736075326743803, + "grad_norm": 0.8203125, + "learning_rate": 0.00019173189142438442, + "loss": 0.9086, + "step": 2320 + }, + { + "epoch": 0.21782920316672133, + "grad_norm": 0.890625, + "learning_rate": 0.00019166665502469971, + "loss": 0.9208, + "step": 2325 + }, + { + "epoch": 0.2182976530660046, + "grad_norm": 1.3828125, + "learning_rate": 0.00019160117346123836, + "loss": 0.9101, + "step": 2330 + }, + { + "epoch": 0.21876610296528787, + "grad_norm": 1.3046875, + "learning_rate": 0.00019153544690913177, + "loss": 0.9423, + "step": 2335 + }, + { + "epoch": 0.21923455286457114, + "grad_norm": 1.015625, + "learning_rate": 0.0001914694755441665, + "loss": 0.9293, + "step": 2340 + }, + { + "epoch": 0.2197030027638544, + "grad_norm": 0.58984375, + "learning_rate": 0.000191403259542784, + "loss": 0.9133, + "step": 2345 + }, + { + "epoch": 0.22017145266313767, + "grad_norm": 0.5859375, + "learning_rate": 0.0001913367990820798, + "loss": 0.9275, + "step": 2350 + }, + { + "epoch": 0.22063990256242094, + "grad_norm": 0.62109375, + "learning_rate": 0.00019127009433980342, + "loss": 0.9211, + "step": 2355 + }, + { + "epoch": 0.2211083524617042, + "grad_norm": 0.83203125, + "learning_rate": 0.00019120314549435762, + "loss": 0.9112, + "step": 2360 + }, + { + "epoch": 0.22157680236098748, + "grad_norm": 0.74609375, + "learning_rate": 0.000191135952724798, + "loss": 0.9264, + "step": 2365 + }, + { + "epoch": 0.22204525226027078, + "grad_norm": 0.671875, + "learning_rate": 0.00019106851621083267, + "loss": 0.9337, + "step": 2370 + }, + { + "epoch": 0.22251370215955404, + "grad_norm": 0.59765625, + "learning_rate": 0.0001910008361328215, + "loss": 0.9163, + "step": 2375 + }, + { + "epoch": 0.2229821520588373, + "grad_norm": 0.71484375, + "learning_rate": 0.00019093291267177582, + "loss": 0.9088, + "step": 2380 + }, + { + "epoch": 0.22345060195812058, + "grad_norm": 0.65625, + "learning_rate": 0.00019086474600935792, + "loss": 0.9099, + "step": 2385 + }, + { + "epoch": 0.22391905185740385, + "grad_norm": 0.625, + "learning_rate": 0.00019079633632788044, + "loss": 0.9336, + "step": 2390 + }, + { + "epoch": 0.22438750175668712, + "grad_norm": 0.67578125, + "learning_rate": 0.00019072768381030617, + "loss": 0.9155, + "step": 2395 + }, + { + "epoch": 0.2248559516559704, + "grad_norm": 0.8046875, + "learning_rate": 0.00019065878864024714, + "loss": 0.9177, + "step": 2400 + }, + { + "epoch": 0.22532440155525366, + "grad_norm": 0.6015625, + "learning_rate": 0.0001905896510019645, + "loss": 0.9393, + "step": 2405 + }, + { + "epoch": 0.22579285145453692, + "grad_norm": 1.0078125, + "learning_rate": 0.00019052027108036788, + "loss": 0.9375, + "step": 2410 + }, + { + "epoch": 0.22626130135382022, + "grad_norm": 0.89453125, + "learning_rate": 0.00019045064906101484, + "loss": 0.9118, + "step": 2415 + }, + { + "epoch": 0.2267297512531035, + "grad_norm": 1.4140625, + "learning_rate": 0.00019038078513011048, + "loss": 0.9439, + "step": 2420 + }, + { + "epoch": 0.22719820115238676, + "grad_norm": 1.3671875, + "learning_rate": 0.00019031067947450686, + "loss": 0.9321, + "step": 2425 + }, + { + "epoch": 0.22766665105167003, + "grad_norm": 0.5859375, + "learning_rate": 0.00019024033228170258, + "loss": 0.9307, + "step": 2430 + }, + { + "epoch": 0.2281351009509533, + "grad_norm": 0.7578125, + "learning_rate": 0.00019016974373984216, + "loss": 0.9267, + "step": 2435 + }, + { + "epoch": 0.22860355085023656, + "grad_norm": 0.953125, + "learning_rate": 0.00019009891403771572, + "loss": 0.9317, + "step": 2440 + }, + { + "epoch": 0.22907200074951983, + "grad_norm": 0.9140625, + "learning_rate": 0.0001900278433647583, + "loss": 0.9446, + "step": 2445 + }, + { + "epoch": 0.2295404506488031, + "grad_norm": 0.734375, + "learning_rate": 0.0001899565319110494, + "loss": 0.9063, + "step": 2450 + }, + { + "epoch": 0.23000890054808637, + "grad_norm": 0.6328125, + "learning_rate": 0.00018988497986731256, + "loss": 0.934, + "step": 2455 + }, + { + "epoch": 0.23047735044736967, + "grad_norm": 0.69140625, + "learning_rate": 0.00018981318742491472, + "loss": 0.926, + "step": 2460 + }, + { + "epoch": 0.23094580034665294, + "grad_norm": 0.703125, + "learning_rate": 0.00018974115477586576, + "loss": 0.9133, + "step": 2465 + }, + { + "epoch": 0.2314142502459362, + "grad_norm": 0.60546875, + "learning_rate": 0.00018966888211281808, + "loss": 0.9142, + "step": 2470 + }, + { + "epoch": 0.23188270014521947, + "grad_norm": 1.5546875, + "learning_rate": 0.00018959636962906593, + "loss": 0.9286, + "step": 2475 + }, + { + "epoch": 0.23235115004450274, + "grad_norm": 0.99609375, + "learning_rate": 0.00018952361751854496, + "loss": 0.9026, + "step": 2480 + }, + { + "epoch": 0.232819599943786, + "grad_norm": 0.76171875, + "learning_rate": 0.0001894506259758318, + "loss": 0.9356, + "step": 2485 + }, + { + "epoch": 0.23328804984306928, + "grad_norm": 0.65234375, + "learning_rate": 0.00018937739519614324, + "loss": 0.9393, + "step": 2490 + }, + { + "epoch": 0.23375649974235255, + "grad_norm": 1.1796875, + "learning_rate": 0.00018930392537533616, + "loss": 0.9234, + "step": 2495 + }, + { + "epoch": 0.23422494964163582, + "grad_norm": 0.62109375, + "learning_rate": 0.00018923021670990661, + "loss": 0.9256, + "step": 2500 + }, + { + "epoch": 0.2346933995409191, + "grad_norm": 0.62890625, + "learning_rate": 0.00018915626939698945, + "loss": 0.9491, + "step": 2505 + }, + { + "epoch": 0.23516184944020238, + "grad_norm": 0.6640625, + "learning_rate": 0.00018908208363435785, + "loss": 0.9285, + "step": 2510 + }, + { + "epoch": 0.23563029933948565, + "grad_norm": 0.65234375, + "learning_rate": 0.0001890076596204227, + "loss": 0.9372, + "step": 2515 + }, + { + "epoch": 0.23609874923876892, + "grad_norm": 1.484375, + "learning_rate": 0.00018893299755423203, + "loss": 0.9148, + "step": 2520 + }, + { + "epoch": 0.2365671991380522, + "grad_norm": 1.203125, + "learning_rate": 0.00018885809763547067, + "loss": 0.9355, + "step": 2525 + }, + { + "epoch": 0.23703564903733546, + "grad_norm": 1.1640625, + "learning_rate": 0.00018878296006445946, + "loss": 0.923, + "step": 2530 + }, + { + "epoch": 0.23750409893661872, + "grad_norm": 0.6796875, + "learning_rate": 0.00018870758504215495, + "loss": 0.9276, + "step": 2535 + }, + { + "epoch": 0.237972548835902, + "grad_norm": 0.6796875, + "learning_rate": 0.0001886319727701487, + "loss": 0.8998, + "step": 2540 + }, + { + "epoch": 0.23844099873518526, + "grad_norm": 1.0703125, + "learning_rate": 0.0001885561234506668, + "loss": 0.9627, + "step": 2545 + }, + { + "epoch": 0.23890944863446853, + "grad_norm": 0.609375, + "learning_rate": 0.00018848003728656932, + "loss": 0.9846, + "step": 2550 + }, + { + "epoch": 0.23937789853375183, + "grad_norm": 0.5859375, + "learning_rate": 0.00018840371448134984, + "loss": 0.9108, + "step": 2555 + }, + { + "epoch": 0.2398463484330351, + "grad_norm": 0.67578125, + "learning_rate": 0.00018832715523913477, + "loss": 0.9386, + "step": 2560 + }, + { + "epoch": 0.24031479833231836, + "grad_norm": 0.6015625, + "learning_rate": 0.00018825035976468288, + "loss": 0.9481, + "step": 2565 + }, + { + "epoch": 0.24078324823160163, + "grad_norm": 0.62890625, + "learning_rate": 0.0001881733282633848, + "loss": 0.9623, + "step": 2570 + }, + { + "epoch": 0.2412516981308849, + "grad_norm": 0.72265625, + "learning_rate": 0.0001880960609412623, + "loss": 0.9104, + "step": 2575 + }, + { + "epoch": 0.24172014803016817, + "grad_norm": 0.55859375, + "learning_rate": 0.00018801855800496803, + "loss": 1.0972, + "step": 2580 + }, + { + "epoch": 0.24218859792945144, + "grad_norm": 0.70703125, + "learning_rate": 0.00018794081966178465, + "loss": 0.8949, + "step": 2585 + }, + { + "epoch": 0.2426570478287347, + "grad_norm": 0.76171875, + "learning_rate": 0.0001878628461196245, + "loss": 0.9526, + "step": 2590 + }, + { + "epoch": 0.24312549772801798, + "grad_norm": 0.85546875, + "learning_rate": 0.00018778463758702886, + "loss": 0.9415, + "step": 2595 + }, + { + "epoch": 0.24359394762730127, + "grad_norm": 0.67578125, + "learning_rate": 0.00018770619427316762, + "loss": 0.894, + "step": 2600 + }, + { + "epoch": 0.24406239752658454, + "grad_norm": 0.625, + "learning_rate": 0.00018762751638783855, + "loss": 0.9079, + "step": 2605 + }, + { + "epoch": 0.2445308474258678, + "grad_norm": 0.7265625, + "learning_rate": 0.00018754860414146675, + "loss": 0.9391, + "step": 2610 + }, + { + "epoch": 0.24499929732515108, + "grad_norm": 1.0703125, + "learning_rate": 0.00018746945774510417, + "loss": 0.9077, + "step": 2615 + }, + { + "epoch": 0.24546774722443435, + "grad_norm": 1.078125, + "learning_rate": 0.00018739007741042896, + "loss": 0.9298, + "step": 2620 + }, + { + "epoch": 0.24593619712371761, + "grad_norm": 1.03125, + "learning_rate": 0.000187310463349745, + "loss": 0.8918, + "step": 2625 + }, + { + "epoch": 0.24640464702300088, + "grad_norm": 0.578125, + "learning_rate": 0.00018723061577598113, + "loss": 0.9158, + "step": 2630 + }, + { + "epoch": 0.24687309692228415, + "grad_norm": 0.5390625, + "learning_rate": 0.00018715053490269093, + "loss": 0.9127, + "step": 2635 + }, + { + "epoch": 0.24734154682156742, + "grad_norm": 0.6171875, + "learning_rate": 0.0001870702209440518, + "loss": 0.9032, + "step": 2640 + }, + { + "epoch": 0.24780999672085072, + "grad_norm": 0.640625, + "learning_rate": 0.00018698967411486455, + "loss": 0.9363, + "step": 2645 + }, + { + "epoch": 0.24827844662013399, + "grad_norm": 0.80859375, + "learning_rate": 0.00018690889463055283, + "loss": 0.9133, + "step": 2650 + }, + { + "epoch": 0.24874689651941725, + "grad_norm": 0.609375, + "learning_rate": 0.00018682788270716255, + "loss": 0.9094, + "step": 2655 + }, + { + "epoch": 0.24921534641870052, + "grad_norm": 0.71875, + "learning_rate": 0.00018674663856136126, + "loss": 0.9445, + "step": 2660 + }, + { + "epoch": 0.2496837963179838, + "grad_norm": 0.68359375, + "learning_rate": 0.00018666516241043753, + "loss": 0.9238, + "step": 2665 + }, + { + "epoch": 0.25015224621726706, + "grad_norm": 1.15625, + "learning_rate": 0.00018658345447230054, + "loss": 0.9355, + "step": 2670 + }, + { + "epoch": 0.25062069611655036, + "grad_norm": 0.8046875, + "learning_rate": 0.00018650151496547933, + "loss": 0.9529, + "step": 2675 + }, + { + "epoch": 0.2510891460158336, + "grad_norm": 0.59375, + "learning_rate": 0.00018641934410912225, + "loss": 0.9202, + "step": 2680 + }, + { + "epoch": 0.2515575959151169, + "grad_norm": 0.7734375, + "learning_rate": 0.00018633694212299645, + "loss": 0.8886, + "step": 2685 + }, + { + "epoch": 0.25202604581440013, + "grad_norm": 0.65234375, + "learning_rate": 0.0001862543092274872, + "loss": 0.9449, + "step": 2690 + }, + { + "epoch": 0.25249449571368343, + "grad_norm": 0.765625, + "learning_rate": 0.0001861714456435974, + "loss": 0.9299, + "step": 2695 + }, + { + "epoch": 0.25296294561296667, + "grad_norm": 0.6640625, + "learning_rate": 0.00018608835159294684, + "loss": 0.9357, + "step": 2700 + }, + { + "epoch": 0.25343139551224997, + "grad_norm": 0.62109375, + "learning_rate": 0.00018600502729777176, + "loss": 0.9285, + "step": 2705 + }, + { + "epoch": 0.2538998454115332, + "grad_norm": 0.6484375, + "learning_rate": 0.00018592147298092423, + "loss": 0.9083, + "step": 2710 + }, + { + "epoch": 0.2543682953108165, + "grad_norm": 0.63671875, + "learning_rate": 0.00018583768886587136, + "loss": 0.9, + "step": 2715 + }, + { + "epoch": 0.2548367452100998, + "grad_norm": 0.58203125, + "learning_rate": 0.00018575367517669502, + "loss": 0.905, + "step": 2720 + }, + { + "epoch": 0.25530519510938304, + "grad_norm": 0.87890625, + "learning_rate": 0.00018566943213809103, + "loss": 0.9207, + "step": 2725 + }, + { + "epoch": 0.25577364500866634, + "grad_norm": 1.1796875, + "learning_rate": 0.00018558495997536854, + "loss": 0.8881, + "step": 2730 + }, + { + "epoch": 0.2562420949079496, + "grad_norm": 0.63671875, + "learning_rate": 0.00018550025891444968, + "loss": 0.9189, + "step": 2735 + }, + { + "epoch": 0.2567105448072329, + "grad_norm": 1.265625, + "learning_rate": 0.0001854153291818685, + "loss": 0.9398, + "step": 2740 + }, + { + "epoch": 0.2571789947065161, + "grad_norm": 1.203125, + "learning_rate": 0.00018533017100477083, + "loss": 0.9478, + "step": 2745 + }, + { + "epoch": 0.2576474446057994, + "grad_norm": 1.59375, + "learning_rate": 0.00018524478461091348, + "loss": 0.9454, + "step": 2750 + }, + { + "epoch": 0.25811589450508265, + "grad_norm": 1.46875, + "learning_rate": 0.00018515917022866353, + "loss": 0.9035, + "step": 2755 + }, + { + "epoch": 0.25858434440436595, + "grad_norm": 0.74609375, + "learning_rate": 0.0001850733280869979, + "loss": 0.9178, + "step": 2760 + }, + { + "epoch": 0.25905279430364925, + "grad_norm": 1.25, + "learning_rate": 0.00018498725841550257, + "loss": 0.931, + "step": 2765 + }, + { + "epoch": 0.2595212442029325, + "grad_norm": 0.7578125, + "learning_rate": 0.00018490096144437214, + "loss": 0.9373, + "step": 2770 + }, + { + "epoch": 0.2599896941022158, + "grad_norm": 0.80078125, + "learning_rate": 0.0001848144374044091, + "loss": 0.9457, + "step": 2775 + }, + { + "epoch": 0.260458144001499, + "grad_norm": 0.67578125, + "learning_rate": 0.0001847276865270232, + "loss": 0.8974, + "step": 2780 + }, + { + "epoch": 0.2609265939007823, + "grad_norm": 0.640625, + "learning_rate": 0.00018464070904423093, + "loss": 0.9225, + "step": 2785 + }, + { + "epoch": 0.26139504380006556, + "grad_norm": 0.5859375, + "learning_rate": 0.00018455350518865478, + "loss": 0.9621, + "step": 2790 + }, + { + "epoch": 0.26186349369934886, + "grad_norm": 0.59765625, + "learning_rate": 0.00018446607519352273, + "loss": 0.8968, + "step": 2795 + }, + { + "epoch": 0.2623319435986321, + "grad_norm": 0.6796875, + "learning_rate": 0.00018437841929266752, + "loss": 0.9271, + "step": 2800 + }, + { + "epoch": 0.2628003934979154, + "grad_norm": 0.58203125, + "learning_rate": 0.00018429053772052613, + "loss": 0.918, + "step": 2805 + }, + { + "epoch": 0.2632688433971987, + "grad_norm": 0.5859375, + "learning_rate": 0.0001842024307121391, + "loss": 0.9287, + "step": 2810 + }, + { + "epoch": 0.26373729329648193, + "grad_norm": 0.6015625, + "learning_rate": 0.00018411409850314984, + "loss": 0.9226, + "step": 2815 + }, + { + "epoch": 0.26420574319576523, + "grad_norm": 0.61328125, + "learning_rate": 0.00018402554132980413, + "loss": 0.9207, + "step": 2820 + }, + { + "epoch": 0.26467419309504847, + "grad_norm": 0.58203125, + "learning_rate": 0.00018393675942894935, + "loss": 0.8867, + "step": 2825 + }, + { + "epoch": 0.26514264299433177, + "grad_norm": 0.84375, + "learning_rate": 0.00018384775303803398, + "loss": 0.9316, + "step": 2830 + }, + { + "epoch": 0.265611092893615, + "grad_norm": 0.6328125, + "learning_rate": 0.00018375852239510695, + "loss": 0.9316, + "step": 2835 + }, + { + "epoch": 0.2660795427928983, + "grad_norm": 0.609375, + "learning_rate": 0.0001836690677388168, + "loss": 0.9294, + "step": 2840 + }, + { + "epoch": 0.26654799269218155, + "grad_norm": 0.59765625, + "learning_rate": 0.0001835793893084113, + "loss": 0.9246, + "step": 2845 + }, + { + "epoch": 0.26701644259146484, + "grad_norm": 0.66015625, + "learning_rate": 0.00018348948734373674, + "loss": 0.9205, + "step": 2850 + }, + { + "epoch": 0.26748489249074814, + "grad_norm": 0.7578125, + "learning_rate": 0.00018339936208523717, + "loss": 0.8891, + "step": 2855 + }, + { + "epoch": 0.2679533423900314, + "grad_norm": 0.61328125, + "learning_rate": 0.0001833090137739539, + "loss": 0.9092, + "step": 2860 + }, + { + "epoch": 0.2684217922893147, + "grad_norm": 0.68359375, + "learning_rate": 0.0001832184426515247, + "loss": 0.9424, + "step": 2865 + }, + { + "epoch": 0.2688902421885979, + "grad_norm": 0.8515625, + "learning_rate": 0.00018312764896018347, + "loss": 0.9164, + "step": 2870 + }, + { + "epoch": 0.2693586920878812, + "grad_norm": 0.546875, + "learning_rate": 0.00018303663294275908, + "loss": 0.9174, + "step": 2875 + }, + { + "epoch": 0.26982714198716445, + "grad_norm": 0.6484375, + "learning_rate": 0.00018294539484267527, + "loss": 0.9424, + "step": 2880 + }, + { + "epoch": 0.27029559188644775, + "grad_norm": 0.70703125, + "learning_rate": 0.00018285393490394963, + "loss": 0.911, + "step": 2885 + }, + { + "epoch": 0.270764041785731, + "grad_norm": 0.55859375, + "learning_rate": 0.00018276225337119305, + "loss": 0.943, + "step": 2890 + }, + { + "epoch": 0.2712324916850143, + "grad_norm": 0.66796875, + "learning_rate": 0.0001826703504896091, + "loss": 0.9203, + "step": 2895 + }, + { + "epoch": 0.2717009415842976, + "grad_norm": 0.7265625, + "learning_rate": 0.0001825782265049933, + "loss": 0.9013, + "step": 2900 + }, + { + "epoch": 0.2721693914835808, + "grad_norm": 0.625, + "learning_rate": 0.00018248588166373266, + "loss": 0.9295, + "step": 2905 + }, + { + "epoch": 0.2726378413828641, + "grad_norm": 0.91015625, + "learning_rate": 0.0001823933162128047, + "loss": 0.9222, + "step": 2910 + }, + { + "epoch": 0.27310629128214736, + "grad_norm": 0.88671875, + "learning_rate": 0.0001823005303997771, + "loss": 0.8786, + "step": 2915 + }, + { + "epoch": 0.27357474118143066, + "grad_norm": 0.62109375, + "learning_rate": 0.00018220752447280675, + "loss": 0.9336, + "step": 2920 + }, + { + "epoch": 0.2740431910807139, + "grad_norm": 0.5859375, + "learning_rate": 0.0001821142986806394, + "loss": 0.8944, + "step": 2925 + }, + { + "epoch": 0.2745116409799972, + "grad_norm": 0.59765625, + "learning_rate": 0.0001820208532726088, + "loss": 0.9404, + "step": 2930 + }, + { + "epoch": 0.27498009087928044, + "grad_norm": 0.66796875, + "learning_rate": 0.0001819271884986359, + "loss": 0.9117, + "step": 2935 + }, + { + "epoch": 0.27544854077856373, + "grad_norm": 0.62109375, + "learning_rate": 0.00018183330460922855, + "loss": 0.9196, + "step": 2940 + }, + { + "epoch": 0.27591699067784703, + "grad_norm": 0.67578125, + "learning_rate": 0.00018173920185548056, + "loss": 0.914, + "step": 2945 + }, + { + "epoch": 0.27638544057713027, + "grad_norm": 0.66015625, + "learning_rate": 0.0001816448804890711, + "loss": 0.9055, + "step": 2950 + }, + { + "epoch": 0.27685389047641357, + "grad_norm": 0.58203125, + "learning_rate": 0.00018155034076226394, + "loss": 0.9059, + "step": 2955 + }, + { + "epoch": 0.2773223403756968, + "grad_norm": 0.6484375, + "learning_rate": 0.000181455582927907, + "loss": 0.9109, + "step": 2960 + }, + { + "epoch": 0.2777907902749801, + "grad_norm": 0.671875, + "learning_rate": 0.0001813606072394314, + "loss": 0.9409, + "step": 2965 + }, + { + "epoch": 0.27825924017426334, + "grad_norm": 0.79296875, + "learning_rate": 0.0001812654139508511, + "loss": 0.9086, + "step": 2970 + }, + { + "epoch": 0.27872769007354664, + "grad_norm": 0.5546875, + "learning_rate": 0.00018117000331676172, + "loss": 0.9079, + "step": 2975 + }, + { + "epoch": 0.2791961399728299, + "grad_norm": 0.94140625, + "learning_rate": 0.0001810743755923405, + "loss": 0.9151, + "step": 2980 + }, + { + "epoch": 0.2796645898721132, + "grad_norm": 0.9296875, + "learning_rate": 0.00018097853103334505, + "loss": 0.9239, + "step": 2985 + }, + { + "epoch": 0.2801330397713965, + "grad_norm": 0.59765625, + "learning_rate": 0.00018088246989611312, + "loss": 0.931, + "step": 2990 + }, + { + "epoch": 0.2806014896706797, + "grad_norm": 0.99609375, + "learning_rate": 0.00018078619243756152, + "loss": 0.9391, + "step": 2995 + }, + { + "epoch": 0.281069939569963, + "grad_norm": 0.6484375, + "learning_rate": 0.00018068969891518573, + "loss": 0.9131, + "step": 3000 + }, + { + "epoch": 0.28153838946924625, + "grad_norm": 0.73046875, + "learning_rate": 0.000180592989587059, + "loss": 0.9385, + "step": 3005 + }, + { + "epoch": 0.28200683936852955, + "grad_norm": 0.72265625, + "learning_rate": 0.00018049606471183188, + "loss": 0.9403, + "step": 3010 + }, + { + "epoch": 0.2824752892678128, + "grad_norm": 0.78125, + "learning_rate": 0.00018039892454873126, + "loss": 0.9198, + "step": 3015 + }, + { + "epoch": 0.2829437391670961, + "grad_norm": 0.66015625, + "learning_rate": 0.00018030156935755994, + "loss": 0.9111, + "step": 3020 + }, + { + "epoch": 0.2834121890663793, + "grad_norm": 0.71875, + "learning_rate": 0.00018020399939869573, + "loss": 0.9265, + "step": 3025 + }, + { + "epoch": 0.2838806389656626, + "grad_norm": 0.8828125, + "learning_rate": 0.0001801062149330909, + "loss": 0.9019, + "step": 3030 + }, + { + "epoch": 0.2843490888649459, + "grad_norm": 0.7109375, + "learning_rate": 0.0001800082162222714, + "loss": 0.9214, + "step": 3035 + }, + { + "epoch": 0.28481753876422916, + "grad_norm": 1.0, + "learning_rate": 0.00017991000352833618, + "loss": 0.8984, + "step": 3040 + }, + { + "epoch": 0.28528598866351246, + "grad_norm": 0.70703125, + "learning_rate": 0.0001798115771139565, + "loss": 0.9474, + "step": 3045 + }, + { + "epoch": 0.2857544385627957, + "grad_norm": 0.7734375, + "learning_rate": 0.00017971293724237516, + "loss": 0.8959, + "step": 3050 + }, + { + "epoch": 0.286222888462079, + "grad_norm": 0.55078125, + "learning_rate": 0.00017961408417740594, + "loss": 0.938, + "step": 3055 + }, + { + "epoch": 0.28669133836136224, + "grad_norm": 0.7109375, + "learning_rate": 0.0001795150181834328, + "loss": 0.915, + "step": 3060 + }, + { + "epoch": 0.28715978826064553, + "grad_norm": 0.62890625, + "learning_rate": 0.0001794157395254091, + "loss": 0.9051, + "step": 3065 + }, + { + "epoch": 0.2876282381599288, + "grad_norm": 0.640625, + "learning_rate": 0.0001793162484688571, + "loss": 0.9939, + "step": 3070 + }, + { + "epoch": 0.28809668805921207, + "grad_norm": 0.58984375, + "learning_rate": 0.00017921654527986694, + "loss": 0.9213, + "step": 3075 + }, + { + "epoch": 0.28856513795849537, + "grad_norm": 0.69140625, + "learning_rate": 0.0001791166302250963, + "loss": 0.9166, + "step": 3080 + }, + { + "epoch": 0.2890335878577786, + "grad_norm": 0.60546875, + "learning_rate": 0.00017901650357176943, + "loss": 0.9027, + "step": 3085 + }, + { + "epoch": 0.2895020377570619, + "grad_norm": 0.609375, + "learning_rate": 0.00017891616558767646, + "loss": 0.9376, + "step": 3090 + }, + { + "epoch": 0.28997048765634514, + "grad_norm": 0.77734375, + "learning_rate": 0.00017881561654117277, + "loss": 1.0183, + "step": 3095 + }, + { + "epoch": 0.29043893755562844, + "grad_norm": 0.7734375, + "learning_rate": 0.0001787148567011782, + "loss": 0.9606, + "step": 3100 + }, + { + "epoch": 0.2909073874549117, + "grad_norm": 0.6796875, + "learning_rate": 0.0001786138863371764, + "loss": 0.9098, + "step": 3105 + }, + { + "epoch": 0.291375837354195, + "grad_norm": 0.55078125, + "learning_rate": 0.00017851270571921405, + "loss": 0.9015, + "step": 3110 + }, + { + "epoch": 0.2918442872534782, + "grad_norm": 0.61328125, + "learning_rate": 0.00017841131511790015, + "loss": 0.8943, + "step": 3115 + }, + { + "epoch": 0.2923127371527615, + "grad_norm": 0.98046875, + "learning_rate": 0.0001783097148044053, + "loss": 0.907, + "step": 3120 + }, + { + "epoch": 0.2927811870520448, + "grad_norm": 1.375, + "learning_rate": 0.00017820790505046103, + "loss": 0.9288, + "step": 3125 + }, + { + "epoch": 0.29324963695132805, + "grad_norm": 0.6640625, + "learning_rate": 0.0001781058861283589, + "loss": 0.9369, + "step": 3130 + }, + { + "epoch": 0.29371808685061135, + "grad_norm": 0.828125, + "learning_rate": 0.00017800365831095006, + "loss": 0.9314, + "step": 3135 + }, + { + "epoch": 0.2941865367498946, + "grad_norm": 0.5546875, + "learning_rate": 0.00017790122187164422, + "loss": 0.9102, + "step": 3140 + }, + { + "epoch": 0.2946549866491779, + "grad_norm": 1.21875, + "learning_rate": 0.0001777985770844091, + "loss": 0.9221, + "step": 3145 + }, + { + "epoch": 0.2951234365484611, + "grad_norm": 1.1171875, + "learning_rate": 0.00017769572422376964, + "loss": 0.9235, + "step": 3150 + }, + { + "epoch": 0.2955918864477444, + "grad_norm": 0.60546875, + "learning_rate": 0.0001775926635648073, + "loss": 0.9155, + "step": 3155 + }, + { + "epoch": 0.29606033634702766, + "grad_norm": 1.0, + "learning_rate": 0.00017748939538315929, + "loss": 0.9374, + "step": 3160 + }, + { + "epoch": 0.29652878624631096, + "grad_norm": 1.1015625, + "learning_rate": 0.00017738591995501783, + "loss": 0.9158, + "step": 3165 + }, + { + "epoch": 0.2969972361455942, + "grad_norm": 0.8046875, + "learning_rate": 0.00017728223755712942, + "loss": 0.9084, + "step": 3170 + }, + { + "epoch": 0.2974656860448775, + "grad_norm": 0.71484375, + "learning_rate": 0.0001771783484667941, + "loss": 0.8989, + "step": 3175 + }, + { + "epoch": 0.2979341359441608, + "grad_norm": 1.140625, + "learning_rate": 0.00017707425296186474, + "loss": 0.9225, + "step": 3180 + }, + { + "epoch": 0.29840258584344403, + "grad_norm": 0.6484375, + "learning_rate": 0.00017696995132074626, + "loss": 0.918, + "step": 3185 + }, + { + "epoch": 0.29887103574272733, + "grad_norm": 0.55859375, + "learning_rate": 0.0001768654438223949, + "loss": 0.9197, + "step": 3190 + }, + { + "epoch": 0.29933948564201057, + "grad_norm": 1.3359375, + "learning_rate": 0.00017676073074631741, + "loss": 0.9206, + "step": 3195 + }, + { + "epoch": 0.29980793554129387, + "grad_norm": 0.66796875, + "learning_rate": 0.00017665581237257043, + "loss": 0.9458, + "step": 3200 + }, + { + "epoch": 0.3002763854405771, + "grad_norm": 0.81640625, + "learning_rate": 0.00017655068898175965, + "loss": 1.0539, + "step": 3205 + }, + { + "epoch": 0.3007448353398604, + "grad_norm": 0.60546875, + "learning_rate": 0.0001764453608550391, + "loss": 0.9123, + "step": 3210 + }, + { + "epoch": 0.30121328523914365, + "grad_norm": 0.58203125, + "learning_rate": 0.00017633982827411032, + "loss": 0.9205, + "step": 3215 + }, + { + "epoch": 0.30168173513842694, + "grad_norm": 0.60546875, + "learning_rate": 0.0001762340915212217, + "loss": 0.9407, + "step": 3220 + }, + { + "epoch": 0.30215018503771024, + "grad_norm": 0.8203125, + "learning_rate": 0.00017612815087916773, + "loss": 0.9268, + "step": 3225 + }, + { + "epoch": 0.3026186349369935, + "grad_norm": 0.8359375, + "learning_rate": 0.00017602200663128815, + "loss": 0.9035, + "step": 3230 + }, + { + "epoch": 0.3030870848362768, + "grad_norm": 0.6875, + "learning_rate": 0.00017591565906146722, + "loss": 0.9534, + "step": 3235 + }, + { + "epoch": 0.30355553473556, + "grad_norm": 0.55078125, + "learning_rate": 0.00017580910845413315, + "loss": 0.9116, + "step": 3240 + }, + { + "epoch": 0.3040239846348433, + "grad_norm": 0.73046875, + "learning_rate": 0.00017570235509425694, + "loss": 0.9181, + "step": 3245 + }, + { + "epoch": 0.30449243453412655, + "grad_norm": 0.73828125, + "learning_rate": 0.00017559539926735205, + "loss": 0.918, + "step": 3250 + }, + { + "epoch": 0.30496088443340985, + "grad_norm": 0.62109375, + "learning_rate": 0.0001754882412594733, + "loss": 0.9134, + "step": 3255 + }, + { + "epoch": 0.3054293343326931, + "grad_norm": 0.9453125, + "learning_rate": 0.00017538088135721637, + "loss": 0.9195, + "step": 3260 + }, + { + "epoch": 0.3058977842319764, + "grad_norm": 1.0546875, + "learning_rate": 0.00017527331984771682, + "loss": 0.9192, + "step": 3265 + }, + { + "epoch": 0.3063662341312597, + "grad_norm": 0.63671875, + "learning_rate": 0.00017516555701864946, + "loss": 0.9033, + "step": 3270 + }, + { + "epoch": 0.3068346840305429, + "grad_norm": 0.578125, + "learning_rate": 0.0001750575931582275, + "loss": 0.8793, + "step": 3275 + }, + { + "epoch": 0.3073031339298262, + "grad_norm": 0.65625, + "learning_rate": 0.00017494942855520183, + "loss": 0.925, + "step": 3280 + }, + { + "epoch": 0.30777158382910946, + "grad_norm": 0.61328125, + "learning_rate": 0.00017484106349886024, + "loss": 0.9184, + "step": 3285 + }, + { + "epoch": 0.30824003372839276, + "grad_norm": 0.65625, + "learning_rate": 0.00017473249827902655, + "loss": 0.9181, + "step": 3290 + }, + { + "epoch": 0.308708483627676, + "grad_norm": 0.6484375, + "learning_rate": 0.00017462373318606007, + "loss": 0.8968, + "step": 3295 + }, + { + "epoch": 0.3091769335269593, + "grad_norm": 0.6171875, + "learning_rate": 0.00017451476851085458, + "loss": 0.9257, + "step": 3300 + }, + { + "epoch": 0.30964538342624254, + "grad_norm": 0.62890625, + "learning_rate": 0.00017440560454483762, + "loss": 0.9103, + "step": 3305 + }, + { + "epoch": 0.31011383332552583, + "grad_norm": 0.5625, + "learning_rate": 0.0001742962415799698, + "loss": 0.8803, + "step": 3310 + }, + { + "epoch": 0.31058228322480913, + "grad_norm": 0.859375, + "learning_rate": 0.00017418667990874392, + "loss": 0.9336, + "step": 3315 + }, + { + "epoch": 0.31105073312409237, + "grad_norm": 0.73828125, + "learning_rate": 0.00017407691982418425, + "loss": 0.8923, + "step": 3320 + }, + { + "epoch": 0.31151918302337567, + "grad_norm": 0.68359375, + "learning_rate": 0.0001739669616198457, + "loss": 0.9206, + "step": 3325 + }, + { + "epoch": 0.3119876329226589, + "grad_norm": 0.98828125, + "learning_rate": 0.00017385680558981302, + "loss": 0.9134, + "step": 3330 + }, + { + "epoch": 0.3124560828219422, + "grad_norm": 1.34375, + "learning_rate": 0.00017374645202870013, + "loss": 0.9002, + "step": 3335 + }, + { + "epoch": 0.31292453272122545, + "grad_norm": 0.64453125, + "learning_rate": 0.00017363590123164918, + "loss": 0.8985, + "step": 3340 + }, + { + "epoch": 0.31339298262050874, + "grad_norm": 1.078125, + "learning_rate": 0.00017352515349432983, + "loss": 0.9056, + "step": 3345 + }, + { + "epoch": 0.313861432519792, + "grad_norm": 0.85546875, + "learning_rate": 0.00017341420911293854, + "loss": 0.9401, + "step": 3350 + }, + { + "epoch": 0.3143298824190753, + "grad_norm": 0.65625, + "learning_rate": 0.0001733030683841976, + "loss": 0.8958, + "step": 3355 + }, + { + "epoch": 0.3147983323183586, + "grad_norm": 0.6640625, + "learning_rate": 0.00017319173160535448, + "loss": 0.901, + "step": 3360 + }, + { + "epoch": 0.3152667822176418, + "grad_norm": 0.890625, + "learning_rate": 0.000173080199074181, + "loss": 0.943, + "step": 3365 + }, + { + "epoch": 0.3157352321169251, + "grad_norm": 0.671875, + "learning_rate": 0.0001729684710889725, + "loss": 0.9003, + "step": 3370 + }, + { + "epoch": 0.31620368201620835, + "grad_norm": 0.66796875, + "learning_rate": 0.0001728565479485471, + "loss": 0.9535, + "step": 3375 + }, + { + "epoch": 0.31667213191549165, + "grad_norm": 0.58984375, + "learning_rate": 0.00017274442995224474, + "loss": 0.9193, + "step": 3380 + }, + { + "epoch": 0.3171405818147749, + "grad_norm": 0.6328125, + "learning_rate": 0.0001726321173999267, + "loss": 0.9472, + "step": 3385 + }, + { + "epoch": 0.3176090317140582, + "grad_norm": 0.953125, + "learning_rate": 0.00017251961059197446, + "loss": 0.9295, + "step": 3390 + }, + { + "epoch": 0.31807748161334143, + "grad_norm": 0.87109375, + "learning_rate": 0.0001724069098292891, + "loss": 0.8673, + "step": 3395 + }, + { + "epoch": 0.3185459315126247, + "grad_norm": 0.96875, + "learning_rate": 0.00017229401541329038, + "loss": 0.9175, + "step": 3400 + }, + { + "epoch": 0.319014381411908, + "grad_norm": 0.59375, + "learning_rate": 0.00017218092764591608, + "loss": 0.9012, + "step": 3405 + }, + { + "epoch": 0.31948283131119126, + "grad_norm": 0.98828125, + "learning_rate": 0.000172067646829621, + "loss": 0.9051, + "step": 3410 + }, + { + "epoch": 0.31995128121047456, + "grad_norm": 1.0546875, + "learning_rate": 0.00017195417326737635, + "loss": 0.9079, + "step": 3415 + }, + { + "epoch": 0.3204197311097578, + "grad_norm": 0.6640625, + "learning_rate": 0.00017184050726266873, + "loss": 0.9397, + "step": 3420 + }, + { + "epoch": 0.3208881810090411, + "grad_norm": 0.80859375, + "learning_rate": 0.00017172664911949953, + "loss": 0.9497, + "step": 3425 + }, + { + "epoch": 0.32135663090832434, + "grad_norm": 0.609375, + "learning_rate": 0.00017161259914238398, + "loss": 0.9032, + "step": 3430 + }, + { + "epoch": 0.32182508080760763, + "grad_norm": 0.546875, + "learning_rate": 0.00017149835763635035, + "loss": 0.9019, + "step": 3435 + }, + { + "epoch": 0.3222935307068909, + "grad_norm": 0.60546875, + "learning_rate": 0.0001713839249069392, + "loss": 0.8877, + "step": 3440 + }, + { + "epoch": 0.32276198060617417, + "grad_norm": 0.59765625, + "learning_rate": 0.00017126930126020246, + "loss": 0.9301, + "step": 3445 + }, + { + "epoch": 0.32323043050545747, + "grad_norm": 0.5703125, + "learning_rate": 0.00017115448700270278, + "loss": 0.9423, + "step": 3450 + }, + { + "epoch": 0.3236988804047407, + "grad_norm": 0.8203125, + "learning_rate": 0.00017103948244151244, + "loss": 0.9034, + "step": 3455 + }, + { + "epoch": 0.324167330304024, + "grad_norm": 0.70703125, + "learning_rate": 0.00017092428788421287, + "loss": 0.8691, + "step": 3460 + }, + { + "epoch": 0.32463578020330724, + "grad_norm": 0.68359375, + "learning_rate": 0.00017080890363889347, + "loss": 0.8703, + "step": 3465 + }, + { + "epoch": 0.32510423010259054, + "grad_norm": 0.59765625, + "learning_rate": 0.0001706933300141511, + "loss": 0.9055, + "step": 3470 + }, + { + "epoch": 0.3255726800018738, + "grad_norm": 0.671875, + "learning_rate": 0.00017057756731908908, + "loss": 0.8936, + "step": 3475 + }, + { + "epoch": 0.3260411299011571, + "grad_norm": 0.58203125, + "learning_rate": 0.00017046161586331632, + "loss": 0.9462, + "step": 3480 + }, + { + "epoch": 0.3265095798004403, + "grad_norm": 1.03125, + "learning_rate": 0.00017034547595694673, + "loss": 0.909, + "step": 3485 + }, + { + "epoch": 0.3269780296997236, + "grad_norm": 0.56640625, + "learning_rate": 0.0001702291479105981, + "loss": 0.8918, + "step": 3490 + }, + { + "epoch": 0.3274464795990069, + "grad_norm": 0.58984375, + "learning_rate": 0.00017011263203539149, + "loss": 0.9313, + "step": 3495 + }, + { + "epoch": 0.32791492949829015, + "grad_norm": 0.69140625, + "learning_rate": 0.0001699959286429502, + "loss": 0.9146, + "step": 3500 + }, + { + "epoch": 0.32838337939757345, + "grad_norm": 1.0390625, + "learning_rate": 0.00016987903804539917, + "loss": 0.9142, + "step": 3505 + }, + { + "epoch": 0.3288518292968567, + "grad_norm": 0.76171875, + "learning_rate": 0.00016976196055536392, + "loss": 0.9114, + "step": 3510 + }, + { + "epoch": 0.32932027919614, + "grad_norm": 1.2265625, + "learning_rate": 0.00016964469648596996, + "loss": 1.0432, + "step": 3515 + }, + { + "epoch": 0.3297887290954232, + "grad_norm": 0.6875, + "learning_rate": 0.00016952724615084164, + "loss": 0.9197, + "step": 3520 + }, + { + "epoch": 0.3302571789947065, + "grad_norm": 0.6640625, + "learning_rate": 0.00016940960986410158, + "loss": 0.8996, + "step": 3525 + }, + { + "epoch": 0.33072562889398976, + "grad_norm": 1.0078125, + "learning_rate": 0.0001692917879403697, + "loss": 0.8883, + "step": 3530 + }, + { + "epoch": 0.33119407879327306, + "grad_norm": 0.7890625, + "learning_rate": 0.00016917378069476242, + "loss": 0.8973, + "step": 3535 + }, + { + "epoch": 0.33166252869255636, + "grad_norm": 0.73828125, + "learning_rate": 0.0001690555884428918, + "loss": 0.939, + "step": 3540 + }, + { + "epoch": 0.3321309785918396, + "grad_norm": 0.59765625, + "learning_rate": 0.00016893721150086473, + "loss": 0.9165, + "step": 3545 + }, + { + "epoch": 0.3325994284911229, + "grad_norm": 0.59375, + "learning_rate": 0.00016881865018528197, + "loss": 0.8865, + "step": 3550 + }, + { + "epoch": 0.33306787839040614, + "grad_norm": 0.56640625, + "learning_rate": 0.0001686999048132375, + "loss": 0.8801, + "step": 3555 + }, + { + "epoch": 0.33353632828968943, + "grad_norm": 0.5859375, + "learning_rate": 0.0001685809757023175, + "loss": 0.8828, + "step": 3560 + }, + { + "epoch": 0.3340047781889727, + "grad_norm": 0.6015625, + "learning_rate": 0.0001684618631705996, + "loss": 0.8856, + "step": 3565 + }, + { + "epoch": 0.33447322808825597, + "grad_norm": 0.75390625, + "learning_rate": 0.0001683425675366519, + "loss": 0.892, + "step": 3570 + }, + { + "epoch": 0.3349416779875392, + "grad_norm": 0.77734375, + "learning_rate": 0.00016822308911953232, + "loss": 0.9194, + "step": 3575 + }, + { + "epoch": 0.3354101278868225, + "grad_norm": 0.55078125, + "learning_rate": 0.00016810342823878762, + "loss": 0.902, + "step": 3580 + }, + { + "epoch": 0.3358785777861058, + "grad_norm": 0.625, + "learning_rate": 0.00016798358521445247, + "loss": 0.886, + "step": 3585 + }, + { + "epoch": 0.33634702768538904, + "grad_norm": 0.61328125, + "learning_rate": 0.00016786356036704874, + "loss": 0.9143, + "step": 3590 + }, + { + "epoch": 0.33681547758467234, + "grad_norm": 0.57421875, + "learning_rate": 0.00016774335401758468, + "loss": 0.8993, + "step": 3595 + }, + { + "epoch": 0.3372839274839556, + "grad_norm": 0.87890625, + "learning_rate": 0.0001676229664875538, + "loss": 0.8811, + "step": 3600 + }, + { + "epoch": 0.3377523773832389, + "grad_norm": 0.6953125, + "learning_rate": 0.00016750239809893433, + "loss": 0.9153, + "step": 3605 + }, + { + "epoch": 0.3382208272825221, + "grad_norm": 0.6328125, + "learning_rate": 0.00016738164917418812, + "loss": 0.8942, + "step": 3610 + }, + { + "epoch": 0.3386892771818054, + "grad_norm": 0.56640625, + "learning_rate": 0.0001672607200362599, + "loss": 0.8946, + "step": 3615 + }, + { + "epoch": 0.33915772708108866, + "grad_norm": 0.62890625, + "learning_rate": 0.00016713961100857635, + "loss": 0.9014, + "step": 3620 + }, + { + "epoch": 0.33962617698037195, + "grad_norm": 0.609375, + "learning_rate": 0.00016701832241504532, + "loss": 0.9209, + "step": 3625 + }, + { + "epoch": 0.34009462687965525, + "grad_norm": 0.61328125, + "learning_rate": 0.00016689685458005487, + "loss": 0.9042, + "step": 3630 + }, + { + "epoch": 0.3405630767789385, + "grad_norm": 0.69921875, + "learning_rate": 0.00016677520782847248, + "loss": 0.8944, + "step": 3635 + }, + { + "epoch": 0.3410315266782218, + "grad_norm": 0.57421875, + "learning_rate": 0.00016665338248564407, + "loss": 0.9111, + "step": 3640 + }, + { + "epoch": 0.341499976577505, + "grad_norm": 1.0234375, + "learning_rate": 0.00016653137887739328, + "loss": 0.8778, + "step": 3645 + }, + { + "epoch": 0.3419684264767883, + "grad_norm": 0.7578125, + "learning_rate": 0.00016640919733002053, + "loss": 0.8687, + "step": 3650 + }, + { + "epoch": 0.34243687637607156, + "grad_norm": 0.75, + "learning_rate": 0.0001662868381703021, + "loss": 0.9047, + "step": 3655 + }, + { + "epoch": 0.34290532627535486, + "grad_norm": 0.69921875, + "learning_rate": 0.00016616430172548932, + "loss": 0.8688, + "step": 3660 + }, + { + "epoch": 0.3433737761746381, + "grad_norm": 0.75, + "learning_rate": 0.0001660415883233076, + "loss": 0.8891, + "step": 3665 + }, + { + "epoch": 0.3438422260739214, + "grad_norm": 0.77734375, + "learning_rate": 0.00016591869829195573, + "loss": 0.89, + "step": 3670 + }, + { + "epoch": 0.34431067597320464, + "grad_norm": 0.94140625, + "learning_rate": 0.00016579563196010487, + "loss": 0.9202, + "step": 3675 + }, + { + "epoch": 0.34477912587248793, + "grad_norm": 0.578125, + "learning_rate": 0.00016567238965689766, + "loss": 0.862, + "step": 3680 + }, + { + "epoch": 0.34524757577177123, + "grad_norm": 0.76171875, + "learning_rate": 0.00016554897171194738, + "loss": 0.9204, + "step": 3685 + }, + { + "epoch": 0.34571602567105447, + "grad_norm": 0.75, + "learning_rate": 0.0001654253784553371, + "loss": 0.9066, + "step": 3690 + }, + { + "epoch": 0.34618447557033777, + "grad_norm": 0.55078125, + "learning_rate": 0.00016530161021761875, + "loss": 0.9057, + "step": 3695 + }, + { + "epoch": 0.346652925469621, + "grad_norm": 0.578125, + "learning_rate": 0.00016517766732981228, + "loss": 0.9034, + "step": 3700 + }, + { + "epoch": 0.3471213753689043, + "grad_norm": 0.99609375, + "learning_rate": 0.00016505355012340464, + "loss": 0.9083, + "step": 3705 + }, + { + "epoch": 0.34758982526818755, + "grad_norm": 1.125, + "learning_rate": 0.0001649292589303491, + "loss": 0.9135, + "step": 3710 + }, + { + "epoch": 0.34805827516747084, + "grad_norm": 0.640625, + "learning_rate": 0.00016480479408306428, + "loss": 0.8928, + "step": 3715 + }, + { + "epoch": 0.3485267250667541, + "grad_norm": 1.015625, + "learning_rate": 0.00016468015591443314, + "loss": 0.9567, + "step": 3720 + }, + { + "epoch": 0.3489951749660374, + "grad_norm": 0.58984375, + "learning_rate": 0.00016455534475780218, + "loss": 0.8747, + "step": 3725 + }, + { + "epoch": 0.3494636248653207, + "grad_norm": 0.58984375, + "learning_rate": 0.00016443036094698074, + "loss": 0.9174, + "step": 3730 + }, + { + "epoch": 0.3499320747646039, + "grad_norm": 5.0, + "learning_rate": 0.00016430520481623977, + "loss": 1.063, + "step": 3735 + }, + { + "epoch": 0.3504005246638872, + "grad_norm": 0.6328125, + "learning_rate": 0.00016417987670031106, + "loss": 0.8845, + "step": 3740 + }, + { + "epoch": 0.35086897456317045, + "grad_norm": 0.59375, + "learning_rate": 0.00016405437693438652, + "loss": 0.9233, + "step": 3745 + }, + { + "epoch": 0.35133742446245375, + "grad_norm": 0.5546875, + "learning_rate": 0.00016392870585411702, + "loss": 0.9112, + "step": 3750 + }, + { + "epoch": 0.351805874361737, + "grad_norm": 0.625, + "learning_rate": 0.0001638028637956117, + "loss": 0.9126, + "step": 3755 + }, + { + "epoch": 0.3522743242610203, + "grad_norm": 0.578125, + "learning_rate": 0.00016367685109543688, + "loss": 0.8878, + "step": 3760 + }, + { + "epoch": 0.35274277416030353, + "grad_norm": 0.5546875, + "learning_rate": 0.00016355066809061537, + "loss": 0.8896, + "step": 3765 + }, + { + "epoch": 0.3532112240595868, + "grad_norm": 0.75390625, + "learning_rate": 0.00016342431511862538, + "loss": 0.9075, + "step": 3770 + }, + { + "epoch": 0.3536796739588701, + "grad_norm": 0.5703125, + "learning_rate": 0.00016329779251739982, + "loss": 0.8908, + "step": 3775 + }, + { + "epoch": 0.35414812385815336, + "grad_norm": 0.7265625, + "learning_rate": 0.0001631711006253251, + "loss": 0.8959, + "step": 3780 + }, + { + "epoch": 0.35461657375743666, + "grad_norm": 0.65234375, + "learning_rate": 0.00016304423978124054, + "loss": 0.8979, + "step": 3785 + }, + { + "epoch": 0.3550850236567199, + "grad_norm": 0.83203125, + "learning_rate": 0.00016291721032443723, + "loss": 0.9007, + "step": 3790 + }, + { + "epoch": 0.3555534735560032, + "grad_norm": 0.6015625, + "learning_rate": 0.00016279001259465734, + "loss": 0.928, + "step": 3795 + }, + { + "epoch": 0.35602192345528644, + "grad_norm": 0.7734375, + "learning_rate": 0.00016266264693209293, + "loss": 0.8735, + "step": 3800 + }, + { + "epoch": 0.35649037335456973, + "grad_norm": 0.62109375, + "learning_rate": 0.00016253511367738535, + "loss": 0.909, + "step": 3805 + }, + { + "epoch": 0.356958823253853, + "grad_norm": 0.58203125, + "learning_rate": 0.00016240741317162408, + "loss": 0.8885, + "step": 3810 + }, + { + "epoch": 0.35742727315313627, + "grad_norm": 0.5859375, + "learning_rate": 0.00016227954575634594, + "loss": 0.9338, + "step": 3815 + }, + { + "epoch": 0.35789572305241957, + "grad_norm": 0.6640625, + "learning_rate": 0.00016215151177353417, + "loss": 0.9067, + "step": 3820 + }, + { + "epoch": 0.3583641729517028, + "grad_norm": 0.6796875, + "learning_rate": 0.00016202331156561748, + "loss": 0.9244, + "step": 3825 + }, + { + "epoch": 0.3588326228509861, + "grad_norm": 0.71875, + "learning_rate": 0.0001618949454754691, + "loss": 0.907, + "step": 3830 + }, + { + "epoch": 0.35930107275026935, + "grad_norm": 0.73046875, + "learning_rate": 0.00016176641384640607, + "loss": 0.8841, + "step": 3835 + }, + { + "epoch": 0.35976952264955264, + "grad_norm": 0.83984375, + "learning_rate": 0.00016163771702218803, + "loss": 0.9133, + "step": 3840 + }, + { + "epoch": 0.3602379725488359, + "grad_norm": 0.671875, + "learning_rate": 0.0001615088553470164, + "loss": 0.9089, + "step": 3845 + }, + { + "epoch": 0.3607064224481192, + "grad_norm": 0.80078125, + "learning_rate": 0.00016137982916553367, + "loss": 0.8985, + "step": 3850 + }, + { + "epoch": 0.3611748723474024, + "grad_norm": 0.6171875, + "learning_rate": 0.00016125063882282216, + "loss": 0.9012, + "step": 3855 + }, + { + "epoch": 0.3616433222466857, + "grad_norm": 0.6171875, + "learning_rate": 0.0001611212846644033, + "loss": 0.9211, + "step": 3860 + }, + { + "epoch": 0.362111772145969, + "grad_norm": 0.68359375, + "learning_rate": 0.0001609917670362366, + "loss": 0.905, + "step": 3865 + }, + { + "epoch": 0.36258022204525225, + "grad_norm": 0.5859375, + "learning_rate": 0.00016086208628471877, + "loss": 0.948, + "step": 3870 + }, + { + "epoch": 0.36304867194453555, + "grad_norm": 0.6171875, + "learning_rate": 0.00016073224275668294, + "loss": 0.8925, + "step": 3875 + }, + { + "epoch": 0.3635171218438188, + "grad_norm": 0.53125, + "learning_rate": 0.00016060223679939742, + "loss": 0.8875, + "step": 3880 + }, + { + "epoch": 0.3639855717431021, + "grad_norm": 0.77734375, + "learning_rate": 0.00016047206876056497, + "loss": 0.8999, + "step": 3885 + }, + { + "epoch": 0.36445402164238533, + "grad_norm": 0.71875, + "learning_rate": 0.00016034173898832189, + "loss": 0.8879, + "step": 3890 + }, + { + "epoch": 0.3649224715416686, + "grad_norm": 0.8046875, + "learning_rate": 0.00016021124783123706, + "loss": 0.8873, + "step": 3895 + }, + { + "epoch": 0.36539092144095187, + "grad_norm": 0.70703125, + "learning_rate": 0.0001600805956383109, + "loss": 0.8845, + "step": 3900 + }, + { + "epoch": 0.36585937134023516, + "grad_norm": 0.82421875, + "learning_rate": 0.00015994978275897452, + "loss": 0.9079, + "step": 3905 + }, + { + "epoch": 0.36632782123951846, + "grad_norm": 0.69921875, + "learning_rate": 0.0001598188095430889, + "loss": 0.9029, + "step": 3910 + }, + { + "epoch": 0.3667962711388017, + "grad_norm": 0.546875, + "learning_rate": 0.00015968767634094374, + "loss": 0.9574, + "step": 3915 + }, + { + "epoch": 0.367264721038085, + "grad_norm": 0.5703125, + "learning_rate": 0.0001595563835032567, + "loss": 0.8818, + "step": 3920 + }, + { + "epoch": 0.36773317093736824, + "grad_norm": 0.5859375, + "learning_rate": 0.0001594249313811723, + "loss": 0.9133, + "step": 3925 + }, + { + "epoch": 0.36820162083665153, + "grad_norm": 0.7734375, + "learning_rate": 0.00015929332032626112, + "loss": 0.9072, + "step": 3930 + }, + { + "epoch": 0.3686700707359348, + "grad_norm": 1.296875, + "learning_rate": 0.00015916155069051882, + "loss": 1.1787, + "step": 3935 + }, + { + "epoch": 0.36913852063521807, + "grad_norm": 0.99609375, + "learning_rate": 0.0001590296228263652, + "loss": 0.8941, + "step": 3940 + }, + { + "epoch": 0.3696069705345013, + "grad_norm": 0.625, + "learning_rate": 0.00015889753708664314, + "loss": 0.9176, + "step": 3945 + }, + { + "epoch": 0.3700754204337846, + "grad_norm": 0.61328125, + "learning_rate": 0.00015876529382461793, + "loss": 0.8835, + "step": 3950 + }, + { + "epoch": 0.3705438703330679, + "grad_norm": 1.015625, + "learning_rate": 0.00015863289339397598, + "loss": 0.8881, + "step": 3955 + }, + { + "epoch": 0.37101232023235114, + "grad_norm": 0.6015625, + "learning_rate": 0.00015850033614882417, + "loss": 0.9039, + "step": 3960 + }, + { + "epoch": 0.37148077013163444, + "grad_norm": 0.6171875, + "learning_rate": 0.0001583676224436887, + "loss": 1.0914, + "step": 3965 + }, + { + "epoch": 0.3719492200309177, + "grad_norm": 0.6015625, + "learning_rate": 0.00015823475263351434, + "loss": 0.8746, + "step": 3970 + }, + { + "epoch": 0.372417669930201, + "grad_norm": 0.62890625, + "learning_rate": 0.00015810172707366325, + "loss": 0.9153, + "step": 3975 + }, + { + "epoch": 0.3728861198294842, + "grad_norm": 0.890625, + "learning_rate": 0.00015796854611991419, + "loss": 0.9237, + "step": 3980 + }, + { + "epoch": 0.3733545697287675, + "grad_norm": 0.61328125, + "learning_rate": 0.00015783521012846153, + "loss": 0.8913, + "step": 3985 + }, + { + "epoch": 0.37382301962805076, + "grad_norm": 0.73046875, + "learning_rate": 0.00015770171945591422, + "loss": 0.9, + "step": 3990 + }, + { + "epoch": 0.37429146952733405, + "grad_norm": 0.55078125, + "learning_rate": 0.00015756807445929511, + "loss": 0.8885, + "step": 3995 + }, + { + "epoch": 0.37475991942661735, + "grad_norm": 0.62890625, + "learning_rate": 0.00015743427549603947, + "loss": 0.8945, + "step": 4000 + }, + { + "epoch": 0.3752283693259006, + "grad_norm": 0.671875, + "learning_rate": 0.0001573003229239947, + "loss": 0.8817, + "step": 4005 + }, + { + "epoch": 0.3756968192251839, + "grad_norm": 0.58984375, + "learning_rate": 0.00015716621710141877, + "loss": 0.9255, + "step": 4010 + }, + { + "epoch": 0.3761652691244671, + "grad_norm": 0.61328125, + "learning_rate": 0.00015703195838697966, + "loss": 0.939, + "step": 4015 + }, + { + "epoch": 0.3766337190237504, + "grad_norm": 0.63671875, + "learning_rate": 0.00015689754713975416, + "loss": 0.8979, + "step": 4020 + }, + { + "epoch": 0.37710216892303366, + "grad_norm": 0.58203125, + "learning_rate": 0.00015676298371922713, + "loss": 0.8675, + "step": 4025 + }, + { + "epoch": 0.37757061882231696, + "grad_norm": 0.6953125, + "learning_rate": 0.0001566282684852903, + "loss": 0.9282, + "step": 4030 + }, + { + "epoch": 0.3780390687216002, + "grad_norm": 0.58984375, + "learning_rate": 0.00015649340179824158, + "loss": 0.9367, + "step": 4035 + }, + { + "epoch": 0.3785075186208835, + "grad_norm": 0.61328125, + "learning_rate": 0.00015635838401878373, + "loss": 0.8955, + "step": 4040 + }, + { + "epoch": 0.3789759685201668, + "grad_norm": 0.87890625, + "learning_rate": 0.0001562232155080238, + "loss": 0.9024, + "step": 4045 + }, + { + "epoch": 0.37944441841945004, + "grad_norm": 0.66015625, + "learning_rate": 0.00015608789662747188, + "loss": 0.8621, + "step": 4050 + }, + { + "epoch": 0.37991286831873333, + "grad_norm": 0.6484375, + "learning_rate": 0.00015595242773904026, + "loss": 0.8734, + "step": 4055 + }, + { + "epoch": 0.3803813182180166, + "grad_norm": 0.6484375, + "learning_rate": 0.0001558168092050424, + "loss": 1.0425, + "step": 4060 + }, + { + "epoch": 0.38084976811729987, + "grad_norm": 0.609375, + "learning_rate": 0.00015568104138819202, + "loss": 0.9059, + "step": 4065 + }, + { + "epoch": 0.3813182180165831, + "grad_norm": 0.8984375, + "learning_rate": 0.00015554512465160207, + "loss": 0.8936, + "step": 4070 + }, + { + "epoch": 0.3817866679158664, + "grad_norm": 0.57421875, + "learning_rate": 0.0001554090593587838, + "loss": 0.8847, + "step": 4075 + }, + { + "epoch": 0.38225511781514965, + "grad_norm": 0.6171875, + "learning_rate": 0.00015527284587364579, + "loss": 0.8846, + "step": 4080 + }, + { + "epoch": 0.38272356771443294, + "grad_norm": 0.58203125, + "learning_rate": 0.0001551364845604929, + "loss": 0.9156, + "step": 4085 + }, + { + "epoch": 0.38319201761371624, + "grad_norm": 0.671875, + "learning_rate": 0.00015499997578402548, + "loss": 0.9226, + "step": 4090 + }, + { + "epoch": 0.3836604675129995, + "grad_norm": 0.5546875, + "learning_rate": 0.00015486331990933818, + "loss": 0.9058, + "step": 4095 + }, + { + "epoch": 0.3841289174122828, + "grad_norm": 0.6796875, + "learning_rate": 0.00015472651730191904, + "loss": 0.9001, + "step": 4100 + }, + { + "epoch": 0.384597367311566, + "grad_norm": 0.61328125, + "learning_rate": 0.00015458956832764865, + "loss": 0.8936, + "step": 4105 + }, + { + "epoch": 0.3850658172108493, + "grad_norm": 0.58984375, + "learning_rate": 0.00015445247335279895, + "loss": 0.916, + "step": 4110 + }, + { + "epoch": 0.38553426711013256, + "grad_norm": 0.65234375, + "learning_rate": 0.00015431523274403244, + "loss": 0.9164, + "step": 4115 + }, + { + "epoch": 0.38600271700941585, + "grad_norm": 0.55078125, + "learning_rate": 0.00015417784686840103, + "loss": 0.8978, + "step": 4120 + }, + { + "epoch": 0.3864711669086991, + "grad_norm": 0.671875, + "learning_rate": 0.00015404031609334527, + "loss": 0.9038, + "step": 4125 + }, + { + "epoch": 0.3869396168079824, + "grad_norm": 0.765625, + "learning_rate": 0.0001539026407866931, + "loss": 0.914, + "step": 4130 + }, + { + "epoch": 0.38740806670726563, + "grad_norm": 0.58984375, + "learning_rate": 0.00015376482131665924, + "loss": 0.9174, + "step": 4135 + }, + { + "epoch": 0.3878765166065489, + "grad_norm": 1.0546875, + "learning_rate": 0.00015362685805184366, + "loss": 0.9186, + "step": 4140 + }, + { + "epoch": 0.3883449665058322, + "grad_norm": 0.7578125, + "learning_rate": 0.00015348875136123117, + "loss": 0.9066, + "step": 4145 + }, + { + "epoch": 0.38881341640511546, + "grad_norm": 0.9375, + "learning_rate": 0.00015335050161419003, + "loss": 0.9188, + "step": 4150 + }, + { + "epoch": 0.38928186630439876, + "grad_norm": 0.69921875, + "learning_rate": 0.0001532121091804713, + "loss": 0.9164, + "step": 4155 + }, + { + "epoch": 0.389750316203682, + "grad_norm": 0.65625, + "learning_rate": 0.0001530735744302073, + "loss": 0.8988, + "step": 4160 + }, + { + "epoch": 0.3902187661029653, + "grad_norm": 0.5625, + "learning_rate": 0.00015293489773391138, + "loss": 0.9127, + "step": 4165 + }, + { + "epoch": 0.39068721600224854, + "grad_norm": 0.65625, + "learning_rate": 0.00015279607946247625, + "loss": 0.8976, + "step": 4170 + }, + { + "epoch": 0.39115566590153183, + "grad_norm": 0.59375, + "learning_rate": 0.00015265711998717338, + "loss": 0.895, + "step": 4175 + }, + { + "epoch": 0.3916241158008151, + "grad_norm": 0.58203125, + "learning_rate": 0.0001525180196796519, + "loss": 0.9064, + "step": 4180 + }, + { + "epoch": 0.39209256570009837, + "grad_norm": 0.6015625, + "learning_rate": 0.00015237877891193747, + "loss": 0.8892, + "step": 4185 + }, + { + "epoch": 0.39256101559938167, + "grad_norm": 0.83203125, + "learning_rate": 0.00015223939805643162, + "loss": 0.8649, + "step": 4190 + }, + { + "epoch": 0.3930294654986649, + "grad_norm": 0.58203125, + "learning_rate": 0.00015209987748591043, + "loss": 0.889, + "step": 4195 + }, + { + "epoch": 0.3934979153979482, + "grad_norm": 0.578125, + "learning_rate": 0.00015196021757352357, + "loss": 0.882, + "step": 4200 + }, + { + "epoch": 0.39396636529723145, + "grad_norm": 0.546875, + "learning_rate": 0.00015182041869279354, + "loss": 0.8787, + "step": 4205 + }, + { + "epoch": 0.39443481519651474, + "grad_norm": 0.6796875, + "learning_rate": 0.00015168048121761438, + "loss": 0.8888, + "step": 4210 + }, + { + "epoch": 0.394903265095798, + "grad_norm": 0.98046875, + "learning_rate": 0.00015154040552225095, + "loss": 0.9049, + "step": 4215 + }, + { + "epoch": 0.3953717149950813, + "grad_norm": 0.69921875, + "learning_rate": 0.0001514001919813376, + "loss": 0.9112, + "step": 4220 + }, + { + "epoch": 0.3958401648943645, + "grad_norm": 0.52734375, + "learning_rate": 0.00015125984096987755, + "loss": 0.9035, + "step": 4225 + }, + { + "epoch": 0.3963086147936478, + "grad_norm": 0.5625, + "learning_rate": 0.0001511193528632415, + "loss": 0.8673, + "step": 4230 + }, + { + "epoch": 0.3967770646929311, + "grad_norm": 0.52734375, + "learning_rate": 0.00015097872803716692, + "loss": 0.9297, + "step": 4235 + }, + { + "epoch": 0.39724551459221435, + "grad_norm": 0.5390625, + "learning_rate": 0.00015083796686775694, + "loss": 0.9039, + "step": 4240 + }, + { + "epoch": 0.39771396449149765, + "grad_norm": 0.60546875, + "learning_rate": 0.00015069706973147926, + "loss": 0.8993, + "step": 4245 + }, + { + "epoch": 0.3981824143907809, + "grad_norm": 0.5625, + "learning_rate": 0.00015055603700516533, + "loss": 0.8693, + "step": 4250 + }, + { + "epoch": 0.3986508642900642, + "grad_norm": 0.60546875, + "learning_rate": 0.00015041486906600918, + "loss": 0.8812, + "step": 4255 + }, + { + "epoch": 0.39911931418934743, + "grad_norm": 0.6796875, + "learning_rate": 0.00015027356629156646, + "loss": 0.8836, + "step": 4260 + }, + { + "epoch": 0.3995877640886307, + "grad_norm": 0.85546875, + "learning_rate": 0.00015013212905975345, + "loss": 0.881, + "step": 4265 + }, + { + "epoch": 0.40005621398791397, + "grad_norm": 0.57421875, + "learning_rate": 0.00014999055774884604, + "loss": 0.8774, + "step": 4270 + }, + { + "epoch": 0.40052466388719726, + "grad_norm": 0.609375, + "learning_rate": 0.0001498488527374788, + "loss": 0.9092, + "step": 4275 + }, + { + "epoch": 0.40099311378648056, + "grad_norm": 0.58984375, + "learning_rate": 0.0001497070144046437, + "loss": 0.8907, + "step": 4280 + }, + { + "epoch": 0.4014615636857638, + "grad_norm": 0.59765625, + "learning_rate": 0.00014956504312968943, + "loss": 0.9038, + "step": 4285 + }, + { + "epoch": 0.4019300135850471, + "grad_norm": 0.58984375, + "learning_rate": 0.00014942293929232019, + "loss": 1.023, + "step": 4290 + }, + { + "epoch": 0.40239846348433034, + "grad_norm": 0.5625, + "learning_rate": 0.00014928070327259476, + "loss": 0.8826, + "step": 4295 + }, + { + "epoch": 0.40286691338361363, + "grad_norm": 0.61328125, + "learning_rate": 0.0001491383354509253, + "loss": 0.8968, + "step": 4300 + }, + { + "epoch": 0.4033353632828969, + "grad_norm": 0.55859375, + "learning_rate": 0.00014899583620807668, + "loss": 0.8791, + "step": 4305 + }, + { + "epoch": 0.40380381318218017, + "grad_norm": 0.7421875, + "learning_rate": 0.0001488532059251651, + "loss": 0.9137, + "step": 4310 + }, + { + "epoch": 0.4042722630814634, + "grad_norm": 1.109375, + "learning_rate": 0.00014871044498365735, + "loss": 0.8906, + "step": 4315 + }, + { + "epoch": 0.4047407129807467, + "grad_norm": 0.875, + "learning_rate": 0.00014856755376536953, + "loss": 0.908, + "step": 4320 + }, + { + "epoch": 0.40520916288003, + "grad_norm": 0.953125, + "learning_rate": 0.0001484245326524663, + "loss": 0.8763, + "step": 4325 + }, + { + "epoch": 0.40567761277931325, + "grad_norm": 0.57421875, + "learning_rate": 0.0001482813820274596, + "loss": 0.9505, + "step": 4330 + }, + { + "epoch": 0.40614606267859654, + "grad_norm": 0.62109375, + "learning_rate": 0.0001481381022732079, + "loss": 0.8893, + "step": 4335 + }, + { + "epoch": 0.4066145125778798, + "grad_norm": 0.65625, + "learning_rate": 0.0001479946937729149, + "loss": 0.9082, + "step": 4340 + }, + { + "epoch": 0.4070829624771631, + "grad_norm": 0.78515625, + "learning_rate": 0.00014785115691012864, + "loss": 0.9011, + "step": 4345 + }, + { + "epoch": 0.4075514123764463, + "grad_norm": 0.54296875, + "learning_rate": 0.0001477074920687406, + "loss": 0.8987, + "step": 4350 + }, + { + "epoch": 0.4080198622757296, + "grad_norm": 0.86328125, + "learning_rate": 0.0001475636996329844, + "loss": 0.8585, + "step": 4355 + }, + { + "epoch": 0.40848831217501286, + "grad_norm": 0.70703125, + "learning_rate": 0.00014741977998743497, + "loss": 0.8986, + "step": 4360 + }, + { + "epoch": 0.40895676207429615, + "grad_norm": 0.6484375, + "learning_rate": 0.00014727573351700743, + "loss": 0.9062, + "step": 4365 + }, + { + "epoch": 0.40942521197357945, + "grad_norm": 0.859375, + "learning_rate": 0.00014713156060695615, + "loss": 0.9112, + "step": 4370 + }, + { + "epoch": 0.4098936618728627, + "grad_norm": 0.55078125, + "learning_rate": 0.0001469872616428736, + "loss": 0.9111, + "step": 4375 + }, + { + "epoch": 0.410362111772146, + "grad_norm": 0.7890625, + "learning_rate": 0.0001468428370106895, + "loss": 0.8804, + "step": 4380 + }, + { + "epoch": 0.41083056167142923, + "grad_norm": 0.65625, + "learning_rate": 0.00014669828709666947, + "loss": 0.8998, + "step": 4385 + }, + { + "epoch": 0.4112990115707125, + "grad_norm": 0.77734375, + "learning_rate": 0.0001465536122874144, + "loss": 0.89, + "step": 4390 + }, + { + "epoch": 0.41176746146999577, + "grad_norm": 0.62109375, + "learning_rate": 0.00014640881296985913, + "loss": 0.8845, + "step": 4395 + }, + { + "epoch": 0.41223591136927906, + "grad_norm": 0.55859375, + "learning_rate": 0.00014626388953127144, + "loss": 0.8634, + "step": 4400 + }, + { + "epoch": 0.4127043612685623, + "grad_norm": 0.74609375, + "learning_rate": 0.00014611884235925119, + "loss": 0.9032, + "step": 4405 + }, + { + "epoch": 0.4131728111678456, + "grad_norm": 0.6796875, + "learning_rate": 0.0001459736718417291, + "loss": 0.9157, + "step": 4410 + }, + { + "epoch": 0.4136412610671289, + "grad_norm": 0.6796875, + "learning_rate": 0.0001458283783669658, + "loss": 0.903, + "step": 4415 + }, + { + "epoch": 0.41410971096641214, + "grad_norm": 0.59375, + "learning_rate": 0.0001456829623235507, + "loss": 0.8987, + "step": 4420 + }, + { + "epoch": 0.41457816086569543, + "grad_norm": 0.55078125, + "learning_rate": 0.00014553742410040117, + "loss": 0.9064, + "step": 4425 + }, + { + "epoch": 0.4150466107649787, + "grad_norm": 0.76171875, + "learning_rate": 0.0001453917640867612, + "loss": 0.8856, + "step": 4430 + }, + { + "epoch": 0.41551506066426197, + "grad_norm": 0.578125, + "learning_rate": 0.00014524598267220062, + "loss": 0.9017, + "step": 4435 + }, + { + "epoch": 0.4159835105635452, + "grad_norm": 0.96484375, + "learning_rate": 0.00014510008024661388, + "loss": 0.8495, + "step": 4440 + }, + { + "epoch": 0.4164519604628285, + "grad_norm": 0.5703125, + "learning_rate": 0.00014495405720021907, + "loss": 0.8981, + "step": 4445 + }, + { + "epoch": 0.41692041036211175, + "grad_norm": 0.58984375, + "learning_rate": 0.00014480791392355695, + "loss": 1.005, + "step": 4450 + }, + { + "epoch": 0.41738886026139504, + "grad_norm": 0.609375, + "learning_rate": 0.00014466165080748982, + "loss": 0.8967, + "step": 4455 + }, + { + "epoch": 0.41785731016067834, + "grad_norm": 0.64453125, + "learning_rate": 0.00014451526824320042, + "loss": 0.8819, + "step": 4460 + }, + { + "epoch": 0.4183257600599616, + "grad_norm": 0.63671875, + "learning_rate": 0.00014436876662219102, + "loss": 0.9043, + "step": 4465 + }, + { + "epoch": 0.4187942099592449, + "grad_norm": 0.97265625, + "learning_rate": 0.0001442221463362823, + "loss": 0.9211, + "step": 4470 + }, + { + "epoch": 0.4192626598585281, + "grad_norm": 0.96875, + "learning_rate": 0.00014407540777761231, + "loss": 0.906, + "step": 4475 + }, + { + "epoch": 0.4197311097578114, + "grad_norm": 0.9140625, + "learning_rate": 0.0001439285513386354, + "loss": 0.8962, + "step": 4480 + }, + { + "epoch": 0.42019955965709466, + "grad_norm": 0.64453125, + "learning_rate": 0.00014378157741212126, + "loss": 0.9116, + "step": 4485 + }, + { + "epoch": 0.42066800955637795, + "grad_norm": 0.625, + "learning_rate": 0.0001436344863911537, + "loss": 0.8853, + "step": 4490 + }, + { + "epoch": 0.4211364594556612, + "grad_norm": 1.265625, + "learning_rate": 0.0001434872786691298, + "loss": 0.9244, + "step": 4495 + }, + { + "epoch": 0.4216049093549445, + "grad_norm": 0.99609375, + "learning_rate": 0.00014333995463975863, + "loss": 0.9226, + "step": 4500 + }, + { + "epoch": 0.4220733592542278, + "grad_norm": 0.546875, + "learning_rate": 0.00014319251469706052, + "loss": 0.8893, + "step": 4505 + }, + { + "epoch": 0.422541809153511, + "grad_norm": 0.60546875, + "learning_rate": 0.00014304495923536564, + "loss": 0.8685, + "step": 4510 + }, + { + "epoch": 0.4230102590527943, + "grad_norm": 0.55859375, + "learning_rate": 0.00014289728864931315, + "loss": 0.8663, + "step": 4515 + }, + { + "epoch": 0.42347870895207756, + "grad_norm": 0.6484375, + "learning_rate": 0.0001427495033338502, + "loss": 0.8739, + "step": 4520 + }, + { + "epoch": 0.42394715885136086, + "grad_norm": 0.75, + "learning_rate": 0.00014260160368423068, + "loss": 0.8792, + "step": 4525 + }, + { + "epoch": 0.4244156087506441, + "grad_norm": 0.578125, + "learning_rate": 0.00014245359009601436, + "loss": 0.8911, + "step": 4530 + }, + { + "epoch": 0.4248840586499274, + "grad_norm": 0.56640625, + "learning_rate": 0.0001423054629650656, + "loss": 0.9243, + "step": 4535 + }, + { + "epoch": 0.42535250854921064, + "grad_norm": 0.8125, + "learning_rate": 0.00014215722268755268, + "loss": 0.9, + "step": 4540 + }, + { + "epoch": 0.42582095844849394, + "grad_norm": 0.75, + "learning_rate": 0.0001420088696599462, + "loss": 0.8846, + "step": 4545 + }, + { + "epoch": 0.42628940834777723, + "grad_norm": 0.8125, + "learning_rate": 0.00014186040427901847, + "loss": 0.8815, + "step": 4550 + }, + { + "epoch": 0.4267578582470605, + "grad_norm": 0.67578125, + "learning_rate": 0.00014171182694184233, + "loss": 0.9076, + "step": 4555 + }, + { + "epoch": 0.42722630814634377, + "grad_norm": 0.57421875, + "learning_rate": 0.00014156313804578993, + "loss": 0.9037, + "step": 4560 + }, + { + "epoch": 0.427694758045627, + "grad_norm": 0.5546875, + "learning_rate": 0.00014141433798853188, + "loss": 0.9161, + "step": 4565 + }, + { + "epoch": 0.4281632079449103, + "grad_norm": 0.54296875, + "learning_rate": 0.00014126542716803603, + "loss": 0.8896, + "step": 4570 + }, + { + "epoch": 0.42863165784419355, + "grad_norm": 1.3828125, + "learning_rate": 0.00014111640598256652, + "loss": 0.8899, + "step": 4575 + }, + { + "epoch": 0.42910010774347684, + "grad_norm": 0.95703125, + "learning_rate": 0.0001409672748306826, + "loss": 0.8595, + "step": 4580 + }, + { + "epoch": 0.4295685576427601, + "grad_norm": 0.5546875, + "learning_rate": 0.00014081803411123767, + "loss": 0.8784, + "step": 4585 + }, + { + "epoch": 0.4300370075420434, + "grad_norm": 0.82421875, + "learning_rate": 0.00014066868422337822, + "loss": 0.8833, + "step": 4590 + }, + { + "epoch": 0.4305054574413267, + "grad_norm": 1.03125, + "learning_rate": 0.0001405192255665426, + "loss": 0.8583, + "step": 4595 + }, + { + "epoch": 0.4309739073406099, + "grad_norm": 0.9765625, + "learning_rate": 0.00014036965854046014, + "loss": 0.9033, + "step": 4600 + }, + { + "epoch": 0.4314423572398932, + "grad_norm": 0.56640625, + "learning_rate": 0.00014021998354514997, + "loss": 0.9071, + "step": 4605 + }, + { + "epoch": 0.43191080713917646, + "grad_norm": 0.625, + "learning_rate": 0.00014007020098092, + "loss": 0.9111, + "step": 4610 + }, + { + "epoch": 0.43237925703845975, + "grad_norm": 0.74609375, + "learning_rate": 0.00013992031124836587, + "loss": 0.9257, + "step": 4615 + }, + { + "epoch": 0.432847706937743, + "grad_norm": 0.578125, + "learning_rate": 0.00013977031474836978, + "loss": 0.9058, + "step": 4620 + }, + { + "epoch": 0.4333161568370263, + "grad_norm": 0.60546875, + "learning_rate": 0.00013962021188209948, + "loss": 0.8888, + "step": 4625 + }, + { + "epoch": 0.43378460673630953, + "grad_norm": 0.5703125, + "learning_rate": 0.00013947000305100725, + "loss": 0.882, + "step": 4630 + }, + { + "epoch": 0.4342530566355928, + "grad_norm": 0.62890625, + "learning_rate": 0.00013931968865682883, + "loss": 0.8791, + "step": 4635 + }, + { + "epoch": 0.43472150653487607, + "grad_norm": 0.5703125, + "learning_rate": 0.00013916926910158208, + "loss": 0.9085, + "step": 4640 + }, + { + "epoch": 0.43518995643415936, + "grad_norm": 0.57421875, + "learning_rate": 0.00013901874478756634, + "loss": 0.8917, + "step": 4645 + }, + { + "epoch": 0.43565840633344266, + "grad_norm": 0.63671875, + "learning_rate": 0.00013886811611736098, + "loss": 0.8787, + "step": 4650 + }, + { + "epoch": 0.4361268562327259, + "grad_norm": 0.55078125, + "learning_rate": 0.00013871738349382457, + "loss": 0.8935, + "step": 4655 + }, + { + "epoch": 0.4365953061320092, + "grad_norm": 0.57421875, + "learning_rate": 0.00013856654732009364, + "loss": 0.9202, + "step": 4660 + }, + { + "epoch": 0.43706375603129244, + "grad_norm": 0.58203125, + "learning_rate": 0.0001384156079995817, + "loss": 0.8838, + "step": 4665 + }, + { + "epoch": 0.43753220593057573, + "grad_norm": 0.640625, + "learning_rate": 0.00013826456593597807, + "loss": 0.9288, + "step": 4670 + }, + { + "epoch": 0.438000655829859, + "grad_norm": 0.5625, + "learning_rate": 0.000138113421533247, + "loss": 0.8848, + "step": 4675 + }, + { + "epoch": 0.43846910572914227, + "grad_norm": 0.55078125, + "learning_rate": 0.00013796217519562625, + "loss": 0.9011, + "step": 4680 + }, + { + "epoch": 0.4389375556284255, + "grad_norm": 0.58203125, + "learning_rate": 0.00013781082732762637, + "loss": 0.897, + "step": 4685 + }, + { + "epoch": 0.4394060055277088, + "grad_norm": 0.64453125, + "learning_rate": 0.00013765937833402935, + "loss": 0.8621, + "step": 4690 + }, + { + "epoch": 0.4398744554269921, + "grad_norm": 0.65234375, + "learning_rate": 0.00013750782861988775, + "loss": 0.892, + "step": 4695 + }, + { + "epoch": 0.44034290532627535, + "grad_norm": 0.65234375, + "learning_rate": 0.00013735617859052336, + "loss": 0.9176, + "step": 4700 + }, + { + "epoch": 0.44081135522555864, + "grad_norm": 0.54296875, + "learning_rate": 0.00013720442865152642, + "loss": 0.9059, + "step": 4705 + }, + { + "epoch": 0.4412798051248419, + "grad_norm": 0.62890625, + "learning_rate": 0.00013705257920875423, + "loss": 0.9004, + "step": 4710 + }, + { + "epoch": 0.4417482550241252, + "grad_norm": 0.62890625, + "learning_rate": 0.00013690063066833038, + "loss": 0.923, + "step": 4715 + }, + { + "epoch": 0.4422167049234084, + "grad_norm": 0.65234375, + "learning_rate": 0.00013674858343664333, + "loss": 0.8823, + "step": 4720 + }, + { + "epoch": 0.4426851548226917, + "grad_norm": 0.6328125, + "learning_rate": 0.00013659643792034563, + "loss": 0.8835, + "step": 4725 + }, + { + "epoch": 0.44315360472197496, + "grad_norm": 0.79296875, + "learning_rate": 0.0001364441945263526, + "loss": 0.938, + "step": 4730 + }, + { + "epoch": 0.44362205462125825, + "grad_norm": 0.57421875, + "learning_rate": 0.00013629185366184138, + "loss": 0.875, + "step": 4735 + }, + { + "epoch": 0.44409050452054155, + "grad_norm": 0.59375, + "learning_rate": 0.00013613941573424978, + "loss": 0.8917, + "step": 4740 + }, + { + "epoch": 0.4445589544198248, + "grad_norm": 0.5625, + "learning_rate": 0.0001359868811512752, + "loss": 0.8446, + "step": 4745 + }, + { + "epoch": 0.4450274043191081, + "grad_norm": 0.5703125, + "learning_rate": 0.0001358342503208736, + "loss": 0.8666, + "step": 4750 + }, + { + "epoch": 0.44549585421839133, + "grad_norm": 0.63671875, + "learning_rate": 0.00013568152365125828, + "loss": 0.9304, + "step": 4755 + }, + { + "epoch": 0.4459643041176746, + "grad_norm": 0.71875, + "learning_rate": 0.00013552870155089888, + "loss": 0.8906, + "step": 4760 + }, + { + "epoch": 0.44643275401695787, + "grad_norm": 0.78515625, + "learning_rate": 0.00013537578442852035, + "loss": 0.8833, + "step": 4765 + }, + { + "epoch": 0.44690120391624116, + "grad_norm": 0.67578125, + "learning_rate": 0.00013522277269310164, + "loss": 0.9079, + "step": 4770 + }, + { + "epoch": 0.4473696538155244, + "grad_norm": 0.76953125, + "learning_rate": 0.00013506966675387488, + "loss": 0.9203, + "step": 4775 + }, + { + "epoch": 0.4478381037148077, + "grad_norm": 0.671875, + "learning_rate": 0.00013491646702032406, + "loss": 0.8886, + "step": 4780 + }, + { + "epoch": 0.448306553614091, + "grad_norm": 0.59765625, + "learning_rate": 0.00013476317390218406, + "loss": 0.8876, + "step": 4785 + }, + { + "epoch": 0.44877500351337424, + "grad_norm": 0.5546875, + "learning_rate": 0.00013460978780943948, + "loss": 0.8907, + "step": 4790 + }, + { + "epoch": 0.44924345341265753, + "grad_norm": 0.80859375, + "learning_rate": 0.0001344563091523237, + "loss": 0.8603, + "step": 4795 + }, + { + "epoch": 0.4497119033119408, + "grad_norm": 0.5625, + "learning_rate": 0.0001343027383413175, + "loss": 0.8745, + "step": 4800 + }, + { + "epoch": 0.45018035321122407, + "grad_norm": 0.640625, + "learning_rate": 0.0001341490757871482, + "loss": 0.8741, + "step": 4805 + }, + { + "epoch": 0.4506488031105073, + "grad_norm": 0.61328125, + "learning_rate": 0.0001339953219007885, + "loss": 0.907, + "step": 4810 + }, + { + "epoch": 0.4511172530097906, + "grad_norm": 0.56640625, + "learning_rate": 0.00013384147709345542, + "loss": 0.8919, + "step": 4815 + }, + { + "epoch": 0.45158570290907385, + "grad_norm": 0.5390625, + "learning_rate": 0.00013368754177660904, + "loss": 0.8807, + "step": 4820 + }, + { + "epoch": 0.45205415280835715, + "grad_norm": 0.5859375, + "learning_rate": 0.0001335335163619516, + "loss": 0.8906, + "step": 4825 + }, + { + "epoch": 0.45252260270764044, + "grad_norm": 0.890625, + "learning_rate": 0.00013337940126142622, + "loss": 1.0339, + "step": 4830 + }, + { + "epoch": 0.4529910526069237, + "grad_norm": 0.78125, + "learning_rate": 0.00013322519688721598, + "loss": 0.9284, + "step": 4835 + }, + { + "epoch": 0.453459502506207, + "grad_norm": 0.5625, + "learning_rate": 0.00013307090365174263, + "loss": 0.9172, + "step": 4840 + }, + { + "epoch": 0.4539279524054902, + "grad_norm": 0.78515625, + "learning_rate": 0.00013291652196766564, + "loss": 0.8695, + "step": 4845 + }, + { + "epoch": 0.4543964023047735, + "grad_norm": 0.65234375, + "learning_rate": 0.00013276205224788107, + "loss": 0.8932, + "step": 4850 + }, + { + "epoch": 0.45486485220405676, + "grad_norm": 0.671875, + "learning_rate": 0.00013260749490552037, + "loss": 0.881, + "step": 4855 + }, + { + "epoch": 0.45533330210334005, + "grad_norm": 0.6640625, + "learning_rate": 0.00013245285035394936, + "loss": 0.893, + "step": 4860 + }, + { + "epoch": 0.4558017520026233, + "grad_norm": 0.58984375, + "learning_rate": 0.00013229811900676707, + "loss": 0.8971, + "step": 4865 + }, + { + "epoch": 0.4562702019019066, + "grad_norm": 0.62109375, + "learning_rate": 0.0001321433012778047, + "loss": 0.8746, + "step": 4870 + }, + { + "epoch": 0.4567386518011899, + "grad_norm": 0.55078125, + "learning_rate": 0.00013198839758112453, + "loss": 0.878, + "step": 4875 + }, + { + "epoch": 0.45720710170047313, + "grad_norm": 0.58984375, + "learning_rate": 0.00013183340833101864, + "loss": 0.8936, + "step": 4880 + }, + { + "epoch": 0.4576755515997564, + "grad_norm": 0.640625, + "learning_rate": 0.000131678333942008, + "loss": 0.8799, + "step": 4885 + }, + { + "epoch": 0.45814400149903967, + "grad_norm": 0.63671875, + "learning_rate": 0.0001315231748288413, + "loss": 0.8842, + "step": 4890 + }, + { + "epoch": 0.45861245139832296, + "grad_norm": 0.68359375, + "learning_rate": 0.00013136793140649383, + "loss": 0.8941, + "step": 4895 + }, + { + "epoch": 0.4590809012976062, + "grad_norm": 1.046875, + "learning_rate": 0.00013121260409016628, + "loss": 0.879, + "step": 4900 + }, + { + "epoch": 0.4595493511968895, + "grad_norm": 0.5859375, + "learning_rate": 0.0001310571932952838, + "loss": 1.0178, + "step": 4905 + }, + { + "epoch": 0.46001780109617274, + "grad_norm": 0.5546875, + "learning_rate": 0.00013090169943749476, + "loss": 0.8989, + "step": 4910 + }, + { + "epoch": 0.46048625099545604, + "grad_norm": 0.60546875, + "learning_rate": 0.0001307461229326697, + "loss": 0.8916, + "step": 4915 + }, + { + "epoch": 0.46095470089473933, + "grad_norm": 0.55078125, + "learning_rate": 0.00013059046419690027, + "loss": 0.8907, + "step": 4920 + }, + { + "epoch": 0.4614231507940226, + "grad_norm": 1.15625, + "learning_rate": 0.00013043472364649786, + "loss": 0.9318, + "step": 4925 + }, + { + "epoch": 0.46189160069330587, + "grad_norm": 0.59765625, + "learning_rate": 0.0001302789016979929, + "loss": 0.8905, + "step": 4930 + }, + { + "epoch": 0.4623600505925891, + "grad_norm": 0.625, + "learning_rate": 0.0001301229987681334, + "loss": 0.8817, + "step": 4935 + }, + { + "epoch": 0.4628285004918724, + "grad_norm": 0.67578125, + "learning_rate": 0.0001299670152738839, + "loss": 0.8732, + "step": 4940 + }, + { + "epoch": 0.46329695039115565, + "grad_norm": 0.59765625, + "learning_rate": 0.00012981095163242453, + "loss": 0.8962, + "step": 4945 + }, + { + "epoch": 0.46376540029043894, + "grad_norm": 0.5703125, + "learning_rate": 0.00012965480826114973, + "loss": 0.8704, + "step": 4950 + }, + { + "epoch": 0.4642338501897222, + "grad_norm": 0.609375, + "learning_rate": 0.0001294985855776672, + "loss": 0.8947, + "step": 4955 + }, + { + "epoch": 0.4647023000890055, + "grad_norm": 0.58984375, + "learning_rate": 0.00012934228399979662, + "loss": 0.8959, + "step": 4960 + }, + { + "epoch": 0.4651707499882888, + "grad_norm": 0.70703125, + "learning_rate": 0.00012918590394556893, + "loss": 0.8834, + "step": 4965 + }, + { + "epoch": 0.465639199887572, + "grad_norm": 0.88671875, + "learning_rate": 0.0001290294458332247, + "loss": 0.8817, + "step": 4970 + }, + { + "epoch": 0.4661076497868553, + "grad_norm": 0.61328125, + "learning_rate": 0.00012887291008121352, + "loss": 0.8666, + "step": 4975 + }, + { + "epoch": 0.46657609968613856, + "grad_norm": 0.60546875, + "learning_rate": 0.00012871629710819234, + "loss": 0.9036, + "step": 4980 + }, + { + "epoch": 0.46704454958542185, + "grad_norm": 0.703125, + "learning_rate": 0.00012855960733302492, + "loss": 0.8676, + "step": 4985 + }, + { + "epoch": 0.4675129994847051, + "grad_norm": 0.75, + "learning_rate": 0.00012840284117478023, + "loss": 0.875, + "step": 4990 + }, + { + "epoch": 0.4679814493839884, + "grad_norm": 0.75, + "learning_rate": 0.0001282459990527316, + "loss": 0.8944, + "step": 4995 + }, + { + "epoch": 0.46844989928327163, + "grad_norm": 0.54296875, + "learning_rate": 0.0001280890813863555, + "loss": 0.8764, + "step": 5000 + }, + { + "epoch": 0.4689183491825549, + "grad_norm": 0.578125, + "learning_rate": 0.00012793208859533052, + "loss": 0.876, + "step": 5005 + }, + { + "epoch": 0.4693867990818382, + "grad_norm": 0.58984375, + "learning_rate": 0.00012777502109953613, + "loss": 0.9109, + "step": 5010 + }, + { + "epoch": 0.46985524898112146, + "grad_norm": 0.625, + "learning_rate": 0.00012761787931905153, + "loss": 0.8625, + "step": 5015 + }, + { + "epoch": 0.47032369888040476, + "grad_norm": 0.65625, + "learning_rate": 0.00012746066367415467, + "loss": 0.8682, + "step": 5020 + }, + { + "epoch": 0.470792148779688, + "grad_norm": 0.5625, + "learning_rate": 0.00012730337458532106, + "loss": 0.894, + "step": 5025 + }, + { + "epoch": 0.4712605986789713, + "grad_norm": 0.5703125, + "learning_rate": 0.0001271460124732226, + "loss": 0.8831, + "step": 5030 + }, + { + "epoch": 0.47172904857825454, + "grad_norm": 0.79296875, + "learning_rate": 0.0001269885777587265, + "loss": 0.8789, + "step": 5035 + }, + { + "epoch": 0.47219749847753784, + "grad_norm": 0.5546875, + "learning_rate": 0.00012683107086289413, + "loss": 0.8636, + "step": 5040 + }, + { + "epoch": 0.4726659483768211, + "grad_norm": 0.58203125, + "learning_rate": 0.00012667349220697998, + "loss": 0.9318, + "step": 5045 + }, + { + "epoch": 0.4731343982761044, + "grad_norm": 0.546875, + "learning_rate": 0.00012651584221243033, + "loss": 0.907, + "step": 5050 + }, + { + "epoch": 0.47360284817538767, + "grad_norm": 0.6328125, + "learning_rate": 0.00012635812130088248, + "loss": 0.877, + "step": 5055 + }, + { + "epoch": 0.4740712980746709, + "grad_norm": 0.61328125, + "learning_rate": 0.0001262003298941631, + "loss": 0.8606, + "step": 5060 + }, + { + "epoch": 0.4745397479739542, + "grad_norm": 0.58984375, + "learning_rate": 0.00012604246841428767, + "loss": 0.8781, + "step": 5065 + }, + { + "epoch": 0.47500819787323745, + "grad_norm": 0.59765625, + "learning_rate": 0.00012588453728345892, + "loss": 0.8644, + "step": 5070 + }, + { + "epoch": 0.47547664777252074, + "grad_norm": 0.89453125, + "learning_rate": 0.00012572653692406594, + "loss": 0.9029, + "step": 5075 + }, + { + "epoch": 0.475945097671804, + "grad_norm": 0.56640625, + "learning_rate": 0.0001255684677586829, + "loss": 0.8548, + "step": 5080 + }, + { + "epoch": 0.4764135475710873, + "grad_norm": 0.57421875, + "learning_rate": 0.00012541033021006806, + "loss": 1.0169, + "step": 5085 + }, + { + "epoch": 0.4768819974703705, + "grad_norm": 0.6015625, + "learning_rate": 0.00012525212470116258, + "loss": 0.8897, + "step": 5090 + }, + { + "epoch": 0.4773504473696538, + "grad_norm": 0.84765625, + "learning_rate": 0.00012509385165508934, + "loss": 0.8701, + "step": 5095 + }, + { + "epoch": 0.47781889726893706, + "grad_norm": 0.8046875, + "learning_rate": 0.00012493551149515183, + "loss": 0.866, + "step": 5100 + }, + { + "epoch": 0.47828734716822036, + "grad_norm": 0.66015625, + "learning_rate": 0.00012477710464483307, + "loss": 0.8859, + "step": 5105 + }, + { + "epoch": 0.47875579706750365, + "grad_norm": 0.53515625, + "learning_rate": 0.00012461863152779445, + "loss": 0.8764, + "step": 5110 + }, + { + "epoch": 0.4792242469667869, + "grad_norm": 0.6640625, + "learning_rate": 0.0001244600925678746, + "loss": 0.8771, + "step": 5115 + }, + { + "epoch": 0.4796926968660702, + "grad_norm": 0.578125, + "learning_rate": 0.00012430148818908822, + "loss": 0.8593, + "step": 5120 + }, + { + "epoch": 0.48016114676535343, + "grad_norm": 0.5703125, + "learning_rate": 0.00012414281881562497, + "loss": 0.8778, + "step": 5125 + }, + { + "epoch": 0.4806295966646367, + "grad_norm": 0.546875, + "learning_rate": 0.00012398408487184837, + "loss": 0.8781, + "step": 5130 + }, + { + "epoch": 0.48109804656391997, + "grad_norm": 0.6796875, + "learning_rate": 0.00012382528678229465, + "loss": 0.8822, + "step": 5135 + }, + { + "epoch": 0.48156649646320326, + "grad_norm": 0.5390625, + "learning_rate": 0.00012366642497167146, + "loss": 0.8901, + "step": 5140 + }, + { + "epoch": 0.4820349463624865, + "grad_norm": 0.59765625, + "learning_rate": 0.00012350749986485708, + "loss": 0.8575, + "step": 5145 + }, + { + "epoch": 0.4825033962617698, + "grad_norm": 0.546875, + "learning_rate": 0.00012334851188689893, + "loss": 0.9169, + "step": 5150 + }, + { + "epoch": 0.4829718461610531, + "grad_norm": 0.578125, + "learning_rate": 0.00012318946146301263, + "loss": 0.8796, + "step": 5155 + }, + { + "epoch": 0.48344029606033634, + "grad_norm": 0.625, + "learning_rate": 0.00012303034901858085, + "loss": 0.8967, + "step": 5160 + }, + { + "epoch": 0.48390874595961964, + "grad_norm": 0.7890625, + "learning_rate": 0.00012287117497915203, + "loss": 0.8841, + "step": 5165 + }, + { + "epoch": 0.4843771958589029, + "grad_norm": 2.203125, + "learning_rate": 0.00012271193977043943, + "loss": 1.0036, + "step": 5170 + }, + { + "epoch": 0.4848456457581862, + "grad_norm": 0.65234375, + "learning_rate": 0.00012255264381831992, + "loss": 0.9065, + "step": 5175 + }, + { + "epoch": 0.4853140956574694, + "grad_norm": 0.58984375, + "learning_rate": 0.00012239328754883277, + "loss": 0.9108, + "step": 5180 + }, + { + "epoch": 0.4857825455567527, + "grad_norm": 0.66796875, + "learning_rate": 0.00012223387138817854, + "loss": 0.9077, + "step": 5185 + }, + { + "epoch": 0.48625099545603595, + "grad_norm": 0.5703125, + "learning_rate": 0.00012207439576271813, + "loss": 0.8911, + "step": 5190 + }, + { + "epoch": 0.48671944535531925, + "grad_norm": 0.58203125, + "learning_rate": 0.00012191486109897137, + "loss": 0.8984, + "step": 5195 + }, + { + "epoch": 0.48718789525460254, + "grad_norm": 0.63671875, + "learning_rate": 0.00012175526782361588, + "loss": 0.903, + "step": 5200 + }, + { + "epoch": 0.4876563451538858, + "grad_norm": 0.7109375, + "learning_rate": 0.00012159561636348624, + "loss": 0.8632, + "step": 5205 + }, + { + "epoch": 0.4881247950531691, + "grad_norm": 0.6328125, + "learning_rate": 0.00012143590714557253, + "loss": 0.8835, + "step": 5210 + }, + { + "epoch": 0.4885932449524523, + "grad_norm": 0.75390625, + "learning_rate": 0.00012127614059701939, + "loss": 0.9025, + "step": 5215 + }, + { + "epoch": 0.4890616948517356, + "grad_norm": 0.74609375, + "learning_rate": 0.00012111631714512465, + "loss": 0.8723, + "step": 5220 + }, + { + "epoch": 0.48953014475101886, + "grad_norm": 0.58984375, + "learning_rate": 0.00012095643721733846, + "loss": 0.8803, + "step": 5225 + }, + { + "epoch": 0.48999859465030215, + "grad_norm": 0.578125, + "learning_rate": 0.00012079650124126196, + "loss": 0.9091, + "step": 5230 + }, + { + "epoch": 0.4904670445495854, + "grad_norm": 0.81640625, + "learning_rate": 0.00012063650964464619, + "loss": 0.887, + "step": 5235 + }, + { + "epoch": 0.4909354944488687, + "grad_norm": 0.68359375, + "learning_rate": 0.00012047646285539098, + "loss": 0.877, + "step": 5240 + }, + { + "epoch": 0.491403944348152, + "grad_norm": 0.671875, + "learning_rate": 0.00012031636130154371, + "loss": 0.906, + "step": 5245 + }, + { + "epoch": 0.49187239424743523, + "grad_norm": 0.68359375, + "learning_rate": 0.0001201562054112983, + "loss": 0.8813, + "step": 5250 + }, + { + "epoch": 0.4923408441467185, + "grad_norm": 0.96875, + "learning_rate": 0.00011999599561299398, + "loss": 0.8678, + "step": 5255 + }, + { + "epoch": 0.49280929404600177, + "grad_norm": 0.6875, + "learning_rate": 0.0001198357323351141, + "loss": 0.8975, + "step": 5260 + }, + { + "epoch": 0.49327774394528506, + "grad_norm": 0.5859375, + "learning_rate": 0.00011967541600628509, + "loss": 0.8598, + "step": 5265 + }, + { + "epoch": 0.4937461938445683, + "grad_norm": 0.6015625, + "learning_rate": 0.0001195150470552753, + "loss": 0.8887, + "step": 5270 + }, + { + "epoch": 0.4942146437438516, + "grad_norm": 0.69921875, + "learning_rate": 0.00011935462591099374, + "loss": 0.8972, + "step": 5275 + }, + { + "epoch": 0.49468309364313484, + "grad_norm": 0.58984375, + "learning_rate": 0.00011919415300248907, + "loss": 0.8584, + "step": 5280 + }, + { + "epoch": 0.49515154354241814, + "grad_norm": 0.79296875, + "learning_rate": 0.00011903362875894835, + "loss": 0.8616, + "step": 5285 + }, + { + "epoch": 0.49561999344170143, + "grad_norm": 0.6171875, + "learning_rate": 0.00011887305360969595, + "loss": 0.873, + "step": 5290 + }, + { + "epoch": 0.4960884433409847, + "grad_norm": 0.5546875, + "learning_rate": 0.00011871242798419244, + "loss": 0.8739, + "step": 5295 + }, + { + "epoch": 0.49655689324026797, + "grad_norm": 0.6875, + "learning_rate": 0.00011855175231203333, + "loss": 0.8845, + "step": 5300 + }, + { + "epoch": 0.4970253431395512, + "grad_norm": 0.59765625, + "learning_rate": 0.00011839102702294798, + "loss": 0.9028, + "step": 5305 + }, + { + "epoch": 0.4974937930388345, + "grad_norm": 0.74609375, + "learning_rate": 0.00011823025254679847, + "loss": 0.8753, + "step": 5310 + }, + { + "epoch": 0.49796224293811775, + "grad_norm": 0.63671875, + "learning_rate": 0.00011806942931357847, + "loss": 0.879, + "step": 5315 + }, + { + "epoch": 0.49843069283740105, + "grad_norm": 0.609375, + "learning_rate": 0.00011790855775341199, + "loss": 0.8764, + "step": 5320 + }, + { + "epoch": 0.4988991427366843, + "grad_norm": 0.59765625, + "learning_rate": 0.00011774763829655225, + "loss": 0.8722, + "step": 5325 + }, + { + "epoch": 0.4993675926359676, + "grad_norm": 0.5625, + "learning_rate": 0.00011758667137338073, + "loss": 0.8959, + "step": 5330 + }, + { + "epoch": 0.4998360425352509, + "grad_norm": 0.82421875, + "learning_rate": 0.00011742565741440572, + "loss": 0.8866, + "step": 5335 + }, + { + "epoch": 0.5003044924345341, + "grad_norm": 0.5234375, + "learning_rate": 0.00011726459685026135, + "loss": 1.0222, + "step": 5340 + }, + { + "epoch": 0.5007729423338174, + "grad_norm": 0.76171875, + "learning_rate": 0.00011710349011170638, + "loss": 0.8995, + "step": 5345 + }, + { + "epoch": 0.5012413922331007, + "grad_norm": 0.54296875, + "learning_rate": 0.00011694233762962312, + "loss": 0.8826, + "step": 5350 + }, + { + "epoch": 0.5017098421323839, + "grad_norm": 0.9921875, + "learning_rate": 0.00011678113983501616, + "loss": 0.8776, + "step": 5355 + }, + { + "epoch": 0.5021782920316672, + "grad_norm": 0.59375, + "learning_rate": 0.0001166198971590113, + "loss": 0.8749, + "step": 5360 + }, + { + "epoch": 0.5026467419309505, + "grad_norm": 0.5546875, + "learning_rate": 0.00011645861003285439, + "loss": 0.8867, + "step": 5365 + }, + { + "epoch": 0.5031151918302338, + "grad_norm": 0.55859375, + "learning_rate": 0.00011629727888791011, + "loss": 0.868, + "step": 5370 + }, + { + "epoch": 0.503583641729517, + "grad_norm": 0.58203125, + "learning_rate": 0.00011613590415566098, + "loss": 0.8676, + "step": 5375 + }, + { + "epoch": 0.5040520916288003, + "grad_norm": 0.7109375, + "learning_rate": 0.00011597448626770595, + "loss": 0.86, + "step": 5380 + }, + { + "epoch": 0.5045205415280836, + "grad_norm": 0.5859375, + "learning_rate": 0.00011581302565575951, + "loss": 0.8729, + "step": 5385 + }, + { + "epoch": 0.5049889914273669, + "grad_norm": 0.6875, + "learning_rate": 0.00011565152275165034, + "loss": 0.873, + "step": 5390 + }, + { + "epoch": 0.5054574413266502, + "grad_norm": 0.5703125, + "learning_rate": 0.00011548997798732031, + "loss": 0.8759, + "step": 5395 + }, + { + "epoch": 0.5059258912259333, + "grad_norm": 0.64453125, + "learning_rate": 0.00011532839179482315, + "loss": 0.8717, + "step": 5400 + }, + { + "epoch": 0.5063943411252166, + "grad_norm": 0.5859375, + "learning_rate": 0.00011516676460632343, + "loss": 0.9097, + "step": 5405 + }, + { + "epoch": 0.5068627910244999, + "grad_norm": 0.57421875, + "learning_rate": 0.00011500509685409542, + "loss": 0.8884, + "step": 5410 + }, + { + "epoch": 0.5073312409237832, + "grad_norm": 0.80859375, + "learning_rate": 0.0001148433889705218, + "loss": 0.8796, + "step": 5415 + }, + { + "epoch": 0.5077996908230664, + "grad_norm": 0.80859375, + "learning_rate": 0.0001146816413880926, + "loss": 0.8702, + "step": 5420 + }, + { + "epoch": 0.5082681407223497, + "grad_norm": 0.671875, + "learning_rate": 0.00011451985453940406, + "loss": 0.875, + "step": 5425 + }, + { + "epoch": 0.508736590621633, + "grad_norm": 0.578125, + "learning_rate": 0.0001143580288571574, + "loss": 0.8904, + "step": 5430 + }, + { + "epoch": 0.5092050405209163, + "grad_norm": 0.58203125, + "learning_rate": 0.00011419616477415775, + "loss": 0.889, + "step": 5435 + }, + { + "epoch": 0.5096734904201996, + "grad_norm": 0.55078125, + "learning_rate": 0.00011403426272331286, + "loss": 0.8519, + "step": 5440 + }, + { + "epoch": 0.5101419403194828, + "grad_norm": 0.640625, + "learning_rate": 0.0001138723231376321, + "loss": 0.8653, + "step": 5445 + }, + { + "epoch": 0.5106103902187661, + "grad_norm": 0.59375, + "learning_rate": 0.00011371034645022518, + "loss": 0.903, + "step": 5450 + }, + { + "epoch": 0.5110788401180494, + "grad_norm": 0.73828125, + "learning_rate": 0.00011354833309430113, + "loss": 0.8613, + "step": 5455 + }, + { + "epoch": 0.5115472900173327, + "grad_norm": 0.88671875, + "learning_rate": 0.0001133862835031669, + "loss": 0.8777, + "step": 5460 + }, + { + "epoch": 0.5120157399166159, + "grad_norm": 0.66015625, + "learning_rate": 0.00011322419811022644, + "loss": 0.8916, + "step": 5465 + }, + { + "epoch": 0.5124841898158992, + "grad_norm": 0.5625, + "learning_rate": 0.00011306207734897951, + "loss": 0.8502, + "step": 5470 + }, + { + "epoch": 0.5129526397151825, + "grad_norm": 0.6640625, + "learning_rate": 0.00011289992165302035, + "loss": 0.8633, + "step": 5475 + }, + { + "epoch": 0.5134210896144658, + "grad_norm": 0.59375, + "learning_rate": 0.00011273773145603668, + "loss": 0.8879, + "step": 5480 + }, + { + "epoch": 0.513889539513749, + "grad_norm": 0.61328125, + "learning_rate": 0.00011257550719180852, + "loss": 0.8895, + "step": 5485 + }, + { + "epoch": 0.5143579894130322, + "grad_norm": 0.62109375, + "learning_rate": 0.00011241324929420695, + "loss": 0.8604, + "step": 5490 + }, + { + "epoch": 0.5148264393123155, + "grad_norm": 0.69921875, + "learning_rate": 0.00011225095819719306, + "loss": 0.8618, + "step": 5495 + }, + { + "epoch": 0.5152948892115988, + "grad_norm": 0.58984375, + "learning_rate": 0.00011208863433481672, + "loss": 0.9259, + "step": 5500 + }, + { + "epoch": 0.5157633391108821, + "grad_norm": 0.53515625, + "learning_rate": 0.00011192627814121533, + "loss": 0.8885, + "step": 5505 + }, + { + "epoch": 0.5162317890101653, + "grad_norm": 0.578125, + "learning_rate": 0.00011176389005061294, + "loss": 0.8766, + "step": 5510 + }, + { + "epoch": 0.5167002389094486, + "grad_norm": 0.58203125, + "learning_rate": 0.00011160147049731878, + "loss": 0.8666, + "step": 5515 + }, + { + "epoch": 0.5171686888087319, + "grad_norm": 0.5625, + "learning_rate": 0.00011143901991572627, + "loss": 0.8706, + "step": 5520 + }, + { + "epoch": 0.5176371387080152, + "grad_norm": 0.57421875, + "learning_rate": 0.00011127653874031178, + "loss": 0.891, + "step": 5525 + }, + { + "epoch": 0.5181055886072985, + "grad_norm": 0.578125, + "learning_rate": 0.00011111402740563354, + "loss": 0.8846, + "step": 5530 + }, + { + "epoch": 0.5185740385065817, + "grad_norm": 0.57421875, + "learning_rate": 0.0001109514863463305, + "loss": 0.8769, + "step": 5535 + }, + { + "epoch": 0.519042488405865, + "grad_norm": 0.80078125, + "learning_rate": 0.00011078891599712094, + "loss": 0.8819, + "step": 5540 + }, + { + "epoch": 0.5195109383051483, + "grad_norm": 0.55859375, + "learning_rate": 0.00011062631679280165, + "loss": 0.8946, + "step": 5545 + }, + { + "epoch": 0.5199793882044316, + "grad_norm": 0.59375, + "learning_rate": 0.0001104636891682465, + "loss": 0.884, + "step": 5550 + }, + { + "epoch": 0.5204478381037148, + "grad_norm": 0.69921875, + "learning_rate": 0.00011030103355840538, + "loss": 0.8788, + "step": 5555 + }, + { + "epoch": 0.520916288002998, + "grad_norm": 0.8515625, + "learning_rate": 0.00011013835039830305, + "loss": 0.9092, + "step": 5560 + }, + { + "epoch": 0.5213847379022813, + "grad_norm": 0.86328125, + "learning_rate": 0.00010997564012303791, + "loss": 0.8494, + "step": 5565 + }, + { + "epoch": 0.5218531878015646, + "grad_norm": 0.67578125, + "learning_rate": 0.00010981290316778097, + "loss": 0.8551, + "step": 5570 + }, + { + "epoch": 0.5223216377008479, + "grad_norm": 0.5625, + "learning_rate": 0.00010965013996777451, + "loss": 0.8804, + "step": 5575 + }, + { + "epoch": 0.5227900876001311, + "grad_norm": 0.625, + "learning_rate": 0.00010948735095833104, + "loss": 0.874, + "step": 5580 + }, + { + "epoch": 0.5232585374994144, + "grad_norm": 0.5546875, + "learning_rate": 0.00010932453657483206, + "loss": 0.8647, + "step": 5585 + }, + { + "epoch": 0.5237269873986977, + "grad_norm": 0.59765625, + "learning_rate": 0.000109161697252727, + "loss": 0.8544, + "step": 5590 + }, + { + "epoch": 0.524195437297981, + "grad_norm": 0.8046875, + "learning_rate": 0.00010899883342753192, + "loss": 0.8719, + "step": 5595 + }, + { + "epoch": 0.5246638871972642, + "grad_norm": 0.66015625, + "learning_rate": 0.00010883594553482848, + "loss": 0.8891, + "step": 5600 + }, + { + "epoch": 0.5251323370965475, + "grad_norm": 0.66796875, + "learning_rate": 0.00010867303401026263, + "loss": 0.8787, + "step": 5605 + }, + { + "epoch": 0.5256007869958308, + "grad_norm": 0.66015625, + "learning_rate": 0.00010851009928954356, + "loss": 0.8583, + "step": 5610 + }, + { + "epoch": 0.5260692368951141, + "grad_norm": 0.6015625, + "learning_rate": 0.00010834714180844259, + "loss": 0.8872, + "step": 5615 + }, + { + "epoch": 0.5265376867943974, + "grad_norm": 0.67578125, + "learning_rate": 0.00010818416200279174, + "loss": 0.8636, + "step": 5620 + }, + { + "epoch": 0.5270061366936806, + "grad_norm": 0.5390625, + "learning_rate": 0.00010802116030848284, + "loss": 0.8783, + "step": 5625 + }, + { + "epoch": 0.5274745865929639, + "grad_norm": 0.59765625, + "learning_rate": 0.00010785813716146625, + "loss": 0.8731, + "step": 5630 + }, + { + "epoch": 0.5279430364922472, + "grad_norm": 0.5625, + "learning_rate": 0.00010769509299774976, + "loss": 0.886, + "step": 5635 + }, + { + "epoch": 0.5284114863915305, + "grad_norm": 0.54296875, + "learning_rate": 0.00010753202825339725, + "loss": 0.8794, + "step": 5640 + }, + { + "epoch": 0.5288799362908136, + "grad_norm": 0.625, + "learning_rate": 0.0001073689433645277, + "loss": 0.8998, + "step": 5645 + }, + { + "epoch": 0.5293483861900969, + "grad_norm": 0.68359375, + "learning_rate": 0.000107205838767314, + "loss": 0.8788, + "step": 5650 + }, + { + "epoch": 0.5298168360893802, + "grad_norm": 0.67578125, + "learning_rate": 0.00010704271489798172, + "loss": 0.8976, + "step": 5655 + }, + { + "epoch": 0.5302852859886635, + "grad_norm": 0.60546875, + "learning_rate": 0.00010687957219280798, + "loss": 0.8837, + "step": 5660 + }, + { + "epoch": 0.5307537358879468, + "grad_norm": 0.56640625, + "learning_rate": 0.00010671641108812022, + "loss": 0.8613, + "step": 5665 + }, + { + "epoch": 0.53122218578723, + "grad_norm": 0.61328125, + "learning_rate": 0.00010655323202029524, + "loss": 0.8839, + "step": 5670 + }, + { + "epoch": 0.5316906356865133, + "grad_norm": 0.5703125, + "learning_rate": 0.0001063900354257577, + "loss": 0.8867, + "step": 5675 + }, + { + "epoch": 0.5321590855857966, + "grad_norm": 0.58984375, + "learning_rate": 0.00010622682174097927, + "loss": 0.8875, + "step": 5680 + }, + { + "epoch": 0.5326275354850799, + "grad_norm": 0.5703125, + "learning_rate": 0.00010606359140247723, + "loss": 0.8742, + "step": 5685 + }, + { + "epoch": 0.5330959853843631, + "grad_norm": 0.65234375, + "learning_rate": 0.00010590034484681348, + "loss": 0.8676, + "step": 5690 + }, + { + "epoch": 0.5335644352836464, + "grad_norm": 0.60546875, + "learning_rate": 0.00010573708251059331, + "loss": 1.0158, + "step": 5695 + }, + { + "epoch": 0.5340328851829297, + "grad_norm": 0.6875, + "learning_rate": 0.00010557380483046406, + "loss": 0.8818, + "step": 5700 + }, + { + "epoch": 0.534501335082213, + "grad_norm": 0.66796875, + "learning_rate": 0.0001054105122431143, + "loss": 0.8693, + "step": 5705 + }, + { + "epoch": 0.5349697849814963, + "grad_norm": 0.6875, + "learning_rate": 0.00010524720518527231, + "loss": 0.8966, + "step": 5710 + }, + { + "epoch": 0.5354382348807795, + "grad_norm": 0.5390625, + "learning_rate": 0.00010508388409370526, + "loss": 0.9026, + "step": 5715 + }, + { + "epoch": 0.5359066847800628, + "grad_norm": 0.62109375, + "learning_rate": 0.00010492054940521763, + "loss": 0.8762, + "step": 5720 + }, + { + "epoch": 0.536375134679346, + "grad_norm": 0.5546875, + "learning_rate": 0.00010475720155665043, + "loss": 0.8968, + "step": 5725 + }, + { + "epoch": 0.5368435845786294, + "grad_norm": 0.625, + "learning_rate": 0.0001045938409848798, + "loss": 0.8619, + "step": 5730 + }, + { + "epoch": 0.5373120344779125, + "grad_norm": 0.671875, + "learning_rate": 0.00010443046812681594, + "loss": 0.8887, + "step": 5735 + }, + { + "epoch": 0.5377804843771958, + "grad_norm": 0.59765625, + "learning_rate": 0.00010426708341940183, + "loss": 0.8597, + "step": 5740 + }, + { + "epoch": 0.5382489342764791, + "grad_norm": 0.5703125, + "learning_rate": 0.00010410368729961225, + "loss": 0.8761, + "step": 5745 + }, + { + "epoch": 0.5387173841757624, + "grad_norm": 0.7421875, + "learning_rate": 0.00010394028020445244, + "loss": 0.8793, + "step": 5750 + }, + { + "epoch": 0.5391858340750457, + "grad_norm": 0.86328125, + "learning_rate": 0.00010377686257095702, + "loss": 0.8988, + "step": 5755 + }, + { + "epoch": 0.5396542839743289, + "grad_norm": 0.609375, + "learning_rate": 0.00010361343483618877, + "loss": 0.864, + "step": 5760 + }, + { + "epoch": 0.5401227338736122, + "grad_norm": 0.6640625, + "learning_rate": 0.00010344999743723749, + "loss": 0.8618, + "step": 5765 + }, + { + "epoch": 0.5405911837728955, + "grad_norm": 0.59765625, + "learning_rate": 0.00010328655081121883, + "loss": 0.8766, + "step": 5770 + }, + { + "epoch": 0.5410596336721788, + "grad_norm": 0.54296875, + "learning_rate": 0.00010312309539527318, + "loss": 0.8545, + "step": 5775 + }, + { + "epoch": 0.541528083571462, + "grad_norm": 0.53125, + "learning_rate": 0.0001029596316265643, + "loss": 0.8839, + "step": 5780 + }, + { + "epoch": 0.5419965334707453, + "grad_norm": 0.68359375, + "learning_rate": 0.00010279615994227843, + "loss": 0.8966, + "step": 5785 + }, + { + "epoch": 0.5424649833700286, + "grad_norm": 0.6875, + "learning_rate": 0.00010263268077962289, + "loss": 0.8905, + "step": 5790 + }, + { + "epoch": 0.5429334332693119, + "grad_norm": 0.55078125, + "learning_rate": 0.00010246919457582507, + "loss": 0.8528, + "step": 5795 + }, + { + "epoch": 0.5434018831685952, + "grad_norm": 0.64453125, + "learning_rate": 0.0001023057017681311, + "loss": 0.844, + "step": 5800 + }, + { + "epoch": 0.5438703330678784, + "grad_norm": 0.640625, + "learning_rate": 0.00010214220279380489, + "loss": 0.8905, + "step": 5805 + }, + { + "epoch": 0.5443387829671616, + "grad_norm": 0.5546875, + "learning_rate": 0.00010197869809012673, + "loss": 0.8604, + "step": 5810 + }, + { + "epoch": 0.544807232866445, + "grad_norm": 0.57421875, + "learning_rate": 0.0001018151880943923, + "loss": 0.9017, + "step": 5815 + }, + { + "epoch": 0.5452756827657282, + "grad_norm": 0.57421875, + "learning_rate": 0.0001016516732439114, + "loss": 0.9017, + "step": 5820 + }, + { + "epoch": 0.5457441326650114, + "grad_norm": 0.5625, + "learning_rate": 0.00010148815397600686, + "loss": 0.8484, + "step": 5825 + }, + { + "epoch": 0.5462125825642947, + "grad_norm": 0.55859375, + "learning_rate": 0.00010132463072801328, + "loss": 0.8643, + "step": 5830 + }, + { + "epoch": 0.546681032463578, + "grad_norm": 0.6484375, + "learning_rate": 0.00010116110393727592, + "loss": 0.8603, + "step": 5835 + }, + { + "epoch": 0.5471494823628613, + "grad_norm": 0.5546875, + "learning_rate": 0.00010099757404114952, + "loss": 0.8969, + "step": 5840 + }, + { + "epoch": 0.5476179322621446, + "grad_norm": 0.578125, + "learning_rate": 0.0001008340414769971, + "loss": 0.8644, + "step": 5845 + }, + { + "epoch": 0.5480863821614278, + "grad_norm": 0.578125, + "learning_rate": 0.00010067050668218885, + "loss": 0.8508, + "step": 5850 + }, + { + "epoch": 0.5485548320607111, + "grad_norm": 0.57421875, + "learning_rate": 0.00010050697009410095, + "loss": 0.8716, + "step": 5855 + }, + { + "epoch": 0.5490232819599944, + "grad_norm": 0.578125, + "learning_rate": 0.00010034343215011426, + "loss": 0.8546, + "step": 5860 + }, + { + "epoch": 0.5494917318592777, + "grad_norm": 0.52734375, + "learning_rate": 0.00010017989328761339, + "loss": 0.8651, + "step": 5865 + }, + { + "epoch": 0.5499601817585609, + "grad_norm": 0.609375, + "learning_rate": 0.00010001635394398534, + "loss": 0.8669, + "step": 5870 + }, + { + "epoch": 0.5504286316578442, + "grad_norm": 0.55859375, + "learning_rate": 9.985281455661846e-05, + "loss": 0.868, + "step": 5875 + }, + { + "epoch": 0.5508970815571275, + "grad_norm": 0.625, + "learning_rate": 9.968927556290115e-05, + "loss": 0.8629, + "step": 5880 + }, + { + "epoch": 0.5513655314564108, + "grad_norm": 0.5703125, + "learning_rate": 9.952573740022081e-05, + "loss": 0.8788, + "step": 5885 + }, + { + "epoch": 0.5518339813556941, + "grad_norm": 0.578125, + "learning_rate": 9.93622005059625e-05, + "loss": 0.8598, + "step": 5890 + }, + { + "epoch": 0.5523024312549772, + "grad_norm": 0.5859375, + "learning_rate": 9.919866531750807e-05, + "loss": 0.8892, + "step": 5895 + }, + { + "epoch": 0.5527708811542605, + "grad_norm": 0.671875, + "learning_rate": 9.903513227223467e-05, + "loss": 0.8647, + "step": 5900 + }, + { + "epoch": 0.5532393310535438, + "grad_norm": 0.5546875, + "learning_rate": 9.887160180751376e-05, + "loss": 0.8844, + "step": 5905 + }, + { + "epoch": 0.5537077809528271, + "grad_norm": 0.55859375, + "learning_rate": 9.870807436070994e-05, + "loss": 0.8746, + "step": 5910 + }, + { + "epoch": 0.5541762308521103, + "grad_norm": 0.7578125, + "learning_rate": 9.854455036917966e-05, + "loss": 0.8988, + "step": 5915 + }, + { + "epoch": 0.5546446807513936, + "grad_norm": 0.59765625, + "learning_rate": 9.838103027027016e-05, + "loss": 0.8698, + "step": 5920 + }, + { + "epoch": 0.5551131306506769, + "grad_norm": 0.5703125, + "learning_rate": 9.821751450131835e-05, + "loss": 0.8662, + "step": 5925 + }, + { + "epoch": 0.5555815805499602, + "grad_norm": 0.59375, + "learning_rate": 9.805400349964942e-05, + "loss": 0.8566, + "step": 5930 + }, + { + "epoch": 0.5560500304492435, + "grad_norm": 0.5234375, + "learning_rate": 9.789049770257588e-05, + "loss": 0.8628, + "step": 5935 + }, + { + "epoch": 0.5565184803485267, + "grad_norm": 0.60546875, + "learning_rate": 9.772699754739631e-05, + "loss": 0.8937, + "step": 5940 + }, + { + "epoch": 0.55698693024781, + "grad_norm": 0.5625, + "learning_rate": 9.756350347139426e-05, + "loss": 0.8857, + "step": 5945 + }, + { + "epoch": 0.5574553801470933, + "grad_norm": 0.765625, + "learning_rate": 9.740001591183691e-05, + "loss": 0.8391, + "step": 5950 + }, + { + "epoch": 0.5579238300463766, + "grad_norm": 0.5625, + "learning_rate": 9.723653530597411e-05, + "loss": 0.8685, + "step": 5955 + }, + { + "epoch": 0.5583922799456598, + "grad_norm": 0.62109375, + "learning_rate": 9.707306209103706e-05, + "loss": 0.8784, + "step": 5960 + }, + { + "epoch": 0.5588607298449431, + "grad_norm": 0.546875, + "learning_rate": 9.690959670423724e-05, + "loss": 0.8867, + "step": 5965 + }, + { + "epoch": 0.5593291797442264, + "grad_norm": 0.57421875, + "learning_rate": 9.674613958276506e-05, + "loss": 0.9051, + "step": 5970 + }, + { + "epoch": 0.5597976296435097, + "grad_norm": 0.6640625, + "learning_rate": 9.658269116378902e-05, + "loss": 0.8998, + "step": 5975 + }, + { + "epoch": 0.560266079542793, + "grad_norm": 0.5625, + "learning_rate": 9.641925188445423e-05, + "loss": 0.87, + "step": 5980 + }, + { + "epoch": 0.5607345294420761, + "grad_norm": 0.59765625, + "learning_rate": 9.625582218188131e-05, + "loss": 0.8709, + "step": 5985 + }, + { + "epoch": 0.5612029793413594, + "grad_norm": 0.59375, + "learning_rate": 9.609240249316543e-05, + "loss": 0.8752, + "step": 5990 + }, + { + "epoch": 0.5616714292406427, + "grad_norm": 0.5390625, + "learning_rate": 9.59289932553748e-05, + "loss": 0.8497, + "step": 5995 + }, + { + "epoch": 0.562139879139926, + "grad_norm": 0.56640625, + "learning_rate": 9.576559490554978e-05, + "loss": 0.8478, + "step": 6000 + }, + { + "epoch": 0.5626083290392092, + "grad_norm": 0.578125, + "learning_rate": 9.560220788070165e-05, + "loss": 0.8729, + "step": 6005 + }, + { + "epoch": 0.5630767789384925, + "grad_norm": 0.55859375, + "learning_rate": 9.543883261781125e-05, + "loss": 0.8664, + "step": 6010 + }, + { + "epoch": 0.5635452288377758, + "grad_norm": 0.55078125, + "learning_rate": 9.52754695538281e-05, + "loss": 0.8771, + "step": 6015 + }, + { + "epoch": 0.5640136787370591, + "grad_norm": 0.66015625, + "learning_rate": 9.511211912566899e-05, + "loss": 0.8805, + "step": 6020 + }, + { + "epoch": 0.5644821286363424, + "grad_norm": 0.5546875, + "learning_rate": 9.494878177021704e-05, + "loss": 0.8739, + "step": 6025 + }, + { + "epoch": 0.5649505785356256, + "grad_norm": 0.5546875, + "learning_rate": 9.478545792432028e-05, + "loss": 0.8677, + "step": 6030 + }, + { + "epoch": 0.5654190284349089, + "grad_norm": 0.57421875, + "learning_rate": 9.462214802479074e-05, + "loss": 0.8532, + "step": 6035 + }, + { + "epoch": 0.5658874783341922, + "grad_norm": 0.57421875, + "learning_rate": 9.4458852508403e-05, + "loss": 0.8711, + "step": 6040 + }, + { + "epoch": 0.5663559282334755, + "grad_norm": 0.52734375, + "learning_rate": 9.429557181189332e-05, + "loss": 0.8325, + "step": 6045 + }, + { + "epoch": 0.5668243781327587, + "grad_norm": 0.56640625, + "learning_rate": 9.413230637195817e-05, + "loss": 0.8963, + "step": 6050 + }, + { + "epoch": 0.567292828032042, + "grad_norm": 0.546875, + "learning_rate": 9.396905662525336e-05, + "loss": 0.8704, + "step": 6055 + }, + { + "epoch": 0.5677612779313252, + "grad_norm": 0.55859375, + "learning_rate": 9.380582300839264e-05, + "loss": 0.8896, + "step": 6060 + }, + { + "epoch": 0.5682297278306085, + "grad_norm": 0.6171875, + "learning_rate": 9.364260595794663e-05, + "loss": 0.8826, + "step": 6065 + }, + { + "epoch": 0.5686981777298918, + "grad_norm": 0.55859375, + "learning_rate": 9.347940591044171e-05, + "loss": 0.8602, + "step": 6070 + }, + { + "epoch": 0.569166627629175, + "grad_norm": 0.671875, + "learning_rate": 9.331622330235869e-05, + "loss": 0.9071, + "step": 6075 + }, + { + "epoch": 0.5696350775284583, + "grad_norm": 0.6953125, + "learning_rate": 9.315305857013181e-05, + "loss": 0.8597, + "step": 6080 + }, + { + "epoch": 0.5701035274277416, + "grad_norm": 0.625, + "learning_rate": 9.29899121501475e-05, + "loss": 0.9042, + "step": 6085 + }, + { + "epoch": 0.5705719773270249, + "grad_norm": 0.640625, + "learning_rate": 9.282678447874314e-05, + "loss": 0.8935, + "step": 6090 + }, + { + "epoch": 0.5710404272263081, + "grad_norm": 0.6015625, + "learning_rate": 9.266367599220601e-05, + "loss": 0.855, + "step": 6095 + }, + { + "epoch": 0.5715088771255914, + "grad_norm": 0.5625, + "learning_rate": 9.25005871267721e-05, + "loss": 0.8672, + "step": 6100 + }, + { + "epoch": 0.5719773270248747, + "grad_norm": 0.62109375, + "learning_rate": 9.233751831862491e-05, + "loss": 0.9126, + "step": 6105 + }, + { + "epoch": 0.572445776924158, + "grad_norm": 0.60546875, + "learning_rate": 9.217447000389429e-05, + "loss": 0.8768, + "step": 6110 + }, + { + "epoch": 0.5729142268234413, + "grad_norm": 0.63671875, + "learning_rate": 9.20114426186553e-05, + "loss": 0.8743, + "step": 6115 + }, + { + "epoch": 0.5733826767227245, + "grad_norm": 0.61328125, + "learning_rate": 9.184843659892701e-05, + "loss": 0.8724, + "step": 6120 + }, + { + "epoch": 0.5738511266220078, + "grad_norm": 0.55859375, + "learning_rate": 9.168545238067134e-05, + "loss": 0.8537, + "step": 6125 + }, + { + "epoch": 0.5743195765212911, + "grad_norm": 0.5703125, + "learning_rate": 9.152249039979184e-05, + "loss": 0.8569, + "step": 6130 + }, + { + "epoch": 0.5747880264205744, + "grad_norm": 0.58203125, + "learning_rate": 9.135955109213275e-05, + "loss": 0.8927, + "step": 6135 + }, + { + "epoch": 0.5752564763198575, + "grad_norm": 0.65625, + "learning_rate": 9.119663489347753e-05, + "loss": 0.8405, + "step": 6140 + }, + { + "epoch": 0.5757249262191408, + "grad_norm": 0.6875, + "learning_rate": 9.103374223954785e-05, + "loss": 0.858, + "step": 6145 + }, + { + "epoch": 0.5761933761184241, + "grad_norm": 0.56640625, + "learning_rate": 9.08708735660025e-05, + "loss": 0.8859, + "step": 6150 + }, + { + "epoch": 0.5766618260177074, + "grad_norm": 0.58203125, + "learning_rate": 9.070802930843602e-05, + "loss": 0.8602, + "step": 6155 + }, + { + "epoch": 0.5771302759169907, + "grad_norm": 0.68359375, + "learning_rate": 9.054520990237774e-05, + "loss": 0.857, + "step": 6160 + }, + { + "epoch": 0.5775987258162739, + "grad_norm": 0.5546875, + "learning_rate": 9.038241578329048e-05, + "loss": 0.8562, + "step": 6165 + }, + { + "epoch": 0.5780671757155572, + "grad_norm": 0.5859375, + "learning_rate": 9.02196473865694e-05, + "loss": 0.8498, + "step": 6170 + }, + { + "epoch": 0.5785356256148405, + "grad_norm": 0.65625, + "learning_rate": 9.005690514754088e-05, + "loss": 0.8696, + "step": 6175 + }, + { + "epoch": 0.5790040755141238, + "grad_norm": 0.71484375, + "learning_rate": 8.989418950146143e-05, + "loss": 0.8728, + "step": 6180 + }, + { + "epoch": 0.579472525413407, + "grad_norm": 0.62109375, + "learning_rate": 8.973150088351633e-05, + "loss": 0.8778, + "step": 6185 + }, + { + "epoch": 0.5799409753126903, + "grad_norm": 0.59375, + "learning_rate": 8.956883972881859e-05, + "loss": 0.8915, + "step": 6190 + }, + { + "epoch": 0.5804094252119736, + "grad_norm": 0.5703125, + "learning_rate": 8.940620647240783e-05, + "loss": 0.8785, + "step": 6195 + }, + { + "epoch": 0.5808778751112569, + "grad_norm": 0.56640625, + "learning_rate": 8.9243601549249e-05, + "loss": 0.8806, + "step": 6200 + }, + { + "epoch": 0.5813463250105402, + "grad_norm": 0.59375, + "learning_rate": 8.908102539423131e-05, + "loss": 0.8693, + "step": 6205 + }, + { + "epoch": 0.5818147749098234, + "grad_norm": 0.5390625, + "learning_rate": 8.891847844216693e-05, + "loss": 0.8833, + "step": 6210 + }, + { + "epoch": 0.5822832248091067, + "grad_norm": 0.671875, + "learning_rate": 8.875596112779008e-05, + "loss": 0.8786, + "step": 6215 + }, + { + "epoch": 0.58275167470839, + "grad_norm": 0.6640625, + "learning_rate": 8.85934738857556e-05, + "loss": 1.011, + "step": 6220 + }, + { + "epoch": 0.5832201246076733, + "grad_norm": 0.54296875, + "learning_rate": 8.843101715063792e-05, + "loss": 1.006, + "step": 6225 + }, + { + "epoch": 0.5836885745069564, + "grad_norm": 0.9375, + "learning_rate": 8.826859135692995e-05, + "loss": 0.8741, + "step": 6230 + }, + { + "epoch": 0.5841570244062397, + "grad_norm": 0.98828125, + "learning_rate": 8.810619693904173e-05, + "loss": 0.925, + "step": 6235 + }, + { + "epoch": 0.584625474305523, + "grad_norm": 0.84765625, + "learning_rate": 8.794383433129952e-05, + "loss": 0.8497, + "step": 6240 + }, + { + "epoch": 0.5850939242048063, + "grad_norm": 0.62109375, + "learning_rate": 8.77815039679444e-05, + "loss": 0.8662, + "step": 6245 + }, + { + "epoch": 0.5855623741040896, + "grad_norm": 0.58984375, + "learning_rate": 8.761920628313119e-05, + "loss": 0.8964, + "step": 6250 + }, + { + "epoch": 0.5860308240033728, + "grad_norm": 0.65625, + "learning_rate": 8.745694171092742e-05, + "loss": 0.897, + "step": 6255 + }, + { + "epoch": 0.5864992739026561, + "grad_norm": 0.55078125, + "learning_rate": 8.729471068531199e-05, + "loss": 0.8566, + "step": 6260 + }, + { + "epoch": 0.5869677238019394, + "grad_norm": 0.60546875, + "learning_rate": 8.71325136401741e-05, + "loss": 0.9079, + "step": 6265 + }, + { + "epoch": 0.5874361737012227, + "grad_norm": 0.66796875, + "learning_rate": 8.697035100931203e-05, + "loss": 0.8806, + "step": 6270 + }, + { + "epoch": 0.5879046236005059, + "grad_norm": 0.5625, + "learning_rate": 8.680822322643209e-05, + "loss": 0.873, + "step": 6275 + }, + { + "epoch": 0.5883730734997892, + "grad_norm": 0.62890625, + "learning_rate": 8.664613072514735e-05, + "loss": 0.8524, + "step": 6280 + }, + { + "epoch": 0.5888415233990725, + "grad_norm": 0.6328125, + "learning_rate": 8.64840739389765e-05, + "loss": 0.8885, + "step": 6285 + }, + { + "epoch": 0.5893099732983558, + "grad_norm": 0.64453125, + "learning_rate": 8.632205330134269e-05, + "loss": 0.8719, + "step": 6290 + }, + { + "epoch": 0.5897784231976391, + "grad_norm": 0.70703125, + "learning_rate": 8.616006924557248e-05, + "loss": 0.863, + "step": 6295 + }, + { + "epoch": 0.5902468730969223, + "grad_norm": 0.59375, + "learning_rate": 8.599812220489453e-05, + "loss": 0.8987, + "step": 6300 + }, + { + "epoch": 0.5907153229962055, + "grad_norm": 0.66796875, + "learning_rate": 8.583621261243847e-05, + "loss": 0.8778, + "step": 6305 + }, + { + "epoch": 0.5911837728954888, + "grad_norm": 0.6640625, + "learning_rate": 8.567434090123387e-05, + "loss": 0.893, + "step": 6310 + }, + { + "epoch": 0.5916522227947721, + "grad_norm": 0.6640625, + "learning_rate": 8.55125075042089e-05, + "loss": 0.9027, + "step": 6315 + }, + { + "epoch": 0.5921206726940553, + "grad_norm": 0.60546875, + "learning_rate": 8.53507128541893e-05, + "loss": 0.8564, + "step": 6320 + }, + { + "epoch": 0.5925891225933386, + "grad_norm": 0.5390625, + "learning_rate": 8.518895738389719e-05, + "loss": 0.869, + "step": 6325 + }, + { + "epoch": 0.5930575724926219, + "grad_norm": 0.5859375, + "learning_rate": 8.502724152594983e-05, + "loss": 0.8895, + "step": 6330 + }, + { + "epoch": 0.5935260223919052, + "grad_norm": 0.54296875, + "learning_rate": 8.48655657128586e-05, + "loss": 0.8549, + "step": 6335 + }, + { + "epoch": 0.5939944722911884, + "grad_norm": 0.55859375, + "learning_rate": 8.470393037702783e-05, + "loss": 0.8571, + "step": 6340 + }, + { + "epoch": 0.5944629221904717, + "grad_norm": 0.5859375, + "learning_rate": 8.454233595075347e-05, + "loss": 0.9031, + "step": 6345 + }, + { + "epoch": 0.594931372089755, + "grad_norm": 0.53125, + "learning_rate": 8.438078286622211e-05, + "loss": 0.8523, + "step": 6350 + }, + { + "epoch": 0.5953998219890383, + "grad_norm": 0.53125, + "learning_rate": 8.421927155550984e-05, + "loss": 0.8472, + "step": 6355 + }, + { + "epoch": 0.5958682718883216, + "grad_norm": 0.73828125, + "learning_rate": 8.405780245058093e-05, + "loss": 0.8302, + "step": 6360 + }, + { + "epoch": 0.5963367217876048, + "grad_norm": 0.61328125, + "learning_rate": 8.38963759832868e-05, + "loss": 0.8678, + "step": 6365 + }, + { + "epoch": 0.5968051716868881, + "grad_norm": 0.56640625, + "learning_rate": 8.373499258536483e-05, + "loss": 0.8584, + "step": 6370 + }, + { + "epoch": 0.5972736215861714, + "grad_norm": 0.609375, + "learning_rate": 8.357365268843727e-05, + "loss": 0.863, + "step": 6375 + }, + { + "epoch": 0.5977420714854547, + "grad_norm": 0.57421875, + "learning_rate": 8.341235672400992e-05, + "loss": 0.8899, + "step": 6380 + }, + { + "epoch": 0.5982105213847378, + "grad_norm": 0.5859375, + "learning_rate": 8.325110512347115e-05, + "loss": 0.8759, + "step": 6385 + }, + { + "epoch": 0.5986789712840211, + "grad_norm": 0.58203125, + "learning_rate": 8.308989831809068e-05, + "loss": 0.8537, + "step": 6390 + }, + { + "epoch": 0.5991474211833044, + "grad_norm": 0.58203125, + "learning_rate": 8.292873673901839e-05, + "loss": 0.865, + "step": 6395 + }, + { + "epoch": 0.5996158710825877, + "grad_norm": 0.53515625, + "learning_rate": 8.276762081728327e-05, + "loss": 0.8583, + "step": 6400 + }, + { + "epoch": 0.600084320981871, + "grad_norm": 0.5546875, + "learning_rate": 8.260655098379214e-05, + "loss": 0.8602, + "step": 6405 + }, + { + "epoch": 0.6005527708811542, + "grad_norm": 0.546875, + "learning_rate": 8.244552766932852e-05, + "loss": 0.8685, + "step": 6410 + }, + { + "epoch": 0.6010212207804375, + "grad_norm": 0.59375, + "learning_rate": 8.228455130455156e-05, + "loss": 0.8539, + "step": 6415 + }, + { + "epoch": 0.6014896706797208, + "grad_norm": 0.55859375, + "learning_rate": 8.21236223199949e-05, + "loss": 0.8566, + "step": 6420 + }, + { + "epoch": 0.6019581205790041, + "grad_norm": 0.58203125, + "learning_rate": 8.196274114606539e-05, + "loss": 0.8589, + "step": 6425 + }, + { + "epoch": 0.6024265704782873, + "grad_norm": 0.5859375, + "learning_rate": 8.180190821304199e-05, + "loss": 0.8537, + "step": 6430 + }, + { + "epoch": 0.6028950203775706, + "grad_norm": 0.55078125, + "learning_rate": 8.164112395107472e-05, + "loss": 0.8657, + "step": 6435 + }, + { + "epoch": 0.6033634702768539, + "grad_norm": 0.53515625, + "learning_rate": 8.148038879018334e-05, + "loss": 0.8321, + "step": 6440 + }, + { + "epoch": 0.6038319201761372, + "grad_norm": 0.55078125, + "learning_rate": 8.13197031602564e-05, + "loss": 0.8817, + "step": 6445 + }, + { + "epoch": 0.6043003700754205, + "grad_norm": 0.671875, + "learning_rate": 8.11590674910498e-05, + "loss": 0.8418, + "step": 6450 + }, + { + "epoch": 0.6047688199747037, + "grad_norm": 0.8125, + "learning_rate": 8.099848221218602e-05, + "loss": 0.879, + "step": 6455 + }, + { + "epoch": 0.605237269873987, + "grad_norm": 0.7421875, + "learning_rate": 8.083794775315265e-05, + "loss": 0.8397, + "step": 6460 + }, + { + "epoch": 0.6057057197732703, + "grad_norm": 0.5703125, + "learning_rate": 8.067746454330137e-05, + "loss": 0.891, + "step": 6465 + }, + { + "epoch": 0.6061741696725536, + "grad_norm": 0.64453125, + "learning_rate": 8.051703301184684e-05, + "loss": 0.8681, + "step": 6470 + }, + { + "epoch": 0.6066426195718367, + "grad_norm": 0.77734375, + "learning_rate": 8.035665358786546e-05, + "loss": 0.8485, + "step": 6475 + }, + { + "epoch": 0.60711106947112, + "grad_norm": 0.69140625, + "learning_rate": 8.019632670029433e-05, + "loss": 0.8705, + "step": 6480 + }, + { + "epoch": 0.6075795193704033, + "grad_norm": 0.60546875, + "learning_rate": 8.003605277793e-05, + "loss": 0.8835, + "step": 6485 + }, + { + "epoch": 0.6080479692696866, + "grad_norm": 0.78125, + "learning_rate": 7.987583224942728e-05, + "loss": 0.8667, + "step": 6490 + }, + { + "epoch": 0.6085164191689699, + "grad_norm": 0.6171875, + "learning_rate": 7.971566554329831e-05, + "loss": 0.8857, + "step": 6495 + }, + { + "epoch": 0.6089848690682531, + "grad_norm": 0.703125, + "learning_rate": 7.955555308791125e-05, + "loss": 0.84, + "step": 6500 + }, + { + "epoch": 0.6094533189675364, + "grad_norm": 0.578125, + "learning_rate": 7.939549531148914e-05, + "loss": 0.8466, + "step": 6505 + }, + { + "epoch": 0.6099217688668197, + "grad_norm": 0.546875, + "learning_rate": 7.923549264210872e-05, + "loss": 0.8594, + "step": 6510 + }, + { + "epoch": 0.610390218766103, + "grad_norm": 0.56640625, + "learning_rate": 7.907554550769952e-05, + "loss": 0.8864, + "step": 6515 + }, + { + "epoch": 0.6108586686653862, + "grad_norm": 0.59375, + "learning_rate": 7.891565433604234e-05, + "loss": 0.8643, + "step": 6520 + }, + { + "epoch": 0.6113271185646695, + "grad_norm": 0.58203125, + "learning_rate": 7.875581955476849e-05, + "loss": 0.8533, + "step": 6525 + }, + { + "epoch": 0.6117955684639528, + "grad_norm": 0.53125, + "learning_rate": 7.859604159135828e-05, + "loss": 0.8311, + "step": 6530 + }, + { + "epoch": 0.6122640183632361, + "grad_norm": 0.546875, + "learning_rate": 7.843632087314021e-05, + "loss": 0.8928, + "step": 6535 + }, + { + "epoch": 0.6127324682625194, + "grad_norm": 0.58203125, + "learning_rate": 7.82766578272896e-05, + "loss": 0.8723, + "step": 6540 + }, + { + "epoch": 0.6132009181618026, + "grad_norm": 0.62890625, + "learning_rate": 7.811705288082754e-05, + "loss": 0.8435, + "step": 6545 + }, + { + "epoch": 0.6136693680610859, + "grad_norm": 0.62890625, + "learning_rate": 7.795750646061974e-05, + "loss": 0.8795, + "step": 6550 + }, + { + "epoch": 0.6141378179603691, + "grad_norm": 0.796875, + "learning_rate": 7.779801899337537e-05, + "loss": 0.879, + "step": 6555 + }, + { + "epoch": 0.6146062678596524, + "grad_norm": 0.56640625, + "learning_rate": 7.763859090564598e-05, + "loss": 0.8712, + "step": 6560 + }, + { + "epoch": 0.6150747177589356, + "grad_norm": 0.6328125, + "learning_rate": 7.747922262382424e-05, + "loss": 0.8458, + "step": 6565 + }, + { + "epoch": 0.6155431676582189, + "grad_norm": 0.71875, + "learning_rate": 7.731991457414287e-05, + "loss": 0.8514, + "step": 6570 + }, + { + "epoch": 0.6160116175575022, + "grad_norm": 0.6640625, + "learning_rate": 7.716066718267352e-05, + "loss": 0.8556, + "step": 6575 + }, + { + "epoch": 0.6164800674567855, + "grad_norm": 0.77734375, + "learning_rate": 7.700148087532563e-05, + "loss": 0.8541, + "step": 6580 + }, + { + "epoch": 0.6169485173560688, + "grad_norm": 0.62890625, + "learning_rate": 7.684235607784526e-05, + "loss": 0.8539, + "step": 6585 + }, + { + "epoch": 0.617416967255352, + "grad_norm": 0.6171875, + "learning_rate": 7.668329321581392e-05, + "loss": 0.8759, + "step": 6590 + }, + { + "epoch": 0.6178854171546353, + "grad_norm": 0.546875, + "learning_rate": 7.652429271464755e-05, + "loss": 0.8643, + "step": 6595 + }, + { + "epoch": 0.6183538670539186, + "grad_norm": 0.578125, + "learning_rate": 7.636535499959521e-05, + "loss": 0.8598, + "step": 6600 + }, + { + "epoch": 0.6188223169532019, + "grad_norm": 0.671875, + "learning_rate": 7.620648049573815e-05, + "loss": 0.8576, + "step": 6605 + }, + { + "epoch": 0.6192907668524851, + "grad_norm": 0.6875, + "learning_rate": 7.604766962798842e-05, + "loss": 0.9068, + "step": 6610 + }, + { + "epoch": 0.6197592167517684, + "grad_norm": 0.65625, + "learning_rate": 7.588892282108798e-05, + "loss": 0.8765, + "step": 6615 + }, + { + "epoch": 0.6202276666510517, + "grad_norm": 0.56640625, + "learning_rate": 7.573024049960743e-05, + "loss": 0.8731, + "step": 6620 + }, + { + "epoch": 0.620696116550335, + "grad_norm": 0.55859375, + "learning_rate": 7.557162308794493e-05, + "loss": 0.8686, + "step": 6625 + }, + { + "epoch": 0.6211645664496183, + "grad_norm": 0.59375, + "learning_rate": 7.541307101032497e-05, + "loss": 0.8672, + "step": 6630 + }, + { + "epoch": 0.6216330163489014, + "grad_norm": 0.6875, + "learning_rate": 7.525458469079737e-05, + "loss": 0.9871, + "step": 6635 + }, + { + "epoch": 0.6221014662481847, + "grad_norm": 0.578125, + "learning_rate": 7.509616455323606e-05, + "loss": 0.8472, + "step": 6640 + }, + { + "epoch": 0.622569916147468, + "grad_norm": 0.6640625, + "learning_rate": 7.493781102133796e-05, + "loss": 0.8612, + "step": 6645 + }, + { + "epoch": 0.6230383660467513, + "grad_norm": 0.8671875, + "learning_rate": 7.47795245186218e-05, + "loss": 0.8473, + "step": 6650 + }, + { + "epoch": 0.6235068159460345, + "grad_norm": 0.70703125, + "learning_rate": 7.462130546842711e-05, + "loss": 0.8701, + "step": 6655 + }, + { + "epoch": 0.6239752658453178, + "grad_norm": 0.59765625, + "learning_rate": 7.446315429391304e-05, + "loss": 0.8779, + "step": 6660 + }, + { + "epoch": 0.6244437157446011, + "grad_norm": 0.69140625, + "learning_rate": 7.430507141805714e-05, + "loss": 0.8669, + "step": 6665 + }, + { + "epoch": 0.6249121656438844, + "grad_norm": 0.75390625, + "learning_rate": 7.414705726365427e-05, + "loss": 0.8459, + "step": 6670 + }, + { + "epoch": 0.6253806155431677, + "grad_norm": 0.6328125, + "learning_rate": 7.398911225331561e-05, + "loss": 0.8663, + "step": 6675 + }, + { + "epoch": 0.6258490654424509, + "grad_norm": 0.5546875, + "learning_rate": 7.383123680946729e-05, + "loss": 0.8371, + "step": 6680 + }, + { + "epoch": 0.6263175153417342, + "grad_norm": 0.7109375, + "learning_rate": 7.367343135434952e-05, + "loss": 0.8637, + "step": 6685 + }, + { + "epoch": 0.6267859652410175, + "grad_norm": 0.69140625, + "learning_rate": 7.351569631001512e-05, + "loss": 0.8644, + "step": 6690 + }, + { + "epoch": 0.6272544151403008, + "grad_norm": 0.578125, + "learning_rate": 7.33580320983288e-05, + "loss": 0.8658, + "step": 6695 + }, + { + "epoch": 0.627722865039584, + "grad_norm": 0.53125, + "learning_rate": 7.32004391409657e-05, + "loss": 0.8808, + "step": 6700 + }, + { + "epoch": 0.6281913149388673, + "grad_norm": 0.578125, + "learning_rate": 7.304291785941048e-05, + "loss": 0.8874, + "step": 6705 + }, + { + "epoch": 0.6286597648381506, + "grad_norm": 0.55859375, + "learning_rate": 7.288546867495603e-05, + "loss": 0.8561, + "step": 6710 + }, + { + "epoch": 0.6291282147374339, + "grad_norm": 0.57421875, + "learning_rate": 7.27280920087024e-05, + "loss": 0.877, + "step": 6715 + }, + { + "epoch": 0.6295966646367172, + "grad_norm": 0.5390625, + "learning_rate": 7.25707882815558e-05, + "loss": 0.9023, + "step": 6720 + }, + { + "epoch": 0.6300651145360003, + "grad_norm": 0.57421875, + "learning_rate": 7.241355791422728e-05, + "loss": 0.8688, + "step": 6725 + }, + { + "epoch": 0.6305335644352836, + "grad_norm": 0.640625, + "learning_rate": 7.225640132723165e-05, + "loss": 0.865, + "step": 6730 + }, + { + "epoch": 0.6310020143345669, + "grad_norm": 0.5625, + "learning_rate": 7.209931894088648e-05, + "loss": 0.8624, + "step": 6735 + }, + { + "epoch": 0.6314704642338502, + "grad_norm": 0.57421875, + "learning_rate": 7.194231117531084e-05, + "loss": 0.8862, + "step": 6740 + }, + { + "epoch": 0.6319389141331334, + "grad_norm": 0.58984375, + "learning_rate": 7.178537845042428e-05, + "loss": 0.8646, + "step": 6745 + }, + { + "epoch": 0.6324073640324167, + "grad_norm": 0.625, + "learning_rate": 7.162852118594554e-05, + "loss": 0.8422, + "step": 6750 + }, + { + "epoch": 0.6328758139317, + "grad_norm": 0.59765625, + "learning_rate": 7.147173980139166e-05, + "loss": 0.874, + "step": 6755 + }, + { + "epoch": 0.6333442638309833, + "grad_norm": 0.5859375, + "learning_rate": 7.131503471607668e-05, + "loss": 0.8584, + "step": 6760 + }, + { + "epoch": 0.6338127137302666, + "grad_norm": 0.546875, + "learning_rate": 7.115840634911064e-05, + "loss": 0.8536, + "step": 6765 + }, + { + "epoch": 0.6342811636295498, + "grad_norm": 0.55078125, + "learning_rate": 7.100185511939822e-05, + "loss": 0.8542, + "step": 6770 + }, + { + "epoch": 0.6347496135288331, + "grad_norm": 0.54296875, + "learning_rate": 7.084538144563802e-05, + "loss": 0.8574, + "step": 6775 + }, + { + "epoch": 0.6352180634281164, + "grad_norm": 0.58984375, + "learning_rate": 7.068898574632103e-05, + "loss": 0.8519, + "step": 6780 + }, + { + "epoch": 0.6356865133273997, + "grad_norm": 0.6484375, + "learning_rate": 7.053266843972987e-05, + "loss": 0.8891, + "step": 6785 + }, + { + "epoch": 0.6361549632266829, + "grad_norm": 0.54296875, + "learning_rate": 7.037642994393736e-05, + "loss": 0.8495, + "step": 6790 + }, + { + "epoch": 0.6366234131259662, + "grad_norm": 0.6953125, + "learning_rate": 7.022027067680558e-05, + "loss": 0.8409, + "step": 6795 + }, + { + "epoch": 0.6370918630252494, + "grad_norm": 0.5546875, + "learning_rate": 7.006419105598474e-05, + "loss": 0.8634, + "step": 6800 + }, + { + "epoch": 0.6375603129245327, + "grad_norm": 0.5546875, + "learning_rate": 6.9908191498912e-05, + "loss": 0.8375, + "step": 6805 + }, + { + "epoch": 0.638028762823816, + "grad_norm": 0.61328125, + "learning_rate": 6.975227242281042e-05, + "loss": 0.8665, + "step": 6810 + }, + { + "epoch": 0.6384972127230992, + "grad_norm": 0.546875, + "learning_rate": 6.959643424468774e-05, + "loss": 0.8747, + "step": 6815 + }, + { + "epoch": 0.6389656626223825, + "grad_norm": 0.54296875, + "learning_rate": 6.944067738133544e-05, + "loss": 0.8916, + "step": 6820 + }, + { + "epoch": 0.6394341125216658, + "grad_norm": 0.5703125, + "learning_rate": 6.928500224932746e-05, + "loss": 0.8789, + "step": 6825 + }, + { + "epoch": 0.6399025624209491, + "grad_norm": 0.59375, + "learning_rate": 6.912940926501914e-05, + "loss": 0.8581, + "step": 6830 + }, + { + "epoch": 0.6403710123202323, + "grad_norm": 0.53515625, + "learning_rate": 6.897389884454618e-05, + "loss": 0.9861, + "step": 6835 + }, + { + "epoch": 0.6408394622195156, + "grad_norm": 0.52734375, + "learning_rate": 6.881847140382337e-05, + "loss": 0.8445, + "step": 6840 + }, + { + "epoch": 0.6413079121187989, + "grad_norm": 0.5390625, + "learning_rate": 6.86631273585437e-05, + "loss": 0.8399, + "step": 6845 + }, + { + "epoch": 0.6417763620180822, + "grad_norm": 0.5625, + "learning_rate": 6.850786712417693e-05, + "loss": 0.8553, + "step": 6850 + }, + { + "epoch": 0.6422448119173655, + "grad_norm": 0.6328125, + "learning_rate": 6.835269111596883e-05, + "loss": 0.8622, + "step": 6855 + }, + { + "epoch": 0.6427132618166487, + "grad_norm": 0.546875, + "learning_rate": 6.819759974893983e-05, + "loss": 0.8592, + "step": 6860 + }, + { + "epoch": 0.643181711715932, + "grad_norm": 0.6875, + "learning_rate": 6.804259343788401e-05, + "loss": 0.8471, + "step": 6865 + }, + { + "epoch": 0.6436501616152153, + "grad_norm": 0.5546875, + "learning_rate": 6.788767259736797e-05, + "loss": 0.8955, + "step": 6870 + }, + { + "epoch": 0.6441186115144986, + "grad_norm": 0.53515625, + "learning_rate": 6.773283764172968e-05, + "loss": 0.8356, + "step": 6875 + }, + { + "epoch": 0.6445870614137817, + "grad_norm": 0.578125, + "learning_rate": 6.757808898507747e-05, + "loss": 0.8782, + "step": 6880 + }, + { + "epoch": 0.645055511313065, + "grad_norm": 0.71484375, + "learning_rate": 6.742342704128885e-05, + "loss": 0.8937, + "step": 6885 + }, + { + "epoch": 0.6455239612123483, + "grad_norm": 0.5625, + "learning_rate": 6.726885222400933e-05, + "loss": 0.8786, + "step": 6890 + }, + { + "epoch": 0.6459924111116316, + "grad_norm": 0.75390625, + "learning_rate": 6.71143649466515e-05, + "loss": 0.8774, + "step": 6895 + }, + { + "epoch": 0.6464608610109149, + "grad_norm": 0.6171875, + "learning_rate": 6.695996562239378e-05, + "loss": 0.8905, + "step": 6900 + }, + { + "epoch": 0.6469293109101981, + "grad_norm": 0.7109375, + "learning_rate": 6.680565466417939e-05, + "loss": 0.8554, + "step": 6905 + }, + { + "epoch": 0.6473977608094814, + "grad_norm": 0.73828125, + "learning_rate": 6.665143248471512e-05, + "loss": 0.8392, + "step": 6910 + }, + { + "epoch": 0.6478662107087647, + "grad_norm": 0.8515625, + "learning_rate": 6.649729949647048e-05, + "loss": 0.8728, + "step": 6915 + }, + { + "epoch": 0.648334660608048, + "grad_norm": 1.046875, + "learning_rate": 6.634325611167627e-05, + "loss": 0.8674, + "step": 6920 + }, + { + "epoch": 0.6488031105073312, + "grad_norm": 0.5390625, + "learning_rate": 6.618930274232382e-05, + "loss": 0.8614, + "step": 6925 + }, + { + "epoch": 0.6492715604066145, + "grad_norm": 0.7578125, + "learning_rate": 6.603543980016349e-05, + "loss": 0.863, + "step": 6930 + }, + { + "epoch": 0.6497400103058978, + "grad_norm": 0.60546875, + "learning_rate": 6.588166769670399e-05, + "loss": 0.8457, + "step": 6935 + }, + { + "epoch": 0.6502084602051811, + "grad_norm": 0.578125, + "learning_rate": 6.572798684321095e-05, + "loss": 0.8615, + "step": 6940 + }, + { + "epoch": 0.6506769101044644, + "grad_norm": 0.625, + "learning_rate": 6.557439765070607e-05, + "loss": 0.859, + "step": 6945 + }, + { + "epoch": 0.6511453600037476, + "grad_norm": 0.64453125, + "learning_rate": 6.542090052996582e-05, + "loss": 0.8606, + "step": 6950 + }, + { + "epoch": 0.6516138099030309, + "grad_norm": 0.55078125, + "learning_rate": 6.526749589152039e-05, + "loss": 0.866, + "step": 6955 + }, + { + "epoch": 0.6520822598023142, + "grad_norm": 0.61328125, + "learning_rate": 6.511418414565273e-05, + "loss": 0.8903, + "step": 6960 + }, + { + "epoch": 0.6525507097015975, + "grad_norm": 0.55859375, + "learning_rate": 6.496096570239731e-05, + "loss": 0.8673, + "step": 6965 + }, + { + "epoch": 0.6530191596008806, + "grad_norm": 0.6796875, + "learning_rate": 6.480784097153895e-05, + "loss": 0.8825, + "step": 6970 + }, + { + "epoch": 0.6534876095001639, + "grad_norm": 0.7265625, + "learning_rate": 6.4654810362612e-05, + "loss": 0.85, + "step": 6975 + }, + { + "epoch": 0.6539560593994472, + "grad_norm": 0.65625, + "learning_rate": 6.450187428489898e-05, + "loss": 0.9843, + "step": 6980 + }, + { + "epoch": 0.6544245092987305, + "grad_norm": 0.59765625, + "learning_rate": 6.434903314742964e-05, + "loss": 0.8613, + "step": 6985 + }, + { + "epoch": 0.6548929591980138, + "grad_norm": 0.609375, + "learning_rate": 6.419628735897972e-05, + "loss": 0.8594, + "step": 6990 + }, + { + "epoch": 0.655361409097297, + "grad_norm": 0.578125, + "learning_rate": 6.404363732807009e-05, + "loss": 0.8643, + "step": 6995 + }, + { + "epoch": 0.6558298589965803, + "grad_norm": 0.64453125, + "learning_rate": 6.389108346296537e-05, + "loss": 0.8643, + "step": 7000 + }, + { + "epoch": 0.6562983088958636, + "grad_norm": 0.71875, + "learning_rate": 6.373862617167314e-05, + "loss": 0.8376, + "step": 7005 + }, + { + "epoch": 0.6567667587951469, + "grad_norm": 0.5546875, + "learning_rate": 6.358626586194247e-05, + "loss": 0.8656, + "step": 7010 + }, + { + "epoch": 0.6572352086944301, + "grad_norm": 0.58203125, + "learning_rate": 6.343400294126328e-05, + "loss": 0.8631, + "step": 7015 + }, + { + "epoch": 0.6577036585937134, + "grad_norm": 0.546875, + "learning_rate": 6.328183781686487e-05, + "loss": 0.8717, + "step": 7020 + }, + { + "epoch": 0.6581721084929967, + "grad_norm": 0.578125, + "learning_rate": 6.312977089571507e-05, + "loss": 0.8714, + "step": 7025 + }, + { + "epoch": 0.65864055839228, + "grad_norm": 0.546875, + "learning_rate": 6.297780258451904e-05, + "loss": 0.8613, + "step": 7030 + }, + { + "epoch": 0.6591090082915633, + "grad_norm": 0.5546875, + "learning_rate": 6.282593328971814e-05, + "loss": 0.8567, + "step": 7035 + }, + { + "epoch": 0.6595774581908465, + "grad_norm": 0.546875, + "learning_rate": 6.267416341748902e-05, + "loss": 0.8709, + "step": 7040 + }, + { + "epoch": 0.6600459080901298, + "grad_norm": 0.53515625, + "learning_rate": 6.252249337374238e-05, + "loss": 0.8628, + "step": 7045 + }, + { + "epoch": 0.660514357989413, + "grad_norm": 0.515625, + "learning_rate": 6.237092356412187e-05, + "loss": 0.8699, + "step": 7050 + }, + { + "epoch": 0.6609828078886963, + "grad_norm": 0.5703125, + "learning_rate": 6.221945439400308e-05, + "loss": 0.8392, + "step": 7055 + }, + { + "epoch": 0.6614512577879795, + "grad_norm": 0.546875, + "learning_rate": 6.206808626849254e-05, + "loss": 0.8342, + "step": 7060 + }, + { + "epoch": 0.6619197076872628, + "grad_norm": 0.57421875, + "learning_rate": 6.19168195924264e-05, + "loss": 0.8653, + "step": 7065 + }, + { + "epoch": 0.6623881575865461, + "grad_norm": 0.578125, + "learning_rate": 6.176565477036961e-05, + "loss": 0.881, + "step": 7070 + }, + { + "epoch": 0.6628566074858294, + "grad_norm": 0.7109375, + "learning_rate": 6.161459220661462e-05, + "loss": 0.8736, + "step": 7075 + }, + { + "epoch": 0.6633250573851127, + "grad_norm": 0.5625, + "learning_rate": 6.146363230518036e-05, + "loss": 0.8709, + "step": 7080 + }, + { + "epoch": 0.6637935072843959, + "grad_norm": 0.55859375, + "learning_rate": 6.131277546981136e-05, + "loss": 0.8563, + "step": 7085 + }, + { + "epoch": 0.6642619571836792, + "grad_norm": 0.55078125, + "learning_rate": 6.116202210397621e-05, + "loss": 0.8636, + "step": 7090 + }, + { + "epoch": 0.6647304070829625, + "grad_norm": 0.72265625, + "learning_rate": 6.101137261086707e-05, + "loss": 0.8489, + "step": 7095 + }, + { + "epoch": 0.6651988569822458, + "grad_norm": 0.59375, + "learning_rate": 6.086082739339808e-05, + "loss": 0.8412, + "step": 7100 + }, + { + "epoch": 0.665667306881529, + "grad_norm": 0.5703125, + "learning_rate": 6.071038685420463e-05, + "loss": 0.8515, + "step": 7105 + }, + { + "epoch": 0.6661357567808123, + "grad_norm": 0.53125, + "learning_rate": 6.056005139564206e-05, + "loss": 0.846, + "step": 7110 + }, + { + "epoch": 0.6666042066800956, + "grad_norm": 0.70703125, + "learning_rate": 6.040982141978469e-05, + "loss": 0.878, + "step": 7115 + }, + { + "epoch": 0.6670726565793789, + "grad_norm": 0.58984375, + "learning_rate": 6.025969732842476e-05, + "loss": 0.8628, + "step": 7120 + }, + { + "epoch": 0.6675411064786622, + "grad_norm": 0.54296875, + "learning_rate": 6.0109679523071306e-05, + "loss": 0.8544, + "step": 7125 + }, + { + "epoch": 0.6680095563779453, + "grad_norm": 0.58984375, + "learning_rate": 5.995976840494904e-05, + "loss": 0.8405, + "step": 7130 + }, + { + "epoch": 0.6684780062772286, + "grad_norm": 0.54296875, + "learning_rate": 5.9809964374997396e-05, + "loss": 0.8664, + "step": 7135 + }, + { + "epoch": 0.6689464561765119, + "grad_norm": 0.5546875, + "learning_rate": 5.966026783386941e-05, + "loss": 0.8669, + "step": 7140 + }, + { + "epoch": 0.6694149060757952, + "grad_norm": 0.54296875, + "learning_rate": 5.9510679181930604e-05, + "loss": 0.8692, + "step": 7145 + }, + { + "epoch": 0.6698833559750784, + "grad_norm": 0.64453125, + "learning_rate": 5.936119881925798e-05, + "loss": 0.8288, + "step": 7150 + }, + { + "epoch": 0.6703518058743617, + "grad_norm": 0.52734375, + "learning_rate": 5.921182714563889e-05, + "loss": 0.8463, + "step": 7155 + }, + { + "epoch": 0.670820255773645, + "grad_norm": 0.546875, + "learning_rate": 5.9062564560569976e-05, + "loss": 0.8557, + "step": 7160 + }, + { + "epoch": 0.6712887056729283, + "grad_norm": 0.5390625, + "learning_rate": 5.891341146325623e-05, + "loss": 0.8691, + "step": 7165 + }, + { + "epoch": 0.6717571555722116, + "grad_norm": 0.6015625, + "learning_rate": 5.876436825260967e-05, + "loss": 0.8596, + "step": 7170 + }, + { + "epoch": 0.6722256054714948, + "grad_norm": 0.52734375, + "learning_rate": 5.86154353272485e-05, + "loss": 0.9965, + "step": 7175 + }, + { + "epoch": 0.6726940553707781, + "grad_norm": 0.57421875, + "learning_rate": 5.8466613085495944e-05, + "loss": 0.8696, + "step": 7180 + }, + { + "epoch": 0.6731625052700614, + "grad_norm": 0.6171875, + "learning_rate": 5.831790192537923e-05, + "loss": 0.8722, + "step": 7185 + }, + { + "epoch": 0.6736309551693447, + "grad_norm": 0.59765625, + "learning_rate": 5.816930224462851e-05, + "loss": 0.8875, + "step": 7190 + }, + { + "epoch": 0.6740994050686279, + "grad_norm": 0.640625, + "learning_rate": 5.802081444067568e-05, + "loss": 0.888, + "step": 7195 + }, + { + "epoch": 0.6745678549679112, + "grad_norm": 0.55859375, + "learning_rate": 5.7872438910653527e-05, + "loss": 0.8824, + "step": 7200 + }, + { + "epoch": 0.6750363048671945, + "grad_norm": 0.5390625, + "learning_rate": 5.772417605139455e-05, + "loss": 0.8589, + "step": 7205 + }, + { + "epoch": 0.6755047547664778, + "grad_norm": 0.54296875, + "learning_rate": 5.757602625942986e-05, + "loss": 0.8611, + "step": 7210 + }, + { + "epoch": 0.675973204665761, + "grad_norm": 0.64453125, + "learning_rate": 5.742798993098813e-05, + "loss": 0.8933, + "step": 7215 + }, + { + "epoch": 0.6764416545650442, + "grad_norm": 0.63671875, + "learning_rate": 5.7280067461994704e-05, + "loss": 0.8925, + "step": 7220 + }, + { + "epoch": 0.6769101044643275, + "grad_norm": 0.5234375, + "learning_rate": 5.713225924807034e-05, + "loss": 0.8408, + "step": 7225 + }, + { + "epoch": 0.6773785543636108, + "grad_norm": 0.546875, + "learning_rate": 5.6984565684530165e-05, + "loss": 0.8312, + "step": 7230 + }, + { + "epoch": 0.6778470042628941, + "grad_norm": 0.58203125, + "learning_rate": 5.6836987166382725e-05, + "loss": 0.8491, + "step": 7235 + }, + { + "epoch": 0.6783154541621773, + "grad_norm": 0.55078125, + "learning_rate": 5.668952408832895e-05, + "loss": 0.8495, + "step": 7240 + }, + { + "epoch": 0.6787839040614606, + "grad_norm": 0.671875, + "learning_rate": 5.6542176844760896e-05, + "loss": 0.8628, + "step": 7245 + }, + { + "epoch": 0.6792523539607439, + "grad_norm": 0.5703125, + "learning_rate": 5.6394945829760815e-05, + "loss": 0.8639, + "step": 7250 + }, + { + "epoch": 0.6797208038600272, + "grad_norm": 0.55859375, + "learning_rate": 5.62478314371002e-05, + "loss": 0.9912, + "step": 7255 + }, + { + "epoch": 0.6801892537593105, + "grad_norm": 0.6484375, + "learning_rate": 5.610083406023865e-05, + "loss": 0.8715, + "step": 7260 + }, + { + "epoch": 0.6806577036585937, + "grad_norm": 0.578125, + "learning_rate": 5.595395409232265e-05, + "loss": 0.8925, + "step": 7265 + }, + { + "epoch": 0.681126153557877, + "grad_norm": 0.55078125, + "learning_rate": 5.58071919261848e-05, + "loss": 0.8934, + "step": 7270 + }, + { + "epoch": 0.6815946034571603, + "grad_norm": 0.5390625, + "learning_rate": 5.566054795434268e-05, + "loss": 0.8599, + "step": 7275 + }, + { + "epoch": 0.6820630533564436, + "grad_norm": 0.578125, + "learning_rate": 5.551402256899757e-05, + "loss": 0.8519, + "step": 7280 + }, + { + "epoch": 0.6825315032557268, + "grad_norm": 0.59375, + "learning_rate": 5.536761616203381e-05, + "loss": 0.8797, + "step": 7285 + }, + { + "epoch": 0.68299995315501, + "grad_norm": 0.59375, + "learning_rate": 5.522132912501732e-05, + "loss": 0.8693, + "step": 7290 + }, + { + "epoch": 0.6834684030542933, + "grad_norm": 0.5703125, + "learning_rate": 5.5075161849194966e-05, + "loss": 0.8494, + "step": 7295 + }, + { + "epoch": 0.6839368529535766, + "grad_norm": 0.5625, + "learning_rate": 5.492911472549314e-05, + "loss": 0.838, + "step": 7300 + }, + { + "epoch": 0.6844053028528598, + "grad_norm": 0.55859375, + "learning_rate": 5.4783188144516975e-05, + "loss": 0.8484, + "step": 7305 + }, + { + "epoch": 0.6848737527521431, + "grad_norm": 0.5390625, + "learning_rate": 5.463738249654924e-05, + "loss": 0.8885, + "step": 7310 + }, + { + "epoch": 0.6853422026514264, + "grad_norm": 0.55078125, + "learning_rate": 5.4491698171549156e-05, + "loss": 0.8772, + "step": 7315 + }, + { + "epoch": 0.6858106525507097, + "grad_norm": 0.59765625, + "learning_rate": 5.4346135559151555e-05, + "loss": 0.8939, + "step": 7320 + }, + { + "epoch": 0.686279102449993, + "grad_norm": 0.67578125, + "learning_rate": 5.420069504866576e-05, + "loss": 0.8538, + "step": 7325 + }, + { + "epoch": 0.6867475523492762, + "grad_norm": 0.625, + "learning_rate": 5.405537702907445e-05, + "loss": 0.8417, + "step": 7330 + }, + { + "epoch": 0.6872160022485595, + "grad_norm": 0.51953125, + "learning_rate": 5.391018188903271e-05, + "loss": 0.8472, + "step": 7335 + }, + { + "epoch": 0.6876844521478428, + "grad_norm": 0.55078125, + "learning_rate": 5.376511001686704e-05, + "loss": 0.8479, + "step": 7340 + }, + { + "epoch": 0.6881529020471261, + "grad_norm": 0.5859375, + "learning_rate": 5.362016180057425e-05, + "loss": 0.8684, + "step": 7345 + }, + { + "epoch": 0.6886213519464093, + "grad_norm": 0.5390625, + "learning_rate": 5.347533762782044e-05, + "loss": 0.8443, + "step": 7350 + }, + { + "epoch": 0.6890898018456926, + "grad_norm": 0.5625, + "learning_rate": 5.333063788593986e-05, + "loss": 0.8753, + "step": 7355 + }, + { + "epoch": 0.6895582517449759, + "grad_norm": 0.5625, + "learning_rate": 5.318606296193405e-05, + "loss": 0.8848, + "step": 7360 + }, + { + "epoch": 0.6900267016442592, + "grad_norm": 0.58203125, + "learning_rate": 5.304161324247077e-05, + "loss": 0.8622, + "step": 7365 + }, + { + "epoch": 0.6904951515435425, + "grad_norm": 0.54296875, + "learning_rate": 5.289728911388279e-05, + "loss": 0.8593, + "step": 7370 + }, + { + "epoch": 0.6909636014428256, + "grad_norm": 0.52734375, + "learning_rate": 5.275309096216704e-05, + "loss": 0.8212, + "step": 7375 + }, + { + "epoch": 0.6914320513421089, + "grad_norm": 0.58203125, + "learning_rate": 5.260901917298355e-05, + "loss": 0.8615, + "step": 7380 + }, + { + "epoch": 0.6919005012413922, + "grad_norm": 0.5546875, + "learning_rate": 5.246507413165445e-05, + "loss": 0.9007, + "step": 7385 + }, + { + "epoch": 0.6923689511406755, + "grad_norm": 0.58984375, + "learning_rate": 5.232125622316267e-05, + "loss": 0.8551, + "step": 7390 + }, + { + "epoch": 0.6928374010399587, + "grad_norm": 0.6640625, + "learning_rate": 5.2177565832151355e-05, + "loss": 0.8723, + "step": 7395 + }, + { + "epoch": 0.693305850939242, + "grad_norm": 0.59765625, + "learning_rate": 5.2034003342922514e-05, + "loss": 0.8591, + "step": 7400 + }, + { + "epoch": 0.6937743008385253, + "grad_norm": 0.60546875, + "learning_rate": 5.189056913943604e-05, + "loss": 0.877, + "step": 7405 + }, + { + "epoch": 0.6942427507378086, + "grad_norm": 0.58203125, + "learning_rate": 5.17472636053087e-05, + "loss": 0.8619, + "step": 7410 + }, + { + "epoch": 0.6947112006370919, + "grad_norm": 0.62109375, + "learning_rate": 5.160408712381327e-05, + "loss": 0.8525, + "step": 7415 + }, + { + "epoch": 0.6951796505363751, + "grad_norm": 0.6875, + "learning_rate": 5.146104007787731e-05, + "loss": 0.832, + "step": 7420 + }, + { + "epoch": 0.6956481004356584, + "grad_norm": 0.71875, + "learning_rate": 5.131812285008208e-05, + "loss": 0.8796, + "step": 7425 + }, + { + "epoch": 0.6961165503349417, + "grad_norm": 0.59375, + "learning_rate": 5.117533582266183e-05, + "loss": 0.8522, + "step": 7430 + }, + { + "epoch": 0.696585000234225, + "grad_norm": 0.5703125, + "learning_rate": 5.103267937750251e-05, + "loss": 0.8844, + "step": 7435 + }, + { + "epoch": 0.6970534501335082, + "grad_norm": 0.5546875, + "learning_rate": 5.089015389614076e-05, + "loss": 0.8797, + "step": 7440 + }, + { + "epoch": 0.6975219000327915, + "grad_norm": 0.546875, + "learning_rate": 5.074775975976307e-05, + "loss": 0.8726, + "step": 7445 + }, + { + "epoch": 0.6979903499320748, + "grad_norm": 0.734375, + "learning_rate": 5.0605497349204525e-05, + "loss": 0.8703, + "step": 7450 + }, + { + "epoch": 0.6984587998313581, + "grad_norm": 0.56640625, + "learning_rate": 5.046336704494804e-05, + "loss": 0.8829, + "step": 7455 + }, + { + "epoch": 0.6989272497306414, + "grad_norm": 0.57421875, + "learning_rate": 5.032136922712307e-05, + "loss": 0.8909, + "step": 7460 + }, + { + "epoch": 0.6993956996299245, + "grad_norm": 0.5625, + "learning_rate": 5.017950427550485e-05, + "loss": 0.8634, + "step": 7465 + }, + { + "epoch": 0.6998641495292078, + "grad_norm": 0.640625, + "learning_rate": 5.003777256951321e-05, + "loss": 0.8671, + "step": 7470 + }, + { + "epoch": 0.7003325994284911, + "grad_norm": 0.69140625, + "learning_rate": 4.989617448821166e-05, + "loss": 0.8632, + "step": 7475 + }, + { + "epoch": 0.7008010493277744, + "grad_norm": 0.66796875, + "learning_rate": 4.9754710410306196e-05, + "loss": 0.8988, + "step": 7480 + }, + { + "epoch": 0.7012694992270576, + "grad_norm": 0.578125, + "learning_rate": 4.961338071414462e-05, + "loss": 0.845, + "step": 7485 + }, + { + "epoch": 0.7017379491263409, + "grad_norm": 0.5546875, + "learning_rate": 4.9472185777715155e-05, + "loss": 0.8852, + "step": 7490 + }, + { + "epoch": 0.7022063990256242, + "grad_norm": 0.51953125, + "learning_rate": 4.933112597864564e-05, + "loss": 0.8198, + "step": 7495 + }, + { + "epoch": 0.7026748489249075, + "grad_norm": 0.66015625, + "learning_rate": 4.9190201694202574e-05, + "loss": 0.8643, + "step": 7500 + }, + { + "epoch": 0.7031432988241908, + "grad_norm": 0.58984375, + "learning_rate": 4.904941330128996e-05, + "loss": 0.8516, + "step": 7505 + }, + { + "epoch": 0.703611748723474, + "grad_norm": 0.57421875, + "learning_rate": 4.890876117644839e-05, + "loss": 0.8518, + "step": 7510 + }, + { + "epoch": 0.7040801986227573, + "grad_norm": 0.578125, + "learning_rate": 4.876824569585391e-05, + "loss": 0.8658, + "step": 7515 + }, + { + "epoch": 0.7045486485220406, + "grad_norm": 0.703125, + "learning_rate": 4.862786723531722e-05, + "loss": 0.8338, + "step": 7520 + }, + { + "epoch": 0.7050170984213239, + "grad_norm": 0.6796875, + "learning_rate": 4.848762617028254e-05, + "loss": 0.8643, + "step": 7525 + }, + { + "epoch": 0.7054855483206071, + "grad_norm": 0.5546875, + "learning_rate": 4.8347522875826576e-05, + "loss": 0.8682, + "step": 7530 + }, + { + "epoch": 0.7059539982198904, + "grad_norm": 0.625, + "learning_rate": 4.8207557726657536e-05, + "loss": 0.8499, + "step": 7535 + }, + { + "epoch": 0.7064224481191737, + "grad_norm": 0.5703125, + "learning_rate": 4.8067731097114264e-05, + "loss": 0.86, + "step": 7540 + }, + { + "epoch": 0.706890898018457, + "grad_norm": 0.58203125, + "learning_rate": 4.79280433611651e-05, + "loss": 0.8588, + "step": 7545 + }, + { + "epoch": 0.7073593479177402, + "grad_norm": 0.76953125, + "learning_rate": 4.778849489240681e-05, + "loss": 0.8547, + "step": 7550 + }, + { + "epoch": 0.7078277978170234, + "grad_norm": 0.54296875, + "learning_rate": 4.764908606406382e-05, + "loss": 0.864, + "step": 7555 + }, + { + "epoch": 0.7082962477163067, + "grad_norm": 0.55078125, + "learning_rate": 4.7509817248987046e-05, + "loss": 0.8515, + "step": 7560 + }, + { + "epoch": 0.70876469761559, + "grad_norm": 0.61328125, + "learning_rate": 4.737068881965289e-05, + "loss": 0.9113, + "step": 7565 + }, + { + "epoch": 0.7092331475148733, + "grad_norm": 0.54296875, + "learning_rate": 4.723170114816228e-05, + "loss": 0.8639, + "step": 7570 + }, + { + "epoch": 0.7097015974141565, + "grad_norm": 0.71484375, + "learning_rate": 4.7092854606239775e-05, + "loss": 0.8589, + "step": 7575 + }, + { + "epoch": 0.7101700473134398, + "grad_norm": 0.61328125, + "learning_rate": 4.6954149565232464e-05, + "loss": 0.823, + "step": 7580 + }, + { + "epoch": 0.7106384972127231, + "grad_norm": 0.6484375, + "learning_rate": 4.681558639610888e-05, + "loss": 0.8547, + "step": 7585 + }, + { + "epoch": 0.7111069471120064, + "grad_norm": 0.640625, + "learning_rate": 4.667716546945824e-05, + "loss": 0.8447, + "step": 7590 + }, + { + "epoch": 0.7115753970112897, + "grad_norm": 0.58984375, + "learning_rate": 4.653888715548932e-05, + "loss": 0.8617, + "step": 7595 + }, + { + "epoch": 0.7120438469105729, + "grad_norm": 0.6640625, + "learning_rate": 4.6400751824029366e-05, + "loss": 0.8794, + "step": 7600 + }, + { + "epoch": 0.7125122968098562, + "grad_norm": 0.5625, + "learning_rate": 4.626275984452337e-05, + "loss": 0.8515, + "step": 7605 + }, + { + "epoch": 0.7129807467091395, + "grad_norm": 0.5703125, + "learning_rate": 4.612491158603278e-05, + "loss": 0.8772, + "step": 7610 + }, + { + "epoch": 0.7134491966084228, + "grad_norm": 0.5703125, + "learning_rate": 4.59872074172348e-05, + "loss": 0.8956, + "step": 7615 + }, + { + "epoch": 0.713917646507706, + "grad_norm": 0.53125, + "learning_rate": 4.584964770642112e-05, + "loss": 0.8616, + "step": 7620 + }, + { + "epoch": 0.7143860964069892, + "grad_norm": 0.6015625, + "learning_rate": 4.571223282149718e-05, + "loss": 0.8765, + "step": 7625 + }, + { + "epoch": 0.7148545463062725, + "grad_norm": 0.5390625, + "learning_rate": 4.5574963129981063e-05, + "loss": 0.8697, + "step": 7630 + }, + { + "epoch": 0.7153229962055558, + "grad_norm": 0.6015625, + "learning_rate": 4.543783899900255e-05, + "loss": 0.8554, + "step": 7635 + }, + { + "epoch": 0.7157914461048391, + "grad_norm": 0.5390625, + "learning_rate": 4.5300860795302005e-05, + "loss": 0.8471, + "step": 7640 + }, + { + "epoch": 0.7162598960041223, + "grad_norm": 0.58203125, + "learning_rate": 4.516402888522966e-05, + "loss": 0.8595, + "step": 7645 + }, + { + "epoch": 0.7167283459034056, + "grad_norm": 0.65625, + "learning_rate": 4.502734363474436e-05, + "loss": 0.8475, + "step": 7650 + }, + { + "epoch": 0.7171967958026889, + "grad_norm": 0.6015625, + "learning_rate": 4.489080540941279e-05, + "loss": 0.8655, + "step": 7655 + }, + { + "epoch": 0.7176652457019722, + "grad_norm": 0.56640625, + "learning_rate": 4.475441457440836e-05, + "loss": 0.8455, + "step": 7660 + }, + { + "epoch": 0.7181336956012554, + "grad_norm": 0.671875, + "learning_rate": 4.461817149451028e-05, + "loss": 0.8461, + "step": 7665 + }, + { + "epoch": 0.7186021455005387, + "grad_norm": 0.64453125, + "learning_rate": 4.448207653410271e-05, + "loss": 0.8912, + "step": 7670 + }, + { + "epoch": 0.719070595399822, + "grad_norm": 0.7578125, + "learning_rate": 4.434613005717345e-05, + "loss": 0.8614, + "step": 7675 + }, + { + "epoch": 0.7195390452991053, + "grad_norm": 0.6484375, + "learning_rate": 4.4210332427313336e-05, + "loss": 0.8799, + "step": 7680 + }, + { + "epoch": 0.7200074951983886, + "grad_norm": 0.8359375, + "learning_rate": 4.4074684007715104e-05, + "loss": 0.8748, + "step": 7685 + }, + { + "epoch": 0.7204759450976718, + "grad_norm": 0.5390625, + "learning_rate": 4.393918516117236e-05, + "loss": 0.8428, + "step": 7690 + }, + { + "epoch": 0.7209443949969551, + "grad_norm": 0.6484375, + "learning_rate": 4.380383625007866e-05, + "loss": 0.8637, + "step": 7695 + }, + { + "epoch": 0.7214128448962384, + "grad_norm": 0.66015625, + "learning_rate": 4.3668637636426626e-05, + "loss": 0.8726, + "step": 7700 + }, + { + "epoch": 0.7218812947955217, + "grad_norm": 0.58984375, + "learning_rate": 4.353358968180692e-05, + "loss": 0.8501, + "step": 7705 + }, + { + "epoch": 0.7223497446948048, + "grad_norm": 0.6328125, + "learning_rate": 4.3398692747407154e-05, + "loss": 0.8696, + "step": 7710 + }, + { + "epoch": 0.7228181945940881, + "grad_norm": 0.53515625, + "learning_rate": 4.326394719401115e-05, + "loss": 0.8518, + "step": 7715 + }, + { + "epoch": 0.7232866444933714, + "grad_norm": 0.58984375, + "learning_rate": 4.312935338199783e-05, + "loss": 0.8656, + "step": 7720 + }, + { + "epoch": 0.7237550943926547, + "grad_norm": 0.55859375, + "learning_rate": 4.299491167134023e-05, + "loss": 0.8786, + "step": 7725 + }, + { + "epoch": 0.724223544291938, + "grad_norm": 0.62890625, + "learning_rate": 4.28606224216046e-05, + "loss": 0.8635, + "step": 7730 + }, + { + "epoch": 0.7246919941912212, + "grad_norm": 0.58203125, + "learning_rate": 4.272648599194948e-05, + "loss": 0.8453, + "step": 7735 + }, + { + "epoch": 0.7251604440905045, + "grad_norm": 0.54296875, + "learning_rate": 4.2592502741124706e-05, + "loss": 0.8648, + "step": 7740 + }, + { + "epoch": 0.7256288939897878, + "grad_norm": 0.53515625, + "learning_rate": 4.245867302747032e-05, + "loss": 0.8661, + "step": 7745 + }, + { + "epoch": 0.7260973438890711, + "grad_norm": 0.5546875, + "learning_rate": 4.232499720891584e-05, + "loss": 0.8901, + "step": 7750 + }, + { + "epoch": 0.7265657937883543, + "grad_norm": 0.62109375, + "learning_rate": 4.219147564297918e-05, + "loss": 0.8346, + "step": 7755 + }, + { + "epoch": 0.7270342436876376, + "grad_norm": 0.71484375, + "learning_rate": 4.20581086867656e-05, + "loss": 0.8532, + "step": 7760 + }, + { + "epoch": 0.7275026935869209, + "grad_norm": 0.921875, + "learning_rate": 4.192489669696701e-05, + "loss": 0.8474, + "step": 7765 + }, + { + "epoch": 0.7279711434862042, + "grad_norm": 0.62109375, + "learning_rate": 4.179184002986069e-05, + "loss": 0.8478, + "step": 7770 + }, + { + "epoch": 0.7284395933854875, + "grad_norm": 0.8203125, + "learning_rate": 4.1658939041308674e-05, + "loss": 0.8956, + "step": 7775 + }, + { + "epoch": 0.7289080432847707, + "grad_norm": 0.61328125, + "learning_rate": 4.1526194086756475e-05, + "loss": 0.8706, + "step": 7780 + }, + { + "epoch": 0.729376493184054, + "grad_norm": 0.63671875, + "learning_rate": 4.1393605521232414e-05, + "loss": 0.8881, + "step": 7785 + }, + { + "epoch": 0.7298449430833372, + "grad_norm": 0.578125, + "learning_rate": 4.12611736993465e-05, + "loss": 0.8509, + "step": 7790 + }, + { + "epoch": 0.7303133929826205, + "grad_norm": 0.671875, + "learning_rate": 4.112889897528955e-05, + "loss": 0.8266, + "step": 7795 + }, + { + "epoch": 0.7307818428819037, + "grad_norm": 0.67578125, + "learning_rate": 4.0996781702832156e-05, + "loss": 0.8849, + "step": 7800 + }, + { + "epoch": 0.731250292781187, + "grad_norm": 0.5390625, + "learning_rate": 4.0864822235323895e-05, + "loss": 0.859, + "step": 7805 + }, + { + "epoch": 0.7317187426804703, + "grad_norm": 0.59375, + "learning_rate": 4.073302092569219e-05, + "loss": 0.8548, + "step": 7810 + }, + { + "epoch": 0.7321871925797536, + "grad_norm": 0.5703125, + "learning_rate": 4.060137812644158e-05, + "loss": 0.8576, + "step": 7815 + }, + { + "epoch": 0.7326556424790369, + "grad_norm": 0.671875, + "learning_rate": 4.046989418965255e-05, + "loss": 0.8251, + "step": 7820 + }, + { + "epoch": 0.7331240923783201, + "grad_norm": 0.5703125, + "learning_rate": 4.033856946698079e-05, + "loss": 0.8535, + "step": 7825 + }, + { + "epoch": 0.7335925422776034, + "grad_norm": 0.61328125, + "learning_rate": 4.020740430965619e-05, + "loss": 0.8719, + "step": 7830 + }, + { + "epoch": 0.7340609921768867, + "grad_norm": 0.53125, + "learning_rate": 4.007639906848177e-05, + "loss": 1.0032, + "step": 7835 + }, + { + "epoch": 0.73452944207617, + "grad_norm": 0.5390625, + "learning_rate": 3.9945554093832934e-05, + "loss": 0.8566, + "step": 7840 + }, + { + "epoch": 0.7349978919754532, + "grad_norm": 0.609375, + "learning_rate": 3.981486973565648e-05, + "loss": 0.8843, + "step": 7845 + }, + { + "epoch": 0.7354663418747365, + "grad_norm": 0.70703125, + "learning_rate": 3.968434634346956e-05, + "loss": 0.887, + "step": 7850 + }, + { + "epoch": 0.7359347917740198, + "grad_norm": 0.6171875, + "learning_rate": 3.955398426635879e-05, + "loss": 0.8646, + "step": 7855 + }, + { + "epoch": 0.7364032416733031, + "grad_norm": 0.60546875, + "learning_rate": 3.942378385297946e-05, + "loss": 0.8363, + "step": 7860 + }, + { + "epoch": 0.7368716915725864, + "grad_norm": 0.6015625, + "learning_rate": 3.929374545155445e-05, + "loss": 0.8445, + "step": 7865 + }, + { + "epoch": 0.7373401414718695, + "grad_norm": 0.55859375, + "learning_rate": 3.9163869409873275e-05, + "loss": 0.8709, + "step": 7870 + }, + { + "epoch": 0.7378085913711528, + "grad_norm": 0.57421875, + "learning_rate": 3.9034156075291274e-05, + "loss": 0.8639, + "step": 7875 + }, + { + "epoch": 0.7382770412704361, + "grad_norm": 0.609375, + "learning_rate": 3.890460579472866e-05, + "loss": 0.8563, + "step": 7880 + }, + { + "epoch": 0.7387454911697194, + "grad_norm": 0.56640625, + "learning_rate": 3.877521891466947e-05, + "loss": 0.8573, + "step": 7885 + }, + { + "epoch": 0.7392139410690026, + "grad_norm": 0.58984375, + "learning_rate": 3.8645995781160715e-05, + "loss": 0.8599, + "step": 7890 + }, + { + "epoch": 0.7396823909682859, + "grad_norm": 0.62109375, + "learning_rate": 3.851693673981156e-05, + "loss": 0.8644, + "step": 7895 + }, + { + "epoch": 0.7401508408675692, + "grad_norm": 0.62890625, + "learning_rate": 3.838804213579227e-05, + "loss": 0.8637, + "step": 7900 + }, + { + "epoch": 0.7406192907668525, + "grad_norm": 0.578125, + "learning_rate": 3.825931231383324e-05, + "loss": 0.8653, + "step": 7905 + }, + { + "epoch": 0.7410877406661358, + "grad_norm": 0.5859375, + "learning_rate": 3.813074761822426e-05, + "loss": 0.8737, + "step": 7910 + }, + { + "epoch": 0.741556190565419, + "grad_norm": 0.5625, + "learning_rate": 3.8002348392813425e-05, + "loss": 0.8605, + "step": 7915 + }, + { + "epoch": 0.7420246404647023, + "grad_norm": 0.5703125, + "learning_rate": 3.787411498100635e-05, + "loss": 0.8502, + "step": 7920 + }, + { + "epoch": 0.7424930903639856, + "grad_norm": 0.55859375, + "learning_rate": 3.7746047725765066e-05, + "loss": 0.8986, + "step": 7925 + }, + { + "epoch": 0.7429615402632689, + "grad_norm": 0.57421875, + "learning_rate": 3.761814696960726e-05, + "loss": 0.8629, + "step": 7930 + }, + { + "epoch": 0.7434299901625521, + "grad_norm": 0.546875, + "learning_rate": 3.749041305460539e-05, + "loss": 0.8515, + "step": 7935 + }, + { + "epoch": 0.7438984400618354, + "grad_norm": 0.62109375, + "learning_rate": 3.736284632238556e-05, + "loss": 0.8472, + "step": 7940 + }, + { + "epoch": 0.7443668899611187, + "grad_norm": 0.54296875, + "learning_rate": 3.7235447114126856e-05, + "loss": 0.834, + "step": 7945 + }, + { + "epoch": 0.744835339860402, + "grad_norm": 0.6640625, + "learning_rate": 3.7108215770560284e-05, + "loss": 0.876, + "step": 7950 + }, + { + "epoch": 0.7453037897596853, + "grad_norm": 0.54296875, + "learning_rate": 3.698115263196791e-05, + "loss": 0.8666, + "step": 7955 + }, + { + "epoch": 0.7457722396589684, + "grad_norm": 0.5703125, + "learning_rate": 3.685425803818184e-05, + "loss": 0.8773, + "step": 7960 + }, + { + "epoch": 0.7462406895582517, + "grad_norm": 0.52734375, + "learning_rate": 3.672753232858356e-05, + "loss": 0.85, + "step": 7965 + }, + { + "epoch": 0.746709139457535, + "grad_norm": 0.62109375, + "learning_rate": 3.66009758421027e-05, + "loss": 0.8638, + "step": 7970 + }, + { + "epoch": 0.7471775893568183, + "grad_norm": 0.55078125, + "learning_rate": 3.6474588917216476e-05, + "loss": 0.836, + "step": 7975 + }, + { + "epoch": 0.7476460392561015, + "grad_norm": 0.59765625, + "learning_rate": 3.6348371891948455e-05, + "loss": 0.8471, + "step": 7980 + }, + { + "epoch": 0.7481144891553848, + "grad_norm": 0.546875, + "learning_rate": 3.6222325103867885e-05, + "loss": 0.8684, + "step": 7985 + }, + { + "epoch": 0.7485829390546681, + "grad_norm": 0.578125, + "learning_rate": 3.6096448890088743e-05, + "loss": 0.8594, + "step": 7990 + }, + { + "epoch": 0.7490513889539514, + "grad_norm": 0.609375, + "learning_rate": 3.597074358726869e-05, + "loss": 0.8713, + "step": 7995 + }, + { + "epoch": 0.7495198388532347, + "grad_norm": 0.5703125, + "learning_rate": 3.584520953160839e-05, + "loss": 0.8725, + "step": 8000 + }, + { + "epoch": 0.7499882887525179, + "grad_norm": 0.5703125, + "learning_rate": 3.57198470588505e-05, + "loss": 0.8481, + "step": 8005 + }, + { + "epoch": 0.7504567386518012, + "grad_norm": 0.5234375, + "learning_rate": 3.55946565042787e-05, + "loss": 0.85, + "step": 8010 + }, + { + "epoch": 0.7509251885510845, + "grad_norm": 0.625, + "learning_rate": 3.546963820271689e-05, + "loss": 0.8442, + "step": 8015 + }, + { + "epoch": 0.7513936384503678, + "grad_norm": 0.5625, + "learning_rate": 3.534479248852833e-05, + "loss": 0.8686, + "step": 8020 + }, + { + "epoch": 0.751862088349651, + "grad_norm": 0.5625, + "learning_rate": 3.522011969561469e-05, + "loss": 0.8741, + "step": 8025 + }, + { + "epoch": 0.7523305382489343, + "grad_norm": 0.59765625, + "learning_rate": 3.509562015741509e-05, + "loss": 0.876, + "step": 8030 + }, + { + "epoch": 0.7527989881482176, + "grad_norm": 0.5390625, + "learning_rate": 3.4971294206905326e-05, + "loss": 0.838, + "step": 8035 + }, + { + "epoch": 0.7532674380475008, + "grad_norm": 0.73046875, + "learning_rate": 3.484714217659698e-05, + "loss": 0.8595, + "step": 8040 + }, + { + "epoch": 0.7537358879467841, + "grad_norm": 0.55078125, + "learning_rate": 3.4723164398536376e-05, + "loss": 0.8687, + "step": 8045 + }, + { + "epoch": 0.7542043378460673, + "grad_norm": 0.578125, + "learning_rate": 3.459936120430384e-05, + "loss": 0.8403, + "step": 8050 + }, + { + "epoch": 0.7546727877453506, + "grad_norm": 0.52734375, + "learning_rate": 3.447573292501279e-05, + "loss": 0.8502, + "step": 8055 + }, + { + "epoch": 0.7551412376446339, + "grad_norm": 0.58203125, + "learning_rate": 3.435227989130888e-05, + "loss": 0.8602, + "step": 8060 + }, + { + "epoch": 0.7556096875439172, + "grad_norm": 0.546875, + "learning_rate": 3.422900243336893e-05, + "loss": 0.91, + "step": 8065 + }, + { + "epoch": 0.7560781374432004, + "grad_norm": 0.6171875, + "learning_rate": 3.410590088090029e-05, + "loss": 0.8619, + "step": 8070 + }, + { + "epoch": 0.7565465873424837, + "grad_norm": 0.53125, + "learning_rate": 3.398297556313983e-05, + "loss": 0.8591, + "step": 8075 + }, + { + "epoch": 0.757015037241767, + "grad_norm": 0.60546875, + "learning_rate": 3.386022680885309e-05, + "loss": 0.873, + "step": 8080 + }, + { + "epoch": 0.7574834871410503, + "grad_norm": 0.5703125, + "learning_rate": 3.373765494633333e-05, + "loss": 0.8759, + "step": 8085 + }, + { + "epoch": 0.7579519370403336, + "grad_norm": 0.59375, + "learning_rate": 3.361526030340072e-05, + "loss": 0.8043, + "step": 8090 + }, + { + "epoch": 0.7584203869396168, + "grad_norm": 0.5859375, + "learning_rate": 3.3493043207401496e-05, + "loss": 0.8367, + "step": 8095 + }, + { + "epoch": 0.7588888368389001, + "grad_norm": 0.5703125, + "learning_rate": 3.3371003985207085e-05, + "loss": 0.8603, + "step": 8100 + }, + { + "epoch": 0.7593572867381834, + "grad_norm": 0.60546875, + "learning_rate": 3.3249142963213045e-05, + "loss": 0.8679, + "step": 8105 + }, + { + "epoch": 0.7598257366374667, + "grad_norm": 0.61328125, + "learning_rate": 3.312746046733844e-05, + "loss": 0.8993, + "step": 8110 + }, + { + "epoch": 0.7602941865367498, + "grad_norm": 0.56640625, + "learning_rate": 3.30059568230249e-05, + "loss": 0.8778, + "step": 8115 + }, + { + "epoch": 0.7607626364360331, + "grad_norm": 0.71875, + "learning_rate": 3.288463235523557e-05, + "loss": 0.8611, + "step": 8120 + }, + { + "epoch": 0.7612310863353164, + "grad_norm": 0.71875, + "learning_rate": 3.276348738845454e-05, + "loss": 0.8965, + "step": 8125 + }, + { + "epoch": 0.7616995362345997, + "grad_norm": 0.54296875, + "learning_rate": 3.264252224668569e-05, + "loss": 0.8374, + "step": 8130 + }, + { + "epoch": 0.762167986133883, + "grad_norm": 0.53125, + "learning_rate": 3.252173725345208e-05, + "loss": 0.8614, + "step": 8135 + }, + { + "epoch": 0.7626364360331662, + "grad_norm": 0.546875, + "learning_rate": 3.2401132731794824e-05, + "loss": 0.8493, + "step": 8140 + }, + { + "epoch": 0.7631048859324495, + "grad_norm": 0.58984375, + "learning_rate": 3.228070900427249e-05, + "loss": 0.8571, + "step": 8145 + }, + { + "epoch": 0.7635733358317328, + "grad_norm": 0.640625, + "learning_rate": 3.216046639296008e-05, + "loss": 0.8627, + "step": 8150 + }, + { + "epoch": 0.7640417857310161, + "grad_norm": 0.5859375, + "learning_rate": 3.204040521944809e-05, + "loss": 0.8788, + "step": 8155 + }, + { + "epoch": 0.7645102356302993, + "grad_norm": 0.5625, + "learning_rate": 3.1920525804841886e-05, + "loss": 0.8318, + "step": 8160 + }, + { + "epoch": 0.7649786855295826, + "grad_norm": 0.55078125, + "learning_rate": 3.180082846976071e-05, + "loss": 0.8791, + "step": 8165 + }, + { + "epoch": 0.7654471354288659, + "grad_norm": 0.5703125, + "learning_rate": 3.168131353433676e-05, + "loss": 0.8471, + "step": 8170 + }, + { + "epoch": 0.7659155853281492, + "grad_norm": 0.578125, + "learning_rate": 3.1561981318214396e-05, + "loss": 0.8715, + "step": 8175 + }, + { + "epoch": 0.7663840352274325, + "grad_norm": 0.54296875, + "learning_rate": 3.1442832140549374e-05, + "loss": 0.8531, + "step": 8180 + }, + { + "epoch": 0.7668524851267157, + "grad_norm": 0.5625, + "learning_rate": 3.132386632000789e-05, + "loss": 0.8577, + "step": 8185 + }, + { + "epoch": 0.767320935025999, + "grad_norm": 0.578125, + "learning_rate": 3.120508417476567e-05, + "loss": 0.8718, + "step": 8190 + }, + { + "epoch": 0.7677893849252823, + "grad_norm": 0.5625, + "learning_rate": 3.10864860225073e-05, + "loss": 0.8602, + "step": 8195 + }, + { + "epoch": 0.7682578348245656, + "grad_norm": 0.55078125, + "learning_rate": 3.0968072180425244e-05, + "loss": 0.8472, + "step": 8200 + }, + { + "epoch": 0.7687262847238487, + "grad_norm": 0.62890625, + "learning_rate": 3.084984296521898e-05, + "loss": 0.859, + "step": 8205 + }, + { + "epoch": 0.769194734623132, + "grad_norm": 0.54296875, + "learning_rate": 3.0731798693094213e-05, + "loss": 0.8413, + "step": 8210 + }, + { + "epoch": 0.7696631845224153, + "grad_norm": 0.55859375, + "learning_rate": 3.061393967976205e-05, + "loss": 0.8662, + "step": 8215 + }, + { + "epoch": 0.7701316344216986, + "grad_norm": 0.64453125, + "learning_rate": 3.0496266240438154e-05, + "loss": 0.8721, + "step": 8220 + }, + { + "epoch": 0.7706000843209819, + "grad_norm": 0.578125, + "learning_rate": 3.0378778689841736e-05, + "loss": 0.8383, + "step": 8225 + }, + { + "epoch": 0.7710685342202651, + "grad_norm": 0.58984375, + "learning_rate": 3.0261477342194967e-05, + "loss": 0.8414, + "step": 8230 + }, + { + "epoch": 0.7715369841195484, + "grad_norm": 0.5703125, + "learning_rate": 3.014436251122197e-05, + "loss": 0.8352, + "step": 8235 + }, + { + "epoch": 0.7720054340188317, + "grad_norm": 0.54296875, + "learning_rate": 3.0027434510148055e-05, + "loss": 0.8286, + "step": 8240 + }, + { + "epoch": 0.772473883918115, + "grad_norm": 0.5703125, + "learning_rate": 2.9910693651698827e-05, + "loss": 0.8791, + "step": 8245 + }, + { + "epoch": 0.7729423338173982, + "grad_norm": 0.57421875, + "learning_rate": 2.979414024809931e-05, + "loss": 0.8561, + "step": 8250 + }, + { + "epoch": 0.7734107837166815, + "grad_norm": 0.57421875, + "learning_rate": 2.9677774611073307e-05, + "loss": 0.8361, + "step": 8255 + }, + { + "epoch": 0.7738792336159648, + "grad_norm": 0.54296875, + "learning_rate": 2.9561597051842384e-05, + "loss": 0.8456, + "step": 8260 + }, + { + "epoch": 0.7743476835152481, + "grad_norm": 0.58984375, + "learning_rate": 2.9445607881125024e-05, + "loss": 0.8525, + "step": 8265 + }, + { + "epoch": 0.7748161334145313, + "grad_norm": 0.6015625, + "learning_rate": 2.9329807409135945e-05, + "loss": 0.8702, + "step": 8270 + }, + { + "epoch": 0.7752845833138146, + "grad_norm": 0.58203125, + "learning_rate": 2.921419594558521e-05, + "loss": 0.8843, + "step": 8275 + }, + { + "epoch": 0.7757530332130979, + "grad_norm": 0.5546875, + "learning_rate": 2.909877379967725e-05, + "loss": 0.8471, + "step": 8280 + }, + { + "epoch": 0.7762214831123811, + "grad_norm": 0.578125, + "learning_rate": 2.8983541280110306e-05, + "loss": 0.8709, + "step": 8285 + }, + { + "epoch": 0.7766899330116644, + "grad_norm": 0.53515625, + "learning_rate": 2.8868498695075343e-05, + "loss": 0.8505, + "step": 8290 + }, + { + "epoch": 0.7771583829109476, + "grad_norm": 0.546875, + "learning_rate": 2.8753646352255437e-05, + "loss": 0.836, + "step": 8295 + }, + { + "epoch": 0.7776268328102309, + "grad_norm": 0.58203125, + "learning_rate": 2.8638984558824777e-05, + "loss": 0.8435, + "step": 8300 + }, + { + "epoch": 0.7780952827095142, + "grad_norm": 0.56640625, + "learning_rate": 2.8524513621447992e-05, + "loss": 0.8428, + "step": 8305 + }, + { + "epoch": 0.7785637326087975, + "grad_norm": 0.5546875, + "learning_rate": 2.8410233846279255e-05, + "loss": 0.8552, + "step": 8310 + }, + { + "epoch": 0.7790321825080807, + "grad_norm": 0.58984375, + "learning_rate": 2.8296145538961404e-05, + "loss": 0.8468, + "step": 8315 + }, + { + "epoch": 0.779500632407364, + "grad_norm": 0.546875, + "learning_rate": 2.818224900462527e-05, + "loss": 0.8536, + "step": 8320 + }, + { + "epoch": 0.7799690823066473, + "grad_norm": 0.58203125, + "learning_rate": 2.80685445478888e-05, + "loss": 0.8225, + "step": 8325 + }, + { + "epoch": 0.7804375322059306, + "grad_norm": 0.6484375, + "learning_rate": 2.7955032472856146e-05, + "loss": 0.8995, + "step": 8330 + }, + { + "epoch": 0.7809059821052139, + "grad_norm": 0.55859375, + "learning_rate": 2.7841713083116938e-05, + "loss": 1.0004, + "step": 8335 + }, + { + "epoch": 0.7813744320044971, + "grad_norm": 0.5859375, + "learning_rate": 2.772858668174555e-05, + "loss": 0.8579, + "step": 8340 + }, + { + "epoch": 0.7818428819037804, + "grad_norm": 0.546875, + "learning_rate": 2.761565357130016e-05, + "loss": 0.89, + "step": 8345 + }, + { + "epoch": 0.7823113318030637, + "grad_norm": 0.5703125, + "learning_rate": 2.7502914053821936e-05, + "loss": 0.8625, + "step": 8350 + }, + { + "epoch": 0.782779781702347, + "grad_norm": 0.5625, + "learning_rate": 2.7390368430834368e-05, + "loss": 0.883, + "step": 8355 + }, + { + "epoch": 0.7832482316016302, + "grad_norm": 0.5390625, + "learning_rate": 2.7278017003342306e-05, + "loss": 0.8934, + "step": 8360 + }, + { + "epoch": 0.7837166815009134, + "grad_norm": 0.56640625, + "learning_rate": 2.716586007183133e-05, + "loss": 0.8615, + "step": 8365 + }, + { + "epoch": 0.7841851314001967, + "grad_norm": 0.546875, + "learning_rate": 2.7053897936266615e-05, + "loss": 0.8772, + "step": 8370 + }, + { + "epoch": 0.78465358129948, + "grad_norm": 0.6328125, + "learning_rate": 2.6942130896092553e-05, + "loss": 0.8452, + "step": 8375 + }, + { + "epoch": 0.7851220311987633, + "grad_norm": 0.609375, + "learning_rate": 2.6830559250231725e-05, + "loss": 0.8703, + "step": 8380 + }, + { + "epoch": 0.7855904810980465, + "grad_norm": 0.55078125, + "learning_rate": 2.6719183297084026e-05, + "loss": 0.8448, + "step": 8385 + }, + { + "epoch": 0.7860589309973298, + "grad_norm": 0.55859375, + "learning_rate": 2.6608003334526055e-05, + "loss": 0.8319, + "step": 8390 + }, + { + "epoch": 0.7865273808966131, + "grad_norm": 0.8203125, + "learning_rate": 2.6497019659910216e-05, + "loss": 0.87, + "step": 8395 + }, + { + "epoch": 0.7869958307958964, + "grad_norm": 0.73828125, + "learning_rate": 2.6386232570063964e-05, + "loss": 0.8662, + "step": 8400 + }, + { + "epoch": 0.7874642806951796, + "grad_norm": 0.66015625, + "learning_rate": 2.627564236128891e-05, + "loss": 0.8614, + "step": 8405 + }, + { + "epoch": 0.7879327305944629, + "grad_norm": 0.56640625, + "learning_rate": 2.616524932936012e-05, + "loss": 0.8524, + "step": 8410 + }, + { + "epoch": 0.7884011804937462, + "grad_norm": 0.59765625, + "learning_rate": 2.6055053769525372e-05, + "loss": 0.837, + "step": 8415 + }, + { + "epoch": 0.7888696303930295, + "grad_norm": 0.58984375, + "learning_rate": 2.5945055976504275e-05, + "loss": 0.894, + "step": 8420 + }, + { + "epoch": 0.7893380802923128, + "grad_norm": 0.57421875, + "learning_rate": 2.583525624448745e-05, + "loss": 0.8656, + "step": 8425 + }, + { + "epoch": 0.789806530191596, + "grad_norm": 0.58984375, + "learning_rate": 2.5725654867135872e-05, + "loss": 0.8296, + "step": 8430 + }, + { + "epoch": 0.7902749800908793, + "grad_norm": 0.59375, + "learning_rate": 2.561625213758001e-05, + "loss": 0.8475, + "step": 8435 + }, + { + "epoch": 0.7907434299901626, + "grad_norm": 0.54296875, + "learning_rate": 2.5507048348418983e-05, + "loss": 0.8399, + "step": 8440 + }, + { + "epoch": 0.7912118798894459, + "grad_norm": 0.546875, + "learning_rate": 2.5398043791719928e-05, + "loss": 0.8458, + "step": 8445 + }, + { + "epoch": 0.791680329788729, + "grad_norm": 0.6484375, + "learning_rate": 2.528923875901703e-05, + "loss": 0.8844, + "step": 8450 + }, + { + "epoch": 0.7921487796880123, + "grad_norm": 0.55859375, + "learning_rate": 2.518063354131097e-05, + "loss": 0.8779, + "step": 8455 + }, + { + "epoch": 0.7926172295872956, + "grad_norm": 0.55078125, + "learning_rate": 2.507222842906788e-05, + "loss": 0.8833, + "step": 8460 + }, + { + "epoch": 0.7930856794865789, + "grad_norm": 0.64453125, + "learning_rate": 2.496402371221882e-05, + "loss": 0.8489, + "step": 8465 + }, + { + "epoch": 0.7935541293858622, + "grad_norm": 0.56640625, + "learning_rate": 2.485601968015887e-05, + "loss": 0.8629, + "step": 8470 + }, + { + "epoch": 0.7940225792851454, + "grad_norm": 0.5546875, + "learning_rate": 2.4748216621746302e-05, + "loss": 0.8429, + "step": 8475 + }, + { + "epoch": 0.7944910291844287, + "grad_norm": 0.5703125, + "learning_rate": 2.4640614825301955e-05, + "loss": 0.8517, + "step": 8480 + }, + { + "epoch": 0.794959479083712, + "grad_norm": 0.58984375, + "learning_rate": 2.453321457860841e-05, + "loss": 0.8591, + "step": 8485 + }, + { + "epoch": 0.7954279289829953, + "grad_norm": 0.5546875, + "learning_rate": 2.4426016168909117e-05, + "loss": 0.8764, + "step": 8490 + }, + { + "epoch": 0.7958963788822785, + "grad_norm": 0.55078125, + "learning_rate": 2.4319019882907734e-05, + "loss": 0.8394, + "step": 8495 + }, + { + "epoch": 0.7963648287815618, + "grad_norm": 0.54296875, + "learning_rate": 2.421222600676737e-05, + "loss": 0.8574, + "step": 8500 + }, + { + "epoch": 0.7968332786808451, + "grad_norm": 0.5859375, + "learning_rate": 2.4105634826109812e-05, + "loss": 0.8843, + "step": 8505 + }, + { + "epoch": 0.7973017285801284, + "grad_norm": 0.54296875, + "learning_rate": 2.3999246626014626e-05, + "loss": 0.8522, + "step": 8510 + }, + { + "epoch": 0.7977701784794117, + "grad_norm": 0.5703125, + "learning_rate": 2.3893061691018616e-05, + "loss": 0.844, + "step": 8515 + }, + { + "epoch": 0.7982386283786949, + "grad_norm": 0.58203125, + "learning_rate": 2.3787080305114884e-05, + "loss": 0.8395, + "step": 8520 + }, + { + "epoch": 0.7987070782779782, + "grad_norm": 0.61328125, + "learning_rate": 2.3681302751752232e-05, + "loss": 0.8556, + "step": 8525 + }, + { + "epoch": 0.7991755281772615, + "grad_norm": 0.546875, + "learning_rate": 2.3575729313834118e-05, + "loss": 0.8485, + "step": 8530 + }, + { + "epoch": 0.7996439780765447, + "grad_norm": 0.55859375, + "learning_rate": 2.3470360273718262e-05, + "loss": 0.8715, + "step": 8535 + }, + { + "epoch": 0.8001124279758279, + "grad_norm": 0.67578125, + "learning_rate": 2.3365195913215654e-05, + "loss": 0.861, + "step": 8540 + }, + { + "epoch": 0.8005808778751112, + "grad_norm": 0.62109375, + "learning_rate": 2.3260236513589927e-05, + "loss": 0.8597, + "step": 8545 + }, + { + "epoch": 0.8010493277743945, + "grad_norm": 0.52734375, + "learning_rate": 2.3155482355556425e-05, + "loss": 0.8451, + "step": 8550 + }, + { + "epoch": 0.8015177776736778, + "grad_norm": 0.5703125, + "learning_rate": 2.3050933719281663e-05, + "loss": 0.8659, + "step": 8555 + }, + { + "epoch": 0.8019862275729611, + "grad_norm": 0.5625, + "learning_rate": 2.294659088438249e-05, + "loss": 0.8382, + "step": 8560 + }, + { + "epoch": 0.8024546774722443, + "grad_norm": 0.55859375, + "learning_rate": 2.2842454129925284e-05, + "loss": 0.8526, + "step": 8565 + }, + { + "epoch": 0.8029231273715276, + "grad_norm": 0.56640625, + "learning_rate": 2.2738523734425242e-05, + "loss": 0.8859, + "step": 8570 + }, + { + "epoch": 0.8033915772708109, + "grad_norm": 0.5859375, + "learning_rate": 2.2634799975845733e-05, + "loss": 0.884, + "step": 8575 + }, + { + "epoch": 0.8038600271700942, + "grad_norm": 0.609375, + "learning_rate": 2.2531283131597447e-05, + "loss": 0.8675, + "step": 8580 + }, + { + "epoch": 0.8043284770693774, + "grad_norm": 0.58984375, + "learning_rate": 2.2427973478537612e-05, + "loss": 0.8808, + "step": 8585 + }, + { + "epoch": 0.8047969269686607, + "grad_norm": 0.62890625, + "learning_rate": 2.2324871292969385e-05, + "loss": 0.8342, + "step": 8590 + }, + { + "epoch": 0.805265376867944, + "grad_norm": 0.57421875, + "learning_rate": 2.222197685064107e-05, + "loss": 0.8497, + "step": 8595 + }, + { + "epoch": 0.8057338267672273, + "grad_norm": 0.5546875, + "learning_rate": 2.2119290426745254e-05, + "loss": 0.8725, + "step": 8600 + }, + { + "epoch": 0.8062022766665106, + "grad_norm": 0.5390625, + "learning_rate": 2.201681229591831e-05, + "loss": 0.8519, + "step": 8605 + }, + { + "epoch": 0.8066707265657937, + "grad_norm": 0.55859375, + "learning_rate": 2.1914542732239407e-05, + "loss": 0.856, + "step": 8610 + }, + { + "epoch": 0.807139176465077, + "grad_norm": 0.59765625, + "learning_rate": 2.1812482009229995e-05, + "loss": 0.8556, + "step": 8615 + }, + { + "epoch": 0.8076076263643603, + "grad_norm": 0.56640625, + "learning_rate": 2.1710630399852884e-05, + "loss": 0.8398, + "step": 8620 + }, + { + "epoch": 0.8080760762636436, + "grad_norm": 0.57421875, + "learning_rate": 2.160898817651169e-05, + "loss": 0.8692, + "step": 8625 + }, + { + "epoch": 0.8085445261629268, + "grad_norm": 0.578125, + "learning_rate": 2.1507555611050013e-05, + "loss": 0.8461, + "step": 8630 + }, + { + "epoch": 0.8090129760622101, + "grad_norm": 0.72265625, + "learning_rate": 2.140633297475063e-05, + "loss": 0.8698, + "step": 8635 + }, + { + "epoch": 0.8094814259614934, + "grad_norm": 0.58984375, + "learning_rate": 2.1305320538334983e-05, + "loss": 0.9851, + "step": 8640 + }, + { + "epoch": 0.8099498758607767, + "grad_norm": 0.55859375, + "learning_rate": 2.1204518571962273e-05, + "loss": 0.9966, + "step": 8645 + }, + { + "epoch": 0.81041832576006, + "grad_norm": 0.72265625, + "learning_rate": 2.110392734522877e-05, + "loss": 0.8402, + "step": 8650 + }, + { + "epoch": 0.8108867756593432, + "grad_norm": 0.61328125, + "learning_rate": 2.1003547127167145e-05, + "loss": 0.8469, + "step": 8655 + }, + { + "epoch": 0.8113552255586265, + "grad_norm": 0.546875, + "learning_rate": 2.0903378186245713e-05, + "loss": 0.858, + "step": 8660 + }, + { + "epoch": 0.8118236754579098, + "grad_norm": 0.55078125, + "learning_rate": 2.08034207903678e-05, + "loss": 0.8404, + "step": 8665 + }, + { + "epoch": 0.8122921253571931, + "grad_norm": 0.5390625, + "learning_rate": 2.070367520687081e-05, + "loss": 0.8561, + "step": 8670 + }, + { + "epoch": 0.8127605752564763, + "grad_norm": 0.6796875, + "learning_rate": 2.060414170252576e-05, + "loss": 1.0098, + "step": 8675 + }, + { + "epoch": 0.8132290251557596, + "grad_norm": 0.59375, + "learning_rate": 2.050482054353643e-05, + "loss": 0.8824, + "step": 8680 + }, + { + "epoch": 0.8136974750550429, + "grad_norm": 0.55859375, + "learning_rate": 2.0405711995538725e-05, + "loss": 0.8625, + "step": 8685 + }, + { + "epoch": 0.8141659249543262, + "grad_norm": 0.5390625, + "learning_rate": 2.0306816323599763e-05, + "loss": 0.8539, + "step": 8690 + }, + { + "epoch": 0.8146343748536095, + "grad_norm": 0.57421875, + "learning_rate": 2.02081337922175e-05, + "loss": 0.8683, + "step": 8695 + }, + { + "epoch": 0.8151028247528926, + "grad_norm": 0.62890625, + "learning_rate": 2.010966466531976e-05, + "loss": 0.86, + "step": 8700 + }, + { + "epoch": 0.8155712746521759, + "grad_norm": 0.5859375, + "learning_rate": 2.0011409206263654e-05, + "loss": 0.8658, + "step": 8705 + }, + { + "epoch": 0.8160397245514592, + "grad_norm": 0.5625, + "learning_rate": 1.991336767783476e-05, + "loss": 0.8606, + "step": 8710 + }, + { + "epoch": 0.8165081744507425, + "grad_norm": 0.5625, + "learning_rate": 1.9815540342246562e-05, + "loss": 0.9975, + "step": 8715 + }, + { + "epoch": 0.8169766243500257, + "grad_norm": 0.56640625, + "learning_rate": 1.9717927461139705e-05, + "loss": 0.881, + "step": 8720 + }, + { + "epoch": 0.817445074249309, + "grad_norm": 0.52734375, + "learning_rate": 1.9620529295581192e-05, + "loss": 0.8489, + "step": 8725 + }, + { + "epoch": 0.8179135241485923, + "grad_norm": 0.51953125, + "learning_rate": 1.95233461060638e-05, + "loss": 0.8271, + "step": 8730 + }, + { + "epoch": 0.8183819740478756, + "grad_norm": 0.55078125, + "learning_rate": 1.9426378152505352e-05, + "loss": 0.8778, + "step": 8735 + }, + { + "epoch": 0.8188504239471589, + "grad_norm": 0.59375, + "learning_rate": 1.9329625694248075e-05, + "loss": 0.8552, + "step": 8740 + }, + { + "epoch": 0.8193188738464421, + "grad_norm": 0.53125, + "learning_rate": 1.9233088990057734e-05, + "loss": 0.8524, + "step": 8745 + }, + { + "epoch": 0.8197873237457254, + "grad_norm": 0.52734375, + "learning_rate": 1.9136768298123165e-05, + "loss": 0.8551, + "step": 8750 + }, + { + "epoch": 0.8202557736450087, + "grad_norm": 0.51953125, + "learning_rate": 1.904066387605543e-05, + "loss": 0.8538, + "step": 8755 + }, + { + "epoch": 0.820724223544292, + "grad_norm": 0.56640625, + "learning_rate": 1.8944775980887152e-05, + "loss": 0.8867, + "step": 8760 + }, + { + "epoch": 0.8211926734435752, + "grad_norm": 0.59765625, + "learning_rate": 1.884910486907191e-05, + "loss": 0.8367, + "step": 8765 + }, + { + "epoch": 0.8216611233428585, + "grad_norm": 0.60546875, + "learning_rate": 1.8753650796483403e-05, + "loss": 0.8553, + "step": 8770 + }, + { + "epoch": 0.8221295732421418, + "grad_norm": 0.55078125, + "learning_rate": 1.8658414018414983e-05, + "loss": 0.8617, + "step": 8775 + }, + { + "epoch": 0.822598023141425, + "grad_norm": 0.58984375, + "learning_rate": 1.8563394789578692e-05, + "loss": 0.8509, + "step": 8780 + }, + { + "epoch": 0.8230664730407083, + "grad_norm": 0.625, + "learning_rate": 1.8468593364104856e-05, + "loss": 0.8399, + "step": 8785 + }, + { + "epoch": 0.8235349229399915, + "grad_norm": 0.546875, + "learning_rate": 1.8374009995541274e-05, + "loss": 0.8335, + "step": 8790 + }, + { + "epoch": 0.8240033728392748, + "grad_norm": 0.58984375, + "learning_rate": 1.8279644936852447e-05, + "loss": 0.8516, + "step": 8795 + }, + { + "epoch": 0.8244718227385581, + "grad_norm": 0.61328125, + "learning_rate": 1.8185498440419092e-05, + "loss": 0.8404, + "step": 8800 + }, + { + "epoch": 0.8249402726378414, + "grad_norm": 0.57421875, + "learning_rate": 1.80915707580374e-05, + "loss": 0.8458, + "step": 8805 + }, + { + "epoch": 0.8254087225371246, + "grad_norm": 0.5859375, + "learning_rate": 1.7997862140918265e-05, + "loss": 0.8718, + "step": 8810 + }, + { + "epoch": 0.8258771724364079, + "grad_norm": 0.57421875, + "learning_rate": 1.790437283968669e-05, + "loss": 0.871, + "step": 8815 + }, + { + "epoch": 0.8263456223356912, + "grad_norm": 0.56640625, + "learning_rate": 1.7811103104381176e-05, + "loss": 0.8513, + "step": 8820 + }, + { + "epoch": 0.8268140722349745, + "grad_norm": 0.54296875, + "learning_rate": 1.7718053184452975e-05, + "loss": 0.8573, + "step": 8825 + }, + { + "epoch": 0.8272825221342578, + "grad_norm": 0.5859375, + "learning_rate": 1.7625223328765394e-05, + "loss": 0.8435, + "step": 8830 + }, + { + "epoch": 0.827750972033541, + "grad_norm": 0.69921875, + "learning_rate": 1.753261378559322e-05, + "loss": 0.8535, + "step": 8835 + }, + { + "epoch": 0.8282194219328243, + "grad_norm": 0.53125, + "learning_rate": 1.7440224802621995e-05, + "loss": 0.8452, + "step": 8840 + }, + { + "epoch": 0.8286878718321076, + "grad_norm": 0.59375, + "learning_rate": 1.734805662694743e-05, + "loss": 0.8796, + "step": 8845 + }, + { + "epoch": 0.8291563217313909, + "grad_norm": 0.61328125, + "learning_rate": 1.7256109505074525e-05, + "loss": 0.8524, + "step": 8850 + }, + { + "epoch": 0.829624771630674, + "grad_norm": 0.5546875, + "learning_rate": 1.7164383682917208e-05, + "loss": 0.8433, + "step": 8855 + }, + { + "epoch": 0.8300932215299573, + "grad_norm": 0.51953125, + "learning_rate": 1.7072879405797516e-05, + "loss": 0.8526, + "step": 8860 + }, + { + "epoch": 0.8305616714292406, + "grad_norm": 0.55078125, + "learning_rate": 1.6981596918444953e-05, + "loss": 0.8537, + "step": 8865 + }, + { + "epoch": 0.8310301213285239, + "grad_norm": 0.5703125, + "learning_rate": 1.6890536464995788e-05, + "loss": 0.863, + "step": 8870 + }, + { + "epoch": 0.8314985712278072, + "grad_norm": 0.56640625, + "learning_rate": 1.6799698288992526e-05, + "loss": 0.8806, + "step": 8875 + }, + { + "epoch": 0.8319670211270904, + "grad_norm": 0.546875, + "learning_rate": 1.67090826333832e-05, + "loss": 0.8733, + "step": 8880 + }, + { + "epoch": 0.8324354710263737, + "grad_norm": 0.578125, + "learning_rate": 1.6618689740520642e-05, + "loss": 0.8297, + "step": 8885 + }, + { + "epoch": 0.832903920925657, + "grad_norm": 0.56640625, + "learning_rate": 1.6528519852161916e-05, + "loss": 0.8539, + "step": 8890 + }, + { + "epoch": 0.8333723708249403, + "grad_norm": 0.5703125, + "learning_rate": 1.643857320946769e-05, + "loss": 0.8509, + "step": 8895 + }, + { + "epoch": 0.8338408207242235, + "grad_norm": 0.59375, + "learning_rate": 1.6348850053001564e-05, + "loss": 0.8596, + "step": 8900 + }, + { + "epoch": 0.8343092706235068, + "grad_norm": 0.546875, + "learning_rate": 1.6259350622729362e-05, + "loss": 0.8681, + "step": 8905 + }, + { + "epoch": 0.8347777205227901, + "grad_norm": 0.55859375, + "learning_rate": 1.6170075158018605e-05, + "loss": 0.9096, + "step": 8910 + }, + { + "epoch": 0.8352461704220734, + "grad_norm": 0.578125, + "learning_rate": 1.6081023897637826e-05, + "loss": 0.8656, + "step": 8915 + }, + { + "epoch": 0.8357146203213567, + "grad_norm": 0.5625, + "learning_rate": 1.5992197079755834e-05, + "loss": 0.842, + "step": 8920 + }, + { + "epoch": 0.8361830702206399, + "grad_norm": 0.54296875, + "learning_rate": 1.590359494194128e-05, + "loss": 0.8458, + "step": 8925 + }, + { + "epoch": 0.8366515201199232, + "grad_norm": 0.59375, + "learning_rate": 1.5815217721161792e-05, + "loss": 0.8653, + "step": 8930 + }, + { + "epoch": 0.8371199700192065, + "grad_norm": 0.515625, + "learning_rate": 1.5727065653783558e-05, + "loss": 0.85, + "step": 8935 + }, + { + "epoch": 0.8375884199184898, + "grad_norm": 0.5546875, + "learning_rate": 1.563913897557049e-05, + "loss": 0.8852, + "step": 8940 + }, + { + "epoch": 0.8380568698177729, + "grad_norm": 0.54296875, + "learning_rate": 1.5551437921683766e-05, + "loss": 0.8552, + "step": 8945 + }, + { + "epoch": 0.8385253197170562, + "grad_norm": 0.6015625, + "learning_rate": 1.5463962726681125e-05, + "loss": 0.8604, + "step": 8950 + }, + { + "epoch": 0.8389937696163395, + "grad_norm": 0.5703125, + "learning_rate": 1.5376713624516193e-05, + "loss": 0.8476, + "step": 8955 + }, + { + "epoch": 0.8394622195156228, + "grad_norm": 0.52734375, + "learning_rate": 1.528969084853794e-05, + "loss": 0.8465, + "step": 8960 + }, + { + "epoch": 0.8399306694149061, + "grad_norm": 0.546875, + "learning_rate": 1.5202894631490072e-05, + "loss": 0.8677, + "step": 8965 + }, + { + "epoch": 0.8403991193141893, + "grad_norm": 0.5703125, + "learning_rate": 1.5116325205510262e-05, + "loss": 0.8704, + "step": 8970 + }, + { + "epoch": 0.8408675692134726, + "grad_norm": 0.5859375, + "learning_rate": 1.5029982802129661e-05, + "loss": 0.8784, + "step": 8975 + }, + { + "epoch": 0.8413360191127559, + "grad_norm": 0.58984375, + "learning_rate": 1.4943867652272292e-05, + "loss": 0.9825, + "step": 8980 + }, + { + "epoch": 0.8418044690120392, + "grad_norm": 0.5703125, + "learning_rate": 1.4857979986254333e-05, + "loss": 0.8586, + "step": 8985 + }, + { + "epoch": 0.8422729189113224, + "grad_norm": 0.5546875, + "learning_rate": 1.4772320033783605e-05, + "loss": 0.8557, + "step": 8990 + }, + { + "epoch": 0.8427413688106057, + "grad_norm": 0.625, + "learning_rate": 1.4686888023958822e-05, + "loss": 0.8477, + "step": 8995 + }, + { + "epoch": 0.843209818709889, + "grad_norm": 0.55078125, + "learning_rate": 1.4601684185269149e-05, + "loss": 0.8458, + "step": 9000 + }, + { + "epoch": 0.8436782686091723, + "grad_norm": 0.55859375, + "learning_rate": 1.4516708745593477e-05, + "loss": 0.8444, + "step": 9005 + }, + { + "epoch": 0.8441467185084556, + "grad_norm": 0.59375, + "learning_rate": 1.4431961932199811e-05, + "loss": 0.8613, + "step": 9010 + }, + { + "epoch": 0.8446151684077388, + "grad_norm": 0.61328125, + "learning_rate": 1.434744397174469e-05, + "loss": 0.8436, + "step": 9015 + }, + { + "epoch": 0.845083618307022, + "grad_norm": 0.60546875, + "learning_rate": 1.426315509027264e-05, + "loss": 0.8626, + "step": 9020 + }, + { + "epoch": 0.8455520682063054, + "grad_norm": 0.54296875, + "learning_rate": 1.417909551321549e-05, + "loss": 0.8186, + "step": 9025 + }, + { + "epoch": 0.8460205181055886, + "grad_norm": 0.56640625, + "learning_rate": 1.4095265465391727e-05, + "loss": 0.8445, + "step": 9030 + }, + { + "epoch": 0.8464889680048718, + "grad_norm": 0.61328125, + "learning_rate": 1.401166517100604e-05, + "loss": 0.8795, + "step": 9035 + }, + { + "epoch": 0.8469574179041551, + "grad_norm": 0.5703125, + "learning_rate": 1.3928294853648627e-05, + "loss": 0.8581, + "step": 9040 + }, + { + "epoch": 0.8474258678034384, + "grad_norm": 0.60546875, + "learning_rate": 1.3845154736294574e-05, + "loss": 0.8539, + "step": 9045 + }, + { + "epoch": 0.8478943177027217, + "grad_norm": 0.62109375, + "learning_rate": 1.376224504130329e-05, + "loss": 0.8707, + "step": 9050 + }, + { + "epoch": 0.848362767602005, + "grad_norm": 0.57421875, + "learning_rate": 1.3679565990417942e-05, + "loss": 0.8364, + "step": 9055 + }, + { + "epoch": 0.8488312175012882, + "grad_norm": 0.6328125, + "learning_rate": 1.359711780476488e-05, + "loss": 0.8418, + "step": 9060 + }, + { + "epoch": 0.8492996674005715, + "grad_norm": 0.546875, + "learning_rate": 1.3514900704852895e-05, + "loss": 0.8515, + "step": 9065 + }, + { + "epoch": 0.8497681172998548, + "grad_norm": 0.62109375, + "learning_rate": 1.3432914910572813e-05, + "loss": 0.8587, + "step": 9070 + }, + { + "epoch": 0.8502365671991381, + "grad_norm": 0.53515625, + "learning_rate": 1.3351160641196835e-05, + "loss": 0.8613, + "step": 9075 + }, + { + "epoch": 0.8507050170984213, + "grad_norm": 0.5546875, + "learning_rate": 1.326963811537787e-05, + "loss": 0.8201, + "step": 9080 + }, + { + "epoch": 0.8511734669977046, + "grad_norm": 0.546875, + "learning_rate": 1.3188347551149139e-05, + "loss": 0.8405, + "step": 9085 + }, + { + "epoch": 0.8516419168969879, + "grad_norm": 0.5703125, + "learning_rate": 1.3107289165923353e-05, + "loss": 0.8412, + "step": 9090 + }, + { + "epoch": 0.8521103667962712, + "grad_norm": 0.5625, + "learning_rate": 1.3026463176492376e-05, + "loss": 0.8518, + "step": 9095 + }, + { + "epoch": 0.8525788166955545, + "grad_norm": 0.5859375, + "learning_rate": 1.2945869799026423e-05, + "loss": 0.8523, + "step": 9100 + }, + { + "epoch": 0.8530472665948376, + "grad_norm": 0.5703125, + "learning_rate": 1.2865509249073649e-05, + "loss": 0.8493, + "step": 9105 + }, + { + "epoch": 0.853515716494121, + "grad_norm": 0.59375, + "learning_rate": 1.278538174155951e-05, + "loss": 0.8818, + "step": 9110 + }, + { + "epoch": 0.8539841663934042, + "grad_norm": 0.5546875, + "learning_rate": 1.2705487490786138e-05, + "loss": 0.8731, + "step": 9115 + }, + { + "epoch": 0.8544526162926875, + "grad_norm": 0.53125, + "learning_rate": 1.2625826710431866e-05, + "loss": 0.8304, + "step": 9120 + }, + { + "epoch": 0.8549210661919707, + "grad_norm": 0.60546875, + "learning_rate": 1.2546399613550597e-05, + "loss": 0.8529, + "step": 9125 + }, + { + "epoch": 0.855389516091254, + "grad_norm": 0.62109375, + "learning_rate": 1.2467206412571208e-05, + "loss": 0.8409, + "step": 9130 + }, + { + "epoch": 0.8558579659905373, + "grad_norm": 0.54296875, + "learning_rate": 1.2388247319297053e-05, + "loss": 0.8343, + "step": 9135 + }, + { + "epoch": 0.8563264158898206, + "grad_norm": 0.6015625, + "learning_rate": 1.230952254490535e-05, + "loss": 0.9583, + "step": 9140 + }, + { + "epoch": 0.8567948657891039, + "grad_norm": 0.5390625, + "learning_rate": 1.2231032299946643e-05, + "loss": 0.8788, + "step": 9145 + }, + { + "epoch": 0.8572633156883871, + "grad_norm": 0.5078125, + "learning_rate": 1.2152776794344223e-05, + "loss": 0.832, + "step": 9150 + }, + { + "epoch": 0.8577317655876704, + "grad_norm": 0.53515625, + "learning_rate": 1.2074756237393514e-05, + "loss": 0.8818, + "step": 9155 + }, + { + "epoch": 0.8582002154869537, + "grad_norm": 0.5859375, + "learning_rate": 1.1996970837761645e-05, + "loss": 0.864, + "step": 9160 + }, + { + "epoch": 0.858668665386237, + "grad_norm": 0.57421875, + "learning_rate": 1.1919420803486791e-05, + "loss": 0.8819, + "step": 9165 + }, + { + "epoch": 0.8591371152855202, + "grad_norm": 0.53125, + "learning_rate": 1.1842106341977588e-05, + "loss": 0.8678, + "step": 9170 + }, + { + "epoch": 0.8596055651848035, + "grad_norm": 0.58203125, + "learning_rate": 1.1765027660012672e-05, + "loss": 0.8634, + "step": 9175 + }, + { + "epoch": 0.8600740150840868, + "grad_norm": 0.55859375, + "learning_rate": 1.1688184963740078e-05, + "loss": 0.8707, + "step": 9180 + }, + { + "epoch": 0.8605424649833701, + "grad_norm": 0.55859375, + "learning_rate": 1.161157845867672e-05, + "loss": 0.8717, + "step": 9185 + }, + { + "epoch": 0.8610109148826534, + "grad_norm": 0.5546875, + "learning_rate": 1.1535208349707748e-05, + "loss": 0.8843, + "step": 9190 + }, + { + "epoch": 0.8614793647819365, + "grad_norm": 0.58203125, + "learning_rate": 1.1459074841086136e-05, + "loss": 0.8742, + "step": 9195 + }, + { + "epoch": 0.8619478146812198, + "grad_norm": 0.56640625, + "learning_rate": 1.1383178136432049e-05, + "loss": 0.8853, + "step": 9200 + }, + { + "epoch": 0.8624162645805031, + "grad_norm": 0.609375, + "learning_rate": 1.130751843873229e-05, + "loss": 0.8491, + "step": 9205 + }, + { + "epoch": 0.8628847144797864, + "grad_norm": 0.59765625, + "learning_rate": 1.1232095950339783e-05, + "loss": 0.9069, + "step": 9210 + }, + { + "epoch": 0.8633531643790696, + "grad_norm": 0.5859375, + "learning_rate": 1.1156910872973059e-05, + "loss": 0.8875, + "step": 9215 + }, + { + "epoch": 0.8638216142783529, + "grad_norm": 0.5390625, + "learning_rate": 1.1081963407715711e-05, + "loss": 0.8397, + "step": 9220 + }, + { + "epoch": 0.8642900641776362, + "grad_norm": 0.5546875, + "learning_rate": 1.1007253755015755e-05, + "loss": 0.8587, + "step": 9225 + }, + { + "epoch": 0.8647585140769195, + "grad_norm": 0.5625, + "learning_rate": 1.0932782114685258e-05, + "loss": 0.8695, + "step": 9230 + }, + { + "epoch": 0.8652269639762027, + "grad_norm": 0.55859375, + "learning_rate": 1.0858548685899683e-05, + "loss": 0.8453, + "step": 9235 + }, + { + "epoch": 0.865695413875486, + "grad_norm": 0.56640625, + "learning_rate": 1.0784553667197373e-05, + "loss": 0.8721, + "step": 9240 + }, + { + "epoch": 0.8661638637747693, + "grad_norm": 0.578125, + "learning_rate": 1.0710797256479089e-05, + "loss": 0.8571, + "step": 9245 + }, + { + "epoch": 0.8666323136740526, + "grad_norm": 0.5625, + "learning_rate": 1.063727965100737e-05, + "loss": 0.8583, + "step": 9250 + }, + { + "epoch": 0.8671007635733359, + "grad_norm": 0.53125, + "learning_rate": 1.0564001047406135e-05, + "loss": 0.8438, + "step": 9255 + }, + { + "epoch": 0.8675692134726191, + "grad_norm": 0.56640625, + "learning_rate": 1.0490961641660036e-05, + "loss": 0.8507, + "step": 9260 + }, + { + "epoch": 0.8680376633719024, + "grad_norm": 0.6015625, + "learning_rate": 1.0418161629113987e-05, + "loss": 0.8789, + "step": 9265 + }, + { + "epoch": 0.8685061132711857, + "grad_norm": 0.578125, + "learning_rate": 1.0345601204472699e-05, + "loss": 0.882, + "step": 9270 + }, + { + "epoch": 0.868974563170469, + "grad_norm": 0.5546875, + "learning_rate": 1.0273280561800058e-05, + "loss": 0.8675, + "step": 9275 + }, + { + "epoch": 0.8694430130697521, + "grad_norm": 0.5625, + "learning_rate": 1.0201199894518621e-05, + "loss": 0.8368, + "step": 9280 + }, + { + "epoch": 0.8699114629690354, + "grad_norm": 0.67578125, + "learning_rate": 1.01293593954092e-05, + "loss": 0.8722, + "step": 9285 + }, + { + "epoch": 0.8703799128683187, + "grad_norm": 0.53515625, + "learning_rate": 1.0057759256610222e-05, + "loss": 0.8477, + "step": 9290 + }, + { + "epoch": 0.870848362767602, + "grad_norm": 0.58984375, + "learning_rate": 9.986399669617252e-06, + "loss": 0.8723, + "step": 9295 + }, + { + "epoch": 0.8713168126668853, + "grad_norm": 0.61328125, + "learning_rate": 9.915280825282547e-06, + "loss": 0.8331, + "step": 9300 + }, + { + "epoch": 0.8717852625661685, + "grad_norm": 0.609375, + "learning_rate": 9.844402913814477e-06, + "loss": 0.876, + "step": 9305 + }, + { + "epoch": 0.8722537124654518, + "grad_norm": 0.578125, + "learning_rate": 9.773766124777028e-06, + "loss": 0.8313, + "step": 9310 + }, + { + "epoch": 0.8727221623647351, + "grad_norm": 0.5703125, + "learning_rate": 9.703370647089282e-06, + "loss": 0.8557, + "step": 9315 + }, + { + "epoch": 0.8731906122640184, + "grad_norm": 0.55859375, + "learning_rate": 9.633216669024958e-06, + "loss": 0.8799, + "step": 9320 + }, + { + "epoch": 0.8736590621633016, + "grad_norm": 0.53515625, + "learning_rate": 9.56330437821189e-06, + "loss": 0.8421, + "step": 9325 + }, + { + "epoch": 0.8741275120625849, + "grad_norm": 0.66796875, + "learning_rate": 9.493633961631476e-06, + "loss": 0.8514, + "step": 9330 + }, + { + "epoch": 0.8745959619618682, + "grad_norm": 0.58203125, + "learning_rate": 9.424205605618219e-06, + "loss": 0.8492, + "step": 9335 + }, + { + "epoch": 0.8750644118611515, + "grad_norm": 0.55859375, + "learning_rate": 9.355019495859286e-06, + "loss": 0.865, + "step": 9340 + }, + { + "epoch": 0.8755328617604348, + "grad_norm": 0.57421875, + "learning_rate": 9.286075817393901e-06, + "loss": 0.8825, + "step": 9345 + }, + { + "epoch": 0.876001311659718, + "grad_norm": 0.57421875, + "learning_rate": 9.217374754612906e-06, + "loss": 0.8685, + "step": 9350 + }, + { + "epoch": 0.8764697615590012, + "grad_norm": 0.5625, + "learning_rate": 9.148916491258286e-06, + "loss": 0.8457, + "step": 9355 + }, + { + "epoch": 0.8769382114582845, + "grad_norm": 0.53515625, + "learning_rate": 9.08070121042265e-06, + "loss": 0.8657, + "step": 9360 + }, + { + "epoch": 0.8774066613575678, + "grad_norm": 0.68359375, + "learning_rate": 9.012729094548733e-06, + "loss": 0.9798, + "step": 9365 + }, + { + "epoch": 0.877875111256851, + "grad_norm": 0.640625, + "learning_rate": 8.945000325428898e-06, + "loss": 0.8415, + "step": 9370 + }, + { + "epoch": 0.8783435611561343, + "grad_norm": 0.70703125, + "learning_rate": 8.877515084204735e-06, + "loss": 0.8643, + "step": 9375 + }, + { + "epoch": 0.8788120110554176, + "grad_norm": 0.55078125, + "learning_rate": 8.810273551366499e-06, + "loss": 0.8601, + "step": 9380 + }, + { + "epoch": 0.8792804609547009, + "grad_norm": 0.61328125, + "learning_rate": 8.74327590675259e-06, + "loss": 0.8531, + "step": 9385 + }, + { + "epoch": 0.8797489108539842, + "grad_norm": 0.5546875, + "learning_rate": 8.676522329549186e-06, + "loss": 0.8621, + "step": 9390 + }, + { + "epoch": 0.8802173607532674, + "grad_norm": 0.58984375, + "learning_rate": 8.610012998289696e-06, + "loss": 0.8623, + "step": 9395 + }, + { + "epoch": 0.8806858106525507, + "grad_norm": 0.5390625, + "learning_rate": 8.543748090854242e-06, + "loss": 0.8502, + "step": 9400 + }, + { + "epoch": 0.881154260551834, + "grad_norm": 0.5625, + "learning_rate": 8.477727784469302e-06, + "loss": 0.8303, + "step": 9405 + }, + { + "epoch": 0.8816227104511173, + "grad_norm": 0.5625, + "learning_rate": 8.411952255707079e-06, + "loss": 0.8424, + "step": 9410 + }, + { + "epoch": 0.8820911603504005, + "grad_norm": 0.5859375, + "learning_rate": 8.346421680485217e-06, + "loss": 0.8754, + "step": 9415 + }, + { + "epoch": 0.8825596102496838, + "grad_norm": 0.56640625, + "learning_rate": 8.281136234066133e-06, + "loss": 0.877, + "step": 9420 + }, + { + "epoch": 0.8830280601489671, + "grad_norm": 0.6640625, + "learning_rate": 8.21609609105669e-06, + "loss": 0.8639, + "step": 9425 + }, + { + "epoch": 0.8834965100482504, + "grad_norm": 0.59375, + "learning_rate": 8.151301425407699e-06, + "loss": 1.0118, + "step": 9430 + }, + { + "epoch": 0.8839649599475337, + "grad_norm": 0.58984375, + "learning_rate": 8.086752410413412e-06, + "loss": 0.846, + "step": 9435 + }, + { + "epoch": 0.8844334098468168, + "grad_norm": 0.53515625, + "learning_rate": 8.022449218711082e-06, + "loss": 0.9843, + "step": 9440 + }, + { + "epoch": 0.8849018597461001, + "grad_norm": 0.6328125, + "learning_rate": 7.958392022280525e-06, + "loss": 0.8849, + "step": 9445 + }, + { + "epoch": 0.8853703096453834, + "grad_norm": 0.55859375, + "learning_rate": 7.8945809924436e-06, + "loss": 0.8647, + "step": 9450 + }, + { + "epoch": 0.8858387595446667, + "grad_norm": 0.55859375, + "learning_rate": 7.831016299863847e-06, + "loss": 0.8792, + "step": 9455 + }, + { + "epoch": 0.8863072094439499, + "grad_norm": 0.5703125, + "learning_rate": 7.767698114545919e-06, + "loss": 0.8314, + "step": 9460 + }, + { + "epoch": 0.8867756593432332, + "grad_norm": 0.5625, + "learning_rate": 7.704626605835218e-06, + "loss": 0.8854, + "step": 9465 + }, + { + "epoch": 0.8872441092425165, + "grad_norm": 0.53125, + "learning_rate": 7.641801942417415e-06, + "loss": 0.8566, + "step": 9470 + }, + { + "epoch": 0.8877125591417998, + "grad_norm": 0.66015625, + "learning_rate": 7.579224292317921e-06, + "loss": 0.8811, + "step": 9475 + }, + { + "epoch": 0.8881810090410831, + "grad_norm": 0.6640625, + "learning_rate": 7.5168938229015825e-06, + "loss": 0.8842, + "step": 9480 + }, + { + "epoch": 0.8886494589403663, + "grad_norm": 0.51953125, + "learning_rate": 7.4548107008721395e-06, + "loss": 0.8628, + "step": 9485 + }, + { + "epoch": 0.8891179088396496, + "grad_norm": 0.65625, + "learning_rate": 7.392975092271781e-06, + "loss": 0.8632, + "step": 9490 + }, + { + "epoch": 0.8895863587389329, + "grad_norm": 0.59765625, + "learning_rate": 7.331387162480708e-06, + "loss": 0.8568, + "step": 9495 + }, + { + "epoch": 0.8900548086382162, + "grad_norm": 0.625, + "learning_rate": 7.2700470762167125e-06, + "loss": 0.8797, + "step": 9500 + }, + { + "epoch": 0.8905232585374994, + "grad_norm": 0.59765625, + "learning_rate": 7.208954997534767e-06, + "loss": 0.8497, + "step": 9505 + }, + { + "epoch": 0.8909917084367827, + "grad_norm": 0.60546875, + "learning_rate": 7.1481110898264705e-06, + "loss": 0.8354, + "step": 9510 + }, + { + "epoch": 0.891460158336066, + "grad_norm": 0.57421875, + "learning_rate": 7.087515515819742e-06, + "loss": 0.8595, + "step": 9515 + }, + { + "epoch": 0.8919286082353493, + "grad_norm": 0.56640625, + "learning_rate": 7.0271684375783134e-06, + "loss": 0.8524, + "step": 9520 + }, + { + "epoch": 0.8923970581346325, + "grad_norm": 0.5625, + "learning_rate": 6.967070016501298e-06, + "loss": 0.8456, + "step": 9525 + }, + { + "epoch": 0.8928655080339157, + "grad_norm": 0.5625, + "learning_rate": 6.907220413322768e-06, + "loss": 0.8684, + "step": 9530 + }, + { + "epoch": 0.893333957933199, + "grad_norm": 0.55859375, + "learning_rate": 6.847619788111348e-06, + "loss": 0.8543, + "step": 9535 + }, + { + "epoch": 0.8938024078324823, + "grad_norm": 0.59375, + "learning_rate": 6.788268300269785e-06, + "loss": 0.8336, + "step": 9540 + }, + { + "epoch": 0.8942708577317656, + "grad_norm": 0.6015625, + "learning_rate": 6.729166108534435e-06, + "loss": 0.8674, + "step": 9545 + }, + { + "epoch": 0.8947393076310488, + "grad_norm": 0.58984375, + "learning_rate": 6.6703133709749895e-06, + "loss": 0.8718, + "step": 9550 + }, + { + "epoch": 0.8952077575303321, + "grad_norm": 0.52734375, + "learning_rate": 6.61171024499393e-06, + "loss": 0.8599, + "step": 9555 + }, + { + "epoch": 0.8956762074296154, + "grad_norm": 0.5859375, + "learning_rate": 6.553356887326135e-06, + "loss": 0.8693, + "step": 9560 + }, + { + "epoch": 0.8961446573288987, + "grad_norm": 0.53515625, + "learning_rate": 6.495253454038508e-06, + "loss": 0.8403, + "step": 9565 + }, + { + "epoch": 0.896613107228182, + "grad_norm": 0.5234375, + "learning_rate": 6.4374001005295e-06, + "loss": 0.8618, + "step": 9570 + }, + { + "epoch": 0.8970815571274652, + "grad_norm": 0.5859375, + "learning_rate": 6.379796981528752e-06, + "loss": 0.8476, + "step": 9575 + }, + { + "epoch": 0.8975500070267485, + "grad_norm": 0.54296875, + "learning_rate": 6.3224442510965934e-06, + "loss": 0.8465, + "step": 9580 + }, + { + "epoch": 0.8980184569260318, + "grad_norm": 0.6484375, + "learning_rate": 6.265342062623736e-06, + "loss": 0.8695, + "step": 9585 + }, + { + "epoch": 0.8984869068253151, + "grad_norm": 0.61328125, + "learning_rate": 6.208490568830805e-06, + "loss": 0.8419, + "step": 9590 + }, + { + "epoch": 0.8989553567245983, + "grad_norm": 0.55078125, + "learning_rate": 6.151889921767939e-06, + "loss": 0.8749, + "step": 9595 + }, + { + "epoch": 0.8994238066238815, + "grad_norm": 0.5390625, + "learning_rate": 6.095540272814337e-06, + "loss": 0.8456, + "step": 9600 + }, + { + "epoch": 0.8998922565231648, + "grad_norm": 0.53515625, + "learning_rate": 6.039441772677989e-06, + "loss": 0.8403, + "step": 9605 + }, + { + "epoch": 0.9003607064224481, + "grad_norm": 0.6171875, + "learning_rate": 5.9835945713951016e-06, + "loss": 0.8151, + "step": 9610 + }, + { + "epoch": 0.9008291563217314, + "grad_norm": 0.6171875, + "learning_rate": 5.927998818329827e-06, + "loss": 0.8659, + "step": 9615 + }, + { + "epoch": 0.9012976062210146, + "grad_norm": 0.54296875, + "learning_rate": 5.872654662173782e-06, + "loss": 0.8467, + "step": 9620 + }, + { + "epoch": 0.9017660561202979, + "grad_norm": 0.56640625, + "learning_rate": 5.817562250945707e-06, + "loss": 0.8416, + "step": 9625 + }, + { + "epoch": 0.9022345060195812, + "grad_norm": 0.56640625, + "learning_rate": 5.762721731991061e-06, + "loss": 0.9013, + "step": 9630 + }, + { + "epoch": 0.9027029559188645, + "grad_norm": 0.5546875, + "learning_rate": 5.708133251981573e-06, + "loss": 0.8782, + "step": 9635 + }, + { + "epoch": 0.9031714058181477, + "grad_norm": 0.63671875, + "learning_rate": 5.653796956914914e-06, + "loss": 0.8547, + "step": 9640 + }, + { + "epoch": 0.903639855717431, + "grad_norm": 0.5390625, + "learning_rate": 5.599712992114303e-06, + "loss": 0.841, + "step": 9645 + }, + { + "epoch": 0.9041083056167143, + "grad_norm": 0.58984375, + "learning_rate": 5.5458815022280566e-06, + "loss": 0.8368, + "step": 9650 + }, + { + "epoch": 0.9045767555159976, + "grad_norm": 0.6953125, + "learning_rate": 5.492302631229251e-06, + "loss": 0.8432, + "step": 9655 + }, + { + "epoch": 0.9050452054152809, + "grad_norm": 0.5546875, + "learning_rate": 5.438976522415351e-06, + "loss": 0.8256, + "step": 9660 + }, + { + "epoch": 0.9055136553145641, + "grad_norm": 0.56640625, + "learning_rate": 5.385903318407814e-06, + "loss": 0.8407, + "step": 9665 + }, + { + "epoch": 0.9059821052138474, + "grad_norm": 0.5703125, + "learning_rate": 5.333083161151653e-06, + "loss": 0.8563, + "step": 9670 + }, + { + "epoch": 0.9064505551131307, + "grad_norm": 0.77734375, + "learning_rate": 5.280516191915142e-06, + "loss": 0.8367, + "step": 9675 + }, + { + "epoch": 0.906919005012414, + "grad_norm": 0.54296875, + "learning_rate": 5.228202551289407e-06, + "loss": 0.8634, + "step": 9680 + }, + { + "epoch": 0.9073874549116971, + "grad_norm": 0.5625, + "learning_rate": 5.176142379188009e-06, + "loss": 0.8403, + "step": 9685 + }, + { + "epoch": 0.9078559048109804, + "grad_norm": 0.609375, + "learning_rate": 5.124335814846614e-06, + "loss": 0.8543, + "step": 9690 + }, + { + "epoch": 0.9083243547102637, + "grad_norm": 0.6328125, + "learning_rate": 5.072782996822634e-06, + "loss": 0.8842, + "step": 9695 + }, + { + "epoch": 0.908792804609547, + "grad_norm": 0.5625, + "learning_rate": 5.021484062994819e-06, + "loss": 0.8307, + "step": 9700 + }, + { + "epoch": 0.9092612545088303, + "grad_norm": 0.5859375, + "learning_rate": 4.970439150562867e-06, + "loss": 0.8947, + "step": 9705 + }, + { + "epoch": 0.9097297044081135, + "grad_norm": 0.5234375, + "learning_rate": 4.91964839604716e-06, + "loss": 0.8414, + "step": 9710 + }, + { + "epoch": 0.9101981543073968, + "grad_norm": 0.55078125, + "learning_rate": 4.869111935288273e-06, + "loss": 0.8414, + "step": 9715 + }, + { + "epoch": 0.9106666042066801, + "grad_norm": 0.58203125, + "learning_rate": 4.818829903446731e-06, + "loss": 0.8286, + "step": 9720 + }, + { + "epoch": 0.9111350541059634, + "grad_norm": 0.5859375, + "learning_rate": 4.768802435002506e-06, + "loss": 0.8573, + "step": 9725 + }, + { + "epoch": 0.9116035040052466, + "grad_norm": 0.5859375, + "learning_rate": 4.7190296637547705e-06, + "loss": 0.845, + "step": 9730 + }, + { + "epoch": 0.9120719539045299, + "grad_norm": 0.6015625, + "learning_rate": 4.6695117228215405e-06, + "loss": 0.8924, + "step": 9735 + }, + { + "epoch": 0.9125404038038132, + "grad_norm": 0.5625, + "learning_rate": 4.620248744639233e-06, + "loss": 0.8385, + "step": 9740 + }, + { + "epoch": 0.9130088537030965, + "grad_norm": 0.54296875, + "learning_rate": 4.5712408609623895e-06, + "loss": 0.8617, + "step": 9745 + }, + { + "epoch": 0.9134773036023798, + "grad_norm": 0.578125, + "learning_rate": 4.522488202863307e-06, + "loss": 0.8635, + "step": 9750 + }, + { + "epoch": 0.913945753501663, + "grad_norm": 0.55859375, + "learning_rate": 4.4739909007316614e-06, + "loss": 0.8691, + "step": 9755 + }, + { + "epoch": 0.9144142034009463, + "grad_norm": 0.54296875, + "learning_rate": 4.425749084274166e-06, + "loss": 0.8787, + "step": 9760 + }, + { + "epoch": 0.9148826533002296, + "grad_norm": 0.5703125, + "learning_rate": 4.37776288251428e-06, + "loss": 0.8434, + "step": 9765 + }, + { + "epoch": 0.9153511031995128, + "grad_norm": 0.52734375, + "learning_rate": 4.33003242379173e-06, + "loss": 0.865, + "step": 9770 + }, + { + "epoch": 0.915819553098796, + "grad_norm": 0.546875, + "learning_rate": 4.2825578357623685e-06, + "loss": 0.8321, + "step": 9775 + }, + { + "epoch": 0.9162880029980793, + "grad_norm": 0.61328125, + "learning_rate": 4.235339245397607e-06, + "loss": 0.8723, + "step": 9780 + }, + { + "epoch": 0.9167564528973626, + "grad_norm": 0.58203125, + "learning_rate": 4.188376778984271e-06, + "loss": 0.8666, + "step": 9785 + }, + { + "epoch": 0.9172249027966459, + "grad_norm": 0.60546875, + "learning_rate": 4.141670562124145e-06, + "loss": 0.8793, + "step": 9790 + }, + { + "epoch": 0.9176933526959292, + "grad_norm": 0.58203125, + "learning_rate": 4.0952207197336505e-06, + "loss": 0.8508, + "step": 9795 + }, + { + "epoch": 0.9181618025952124, + "grad_norm": 0.56640625, + "learning_rate": 4.049027376043568e-06, + "loss": 0.8862, + "step": 9800 + }, + { + "epoch": 0.9186302524944957, + "grad_norm": 0.56640625, + "learning_rate": 4.003090654598663e-06, + "loss": 0.8892, + "step": 9805 + }, + { + "epoch": 0.919098702393779, + "grad_norm": 0.5625, + "learning_rate": 3.957410678257356e-06, + "loss": 0.8437, + "step": 9810 + }, + { + "epoch": 0.9195671522930623, + "grad_norm": 0.5703125, + "learning_rate": 3.911987569191367e-06, + "loss": 0.8285, + "step": 9815 + }, + { + "epoch": 0.9200356021923455, + "grad_norm": 0.59765625, + "learning_rate": 3.86682144888546e-06, + "loss": 0.8639, + "step": 9820 + }, + { + "epoch": 0.9205040520916288, + "grad_norm": 0.5703125, + "learning_rate": 3.821912438137087e-06, + "loss": 0.8581, + "step": 9825 + }, + { + "epoch": 0.9209725019909121, + "grad_norm": 0.55078125, + "learning_rate": 3.7772606570560144e-06, + "loss": 0.8247, + "step": 9830 + }, + { + "epoch": 0.9214409518901954, + "grad_norm": 0.5390625, + "learning_rate": 3.7328662250640802e-06, + "loss": 0.8383, + "step": 9835 + }, + { + "epoch": 0.9219094017894787, + "grad_norm": 0.53125, + "learning_rate": 3.688729260894841e-06, + "loss": 0.8371, + "step": 9840 + }, + { + "epoch": 0.9223778516887619, + "grad_norm": 0.6015625, + "learning_rate": 3.6448498825932044e-06, + "loss": 0.8343, + "step": 9845 + }, + { + "epoch": 0.9228463015880451, + "grad_norm": 0.55859375, + "learning_rate": 3.601228207515206e-06, + "loss": 0.8403, + "step": 9850 + }, + { + "epoch": 0.9233147514873284, + "grad_norm": 0.546875, + "learning_rate": 3.5578643523276443e-06, + "loss": 0.8676, + "step": 9855 + }, + { + "epoch": 0.9237832013866117, + "grad_norm": 0.62109375, + "learning_rate": 3.51475843300777e-06, + "loss": 0.8966, + "step": 9860 + }, + { + "epoch": 0.9242516512858949, + "grad_norm": 0.5390625, + "learning_rate": 3.4719105648429726e-06, + "loss": 0.8625, + "step": 9865 + }, + { + "epoch": 0.9247201011851782, + "grad_norm": 0.55859375, + "learning_rate": 3.429320862430485e-06, + "loss": 0.8733, + "step": 9870 + }, + { + "epoch": 0.9251885510844615, + "grad_norm": 0.5390625, + "learning_rate": 3.3869894396770794e-06, + "loss": 0.8656, + "step": 9875 + }, + { + "epoch": 0.9256570009837448, + "grad_norm": 0.55859375, + "learning_rate": 3.3449164097987593e-06, + "loss": 0.8613, + "step": 9880 + }, + { + "epoch": 0.9261254508830281, + "grad_norm": 0.59375, + "learning_rate": 3.3031018853204255e-06, + "loss": 0.8558, + "step": 9885 + }, + { + "epoch": 0.9265939007823113, + "grad_norm": 0.57421875, + "learning_rate": 3.261545978075631e-06, + "loss": 0.8414, + "step": 9890 + }, + { + "epoch": 0.9270623506815946, + "grad_norm": 0.5546875, + "learning_rate": 3.220248799206227e-06, + "loss": 0.8681, + "step": 9895 + }, + { + "epoch": 0.9275308005808779, + "grad_norm": 0.5390625, + "learning_rate": 3.1792104591621296e-06, + "loss": 0.8521, + "step": 9900 + }, + { + "epoch": 0.9279992504801612, + "grad_norm": 0.6328125, + "learning_rate": 3.1384310677009533e-06, + "loss": 0.8701, + "step": 9905 + }, + { + "epoch": 0.9284677003794444, + "grad_norm": 0.5390625, + "learning_rate": 3.0979107338877544e-06, + "loss": 0.8667, + "step": 9910 + }, + { + "epoch": 0.9289361502787277, + "grad_norm": 0.60546875, + "learning_rate": 3.0576495660947667e-06, + "loss": 0.8552, + "step": 9915 + }, + { + "epoch": 0.929404600178011, + "grad_norm": 0.578125, + "learning_rate": 3.017647672001045e-06, + "loss": 0.8674, + "step": 9920 + }, + { + "epoch": 0.9298730500772943, + "grad_norm": 0.56640625, + "learning_rate": 2.97790515859222e-06, + "loss": 0.8786, + "step": 9925 + }, + { + "epoch": 0.9303414999765776, + "grad_norm": 0.55859375, + "learning_rate": 2.9384221321602124e-06, + "loss": 0.8628, + "step": 9930 + }, + { + "epoch": 0.9308099498758607, + "grad_norm": 0.53515625, + "learning_rate": 2.8991986983029518e-06, + "loss": 0.8541, + "step": 9935 + }, + { + "epoch": 0.931278399775144, + "grad_norm": 0.609375, + "learning_rate": 2.8602349619240577e-06, + "loss": 0.8647, + "step": 9940 + }, + { + "epoch": 0.9317468496744273, + "grad_norm": 0.58203125, + "learning_rate": 2.8215310272325936e-06, + "loss": 0.883, + "step": 9945 + }, + { + "epoch": 0.9322152995737106, + "grad_norm": 0.55859375, + "learning_rate": 2.783086997742812e-06, + "loss": 0.8388, + "step": 9950 + }, + { + "epoch": 0.9326837494729938, + "grad_norm": 0.53125, + "learning_rate": 2.744902976273789e-06, + "loss": 0.8612, + "step": 9955 + }, + { + "epoch": 0.9331521993722771, + "grad_norm": 0.5546875, + "learning_rate": 2.7069790649492554e-06, + "loss": 0.859, + "step": 9960 + }, + { + "epoch": 0.9336206492715604, + "grad_norm": 0.5703125, + "learning_rate": 2.6693153651972445e-06, + "loss": 0.8376, + "step": 9965 + }, + { + "epoch": 0.9340890991708437, + "grad_norm": 0.546875, + "learning_rate": 2.631911977749868e-06, + "loss": 0.879, + "step": 9970 + }, + { + "epoch": 0.934557549070127, + "grad_norm": 0.52734375, + "learning_rate": 2.594769002642994e-06, + "loss": 0.8423, + "step": 9975 + }, + { + "epoch": 0.9350259989694102, + "grad_norm": 0.5625, + "learning_rate": 2.5578865392160587e-06, + "loss": 0.8501, + "step": 9980 + }, + { + "epoch": 0.9354944488686935, + "grad_norm": 0.55078125, + "learning_rate": 2.521264686111746e-06, + "loss": 0.8467, + "step": 9985 + }, + { + "epoch": 0.9359628987679768, + "grad_norm": 0.5546875, + "learning_rate": 2.484903541275696e-06, + "loss": 0.8587, + "step": 9990 + }, + { + "epoch": 0.9364313486672601, + "grad_norm": 0.54296875, + "learning_rate": 2.4488032019563402e-06, + "loss": 0.8517, + "step": 9995 + }, + { + "epoch": 0.9368997985665433, + "grad_norm": 0.5703125, + "learning_rate": 2.4129637647045343e-06, + "loss": 0.8644, + "step": 10000 + }, + { + "epoch": 0.9373682484658266, + "grad_norm": 0.53125, + "learning_rate": 2.37738532537336e-06, + "loss": 0.8182, + "step": 10005 + }, + { + "epoch": 0.9378366983651099, + "grad_norm": 0.56640625, + "learning_rate": 2.342067979117868e-06, + "loss": 0.8404, + "step": 10010 + }, + { + "epoch": 0.9383051482643932, + "grad_norm": 0.57421875, + "learning_rate": 2.307011820394811e-06, + "loss": 0.8284, + "step": 10015 + }, + { + "epoch": 0.9387735981636764, + "grad_norm": 0.56640625, + "learning_rate": 2.2722169429623796e-06, + "loss": 0.842, + "step": 10020 + }, + { + "epoch": 0.9392420480629596, + "grad_norm": 0.5859375, + "learning_rate": 2.237683439879956e-06, + "loss": 0.8749, + "step": 10025 + }, + { + "epoch": 0.9397104979622429, + "grad_norm": 0.56640625, + "learning_rate": 2.2034114035079156e-06, + "loss": 0.8509, + "step": 10030 + }, + { + "epoch": 0.9401789478615262, + "grad_norm": 0.55078125, + "learning_rate": 2.169400925507281e-06, + "loss": 0.8514, + "step": 10035 + }, + { + "epoch": 0.9406473977608095, + "grad_norm": 0.58203125, + "learning_rate": 2.13565209683958e-06, + "loss": 0.8871, + "step": 10040 + }, + { + "epoch": 0.9411158476600927, + "grad_norm": 0.52734375, + "learning_rate": 2.1021650077665324e-06, + "loss": 0.8399, + "step": 10045 + }, + { + "epoch": 0.941584297559376, + "grad_norm": 0.5546875, + "learning_rate": 2.0689397478498295e-06, + "loss": 0.8421, + "step": 10050 + }, + { + "epoch": 0.9420527474586593, + "grad_norm": 0.63671875, + "learning_rate": 2.035976405950901e-06, + "loss": 0.8468, + "step": 10055 + }, + { + "epoch": 0.9425211973579426, + "grad_norm": 0.59375, + "learning_rate": 2.0032750702306923e-06, + "loss": 0.8717, + "step": 10060 + }, + { + "epoch": 0.9429896472572259, + "grad_norm": 0.578125, + "learning_rate": 1.9708358281493753e-06, + "loss": 0.8816, + "step": 10065 + }, + { + "epoch": 0.9434580971565091, + "grad_norm": 0.6171875, + "learning_rate": 1.9386587664661615e-06, + "loss": 0.8588, + "step": 10070 + }, + { + "epoch": 0.9439265470557924, + "grad_norm": 0.56640625, + "learning_rate": 1.9067439712390778e-06, + "loss": 0.8395, + "step": 10075 + }, + { + "epoch": 0.9443949969550757, + "grad_norm": 0.66015625, + "learning_rate": 1.8750915278246905e-06, + "loss": 0.9044, + "step": 10080 + }, + { + "epoch": 0.944863446854359, + "grad_norm": 0.5703125, + "learning_rate": 1.8437015208779274e-06, + "loss": 0.8487, + "step": 10085 + }, + { + "epoch": 0.9453318967536422, + "grad_norm": 0.5625, + "learning_rate": 1.8125740343517773e-06, + "loss": 0.8761, + "step": 10090 + }, + { + "epoch": 0.9458003466529254, + "grad_norm": 1.140625, + "learning_rate": 1.7817091514971684e-06, + "loss": 0.8454, + "step": 10095 + }, + { + "epoch": 0.9462687965522087, + "grad_norm": 0.5703125, + "learning_rate": 1.751106954862669e-06, + "loss": 0.8976, + "step": 10100 + }, + { + "epoch": 0.946737246451492, + "grad_norm": 0.53515625, + "learning_rate": 1.7207675262942868e-06, + "loss": 0.8612, + "step": 10105 + }, + { + "epoch": 0.9472056963507753, + "grad_norm": 0.55078125, + "learning_rate": 1.6906909469352584e-06, + "loss": 0.8359, + "step": 10110 + }, + { + "epoch": 0.9476741462500585, + "grad_norm": 0.58984375, + "learning_rate": 1.6608772972258268e-06, + "loss": 0.8584, + "step": 10115 + }, + { + "epoch": 0.9481425961493418, + "grad_norm": 0.56640625, + "learning_rate": 1.6313266569030094e-06, + "loss": 0.8528, + "step": 10120 + }, + { + "epoch": 0.9486110460486251, + "grad_norm": 0.52734375, + "learning_rate": 1.6020391050004413e-06, + "loss": 0.8418, + "step": 10125 + }, + { + "epoch": 0.9490794959479084, + "grad_norm": 0.5703125, + "learning_rate": 1.573014719848087e-06, + "loss": 0.8983, + "step": 10130 + }, + { + "epoch": 0.9495479458471916, + "grad_norm": 0.56640625, + "learning_rate": 1.5442535790720635e-06, + "loss": 0.8467, + "step": 10135 + }, + { + "epoch": 0.9500163957464749, + "grad_norm": 0.5625, + "learning_rate": 1.5157557595944616e-06, + "loss": 0.8263, + "step": 10140 + }, + { + "epoch": 0.9504848456457582, + "grad_norm": 0.66796875, + "learning_rate": 1.4875213376331132e-06, + "loss": 0.9152, + "step": 10145 + }, + { + "epoch": 0.9509532955450415, + "grad_norm": 0.609375, + "learning_rate": 1.4595503887013695e-06, + "loss": 0.8345, + "step": 10150 + }, + { + "epoch": 0.9514217454443248, + "grad_norm": 0.57421875, + "learning_rate": 1.4318429876079454e-06, + "loss": 0.8724, + "step": 10155 + }, + { + "epoch": 0.951890195343608, + "grad_norm": 0.59375, + "learning_rate": 1.4043992084566748e-06, + "loss": 0.8628, + "step": 10160 + }, + { + "epoch": 0.9523586452428913, + "grad_norm": 0.5546875, + "learning_rate": 1.377219124646356e-06, + "loss": 0.8688, + "step": 10165 + }, + { + "epoch": 0.9528270951421746, + "grad_norm": 0.6171875, + "learning_rate": 1.350302808870474e-06, + "loss": 0.878, + "step": 10170 + }, + { + "epoch": 0.9532955450414579, + "grad_norm": 0.61328125, + "learning_rate": 1.3236503331171102e-06, + "loss": 0.8659, + "step": 10175 + }, + { + "epoch": 0.953763994940741, + "grad_norm": 0.56640625, + "learning_rate": 1.2972617686687116e-06, + "loss": 0.8511, + "step": 10180 + }, + { + "epoch": 0.9542324448400243, + "grad_norm": 0.54296875, + "learning_rate": 1.2711371861018451e-06, + "loss": 0.8434, + "step": 10185 + }, + { + "epoch": 0.9547008947393076, + "grad_norm": 0.5546875, + "learning_rate": 1.2452766552870977e-06, + "loss": 0.85, + "step": 10190 + }, + { + "epoch": 0.9551693446385909, + "grad_norm": 0.546875, + "learning_rate": 1.2196802453888211e-06, + "loss": 0.8661, + "step": 10195 + }, + { + "epoch": 0.9556377945378741, + "grad_norm": 0.546875, + "learning_rate": 1.1943480248649884e-06, + "loss": 0.8296, + "step": 10200 + }, + { + "epoch": 0.9561062444371574, + "grad_norm": 0.59375, + "learning_rate": 1.1692800614669707e-06, + "loss": 0.8898, + "step": 10205 + }, + { + "epoch": 0.9565746943364407, + "grad_norm": 0.5625, + "learning_rate": 1.1444764222393934e-06, + "loss": 0.8616, + "step": 10210 + }, + { + "epoch": 0.957043144235724, + "grad_norm": 0.58984375, + "learning_rate": 1.119937173519947e-06, + "loss": 0.8209, + "step": 10215 + }, + { + "epoch": 0.9575115941350073, + "grad_norm": 0.55078125, + "learning_rate": 1.0956623809392107e-06, + "loss": 0.8521, + "step": 10220 + }, + { + "epoch": 0.9579800440342905, + "grad_norm": 0.6015625, + "learning_rate": 1.0716521094204513e-06, + "loss": 0.8661, + "step": 10225 + }, + { + "epoch": 0.9584484939335738, + "grad_norm": 0.57421875, + "learning_rate": 1.04790642317949e-06, + "loss": 0.8529, + "step": 10230 + }, + { + "epoch": 0.9589169438328571, + "grad_norm": 0.55859375, + "learning_rate": 1.0244253857245034e-06, + "loss": 0.8605, + "step": 10235 + }, + { + "epoch": 0.9593853937321404, + "grad_norm": 0.56640625, + "learning_rate": 1.0012090598558677e-06, + "loss": 0.8326, + "step": 10240 + }, + { + "epoch": 0.9598538436314236, + "grad_norm": 0.53125, + "learning_rate": 9.782575076659584e-07, + "loss": 0.8343, + "step": 10245 + }, + { + "epoch": 0.9603222935307069, + "grad_norm": 0.515625, + "learning_rate": 9.555707905390398e-07, + "loss": 0.8287, + "step": 10250 + }, + { + "epoch": 0.9607907434299902, + "grad_norm": 0.546875, + "learning_rate": 9.331489691510765e-07, + "loss": 0.8548, + "step": 10255 + }, + { + "epoch": 0.9612591933292735, + "grad_norm": 0.69140625, + "learning_rate": 9.109921034695213e-07, + "loss": 0.8439, + "step": 10260 + }, + { + "epoch": 0.9617276432285568, + "grad_norm": 0.63671875, + "learning_rate": 8.891002527532388e-07, + "loss": 0.9948, + "step": 10265 + }, + { + "epoch": 0.9621960931278399, + "grad_norm": 0.5546875, + "learning_rate": 8.674734755522939e-07, + "loss": 0.8555, + "step": 10270 + }, + { + "epoch": 0.9626645430271232, + "grad_norm": 0.5625, + "learning_rate": 8.461118297078296e-07, + "loss": 0.85, + "step": 10275 + }, + { + "epoch": 0.9631329929264065, + "grad_norm": 0.5546875, + "learning_rate": 8.25015372351845e-07, + "loss": 0.8547, + "step": 10280 + }, + { + "epoch": 0.9636014428256898, + "grad_norm": 0.58203125, + "learning_rate": 8.041841599071287e-07, + "loss": 0.8644, + "step": 10285 + }, + { + "epoch": 0.964069892724973, + "grad_norm": 0.58203125, + "learning_rate": 7.836182480870701e-07, + "loss": 0.8666, + "step": 10290 + }, + { + "epoch": 0.9645383426242563, + "grad_norm": 0.5390625, + "learning_rate": 7.633176918954599e-07, + "loss": 0.8699, + "step": 10295 + }, + { + "epoch": 0.9650067925235396, + "grad_norm": 0.5390625, + "learning_rate": 7.432825456264559e-07, + "loss": 0.8444, + "step": 10300 + }, + { + "epoch": 0.9654752424228229, + "grad_norm": 0.5703125, + "learning_rate": 7.235128628643284e-07, + "loss": 0.8359, + "step": 10305 + }, + { + "epoch": 0.9659436923221062, + "grad_norm": 0.62109375, + "learning_rate": 7.040086964833825e-07, + "loss": 0.8682, + "step": 10310 + }, + { + "epoch": 0.9664121422213894, + "grad_norm": 0.57421875, + "learning_rate": 6.8477009864778e-07, + "loss": 0.877, + "step": 10315 + }, + { + "epoch": 0.9668805921206727, + "grad_norm": 0.53515625, + "learning_rate": 6.657971208114288e-07, + "loss": 0.878, + "step": 10320 + }, + { + "epoch": 0.967349042019956, + "grad_norm": 0.53515625, + "learning_rate": 6.470898137178494e-07, + "loss": 0.8769, + "step": 10325 + }, + { + "epoch": 0.9678174919192393, + "grad_norm": 0.7265625, + "learning_rate": 6.286482273999639e-07, + "loss": 0.847, + "step": 10330 + }, + { + "epoch": 0.9682859418185225, + "grad_norm": 0.5703125, + "learning_rate": 6.104724111800852e-07, + "loss": 0.8791, + "step": 10335 + }, + { + "epoch": 0.9687543917178058, + "grad_norm": 0.609375, + "learning_rate": 5.925624136696839e-07, + "loss": 0.986, + "step": 10340 + }, + { + "epoch": 0.969222841617089, + "grad_norm": 0.57421875, + "learning_rate": 5.749182827693101e-07, + "loss": 0.8673, + "step": 10345 + }, + { + "epoch": 0.9696912915163723, + "grad_norm": 0.55078125, + "learning_rate": 5.575400656684271e-07, + "loss": 0.8658, + "step": 10350 + }, + { + "epoch": 0.9701597414156556, + "grad_norm": 0.6171875, + "learning_rate": 5.404278088453451e-07, + "loss": 0.8949, + "step": 10355 + }, + { + "epoch": 0.9706281913149388, + "grad_norm": 0.55078125, + "learning_rate": 5.235815580670322e-07, + "loss": 0.8512, + "step": 10360 + }, + { + "epoch": 0.9710966412142221, + "grad_norm": 0.5546875, + "learning_rate": 5.070013583890254e-07, + "loss": 0.852, + "step": 10365 + }, + { + "epoch": 0.9715650911135054, + "grad_norm": 0.54296875, + "learning_rate": 4.906872541552976e-07, + "loss": 0.8552, + "step": 10370 + }, + { + "epoch": 0.9720335410127887, + "grad_norm": 0.578125, + "learning_rate": 4.746392889981577e-07, + "loss": 0.8644, + "step": 10375 + }, + { + "epoch": 0.9725019909120719, + "grad_norm": 0.58203125, + "learning_rate": 4.588575058381284e-07, + "loss": 0.8886, + "step": 10380 + }, + { + "epoch": 0.9729704408113552, + "grad_norm": 0.546875, + "learning_rate": 4.433419468837907e-07, + "loss": 0.8543, + "step": 10385 + }, + { + "epoch": 0.9734388907106385, + "grad_norm": 0.53125, + "learning_rate": 4.280926536317509e-07, + "loss": 0.8447, + "step": 10390 + }, + { + "epoch": 0.9739073406099218, + "grad_norm": 0.5625, + "learning_rate": 4.131096668664625e-07, + "loss": 0.8641, + "step": 10395 + }, + { + "epoch": 0.9743757905092051, + "grad_norm": 0.53515625, + "learning_rate": 3.9839302666011546e-07, + "loss": 0.8413, + "step": 10400 + }, + { + "epoch": 0.9748442404084883, + "grad_norm": 0.578125, + "learning_rate": 3.8394277237259193e-07, + "loss": 0.8857, + "step": 10405 + }, + { + "epoch": 0.9753126903077716, + "grad_norm": 0.578125, + "learning_rate": 3.697589426512993e-07, + "loss": 0.8598, + "step": 10410 + }, + { + "epoch": 0.9757811402070549, + "grad_norm": 0.66015625, + "learning_rate": 3.5584157543109285e-07, + "loss": 0.8419, + "step": 10415 + }, + { + "epoch": 0.9762495901063382, + "grad_norm": 0.58203125, + "learning_rate": 3.421907079341646e-07, + "loss": 0.8247, + "step": 10420 + }, + { + "epoch": 0.9767180400056213, + "grad_norm": 0.5390625, + "learning_rate": 3.288063766699545e-07, + "loss": 0.8495, + "step": 10425 + }, + { + "epoch": 0.9771864899049046, + "grad_norm": 0.54296875, + "learning_rate": 3.156886174350615e-07, + "loss": 0.8556, + "step": 10430 + }, + { + "epoch": 0.9776549398041879, + "grad_norm": 0.56640625, + "learning_rate": 3.028374653130994e-07, + "loss": 0.8362, + "step": 10435 + }, + { + "epoch": 0.9781233897034712, + "grad_norm": 0.57421875, + "learning_rate": 2.902529546746635e-07, + "loss": 0.8553, + "step": 10440 + }, + { + "epoch": 0.9785918396027545, + "grad_norm": 0.6015625, + "learning_rate": 2.7793511917723057e-07, + "loss": 0.8601, + "step": 10445 + }, + { + "epoch": 0.9790602895020377, + "grad_norm": 0.59375, + "learning_rate": 2.658839917649925e-07, + "loss": 0.8611, + "step": 10450 + }, + { + "epoch": 0.979528739401321, + "grad_norm": 0.57421875, + "learning_rate": 2.5409960466887817e-07, + "loss": 0.8485, + "step": 10455 + }, + { + "epoch": 0.9799971893006043, + "grad_norm": 0.54296875, + "learning_rate": 2.4258198940639853e-07, + "loss": 0.8384, + "step": 10460 + }, + { + "epoch": 0.9804656391998876, + "grad_norm": 0.55859375, + "learning_rate": 2.3133117678157956e-07, + "loss": 0.8731, + "step": 10465 + }, + { + "epoch": 0.9809340890991708, + "grad_norm": 0.6015625, + "learning_rate": 2.203471968848847e-07, + "loss": 0.8821, + "step": 10470 + }, + { + "epoch": 0.9814025389984541, + "grad_norm": 0.59375, + "learning_rate": 2.0963007909311494e-07, + "loss": 0.8541, + "step": 10475 + }, + { + "epoch": 0.9818709888977374, + "grad_norm": 0.54296875, + "learning_rate": 1.991798520693644e-07, + "loss": 0.8547, + "step": 10480 + }, + { + "epoch": 0.9823394387970207, + "grad_norm": 0.54296875, + "learning_rate": 1.889965437629093e-07, + "loss": 0.8453, + "step": 10485 + }, + { + "epoch": 0.982807888696304, + "grad_norm": 0.5625, + "learning_rate": 1.790801814091414e-07, + "loss": 0.8649, + "step": 10490 + }, + { + "epoch": 0.9832763385955872, + "grad_norm": 0.5546875, + "learning_rate": 1.6943079152953457e-07, + "loss": 0.8464, + "step": 10495 + }, + { + "epoch": 0.9837447884948705, + "grad_norm": 0.55859375, + "learning_rate": 1.6004839993151167e-07, + "loss": 0.8536, + "step": 10500 + }, + { + "epoch": 0.9842132383941538, + "grad_norm": 0.546875, + "learning_rate": 1.5093303170841121e-07, + "loss": 0.8611, + "step": 10505 + }, + { + "epoch": 0.984681688293437, + "grad_norm": 0.54296875, + "learning_rate": 1.420847112394097e-07, + "loss": 0.8606, + "step": 10510 + }, + { + "epoch": 0.9851501381927202, + "grad_norm": 0.5234375, + "learning_rate": 1.3350346218947707e-07, + "loss": 0.8608, + "step": 10515 + }, + { + "epoch": 0.9856185880920035, + "grad_norm": 0.5546875, + "learning_rate": 1.2518930750928803e-07, + "loss": 0.8629, + "step": 10520 + }, + { + "epoch": 0.9860870379912868, + "grad_norm": 0.56640625, + "learning_rate": 1.1714226943515538e-07, + "loss": 0.8474, + "step": 10525 + }, + { + "epoch": 0.9865554878905701, + "grad_norm": 0.5703125, + "learning_rate": 1.093623694890078e-07, + "loss": 0.8758, + "step": 10530 + }, + { + "epoch": 0.9870239377898534, + "grad_norm": 0.58984375, + "learning_rate": 1.0184962847828994e-07, + "loss": 0.8533, + "step": 10535 + }, + { + "epoch": 0.9874923876891366, + "grad_norm": 0.60546875, + "learning_rate": 9.460406649594022e-08, + "loss": 0.8661, + "step": 10540 + }, + { + "epoch": 0.9879608375884199, + "grad_norm": 0.58203125, + "learning_rate": 8.762570292032423e-08, + "loss": 0.883, + "step": 10545 + }, + { + "epoch": 0.9884292874877032, + "grad_norm": 0.5546875, + "learning_rate": 8.0914556415157e-08, + "loss": 0.8615, + "step": 10550 + }, + { + "epoch": 0.9888977373869865, + "grad_norm": 0.5625, + "learning_rate": 7.447064492951405e-08, + "loss": 0.8503, + "step": 10555 + }, + { + "epoch": 0.9893661872862697, + "grad_norm": 0.52734375, + "learning_rate": 6.829398569770939e-08, + "loss": 0.8334, + "step": 10560 + }, + { + "epoch": 0.989834637185553, + "grad_norm": 0.55859375, + "learning_rate": 6.238459523932871e-08, + "loss": 0.8601, + "step": 10565 + }, + { + "epoch": 0.9903030870848363, + "grad_norm": 0.53125, + "learning_rate": 5.674248935910731e-08, + "loss": 0.8565, + "step": 10570 + }, + { + "epoch": 0.9907715369841196, + "grad_norm": 0.56640625, + "learning_rate": 5.136768314694118e-08, + "loss": 0.8497, + "step": 10575 + }, + { + "epoch": 0.9912399868834029, + "grad_norm": 0.55078125, + "learning_rate": 4.626019097782042e-08, + "loss": 0.8954, + "step": 10580 + }, + { + "epoch": 0.991708436782686, + "grad_norm": 0.56640625, + "learning_rate": 4.1420026511806984e-08, + "loss": 0.9051, + "step": 10585 + }, + { + "epoch": 0.9921768866819693, + "grad_norm": 0.58984375, + "learning_rate": 3.6847202694001434e-08, + "loss": 0.8412, + "step": 10590 + }, + { + "epoch": 0.9926453365812526, + "grad_norm": 0.55078125, + "learning_rate": 3.254173175447628e-08, + "loss": 0.8516, + "step": 10595 + }, + { + "epoch": 0.9931137864805359, + "grad_norm": 0.6015625, + "learning_rate": 2.8503625208275986e-08, + "loss": 0.8652, + "step": 10600 + }, + { + "epoch": 0.9935822363798191, + "grad_norm": 0.55078125, + "learning_rate": 2.473289385538369e-08, + "loss": 0.8294, + "step": 10605 + }, + { + "epoch": 0.9940506862791024, + "grad_norm": 0.5625, + "learning_rate": 2.122954778065456e-08, + "loss": 0.8417, + "step": 10610 + }, + { + "epoch": 0.9945191361783857, + "grad_norm": 0.53125, + "learning_rate": 1.7993596353849118e-08, + "loss": 0.8383, + "step": 10615 + }, + { + "epoch": 0.994987586077669, + "grad_norm": 0.55859375, + "learning_rate": 1.502504822957773e-08, + "loss": 0.8529, + "step": 10620 + }, + { + "epoch": 0.9954560359769523, + "grad_norm": 0.53515625, + "learning_rate": 1.2323911347245087e-08, + "loss": 0.8473, + "step": 10625 + }, + { + "epoch": 0.9959244858762355, + "grad_norm": 0.5234375, + "learning_rate": 9.890192931083508e-09, + "loss": 0.8372, + "step": 10630 + }, + { + "epoch": 0.9963929357755188, + "grad_norm": 0.54296875, + "learning_rate": 7.723899490108544e-09, + "loss": 0.8238, + "step": 10635 + }, + { + "epoch": 0.9968613856748021, + "grad_norm": 0.5625, + "learning_rate": 5.825036818118967e-09, + "loss": 0.8652, + "step": 10640 + }, + { + "epoch": 0.9973298355740854, + "grad_norm": 0.5625, + "learning_rate": 4.193609993630166e-09, + "loss": 0.8583, + "step": 10645 + }, + { + "epoch": 0.9977982854733686, + "grad_norm": 0.671875, + "learning_rate": 2.8296233799185444e-09, + "loss": 0.8498, + "step": 10650 + }, + { + "epoch": 0.9982667353726519, + "grad_norm": 0.578125, + "learning_rate": 1.733080624999328e-09, + "loss": 0.8679, + "step": 10655 + }, + { + "epoch": 0.9987351852719352, + "grad_norm": 0.58984375, + "learning_rate": 9.039846615710446e-10, + "loss": 0.8622, + "step": 10660 + }, + { + "epoch": 0.9992036351712185, + "grad_norm": 0.53515625, + "learning_rate": 3.423377070932432e-10, + "loss": 0.841, + "step": 10665 + }, + { + "epoch": 0.9996720850705018, + "grad_norm": 0.54296875, + "learning_rate": 4.8141263664369664e-11, + "loss": 0.8429, + "step": 10670 + }, + { + "epoch": 0.9999531550100716, + "eval_loss": 0.8574451804161072, + "eval_runtime": 961.4713, + "eval_samples_per_second": 4.677, + "eval_steps_per_second": 1.17, + "step": 10673 + }, + { + "epoch": 0.9999531550100716, + "step": 10673, + "total_flos": 4.092531867595571e+18, + "train_loss": 1.0194565681556524, + "train_runtime": 58622.8041, + "train_samples_per_second": 1.457, + "train_steps_per_second": 0.182 + } + ], + "logging_steps": 5, + "max_steps": 10673, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 4.092531867595571e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}