{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9143553794574825, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001015950421619425, "grad_norm": 17.625, "learning_rate": 5e-06, "loss": 3.4264, "step": 10 }, { "epoch": 0.00203190084323885, "grad_norm": 12.5625, "learning_rate": 1e-05, "loss": 3.432, "step": 20 }, { "epoch": 0.003047851264858275, "grad_norm": 14.0625, "learning_rate": 1.5e-05, "loss": 3.23, "step": 30 }, { "epoch": 0.0040638016864777, "grad_norm": 12.4375, "learning_rate": 2e-05, "loss": 2.9762, "step": 40 }, { "epoch": 0.005079752108097125, "grad_norm": 10.0625, "learning_rate": 2.5e-05, "loss": 2.6173, "step": 50 }, { "epoch": 0.00609570252971655, "grad_norm": 10.1875, "learning_rate": 3e-05, "loss": 2.2004, "step": 60 }, { "epoch": 0.007111652951335975, "grad_norm": 7.03125, "learning_rate": 3.5e-05, "loss": 1.4176, "step": 70 }, { "epoch": 0.0081276033729554, "grad_norm": 4.375, "learning_rate": 4e-05, "loss": 1.0122, "step": 80 }, { "epoch": 0.009143553794574825, "grad_norm": 6.5625, "learning_rate": 4.5e-05, "loss": 0.9116, "step": 90 }, { "epoch": 0.01015950421619425, "grad_norm": 5.28125, "learning_rate": 5e-05, "loss": 0.6832, "step": 100 }, { "epoch": 0.011175454637813675, "grad_norm": 5.5, "learning_rate": 4.9999870035728426e-05, "loss": 0.7355, "step": 110 }, { "epoch": 0.0121914050594331, "grad_norm": 5.1875, "learning_rate": 4.9999480144264944e-05, "loss": 0.6673, "step": 120 }, { "epoch": 0.013207355481052525, "grad_norm": 4.5, "learning_rate": 4.9998830329663314e-05, "loss": 0.6792, "step": 130 }, { "epoch": 0.01422330590267195, "grad_norm": 3.9375, "learning_rate": 4.9997920598679756e-05, "loss": 0.6207, "step": 140 }, { "epoch": 0.015239256324291375, "grad_norm": 3.15625, "learning_rate": 4.999675096077286e-05, "loss": 0.483, "step": 150 }, { "epoch": 0.0162552067459108, "grad_norm": 5.28125, "learning_rate": 4.999532142810354e-05, "loss": 0.5319, "step": 160 }, { "epoch": 0.017271157167530225, "grad_norm": 4.59375, "learning_rate": 4.999363201553483e-05, "loss": 0.6052, "step": 170 }, { "epoch": 0.01828710758914965, "grad_norm": 5.03125, "learning_rate": 4.9991682740631794e-05, "loss": 0.4258, "step": 180 }, { "epoch": 0.019303058010769075, "grad_norm": 3.859375, "learning_rate": 4.998947362366133e-05, "loss": 0.4309, "step": 190 }, { "epoch": 0.0203190084323885, "grad_norm": 3.328125, "learning_rate": 4.998700468759193e-05, "loss": 0.3957, "step": 200 }, { "epoch": 0.021334958854007924, "grad_norm": 4.9375, "learning_rate": 4.9984275958093475e-05, "loss": 0.4777, "step": 210 }, { "epoch": 0.02235090927562735, "grad_norm": 4.78125, "learning_rate": 4.998128746353695e-05, "loss": 0.3549, "step": 220 }, { "epoch": 0.023366859697246774, "grad_norm": 4.0625, "learning_rate": 4.997803923499417e-05, "loss": 0.4447, "step": 230 }, { "epoch": 0.0243828101188662, "grad_norm": 6.375, "learning_rate": 4.99745313062374e-05, "loss": 0.3808, "step": 240 }, { "epoch": 0.025398760540485624, "grad_norm": 3.59375, "learning_rate": 4.99707637137391e-05, "loss": 0.3827, "step": 250 }, { "epoch": 0.02641471096210505, "grad_norm": 3.015625, "learning_rate": 4.996673649667145e-05, "loss": 0.3694, "step": 260 }, { "epoch": 0.027430661383724474, "grad_norm": 2.296875, "learning_rate": 4.9962449696906e-05, "loss": 0.3586, "step": 270 }, { "epoch": 0.0284466118053439, "grad_norm": 4.125, "learning_rate": 4.9957903359013214e-05, "loss": 0.3832, "step": 280 }, { "epoch": 0.029462562226963324, "grad_norm": 3.296875, "learning_rate": 4.995309753026201e-05, "loss": 0.328, "step": 290 }, { "epoch": 0.03047851264858275, "grad_norm": 4.5, "learning_rate": 4.994803226061927e-05, "loss": 0.3667, "step": 300 }, { "epoch": 0.03149446307020217, "grad_norm": 4.3125, "learning_rate": 4.994270760274933e-05, "loss": 0.3811, "step": 310 }, { "epoch": 0.0325104134918216, "grad_norm": 3.421875, "learning_rate": 4.99371236120134e-05, "loss": 0.3065, "step": 320 }, { "epoch": 0.03352636391344102, "grad_norm": 4.6875, "learning_rate": 4.993128034646902e-05, "loss": 0.4177, "step": 330 }, { "epoch": 0.03454231433506045, "grad_norm": 3.046875, "learning_rate": 4.992517786686947e-05, "loss": 0.33, "step": 340 }, { "epoch": 0.03555826475667987, "grad_norm": 1.8828125, "learning_rate": 4.9918816236663077e-05, "loss": 0.3287, "step": 350 }, { "epoch": 0.0365742151782993, "grad_norm": 3.8125, "learning_rate": 4.991219552199262e-05, "loss": 0.2934, "step": 360 }, { "epoch": 0.03759016559991872, "grad_norm": 4.28125, "learning_rate": 4.99053157916946e-05, "loss": 0.3176, "step": 370 }, { "epoch": 0.03860611602153815, "grad_norm": 2.609375, "learning_rate": 4.989817711729856e-05, "loss": 0.3318, "step": 380 }, { "epoch": 0.03962206644315757, "grad_norm": 2.375, "learning_rate": 4.98907795730263e-05, "loss": 0.3234, "step": 390 }, { "epoch": 0.040638016864777, "grad_norm": 4.46875, "learning_rate": 4.988312323579114e-05, "loss": 0.267, "step": 400 }, { "epoch": 0.04165396728639642, "grad_norm": 3.75, "learning_rate": 4.98752081851971e-05, "loss": 0.3081, "step": 410 }, { "epoch": 0.04266991770801585, "grad_norm": 2.203125, "learning_rate": 4.986703450353809e-05, "loss": 0.2917, "step": 420 }, { "epoch": 0.04368586812963527, "grad_norm": 1.6015625, "learning_rate": 4.985860227579703e-05, "loss": 0.2805, "step": 430 }, { "epoch": 0.0447018185512547, "grad_norm": 3.140625, "learning_rate": 4.984991158964499e-05, "loss": 0.3534, "step": 440 }, { "epoch": 0.04571776897287412, "grad_norm": 3.296875, "learning_rate": 4.9840962535440265e-05, "loss": 0.335, "step": 450 }, { "epoch": 0.04673371939449355, "grad_norm": 3.25, "learning_rate": 4.983175520622744e-05, "loss": 0.2544, "step": 460 }, { "epoch": 0.04774966981611297, "grad_norm": 2.25, "learning_rate": 4.982228969773642e-05, "loss": 0.3449, "step": 470 }, { "epoch": 0.0487656202377324, "grad_norm": 4.9375, "learning_rate": 4.9812566108381435e-05, "loss": 0.2964, "step": 480 }, { "epoch": 0.04978157065935182, "grad_norm": 1.5703125, "learning_rate": 4.9802584539260035e-05, "loss": 0.2799, "step": 490 }, { "epoch": 0.05079752108097125, "grad_norm": 2.828125, "learning_rate": 4.979234509415199e-05, "loss": 0.3231, "step": 500 }, { "epoch": 0.05181347150259067, "grad_norm": 2.9375, "learning_rate": 4.978184787951828e-05, "loss": 0.2943, "step": 510 }, { "epoch": 0.0528294219242101, "grad_norm": 2.34375, "learning_rate": 4.977109300449992e-05, "loss": 0.2705, "step": 520 }, { "epoch": 0.05384537234582952, "grad_norm": 3.140625, "learning_rate": 4.9760080580916876e-05, "loss": 0.2998, "step": 530 }, { "epoch": 0.05486132276744895, "grad_norm": 3.5625, "learning_rate": 4.974881072326688e-05, "loss": 0.2595, "step": 540 }, { "epoch": 0.05587727318906837, "grad_norm": 4.25, "learning_rate": 4.9737283548724236e-05, "loss": 0.2803, "step": 550 }, { "epoch": 0.0568932236106878, "grad_norm": 4.0625, "learning_rate": 4.97254991771386e-05, "loss": 0.3511, "step": 560 }, { "epoch": 0.05790917403230722, "grad_norm": 2.515625, "learning_rate": 4.971345773103377e-05, "loss": 0.312, "step": 570 }, { "epoch": 0.05892512445392665, "grad_norm": 3.21875, "learning_rate": 4.9701159335606365e-05, "loss": 0.2482, "step": 580 }, { "epoch": 0.05994107487554607, "grad_norm": 5.5, "learning_rate": 4.968860411872454e-05, "loss": 0.2537, "step": 590 }, { "epoch": 0.0609570252971655, "grad_norm": 3.546875, "learning_rate": 4.967579221092666e-05, "loss": 0.3125, "step": 600 }, { "epoch": 0.06197297571878492, "grad_norm": 2.984375, "learning_rate": 4.966272374541996e-05, "loss": 0.2354, "step": 610 }, { "epoch": 0.06298892614040434, "grad_norm": 3.6875, "learning_rate": 4.964939885807912e-05, "loss": 0.3213, "step": 620 }, { "epoch": 0.06400487656202378, "grad_norm": 2.140625, "learning_rate": 4.9635817687444876e-05, "loss": 0.3003, "step": 630 }, { "epoch": 0.0650208269836432, "grad_norm": 3.484375, "learning_rate": 4.962198037472259e-05, "loss": 0.2996, "step": 640 }, { "epoch": 0.06603677740526262, "grad_norm": 3.21875, "learning_rate": 4.9607887063780776e-05, "loss": 0.2257, "step": 650 }, { "epoch": 0.06705272782688204, "grad_norm": 5.375, "learning_rate": 4.9593537901149564e-05, "loss": 0.223, "step": 660 }, { "epoch": 0.06806867824850148, "grad_norm": 4.1875, "learning_rate": 4.957893303601924e-05, "loss": 0.3407, "step": 670 }, { "epoch": 0.0690846286701209, "grad_norm": 3.328125, "learning_rate": 4.956407262023866e-05, "loss": 0.2589, "step": 680 }, { "epoch": 0.07010057909174032, "grad_norm": 2.953125, "learning_rate": 4.954895680831367e-05, "loss": 0.2949, "step": 690 }, { "epoch": 0.07111652951335974, "grad_norm": 4.0625, "learning_rate": 4.9533585757405506e-05, "loss": 0.2995, "step": 700 }, { "epoch": 0.07213247993497918, "grad_norm": 4.625, "learning_rate": 4.951795962732917e-05, "loss": 0.2894, "step": 710 }, { "epoch": 0.0731484303565986, "grad_norm": 3.0, "learning_rate": 4.9502078580551755e-05, "loss": 0.3082, "step": 720 }, { "epoch": 0.07416438077821802, "grad_norm": 3.65625, "learning_rate": 4.9485942782190734e-05, "loss": 0.2308, "step": 730 }, { "epoch": 0.07518033119983744, "grad_norm": 4.78125, "learning_rate": 4.9469552400012306e-05, "loss": 0.2272, "step": 740 }, { "epoch": 0.07619628162145688, "grad_norm": 4.25, "learning_rate": 4.94529076044296e-05, "loss": 0.2701, "step": 750 }, { "epoch": 0.0772122320430763, "grad_norm": 3.140625, "learning_rate": 4.94360085685009e-05, "loss": 0.2686, "step": 760 }, { "epoch": 0.07822818246469572, "grad_norm": 0.765625, "learning_rate": 4.9418855467927894e-05, "loss": 0.2051, "step": 770 }, { "epoch": 0.07924413288631514, "grad_norm": 1.796875, "learning_rate": 4.940144848105379e-05, "loss": 0.2267, "step": 780 }, { "epoch": 0.08026008330793458, "grad_norm": 4.5625, "learning_rate": 4.93837877888615e-05, "loss": 0.2597, "step": 790 }, { "epoch": 0.081276033729554, "grad_norm": 3.03125, "learning_rate": 4.9365873574971745e-05, "loss": 0.3701, "step": 800 }, { "epoch": 0.08229198415117342, "grad_norm": 4.5625, "learning_rate": 4.9347706025641136e-05, "loss": 0.2559, "step": 810 }, { "epoch": 0.08330793457279284, "grad_norm": 3.90625, "learning_rate": 4.9329285329760275e-05, "loss": 0.2799, "step": 820 }, { "epoch": 0.08432388499441228, "grad_norm": 3.140625, "learning_rate": 4.9310611678851735e-05, "loss": 0.2866, "step": 830 }, { "epoch": 0.0853398354160317, "grad_norm": 2.46875, "learning_rate": 4.929168526706811e-05, "loss": 0.3105, "step": 840 }, { "epoch": 0.08635578583765112, "grad_norm": 13.625, "learning_rate": 4.927250629119e-05, "loss": 0.2454, "step": 850 }, { "epoch": 0.08737173625927054, "grad_norm": 3.921875, "learning_rate": 4.9253074950623925e-05, "loss": 0.2424, "step": 860 }, { "epoch": 0.08838768668088998, "grad_norm": 2.90625, "learning_rate": 4.9233391447400286e-05, "loss": 0.2481, "step": 870 }, { "epoch": 0.0894036371025094, "grad_norm": 2.96875, "learning_rate": 4.921345598617125e-05, "loss": 0.2231, "step": 880 }, { "epoch": 0.09041958752412882, "grad_norm": 5.375, "learning_rate": 4.9193268774208654e-05, "loss": 0.3447, "step": 890 }, { "epoch": 0.09143553794574824, "grad_norm": 2.0, "learning_rate": 4.9172830021401785e-05, "loss": 0.229, "step": 900 }, { "epoch": 0.09245148836736768, "grad_norm": 3.1875, "learning_rate": 4.9152139940255245e-05, "loss": 0.2122, "step": 910 }, { "epoch": 0.0934674387889871, "grad_norm": 3.40625, "learning_rate": 4.913119874588677e-05, "loss": 0.2386, "step": 920 }, { "epoch": 0.09448338921060652, "grad_norm": 1.4609375, "learning_rate": 4.911000665602489e-05, "loss": 0.1944, "step": 930 }, { "epoch": 0.09549933963222594, "grad_norm": 5.0625, "learning_rate": 4.9088563891006786e-05, "loss": 0.2038, "step": 940 }, { "epoch": 0.09651529005384538, "grad_norm": 4.53125, "learning_rate": 4.906687067377592e-05, "loss": 0.3122, "step": 950 }, { "epoch": 0.0975312404754648, "grad_norm": 2.84375, "learning_rate": 4.904492722987976e-05, "loss": 0.3157, "step": 960 }, { "epoch": 0.09854719089708422, "grad_norm": 2.171875, "learning_rate": 4.902273378746738e-05, "loss": 0.3077, "step": 970 }, { "epoch": 0.09956314131870364, "grad_norm": 2.84375, "learning_rate": 4.9000290577287165e-05, "loss": 0.2756, "step": 980 }, { "epoch": 0.10057909174032308, "grad_norm": 0.99609375, "learning_rate": 4.897759783268434e-05, "loss": 0.2915, "step": 990 }, { "epoch": 0.1015950421619425, "grad_norm": 3.53125, "learning_rate": 4.895465578959859e-05, "loss": 0.2052, "step": 1000 }, { "epoch": 0.10261099258356192, "grad_norm": 4.0, "learning_rate": 4.893146468656159e-05, "loss": 0.2499, "step": 1010 }, { "epoch": 0.10362694300518134, "grad_norm": 1.65625, "learning_rate": 4.890802476469452e-05, "loss": 0.278, "step": 1020 }, { "epoch": 0.10464289342680078, "grad_norm": 3.625, "learning_rate": 4.888433626770558e-05, "loss": 0.2143, "step": 1030 }, { "epoch": 0.1056588438484202, "grad_norm": 5.0625, "learning_rate": 4.886039944188741e-05, "loss": 0.2878, "step": 1040 }, { "epoch": 0.10667479427003962, "grad_norm": 4.5, "learning_rate": 4.883621453611461e-05, "loss": 0.2744, "step": 1050 }, { "epoch": 0.10769074469165904, "grad_norm": 4.5625, "learning_rate": 4.881178180184106e-05, "loss": 0.2734, "step": 1060 }, { "epoch": 0.10870669511327848, "grad_norm": 3.125, "learning_rate": 4.878710149309735e-05, "loss": 0.3574, "step": 1070 }, { "epoch": 0.1097226455348979, "grad_norm": 3.0625, "learning_rate": 4.876217386648816e-05, "loss": 0.2625, "step": 1080 }, { "epoch": 0.11073859595651732, "grad_norm": 4.0625, "learning_rate": 4.873699918118955e-05, "loss": 0.2437, "step": 1090 }, { "epoch": 0.11175454637813674, "grad_norm": 1.59375, "learning_rate": 4.87115776989463e-05, "loss": 0.2051, "step": 1100 }, { "epoch": 0.11277049679975618, "grad_norm": 4.375, "learning_rate": 4.8685909684069153e-05, "loss": 0.1727, "step": 1110 }, { "epoch": 0.1137864472213756, "grad_norm": 2.28125, "learning_rate": 4.865999540343211e-05, "loss": 0.2256, "step": 1120 }, { "epoch": 0.11480239764299502, "grad_norm": 2.265625, "learning_rate": 4.86338351264696e-05, "loss": 0.3529, "step": 1130 }, { "epoch": 0.11581834806461444, "grad_norm": 2.34375, "learning_rate": 4.8607429125173754e-05, "loss": 0.2113, "step": 1140 }, { "epoch": 0.11683429848623388, "grad_norm": 0.7578125, "learning_rate": 4.858077767409149e-05, "loss": 0.2759, "step": 1150 }, { "epoch": 0.1178502489078533, "grad_norm": 3.640625, "learning_rate": 4.855388105032174e-05, "loss": 0.2482, "step": 1160 }, { "epoch": 0.11886619932947272, "grad_norm": 3.5, "learning_rate": 4.852673953351249e-05, "loss": 0.1865, "step": 1170 }, { "epoch": 0.11988214975109214, "grad_norm": 3.75, "learning_rate": 4.849935340585796e-05, "loss": 0.2659, "step": 1180 }, { "epoch": 0.12089810017271158, "grad_norm": 3.375, "learning_rate": 4.8471722952095586e-05, "loss": 0.1506, "step": 1190 }, { "epoch": 0.121914050594331, "grad_norm": 3.34375, "learning_rate": 4.844384845950312e-05, "loss": 0.307, "step": 1200 }, { "epoch": 0.12293000101595042, "grad_norm": 1.578125, "learning_rate": 4.841573021789561e-05, "loss": 0.1952, "step": 1210 }, { "epoch": 0.12394595143756984, "grad_norm": 1.2890625, "learning_rate": 4.838736851962239e-05, "loss": 0.1779, "step": 1220 }, { "epoch": 0.12496190185918928, "grad_norm": 1.265625, "learning_rate": 4.835876365956408e-05, "loss": 0.1235, "step": 1230 }, { "epoch": 0.12597785228080868, "grad_norm": 1.9609375, "learning_rate": 4.8329915935129436e-05, "loss": 0.1876, "step": 1240 }, { "epoch": 0.12699380270242813, "grad_norm": 1.6328125, "learning_rate": 4.830082564625235e-05, "loss": 0.2188, "step": 1250 }, { "epoch": 0.12800975312404755, "grad_norm": 3.96875, "learning_rate": 4.8271493095388684e-05, "loss": 0.2622, "step": 1260 }, { "epoch": 0.12902570354566698, "grad_norm": 3.765625, "learning_rate": 4.824191858751312e-05, "loss": 0.2724, "step": 1270 }, { "epoch": 0.1300416539672864, "grad_norm": 5.59375, "learning_rate": 4.821210243011601e-05, "loss": 0.2413, "step": 1280 }, { "epoch": 0.13105760438890582, "grad_norm": 3.34375, "learning_rate": 4.818204493320016e-05, "loss": 0.2618, "step": 1290 }, { "epoch": 0.13207355481052524, "grad_norm": 2.78125, "learning_rate": 4.8151746409277634e-05, "loss": 0.2295, "step": 1300 }, { "epoch": 0.13308950523214466, "grad_norm": 3.1875, "learning_rate": 4.8121207173366484e-05, "loss": 0.2733, "step": 1310 }, { "epoch": 0.13410545565376408, "grad_norm": 2.28125, "learning_rate": 4.809042754298746e-05, "loss": 0.2311, "step": 1320 }, { "epoch": 0.13512140607538353, "grad_norm": 2.171875, "learning_rate": 4.805940783816075e-05, "loss": 0.2059, "step": 1330 }, { "epoch": 0.13613735649700295, "grad_norm": 2.796875, "learning_rate": 4.8028148381402625e-05, "loss": 0.2102, "step": 1340 }, { "epoch": 0.13715330691862238, "grad_norm": 2.96875, "learning_rate": 4.7996649497722084e-05, "loss": 0.2708, "step": 1350 }, { "epoch": 0.1381692573402418, "grad_norm": 2.4375, "learning_rate": 4.7964911514617485e-05, "loss": 0.2429, "step": 1360 }, { "epoch": 0.13918520776186122, "grad_norm": 5.8125, "learning_rate": 4.793293476207312e-05, "loss": 0.2725, "step": 1370 }, { "epoch": 0.14020115818348064, "grad_norm": 2.40625, "learning_rate": 4.790071957255585e-05, "loss": 0.2098, "step": 1380 }, { "epoch": 0.14121710860510006, "grad_norm": 4.25, "learning_rate": 4.786826628101154e-05, "loss": 0.2101, "step": 1390 }, { "epoch": 0.14223305902671948, "grad_norm": 2.578125, "learning_rate": 4.783557522486167e-05, "loss": 0.2624, "step": 1400 }, { "epoch": 0.14324900944833893, "grad_norm": 3.125, "learning_rate": 4.780264674399978e-05, "loss": 0.2518, "step": 1410 }, { "epoch": 0.14426495986995835, "grad_norm": 3.671875, "learning_rate": 4.7769481180787966e-05, "loss": 0.3112, "step": 1420 }, { "epoch": 0.14528091029157778, "grad_norm": 3.984375, "learning_rate": 4.773607888005327e-05, "loss": 0.2747, "step": 1430 }, { "epoch": 0.1462968607131972, "grad_norm": 3.234375, "learning_rate": 4.770244018908416e-05, "loss": 0.1572, "step": 1440 }, { "epoch": 0.14731281113481662, "grad_norm": 4.09375, "learning_rate": 4.766856545762687e-05, "loss": 0.2148, "step": 1450 }, { "epoch": 0.14832876155643604, "grad_norm": 1.6875, "learning_rate": 4.763445503788178e-05, "loss": 0.2531, "step": 1460 }, { "epoch": 0.14934471197805546, "grad_norm": 2.375, "learning_rate": 4.760010928449976e-05, "loss": 0.199, "step": 1470 }, { "epoch": 0.15036066239967488, "grad_norm": 4.6875, "learning_rate": 4.7565528554578485e-05, "loss": 0.2366, "step": 1480 }, { "epoch": 0.15137661282129433, "grad_norm": 5.4375, "learning_rate": 4.75307132076587e-05, "loss": 0.1862, "step": 1490 }, { "epoch": 0.15239256324291375, "grad_norm": 2.484375, "learning_rate": 4.749566360572049e-05, "loss": 0.2143, "step": 1500 }, { "epoch": 0.15340851366453317, "grad_norm": 2.1875, "learning_rate": 4.746038011317955e-05, "loss": 0.1877, "step": 1510 }, { "epoch": 0.1544244640861526, "grad_norm": 2.84375, "learning_rate": 4.742486309688333e-05, "loss": 0.2831, "step": 1520 }, { "epoch": 0.15544041450777202, "grad_norm": 2.015625, "learning_rate": 4.738911292610732e-05, "loss": 0.1708, "step": 1530 }, { "epoch": 0.15645636492939144, "grad_norm": 3.953125, "learning_rate": 4.735312997255107e-05, "loss": 0.192, "step": 1540 }, { "epoch": 0.15747231535101086, "grad_norm": 2.09375, "learning_rate": 4.7316914610334475e-05, "loss": 0.2586, "step": 1550 }, { "epoch": 0.15848826577263028, "grad_norm": 3.6875, "learning_rate": 4.728046721599378e-05, "loss": 0.2141, "step": 1560 }, { "epoch": 0.15950421619424973, "grad_norm": 2.9375, "learning_rate": 4.724378816847771e-05, "loss": 0.193, "step": 1570 }, { "epoch": 0.16052016661586915, "grad_norm": 1.5625, "learning_rate": 4.720687784914352e-05, "loss": 0.191, "step": 1580 }, { "epoch": 0.16153611703748857, "grad_norm": 3.75, "learning_rate": 4.716973664175304e-05, "loss": 0.2172, "step": 1590 }, { "epoch": 0.162552067459108, "grad_norm": 3.125, "learning_rate": 4.7132364932468645e-05, "loss": 0.2134, "step": 1600 }, { "epoch": 0.16356801788072742, "grad_norm": 4.09375, "learning_rate": 4.709476310984932e-05, "loss": 0.2055, "step": 1610 }, { "epoch": 0.16458396830234684, "grad_norm": 3.875, "learning_rate": 4.705693156484652e-05, "loss": 0.2136, "step": 1620 }, { "epoch": 0.16559991872396626, "grad_norm": 1.1796875, "learning_rate": 4.7018870690800196e-05, "loss": 0.1471, "step": 1630 }, { "epoch": 0.16661586914558568, "grad_norm": 2.5, "learning_rate": 4.698058088343465e-05, "loss": 0.2308, "step": 1640 }, { "epoch": 0.16763181956720513, "grad_norm": 1.390625, "learning_rate": 4.6942062540854425e-05, "loss": 0.2456, "step": 1650 }, { "epoch": 0.16864776998882455, "grad_norm": 3.125, "learning_rate": 4.69033160635402e-05, "loss": 0.2654, "step": 1660 }, { "epoch": 0.16966372041044397, "grad_norm": 3.984375, "learning_rate": 4.6864341854344587e-05, "loss": 0.2226, "step": 1670 }, { "epoch": 0.1706796708320634, "grad_norm": 2.328125, "learning_rate": 4.682514031848795e-05, "loss": 0.2438, "step": 1680 }, { "epoch": 0.17169562125368282, "grad_norm": 3.078125, "learning_rate": 4.678571186355423e-05, "loss": 0.1889, "step": 1690 }, { "epoch": 0.17271157167530224, "grad_norm": 3.328125, "learning_rate": 4.6746056899486644e-05, "loss": 0.2117, "step": 1700 }, { "epoch": 0.17372752209692166, "grad_norm": 2.78125, "learning_rate": 4.67061758385835e-05, "loss": 0.1953, "step": 1710 }, { "epoch": 0.17474347251854108, "grad_norm": 3.09375, "learning_rate": 4.6666069095493816e-05, "loss": 0.1844, "step": 1720 }, { "epoch": 0.17575942294016053, "grad_norm": 3.234375, "learning_rate": 4.662573708721309e-05, "loss": 0.2774, "step": 1730 }, { "epoch": 0.17677537336177995, "grad_norm": 4.03125, "learning_rate": 4.658518023307894e-05, "loss": 0.2527, "step": 1740 }, { "epoch": 0.17779132378339937, "grad_norm": 3.21875, "learning_rate": 4.654439895476671e-05, "loss": 0.2164, "step": 1750 }, { "epoch": 0.1788072742050188, "grad_norm": 2.390625, "learning_rate": 4.6503393676285146e-05, "loss": 0.2424, "step": 1760 }, { "epoch": 0.17982322462663822, "grad_norm": 1.8359375, "learning_rate": 4.646216482397192e-05, "loss": 0.2428, "step": 1770 }, { "epoch": 0.18083917504825764, "grad_norm": 2.796875, "learning_rate": 4.6420712826489275e-05, "loss": 0.2155, "step": 1780 }, { "epoch": 0.18185512546987706, "grad_norm": 0.69921875, "learning_rate": 4.6379038114819485e-05, "loss": 0.1544, "step": 1790 }, { "epoch": 0.18287107589149648, "grad_norm": 3.40625, "learning_rate": 4.6337141122260444e-05, "loss": 0.2029, "step": 1800 }, { "epoch": 0.18388702631311593, "grad_norm": 2.359375, "learning_rate": 4.629502228442112e-05, "loss": 0.1489, "step": 1810 }, { "epoch": 0.18490297673473535, "grad_norm": 1.4453125, "learning_rate": 4.6252682039217045e-05, "loss": 0.2101, "step": 1820 }, { "epoch": 0.18591892715635477, "grad_norm": 2.71875, "learning_rate": 4.621012082686573e-05, "loss": 0.2076, "step": 1830 }, { "epoch": 0.1869348775779742, "grad_norm": 3.0625, "learning_rate": 4.616733908988216e-05, "loss": 0.2719, "step": 1840 }, { "epoch": 0.18795082799959362, "grad_norm": 1.953125, "learning_rate": 4.612433727307409e-05, "loss": 0.2105, "step": 1850 }, { "epoch": 0.18896677842121304, "grad_norm": 3.46875, "learning_rate": 4.608111582353751e-05, "loss": 0.1877, "step": 1860 }, { "epoch": 0.18998272884283246, "grad_norm": 2.546875, "learning_rate": 4.603767519065197e-05, "loss": 0.2238, "step": 1870 }, { "epoch": 0.19099867926445188, "grad_norm": 1.5703125, "learning_rate": 4.599401582607589e-05, "loss": 0.243, "step": 1880 }, { "epoch": 0.19201462968607133, "grad_norm": 2.4375, "learning_rate": 4.595013818374185e-05, "loss": 0.1867, "step": 1890 }, { "epoch": 0.19303058010769075, "grad_norm": 2.203125, "learning_rate": 4.5906042719851925e-05, "loss": 0.1994, "step": 1900 }, { "epoch": 0.19404653052931017, "grad_norm": 3.984375, "learning_rate": 4.586172989287291e-05, "loss": 0.1899, "step": 1910 }, { "epoch": 0.1950624809509296, "grad_norm": 2.6875, "learning_rate": 4.5817200163531534e-05, "loss": 0.2528, "step": 1920 }, { "epoch": 0.19607843137254902, "grad_norm": 2.71875, "learning_rate": 4.577245399480972e-05, "loss": 0.2336, "step": 1930 }, { "epoch": 0.19709438179416844, "grad_norm": 2.640625, "learning_rate": 4.5727491851939715e-05, "loss": 0.2204, "step": 1940 }, { "epoch": 0.19811033221578786, "grad_norm": 1.78125, "learning_rate": 4.568231420239929e-05, "loss": 0.1656, "step": 1950 }, { "epoch": 0.19912628263740728, "grad_norm": 3.15625, "learning_rate": 4.563692151590687e-05, "loss": 0.2105, "step": 1960 }, { "epoch": 0.20014223305902673, "grad_norm": 1.3671875, "learning_rate": 4.5591314264416666e-05, "loss": 0.1464, "step": 1970 }, { "epoch": 0.20115818348064615, "grad_norm": 4.25, "learning_rate": 4.554549292211371e-05, "loss": 0.2103, "step": 1980 }, { "epoch": 0.20217413390226557, "grad_norm": 2.984375, "learning_rate": 4.549945796540901e-05, "loss": 0.144, "step": 1990 }, { "epoch": 0.203190084323885, "grad_norm": 1.859375, "learning_rate": 4.545320987293453e-05, "loss": 0.1963, "step": 2000 }, { "epoch": 0.20420603474550442, "grad_norm": 1.078125, "learning_rate": 4.540674912553824e-05, "loss": 0.2115, "step": 2010 }, { "epoch": 0.20522198516712384, "grad_norm": 4.25, "learning_rate": 4.536007620627911e-05, "loss": 0.1682, "step": 2020 }, { "epoch": 0.20623793558874326, "grad_norm": 2.71875, "learning_rate": 4.531319160042212e-05, "loss": 0.1992, "step": 2030 }, { "epoch": 0.20725388601036268, "grad_norm": 1.2890625, "learning_rate": 4.5266095795433126e-05, "loss": 0.1134, "step": 2040 }, { "epoch": 0.20826983643198213, "grad_norm": 3.296875, "learning_rate": 4.5218789280973925e-05, "loss": 0.1474, "step": 2050 }, { "epoch": 0.20928578685360155, "grad_norm": 1.9375, "learning_rate": 4.5171272548897024e-05, "loss": 0.1955, "step": 2060 }, { "epoch": 0.21030173727522097, "grad_norm": 2.734375, "learning_rate": 4.512354609324063e-05, "loss": 0.2042, "step": 2070 }, { "epoch": 0.2113176876968404, "grad_norm": 2.921875, "learning_rate": 4.507561041022347e-05, "loss": 0.2174, "step": 2080 }, { "epoch": 0.21233363811845982, "grad_norm": 2.40625, "learning_rate": 4.502746599823963e-05, "loss": 0.2634, "step": 2090 }, { "epoch": 0.21334958854007924, "grad_norm": 1.71875, "learning_rate": 4.497911335785339e-05, "loss": 0.1884, "step": 2100 }, { "epoch": 0.21436553896169866, "grad_norm": 0.79296875, "learning_rate": 4.4930552991794e-05, "loss": 0.1872, "step": 2110 }, { "epoch": 0.21538148938331808, "grad_norm": 3.171875, "learning_rate": 4.4881785404950474e-05, "loss": 0.2233, "step": 2120 }, { "epoch": 0.21639743980493753, "grad_norm": 2.59375, "learning_rate": 4.483281110436631e-05, "loss": 0.2374, "step": 2130 }, { "epoch": 0.21741339022655695, "grad_norm": 3.328125, "learning_rate": 4.478363059923426e-05, "loss": 0.2545, "step": 2140 }, { "epoch": 0.21842934064817637, "grad_norm": 2.3125, "learning_rate": 4.4734244400891014e-05, "loss": 0.2063, "step": 2150 }, { "epoch": 0.2194452910697958, "grad_norm": 3.40625, "learning_rate": 4.4684653022811865e-05, "loss": 0.1219, "step": 2160 }, { "epoch": 0.22046124149141522, "grad_norm": 4.1875, "learning_rate": 4.463485698060541e-05, "loss": 0.2805, "step": 2170 }, { "epoch": 0.22147719191303464, "grad_norm": 2.3125, "learning_rate": 4.458485679200814e-05, "loss": 0.1998, "step": 2180 }, { "epoch": 0.22249314233465406, "grad_norm": 3.578125, "learning_rate": 4.453465297687912e-05, "loss": 0.2489, "step": 2190 }, { "epoch": 0.22350909275627348, "grad_norm": 2.59375, "learning_rate": 4.448424605719452e-05, "loss": 0.2731, "step": 2200 }, { "epoch": 0.22452504317789293, "grad_norm": 3.28125, "learning_rate": 4.443363655704224e-05, "loss": 0.2425, "step": 2210 }, { "epoch": 0.22554099359951235, "grad_norm": 2.78125, "learning_rate": 4.438282500261641e-05, "loss": 0.2938, "step": 2220 }, { "epoch": 0.22655694402113177, "grad_norm": 1.1953125, "learning_rate": 4.433181192221197e-05, "loss": 0.1728, "step": 2230 }, { "epoch": 0.2275728944427512, "grad_norm": 1.34375, "learning_rate": 4.4280597846219155e-05, "loss": 0.216, "step": 2240 }, { "epoch": 0.22858884486437062, "grad_norm": 1.8515625, "learning_rate": 4.422918330711796e-05, "loss": 0.1612, "step": 2250 }, { "epoch": 0.22960479528599004, "grad_norm": 1.90625, "learning_rate": 4.417756883947263e-05, "loss": 0.107, "step": 2260 }, { "epoch": 0.23062074570760946, "grad_norm": 3.375, "learning_rate": 4.412575497992611e-05, "loss": 0.1756, "step": 2270 }, { "epoch": 0.23163669612922888, "grad_norm": 4.375, "learning_rate": 4.407374226719445e-05, "loss": 0.234, "step": 2280 }, { "epoch": 0.23265264655084833, "grad_norm": 3.25, "learning_rate": 4.402153124206119e-05, "loss": 0.2144, "step": 2290 }, { "epoch": 0.23366859697246775, "grad_norm": 1.703125, "learning_rate": 4.396912244737173e-05, "loss": 0.1696, "step": 2300 }, { "epoch": 0.23468454739408717, "grad_norm": 2.84375, "learning_rate": 4.391651642802778e-05, "loss": 0.2506, "step": 2310 }, { "epoch": 0.2357004978157066, "grad_norm": 4.5, "learning_rate": 4.386371373098155e-05, "loss": 0.1686, "step": 2320 }, { "epoch": 0.23671644823732602, "grad_norm": 2.515625, "learning_rate": 4.381071490523018e-05, "loss": 0.2403, "step": 2330 }, { "epoch": 0.23773239865894544, "grad_norm": 4.4375, "learning_rate": 4.3757520501809955e-05, "loss": 0.1611, "step": 2340 }, { "epoch": 0.23874834908056486, "grad_norm": 1.609375, "learning_rate": 4.370413107379065e-05, "loss": 0.1698, "step": 2350 }, { "epoch": 0.23976429950218428, "grad_norm": 4.96875, "learning_rate": 4.36505471762697e-05, "loss": 0.1928, "step": 2360 }, { "epoch": 0.24078024992380373, "grad_norm": 0.8984375, "learning_rate": 4.3596769366366474e-05, "loss": 0.2035, "step": 2370 }, { "epoch": 0.24179620034542315, "grad_norm": 5.75, "learning_rate": 4.354279820321649e-05, "loss": 0.16, "step": 2380 }, { "epoch": 0.24281215076704257, "grad_norm": 1.9453125, "learning_rate": 4.34886342479656e-05, "loss": 0.1851, "step": 2390 }, { "epoch": 0.243828101188662, "grad_norm": 1.015625, "learning_rate": 4.34342780637641e-05, "loss": 0.1726, "step": 2400 }, { "epoch": 0.24484405161028142, "grad_norm": 4.59375, "learning_rate": 4.337973021576095e-05, "loss": 0.2847, "step": 2410 }, { "epoch": 0.24586000203190084, "grad_norm": 1.03125, "learning_rate": 4.3324991271097846e-05, "loss": 0.2528, "step": 2420 }, { "epoch": 0.24687595245352026, "grad_norm": 2.1875, "learning_rate": 4.3270061798903374e-05, "loss": 0.1573, "step": 2430 }, { "epoch": 0.24789190287513968, "grad_norm": 0.98046875, "learning_rate": 4.321494237028701e-05, "loss": 0.1703, "step": 2440 }, { "epoch": 0.24890785329675913, "grad_norm": 3.8125, "learning_rate": 4.31596335583333e-05, "loss": 0.2613, "step": 2450 }, { "epoch": 0.24992380371837855, "grad_norm": 4.0625, "learning_rate": 4.310413593809579e-05, "loss": 0.22, "step": 2460 }, { "epoch": 0.250939754139998, "grad_norm": 3.15625, "learning_rate": 4.304845008659108e-05, "loss": 0.1263, "step": 2470 }, { "epoch": 0.25195570456161737, "grad_norm": 3.046875, "learning_rate": 4.2992576582792895e-05, "loss": 0.1639, "step": 2480 }, { "epoch": 0.2529716549832368, "grad_norm": 9.8125, "learning_rate": 4.293651600762595e-05, "loss": 0.2681, "step": 2490 }, { "epoch": 0.25398760540485626, "grad_norm": 3.734375, "learning_rate": 4.288026894395999e-05, "loss": 0.2292, "step": 2500 }, { "epoch": 0.25500355582647566, "grad_norm": 0.455078125, "learning_rate": 4.2823835976603723e-05, "loss": 0.2324, "step": 2510 }, { "epoch": 0.2560195062480951, "grad_norm": 5.625, "learning_rate": 4.276721769229869e-05, "loss": 0.1834, "step": 2520 }, { "epoch": 0.2570354566697145, "grad_norm": 1.3671875, "learning_rate": 4.271041467971323e-05, "loss": 0.1826, "step": 2530 }, { "epoch": 0.25805140709133395, "grad_norm": 5.0625, "learning_rate": 4.265342752943632e-05, "loss": 0.2463, "step": 2540 }, { "epoch": 0.25906735751295334, "grad_norm": 2.859375, "learning_rate": 4.2596256833971425e-05, "loss": 0.2598, "step": 2550 }, { "epoch": 0.2600833079345728, "grad_norm": 1.8515625, "learning_rate": 4.2538903187730374e-05, "loss": 0.1148, "step": 2560 }, { "epoch": 0.26109925835619224, "grad_norm": 2.71875, "learning_rate": 4.248136718702716e-05, "loss": 0.2123, "step": 2570 }, { "epoch": 0.26211520877781164, "grad_norm": 4.5625, "learning_rate": 4.242364943007172e-05, "loss": 0.2369, "step": 2580 }, { "epoch": 0.2631311591994311, "grad_norm": 2.296875, "learning_rate": 4.236575051696377e-05, "loss": 0.261, "step": 2590 }, { "epoch": 0.2641471096210505, "grad_norm": 2.75, "learning_rate": 4.2307671049686514e-05, "loss": 0.1564, "step": 2600 }, { "epoch": 0.26516306004266993, "grad_norm": 3.5, "learning_rate": 4.2249411632100396e-05, "loss": 0.1563, "step": 2610 }, { "epoch": 0.2661790104642893, "grad_norm": 2.84375, "learning_rate": 4.219097286993684e-05, "loss": 0.1697, "step": 2620 }, { "epoch": 0.26719496088590877, "grad_norm": 2.125, "learning_rate": 4.2132355370791946e-05, "loss": 0.1844, "step": 2630 }, { "epoch": 0.26821091130752817, "grad_norm": 4.03125, "learning_rate": 4.2073559744120156e-05, "loss": 0.2144, "step": 2640 }, { "epoch": 0.2692268617291476, "grad_norm": 2.375, "learning_rate": 4.201458660122793e-05, "loss": 0.2013, "step": 2650 }, { "epoch": 0.27024281215076706, "grad_norm": 3.625, "learning_rate": 4.1955436555267393e-05, "loss": 0.2166, "step": 2660 }, { "epoch": 0.27125876257238646, "grad_norm": 0.328125, "learning_rate": 4.189611022122997e-05, "loss": 0.1934, "step": 2670 }, { "epoch": 0.2722747129940059, "grad_norm": 2.75, "learning_rate": 4.1836608215939944e-05, "loss": 0.2157, "step": 2680 }, { "epoch": 0.2732906634156253, "grad_norm": 3.5, "learning_rate": 4.17769311580481e-05, "loss": 0.18, "step": 2690 }, { "epoch": 0.27430661383724475, "grad_norm": 2.109375, "learning_rate": 4.171707966802528e-05, "loss": 0.2178, "step": 2700 }, { "epoch": 0.27532256425886414, "grad_norm": 4.65625, "learning_rate": 4.16570543681559e-05, "loss": 0.1896, "step": 2710 }, { "epoch": 0.2763385146804836, "grad_norm": 4.8125, "learning_rate": 4.159685588253151e-05, "loss": 0.1322, "step": 2720 }, { "epoch": 0.27735446510210304, "grad_norm": 3.9375, "learning_rate": 4.153648483704429e-05, "loss": 0.184, "step": 2730 }, { "epoch": 0.27837041552372244, "grad_norm": 4.53125, "learning_rate": 4.147594185938057e-05, "loss": 0.2451, "step": 2740 }, { "epoch": 0.2793863659453419, "grad_norm": 1.0390625, "learning_rate": 4.141522757901426e-05, "loss": 0.2367, "step": 2750 }, { "epoch": 0.2804023163669613, "grad_norm": 3.375, "learning_rate": 4.1354342627200345e-05, "loss": 0.179, "step": 2760 }, { "epoch": 0.28141826678858073, "grad_norm": 2.953125, "learning_rate": 4.1293287636968286e-05, "loss": 0.1396, "step": 2770 }, { "epoch": 0.2824342172102001, "grad_norm": 2.546875, "learning_rate": 4.1232063243115485e-05, "loss": 0.1963, "step": 2780 }, { "epoch": 0.28345016763181957, "grad_norm": 5.09375, "learning_rate": 4.117067008220063e-05, "loss": 0.2457, "step": 2790 }, { "epoch": 0.28446611805343897, "grad_norm": 2.046875, "learning_rate": 4.110910879253712e-05, "loss": 0.2262, "step": 2800 }, { "epoch": 0.2854820684750584, "grad_norm": 2.1875, "learning_rate": 4.104738001418641e-05, "loss": 0.2499, "step": 2810 }, { "epoch": 0.28649801889667786, "grad_norm": 2.59375, "learning_rate": 4.098548438895135e-05, "loss": 0.1667, "step": 2820 }, { "epoch": 0.28751396931829726, "grad_norm": 2.875, "learning_rate": 4.092342256036954e-05, "loss": 0.2288, "step": 2830 }, { "epoch": 0.2885299197399167, "grad_norm": 3.015625, "learning_rate": 4.086119517370659e-05, "loss": 0.2038, "step": 2840 }, { "epoch": 0.2895458701615361, "grad_norm": 3.53125, "learning_rate": 4.0798802875949485e-05, "loss": 0.181, "step": 2850 }, { "epoch": 0.29056182058315555, "grad_norm": 2.296875, "learning_rate": 4.073624631579975e-05, "loss": 0.1886, "step": 2860 }, { "epoch": 0.29157777100477494, "grad_norm": 3.609375, "learning_rate": 4.067352614366685e-05, "loss": 0.2053, "step": 2870 }, { "epoch": 0.2925937214263944, "grad_norm": 2.328125, "learning_rate": 4.061064301166128e-05, "loss": 0.1409, "step": 2880 }, { "epoch": 0.29360967184801384, "grad_norm": 4.9375, "learning_rate": 4.054759757358787e-05, "loss": 0.184, "step": 2890 }, { "epoch": 0.29462562226963324, "grad_norm": 4.6875, "learning_rate": 4.048439048493898e-05, "loss": 0.2306, "step": 2900 }, { "epoch": 0.2956415726912527, "grad_norm": 4.09375, "learning_rate": 4.0421022402887676e-05, "loss": 0.1914, "step": 2910 }, { "epoch": 0.2966575231128721, "grad_norm": 2.3125, "learning_rate": 4.035749398628088e-05, "loss": 0.1653, "step": 2920 }, { "epoch": 0.29767347353449153, "grad_norm": 2.515625, "learning_rate": 4.029380589563256e-05, "loss": 0.1941, "step": 2930 }, { "epoch": 0.2986894239561109, "grad_norm": 1.78125, "learning_rate": 4.02299587931168e-05, "loss": 0.1117, "step": 2940 }, { "epoch": 0.29970537437773037, "grad_norm": 0.8359375, "learning_rate": 4.0165953342560974e-05, "loss": 0.1605, "step": 2950 }, { "epoch": 0.30072132479934977, "grad_norm": 3.046875, "learning_rate": 4.010179020943884e-05, "loss": 0.1726, "step": 2960 }, { "epoch": 0.3017372752209692, "grad_norm": 3.453125, "learning_rate": 4.003747006086357e-05, "loss": 0.2208, "step": 2970 }, { "epoch": 0.30275322564258866, "grad_norm": 2.515625, "learning_rate": 3.9972993565580866e-05, "loss": 0.1325, "step": 2980 }, { "epoch": 0.30376917606420806, "grad_norm": 3.046875, "learning_rate": 3.9908361393962e-05, "loss": 0.2014, "step": 2990 }, { "epoch": 0.3047851264858275, "grad_norm": 2.28125, "learning_rate": 3.984357421799681e-05, "loss": 0.165, "step": 3000 }, { "epoch": 0.3058010769074469, "grad_norm": 5.09375, "learning_rate": 3.9778632711286756e-05, "loss": 0.212, "step": 3010 }, { "epoch": 0.30681702732906635, "grad_norm": 4.25, "learning_rate": 3.971353754903788e-05, "loss": 0.2388, "step": 3020 }, { "epoch": 0.30783297775068574, "grad_norm": 2.34375, "learning_rate": 3.964828940805381e-05, "loss": 0.2175, "step": 3030 }, { "epoch": 0.3088489281723052, "grad_norm": 4.09375, "learning_rate": 3.95828889667287e-05, "loss": 0.2088, "step": 3040 }, { "epoch": 0.30986487859392464, "grad_norm": 2.359375, "learning_rate": 3.9517336905040244e-05, "loss": 0.1913, "step": 3050 }, { "epoch": 0.31088082901554404, "grad_norm": 1.1640625, "learning_rate": 3.9451633904542483e-05, "loss": 0.2185, "step": 3060 }, { "epoch": 0.3118967794371635, "grad_norm": 2.59375, "learning_rate": 3.9385780648358846e-05, "loss": 0.2072, "step": 3070 }, { "epoch": 0.3129127298587829, "grad_norm": 3.015625, "learning_rate": 3.9319777821174955e-05, "loss": 0.1902, "step": 3080 }, { "epoch": 0.31392868028040233, "grad_norm": 2.375, "learning_rate": 3.925362610923158e-05, "loss": 0.259, "step": 3090 }, { "epoch": 0.3149446307020217, "grad_norm": 4.65625, "learning_rate": 3.918732620031742e-05, "loss": 0.2026, "step": 3100 }, { "epoch": 0.31596058112364117, "grad_norm": 2.1875, "learning_rate": 3.912087878376205e-05, "loss": 0.1478, "step": 3110 }, { "epoch": 0.31697653154526056, "grad_norm": 2.34375, "learning_rate": 3.905428455042865e-05, "loss": 0.167, "step": 3120 }, { "epoch": 0.31799248196688, "grad_norm": 2.390625, "learning_rate": 3.898754419270693e-05, "loss": 0.1629, "step": 3130 }, { "epoch": 0.31900843238849946, "grad_norm": 1.546875, "learning_rate": 3.892065840450583e-05, "loss": 0.1308, "step": 3140 }, { "epoch": 0.32002438281011886, "grad_norm": 4.625, "learning_rate": 3.885362788124637e-05, "loss": 0.2008, "step": 3150 }, { "epoch": 0.3210403332317383, "grad_norm": 3.8125, "learning_rate": 3.8786453319854396e-05, "loss": 0.2225, "step": 3160 }, { "epoch": 0.3220562836533577, "grad_norm": 3.015625, "learning_rate": 3.8719135418753366e-05, "loss": 0.2243, "step": 3170 }, { "epoch": 0.32307223407497715, "grad_norm": 5.6875, "learning_rate": 3.865167487785702e-05, "loss": 0.1981, "step": 3180 }, { "epoch": 0.32408818449659654, "grad_norm": 4.84375, "learning_rate": 3.8584072398562164e-05, "loss": 0.2031, "step": 3190 }, { "epoch": 0.325104134918216, "grad_norm": 4.0625, "learning_rate": 3.851632868374136e-05, "loss": 0.1621, "step": 3200 }, { "epoch": 0.32612008533983544, "grad_norm": 3.421875, "learning_rate": 3.844844443773562e-05, "loss": 0.1674, "step": 3210 }, { "epoch": 0.32713603576145484, "grad_norm": 1.3671875, "learning_rate": 3.8380420366347046e-05, "loss": 0.1502, "step": 3220 }, { "epoch": 0.3281519861830743, "grad_norm": 3.734375, "learning_rate": 3.831225717683157e-05, "loss": 0.1868, "step": 3230 }, { "epoch": 0.3291679366046937, "grad_norm": 2.703125, "learning_rate": 3.8243955577891534e-05, "loss": 0.1818, "step": 3240 }, { "epoch": 0.3301838870263131, "grad_norm": 3.796875, "learning_rate": 3.8175516279668335e-05, "loss": 0.2215, "step": 3250 }, { "epoch": 0.3311998374479325, "grad_norm": 3.203125, "learning_rate": 3.810693999373505e-05, "loss": 0.2544, "step": 3260 }, { "epoch": 0.33221578786955197, "grad_norm": 4.0, "learning_rate": 3.8038227433089056e-05, "loss": 0.1175, "step": 3270 }, { "epoch": 0.33323173829117136, "grad_norm": 3.625, "learning_rate": 3.796937931214458e-05, "loss": 0.2213, "step": 3280 }, { "epoch": 0.3342476887127908, "grad_norm": 1.7265625, "learning_rate": 3.7900396346725296e-05, "loss": 0.1711, "step": 3290 }, { "epoch": 0.33526363913441026, "grad_norm": 3.140625, "learning_rate": 3.783127925405686e-05, "loss": 0.2628, "step": 3300 }, { "epoch": 0.33627958955602966, "grad_norm": 2.1875, "learning_rate": 3.77620287527595e-05, "loss": 0.1671, "step": 3310 }, { "epoch": 0.3372955399776491, "grad_norm": 5.28125, "learning_rate": 3.769264556284048e-05, "loss": 0.2109, "step": 3320 }, { "epoch": 0.3383114903992685, "grad_norm": 2.875, "learning_rate": 3.762313040568665e-05, "loss": 0.1978, "step": 3330 }, { "epoch": 0.33932744082088795, "grad_norm": 2.234375, "learning_rate": 3.755348400405697e-05, "loss": 0.1275, "step": 3340 }, { "epoch": 0.34034339124250734, "grad_norm": 1.9453125, "learning_rate": 3.7483707082074945e-05, "loss": 0.1482, "step": 3350 }, { "epoch": 0.3413593416641268, "grad_norm": 5.40625, "learning_rate": 3.741380036522111e-05, "loss": 0.1933, "step": 3360 }, { "epoch": 0.34237529208574624, "grad_norm": 4.53125, "learning_rate": 3.734376458032551e-05, "loss": 0.1925, "step": 3370 }, { "epoch": 0.34339124250736563, "grad_norm": 4.0625, "learning_rate": 3.727360045556014e-05, "loss": 0.2297, "step": 3380 }, { "epoch": 0.3444071929289851, "grad_norm": 2.53125, "learning_rate": 3.7203308720431336e-05, "loss": 0.1704, "step": 3390 }, { "epoch": 0.3454231433506045, "grad_norm": 1.859375, "learning_rate": 3.7132890105772234e-05, "loss": 0.258, "step": 3400 }, { "epoch": 0.3464390937722239, "grad_norm": 3.90625, "learning_rate": 3.706234534373515e-05, "loss": 0.2376, "step": 3410 }, { "epoch": 0.3474550441938433, "grad_norm": 1.1015625, "learning_rate": 3.6991675167783985e-05, "loss": 0.2403, "step": 3420 }, { "epoch": 0.34847099461546277, "grad_norm": 1.1640625, "learning_rate": 3.6920880312686556e-05, "loss": 0.1642, "step": 3430 }, { "epoch": 0.34948694503708216, "grad_norm": 2.875, "learning_rate": 3.684996151450702e-05, "loss": 0.1455, "step": 3440 }, { "epoch": 0.3505028954587016, "grad_norm": 0.59765625, "learning_rate": 3.6778919510598155e-05, "loss": 0.2175, "step": 3450 }, { "epoch": 0.35151884588032106, "grad_norm": 0.93359375, "learning_rate": 3.670775503959376e-05, "loss": 0.1858, "step": 3460 }, { "epoch": 0.35253479630194046, "grad_norm": 4.1875, "learning_rate": 3.6636468841400917e-05, "loss": 0.1911, "step": 3470 }, { "epoch": 0.3535507467235599, "grad_norm": 3.734375, "learning_rate": 3.656506165719233e-05, "loss": 0.2114, "step": 3480 }, { "epoch": 0.3545666971451793, "grad_norm": 1.171875, "learning_rate": 3.649353422939863e-05, "loss": 0.1841, "step": 3490 }, { "epoch": 0.35558264756679875, "grad_norm": 2.53125, "learning_rate": 3.6421887301700615e-05, "loss": 0.1505, "step": 3500 }, { "epoch": 0.35659859798841814, "grad_norm": 4.9375, "learning_rate": 3.6350121619021524e-05, "loss": 0.2625, "step": 3510 }, { "epoch": 0.3576145484100376, "grad_norm": 5.25, "learning_rate": 3.627823792751936e-05, "loss": 0.1676, "step": 3520 }, { "epoch": 0.35863049883165704, "grad_norm": 1.09375, "learning_rate": 3.620623697457905e-05, "loss": 0.1963, "step": 3530 }, { "epoch": 0.35964644925327643, "grad_norm": 4.03125, "learning_rate": 3.613411950880468e-05, "loss": 0.2048, "step": 3540 }, { "epoch": 0.3606623996748959, "grad_norm": 4.40625, "learning_rate": 3.606188628001178e-05, "loss": 0.226, "step": 3550 }, { "epoch": 0.3616783500965153, "grad_norm": 2.375, "learning_rate": 3.598953803921947e-05, "loss": 0.1884, "step": 3560 }, { "epoch": 0.3626943005181347, "grad_norm": 3.21875, "learning_rate": 3.591707553864266e-05, "loss": 0.224, "step": 3570 }, { "epoch": 0.3637102509397541, "grad_norm": 3.5625, "learning_rate": 3.584449953168423e-05, "loss": 0.1866, "step": 3580 }, { "epoch": 0.36472620136137357, "grad_norm": 2.359375, "learning_rate": 3.577181077292722e-05, "loss": 0.1663, "step": 3590 }, { "epoch": 0.36574215178299296, "grad_norm": 5.0, "learning_rate": 3.569901001812696e-05, "loss": 0.2032, "step": 3600 }, { "epoch": 0.3667581022046124, "grad_norm": 1.953125, "learning_rate": 3.562609802420321e-05, "loss": 0.2395, "step": 3610 }, { "epoch": 0.36777405262623186, "grad_norm": 3.796875, "learning_rate": 3.555307554923229e-05, "loss": 0.1799, "step": 3620 }, { "epoch": 0.36879000304785126, "grad_norm": 4.4375, "learning_rate": 3.547994335243925e-05, "loss": 0.1771, "step": 3630 }, { "epoch": 0.3698059534694707, "grad_norm": 1.890625, "learning_rate": 3.540670219418989e-05, "loss": 0.2123, "step": 3640 }, { "epoch": 0.3708219038910901, "grad_norm": 4.03125, "learning_rate": 3.53333528359829e-05, "loss": 0.2159, "step": 3650 }, { "epoch": 0.37183785431270955, "grad_norm": 3.265625, "learning_rate": 3.525989604044198e-05, "loss": 0.2749, "step": 3660 }, { "epoch": 0.37285380473432894, "grad_norm": 1.4375, "learning_rate": 3.5186332571307826e-05, "loss": 0.1613, "step": 3670 }, { "epoch": 0.3738697551559484, "grad_norm": 3.984375, "learning_rate": 3.511266319343025e-05, "loss": 0.1877, "step": 3680 }, { "epoch": 0.37488570557756784, "grad_norm": 2.203125, "learning_rate": 3.503888867276022e-05, "loss": 0.2185, "step": 3690 }, { "epoch": 0.37590165599918723, "grad_norm": 1.5078125, "learning_rate": 3.4965009776341894e-05, "loss": 0.2195, "step": 3700 }, { "epoch": 0.3769176064208067, "grad_norm": 4.375, "learning_rate": 3.489102727230461e-05, "loss": 0.2344, "step": 3710 }, { "epoch": 0.3779335568424261, "grad_norm": 2.984375, "learning_rate": 3.481694192985496e-05, "loss": 0.1863, "step": 3720 }, { "epoch": 0.3789495072640455, "grad_norm": 1.1328125, "learning_rate": 3.474275451926875e-05, "loss": 0.1894, "step": 3730 }, { "epoch": 0.3799654576856649, "grad_norm": 2.265625, "learning_rate": 3.4668465811883e-05, "loss": 0.2127, "step": 3740 }, { "epoch": 0.38098140810728437, "grad_norm": 2.921875, "learning_rate": 3.4594076580087914e-05, "loss": 0.2125, "step": 3750 }, { "epoch": 0.38199735852890376, "grad_norm": 2.390625, "learning_rate": 3.451958759731889e-05, "loss": 0.1801, "step": 3760 }, { "epoch": 0.3830133089505232, "grad_norm": 3.046875, "learning_rate": 3.4444999638048456e-05, "loss": 0.1949, "step": 3770 }, { "epoch": 0.38402925937214266, "grad_norm": 2.890625, "learning_rate": 3.437031347777817e-05, "loss": 0.2719, "step": 3780 }, { "epoch": 0.38504520979376206, "grad_norm": 3.9375, "learning_rate": 3.4295529893030634e-05, "loss": 0.1697, "step": 3790 }, { "epoch": 0.3860611602153815, "grad_norm": 2.0625, "learning_rate": 3.422064966134138e-05, "loss": 0.1557, "step": 3800 }, { "epoch": 0.3870771106370009, "grad_norm": 2.234375, "learning_rate": 3.4145673561250794e-05, "loss": 0.2129, "step": 3810 }, { "epoch": 0.38809306105862035, "grad_norm": 4.96875, "learning_rate": 3.4070602372296e-05, "loss": 0.2068, "step": 3820 }, { "epoch": 0.38910901148023974, "grad_norm": 2.234375, "learning_rate": 3.39954368750028e-05, "loss": 0.1634, "step": 3830 }, { "epoch": 0.3901249619018592, "grad_norm": 1.75, "learning_rate": 3.392017785087752e-05, "loss": 0.2299, "step": 3840 }, { "epoch": 0.39114091232347864, "grad_norm": 3.90625, "learning_rate": 3.38448260823989e-05, "loss": 0.1585, "step": 3850 }, { "epoch": 0.39215686274509803, "grad_norm": 2.8125, "learning_rate": 3.376938235300996e-05, "loss": 0.2382, "step": 3860 }, { "epoch": 0.3931728131667175, "grad_norm": 5.375, "learning_rate": 3.369384744710984e-05, "loss": 0.1987, "step": 3870 }, { "epoch": 0.3941887635883369, "grad_norm": 2.578125, "learning_rate": 3.361822215004566e-05, "loss": 0.2316, "step": 3880 }, { "epoch": 0.3952047140099563, "grad_norm": 2.0, "learning_rate": 3.354250724810436e-05, "loss": 0.2019, "step": 3890 }, { "epoch": 0.3962206644315757, "grad_norm": 2.3125, "learning_rate": 3.34667035285045e-05, "loss": 0.187, "step": 3900 }, { "epoch": 0.39723661485319517, "grad_norm": 3.53125, "learning_rate": 3.339081177938811e-05, "loss": 0.2353, "step": 3910 }, { "epoch": 0.39825256527481456, "grad_norm": 1.9609375, "learning_rate": 3.331483278981244e-05, "loss": 0.2078, "step": 3920 }, { "epoch": 0.399268515696434, "grad_norm": 1.2109375, "learning_rate": 3.323876734974183e-05, "loss": 0.1761, "step": 3930 }, { "epoch": 0.40028446611805346, "grad_norm": 4.0625, "learning_rate": 3.316261625003943e-05, "loss": 0.2081, "step": 3940 }, { "epoch": 0.40130041653967286, "grad_norm": 1.953125, "learning_rate": 3.308638028245902e-05, "loss": 0.2087, "step": 3950 }, { "epoch": 0.4023163669612923, "grad_norm": 2.390625, "learning_rate": 3.301006023963676e-05, "loss": 0.1579, "step": 3960 }, { "epoch": 0.4033323173829117, "grad_norm": 3.53125, "learning_rate": 3.293365691508295e-05, "loss": 0.1904, "step": 3970 }, { "epoch": 0.40434826780453115, "grad_norm": 3.0, "learning_rate": 3.285717110317379e-05, "loss": 0.1991, "step": 3980 }, { "epoch": 0.40536421822615054, "grad_norm": 7.21875, "learning_rate": 3.27806035991431e-05, "loss": 0.1445, "step": 3990 }, { "epoch": 0.40638016864777, "grad_norm": 1.0859375, "learning_rate": 3.2703955199074075e-05, "loss": 0.2393, "step": 4000 }, { "epoch": 0.40739611906938944, "grad_norm": 4.5625, "learning_rate": 3.262722669989098e-05, "loss": 0.1789, "step": 4010 }, { "epoch": 0.40841206949100883, "grad_norm": 3.09375, "learning_rate": 3.255041889935092e-05, "loss": 0.1511, "step": 4020 }, { "epoch": 0.4094280199126283, "grad_norm": 1.90625, "learning_rate": 3.247353259603547e-05, "loss": 0.2066, "step": 4030 }, { "epoch": 0.4104439703342477, "grad_norm": 2.28125, "learning_rate": 3.239656858934242e-05, "loss": 0.1564, "step": 4040 }, { "epoch": 0.4114599207558671, "grad_norm": 2.609375, "learning_rate": 3.231952767947746e-05, "loss": 0.1503, "step": 4050 }, { "epoch": 0.4124758711774865, "grad_norm": 1.4453125, "learning_rate": 3.2242410667445844e-05, "loss": 0.1633, "step": 4060 }, { "epoch": 0.41349182159910597, "grad_norm": 3.015625, "learning_rate": 3.2165218355044076e-05, "loss": 0.1492, "step": 4070 }, { "epoch": 0.41450777202072536, "grad_norm": 3.234375, "learning_rate": 3.2087951544851566e-05, "loss": 0.3051, "step": 4080 }, { "epoch": 0.4155237224423448, "grad_norm": 2.9375, "learning_rate": 3.20106110402223e-05, "loss": 0.2229, "step": 4090 }, { "epoch": 0.41653967286396426, "grad_norm": 3.171875, "learning_rate": 3.1933197645276455e-05, "loss": 0.2224, "step": 4100 }, { "epoch": 0.41755562328558365, "grad_norm": 2.09375, "learning_rate": 3.185571216489209e-05, "loss": 0.1297, "step": 4110 }, { "epoch": 0.4185715737072031, "grad_norm": 3.625, "learning_rate": 3.177815540469669e-05, "loss": 0.2074, "step": 4120 }, { "epoch": 0.4195875241288225, "grad_norm": 2.296875, "learning_rate": 3.1700528171058916e-05, "loss": 0.1949, "step": 4130 }, { "epoch": 0.42060347455044195, "grad_norm": 3.8125, "learning_rate": 3.162283127108011e-05, "loss": 0.1661, "step": 4140 }, { "epoch": 0.42161942497206134, "grad_norm": 2.5, "learning_rate": 3.154506551258594e-05, "loss": 0.2275, "step": 4150 }, { "epoch": 0.4226353753936808, "grad_norm": 2.96875, "learning_rate": 3.146723170411804e-05, "loss": 0.2242, "step": 4160 }, { "epoch": 0.42365132581530024, "grad_norm": 6.625, "learning_rate": 3.138933065492552e-05, "loss": 0.1897, "step": 4170 }, { "epoch": 0.42466727623691963, "grad_norm": 0.8515625, "learning_rate": 3.131136317495665e-05, "loss": 0.1629, "step": 4180 }, { "epoch": 0.4256832266585391, "grad_norm": 0.94140625, "learning_rate": 3.1233330074850364e-05, "loss": 0.1535, "step": 4190 }, { "epoch": 0.4266991770801585, "grad_norm": 2.6875, "learning_rate": 3.115523216592786e-05, "loss": 0.2494, "step": 4200 }, { "epoch": 0.4277151275017779, "grad_norm": 2.578125, "learning_rate": 3.107707026018417e-05, "loss": 0.1705, "step": 4210 }, { "epoch": 0.4287310779233973, "grad_norm": 3.0625, "learning_rate": 3.09988451702797e-05, "loss": 0.1507, "step": 4220 }, { "epoch": 0.42974702834501677, "grad_norm": 2.421875, "learning_rate": 3.0920557709531804e-05, "loss": 0.3071, "step": 4230 }, { "epoch": 0.43076297876663616, "grad_norm": 3.640625, "learning_rate": 3.0842208691906306e-05, "loss": 0.199, "step": 4240 }, { "epoch": 0.4317789291882556, "grad_norm": 3.5625, "learning_rate": 3.076379893200904e-05, "loss": 0.1987, "step": 4250 }, { "epoch": 0.43279487960987506, "grad_norm": 3.65625, "learning_rate": 3.068532924507739e-05, "loss": 0.1945, "step": 4260 }, { "epoch": 0.43381083003149445, "grad_norm": 5.875, "learning_rate": 3.060680044697183e-05, "loss": 0.1937, "step": 4270 }, { "epoch": 0.4348267804531139, "grad_norm": 2.859375, "learning_rate": 3.052821335416739e-05, "loss": 0.1643, "step": 4280 }, { "epoch": 0.4358427308747333, "grad_norm": 3.296875, "learning_rate": 3.0449568783745203e-05, "loss": 0.1455, "step": 4290 }, { "epoch": 0.43685868129635275, "grad_norm": 0.427734375, "learning_rate": 3.0370867553384023e-05, "loss": 0.1891, "step": 4300 }, { "epoch": 0.43787463171797214, "grad_norm": 0.361328125, "learning_rate": 3.029211048135171e-05, "loss": 0.1377, "step": 4310 }, { "epoch": 0.4388905821395916, "grad_norm": 1.8203125, "learning_rate": 3.021329838649668e-05, "loss": 0.2194, "step": 4320 }, { "epoch": 0.43990653256121104, "grad_norm": 1.8828125, "learning_rate": 3.0134432088239462e-05, "loss": 0.1915, "step": 4330 }, { "epoch": 0.44092248298283043, "grad_norm": 2.015625, "learning_rate": 3.0055512406564146e-05, "loss": 0.1794, "step": 4340 }, { "epoch": 0.4419384334044499, "grad_norm": 2.546875, "learning_rate": 2.9976540162009836e-05, "loss": 0.2154, "step": 4350 }, { "epoch": 0.4429543838260693, "grad_norm": 4.09375, "learning_rate": 2.9897516175662155e-05, "loss": 0.1861, "step": 4360 }, { "epoch": 0.4439703342476887, "grad_norm": 3.953125, "learning_rate": 2.9818441269144693e-05, "loss": 0.1857, "step": 4370 }, { "epoch": 0.4449862846693081, "grad_norm": 2.234375, "learning_rate": 2.9739316264610452e-05, "loss": 0.1493, "step": 4380 }, { "epoch": 0.44600223509092757, "grad_norm": 1.109375, "learning_rate": 2.966014198473332e-05, "loss": 0.186, "step": 4390 }, { "epoch": 0.44701818551254696, "grad_norm": 4.5625, "learning_rate": 2.9580919252699502e-05, "loss": 0.1963, "step": 4400 }, { "epoch": 0.4480341359341664, "grad_norm": 7.3125, "learning_rate": 2.9501648892198984e-05, "loss": 0.2882, "step": 4410 }, { "epoch": 0.44905008635578586, "grad_norm": 3.03125, "learning_rate": 2.942233172741693e-05, "loss": 0.2154, "step": 4420 }, { "epoch": 0.45006603677740525, "grad_norm": 2.421875, "learning_rate": 2.934296858302515e-05, "loss": 0.2228, "step": 4430 }, { "epoch": 0.4510819871990247, "grad_norm": 1.6015625, "learning_rate": 2.9263560284173485e-05, "loss": 0.1637, "step": 4440 }, { "epoch": 0.4520979376206441, "grad_norm": 4.5, "learning_rate": 2.91841076564813e-05, "loss": 0.1396, "step": 4450 }, { "epoch": 0.45311388804226355, "grad_norm": 1.9609375, "learning_rate": 2.9104611526028808e-05, "loss": 0.186, "step": 4460 }, { "epoch": 0.45412983846388294, "grad_norm": 2.046875, "learning_rate": 2.902507271934855e-05, "loss": 0.1706, "step": 4470 }, { "epoch": 0.4551457888855024, "grad_norm": 2.390625, "learning_rate": 2.8945492063416768e-05, "loss": 0.2191, "step": 4480 }, { "epoch": 0.45616173930712184, "grad_norm": 2.734375, "learning_rate": 2.8865870385644823e-05, "loss": 0.1651, "step": 4490 }, { "epoch": 0.45717768972874123, "grad_norm": 4.4375, "learning_rate": 2.8786208513870583e-05, "loss": 0.1907, "step": 4500 }, { "epoch": 0.4581936401503607, "grad_norm": 1.9609375, "learning_rate": 2.8706507276349815e-05, "loss": 0.2256, "step": 4510 }, { "epoch": 0.4592095905719801, "grad_norm": 3.375, "learning_rate": 2.8626767501747588e-05, "loss": 0.215, "step": 4520 }, { "epoch": 0.4602255409935995, "grad_norm": 2.296875, "learning_rate": 2.854699001912964e-05, "loss": 0.2241, "step": 4530 }, { "epoch": 0.4612414914152189, "grad_norm": 2.078125, "learning_rate": 2.846717565795376e-05, "loss": 0.1541, "step": 4540 }, { "epoch": 0.46225744183683837, "grad_norm": 0.81640625, "learning_rate": 2.8387325248061164e-05, "loss": 0.1718, "step": 4550 }, { "epoch": 0.46327339225845776, "grad_norm": 5.6875, "learning_rate": 2.8307439619667897e-05, "loss": 0.259, "step": 4560 }, { "epoch": 0.4642893426800772, "grad_norm": 1.78125, "learning_rate": 2.8227519603356157e-05, "loss": 0.2205, "step": 4570 }, { "epoch": 0.46530529310169666, "grad_norm": 4.78125, "learning_rate": 2.8147566030065677e-05, "loss": 0.2256, "step": 4580 }, { "epoch": 0.46632124352331605, "grad_norm": 3.296875, "learning_rate": 2.8067579731085085e-05, "loss": 0.1671, "step": 4590 }, { "epoch": 0.4673371939449355, "grad_norm": 3.265625, "learning_rate": 2.7987561538043273e-05, "loss": 0.2471, "step": 4600 }, { "epoch": 0.4683531443665549, "grad_norm": 3.390625, "learning_rate": 2.7907512282900727e-05, "loss": 0.1749, "step": 4610 }, { "epoch": 0.46936909478817435, "grad_norm": 3.140625, "learning_rate": 2.782743279794091e-05, "loss": 0.2276, "step": 4620 }, { "epoch": 0.47038504520979374, "grad_norm": 2.921875, "learning_rate": 2.7747323915761574e-05, "loss": 0.1971, "step": 4630 }, { "epoch": 0.4714009956314132, "grad_norm": 4.15625, "learning_rate": 2.7667186469266122e-05, "loss": 0.1951, "step": 4640 }, { "epoch": 0.47241694605303264, "grad_norm": 2.953125, "learning_rate": 2.7587021291654924e-05, "loss": 0.2045, "step": 4650 }, { "epoch": 0.47343289647465203, "grad_norm": 1.6640625, "learning_rate": 2.750682921641672e-05, "loss": 0.155, "step": 4660 }, { "epoch": 0.4744488468962715, "grad_norm": 4.375, "learning_rate": 2.7426611077319864e-05, "loss": 0.2038, "step": 4670 }, { "epoch": 0.4754647973178909, "grad_norm": 5.5, "learning_rate": 2.734636770840372e-05, "loss": 0.159, "step": 4680 }, { "epoch": 0.4764807477395103, "grad_norm": 1.703125, "learning_rate": 2.7266099943969976e-05, "loss": 0.1566, "step": 4690 }, { "epoch": 0.4774966981611297, "grad_norm": 0.81640625, "learning_rate": 2.7185808618573943e-05, "loss": 0.1927, "step": 4700 }, { "epoch": 0.47851264858274917, "grad_norm": 0.81640625, "learning_rate": 2.710549456701592e-05, "loss": 0.1873, "step": 4710 }, { "epoch": 0.47952859900436856, "grad_norm": 3.828125, "learning_rate": 2.702515862433247e-05, "loss": 0.2474, "step": 4720 }, { "epoch": 0.480544549425988, "grad_norm": 1.1640625, "learning_rate": 2.6944801625787795e-05, "loss": 0.204, "step": 4730 }, { "epoch": 0.48156049984760746, "grad_norm": 2.953125, "learning_rate": 2.6864424406864984e-05, "loss": 0.1758, "step": 4740 }, { "epoch": 0.48257645026922685, "grad_norm": 3.265625, "learning_rate": 2.6784027803257377e-05, "loss": 0.161, "step": 4750 }, { "epoch": 0.4835924006908463, "grad_norm": 2.046875, "learning_rate": 2.6703612650859848e-05, "loss": 0.1469, "step": 4760 }, { "epoch": 0.4846083511124657, "grad_norm": 4.03125, "learning_rate": 2.6623179785760148e-05, "loss": 0.1858, "step": 4770 }, { "epoch": 0.48562430153408515, "grad_norm": 2.65625, "learning_rate": 2.6542730044230175e-05, "loss": 0.176, "step": 4780 }, { "epoch": 0.48664025195570454, "grad_norm": 2.59375, "learning_rate": 2.6462264262717278e-05, "loss": 0.1657, "step": 4790 }, { "epoch": 0.487656202377324, "grad_norm": 4.78125, "learning_rate": 2.6381783277835605e-05, "loss": 0.2705, "step": 4800 }, { "epoch": 0.48867215279894344, "grad_norm": 3.65625, "learning_rate": 2.6301287926357355e-05, "loss": 0.2252, "step": 4810 }, { "epoch": 0.48968810322056283, "grad_norm": 0.734375, "learning_rate": 2.622077904520411e-05, "loss": 0.2141, "step": 4820 }, { "epoch": 0.4907040536421823, "grad_norm": 5.15625, "learning_rate": 2.6140257471438108e-05, "loss": 0.1935, "step": 4830 }, { "epoch": 0.4917200040638017, "grad_norm": 3.625, "learning_rate": 2.6059724042253574e-05, "loss": 0.2121, "step": 4840 }, { "epoch": 0.4927359544854211, "grad_norm": 1.2890625, "learning_rate": 2.5979179594967983e-05, "loss": 0.1221, "step": 4850 }, { "epoch": 0.4937519049070405, "grad_norm": 3.4375, "learning_rate": 2.5898624967013367e-05, "loss": 0.2208, "step": 4860 }, { "epoch": 0.49476785532865997, "grad_norm": 2.40625, "learning_rate": 2.5818060995927607e-05, "loss": 0.1904, "step": 4870 }, { "epoch": 0.49578380575027936, "grad_norm": 2.921875, "learning_rate": 2.573748851934574e-05, "loss": 0.1658, "step": 4880 }, { "epoch": 0.4967997561718988, "grad_norm": 1.6640625, "learning_rate": 2.5656908374991213e-05, "loss": 0.1626, "step": 4890 }, { "epoch": 0.49781570659351826, "grad_norm": 1.8046875, "learning_rate": 2.557632140066721e-05, "loss": 0.1905, "step": 4900 }, { "epoch": 0.49883165701513765, "grad_norm": 4.875, "learning_rate": 2.5495728434247917e-05, "loss": 0.2591, "step": 4910 }, { "epoch": 0.4998476074367571, "grad_norm": 1.4453125, "learning_rate": 2.5415130313669845e-05, "loss": 0.1359, "step": 4920 }, { "epoch": 0.5008635578583766, "grad_norm": 2.109375, "learning_rate": 2.5334527876923063e-05, "loss": 0.2353, "step": 4930 }, { "epoch": 0.501879508279996, "grad_norm": 3.546875, "learning_rate": 2.5253921962042525e-05, "loss": 0.2173, "step": 4940 }, { "epoch": 0.5028954587016153, "grad_norm": 1.8125, "learning_rate": 2.5173313407099373e-05, "loss": 0.1631, "step": 4950 }, { "epoch": 0.5039114091232347, "grad_norm": 2.671875, "learning_rate": 2.5092703050192163e-05, "loss": 0.1884, "step": 4960 }, { "epoch": 0.5049273595448542, "grad_norm": 2.5625, "learning_rate": 2.501209172943819e-05, "loss": 0.217, "step": 4970 }, { "epoch": 0.5059433099664736, "grad_norm": 4.375, "learning_rate": 2.49314802829648e-05, "loss": 0.1854, "step": 4980 }, { "epoch": 0.506959260388093, "grad_norm": 2.3125, "learning_rate": 2.4850869548900628e-05, "loss": 0.2049, "step": 4990 }, { "epoch": 0.5079752108097125, "grad_norm": 3.859375, "learning_rate": 2.477026036536688e-05, "loss": 0.2093, "step": 5000 }, { "epoch": 0.5089911612313319, "grad_norm": 1.09375, "learning_rate": 2.4689653570468677e-05, "loss": 0.164, "step": 5010 }, { "epoch": 0.5100071116529513, "grad_norm": 3.40625, "learning_rate": 2.460905000228628e-05, "loss": 0.1649, "step": 5020 }, { "epoch": 0.5110230620745707, "grad_norm": 3.546875, "learning_rate": 2.4528450498866428e-05, "loss": 0.1777, "step": 5030 }, { "epoch": 0.5120390124961902, "grad_norm": 3.0, "learning_rate": 2.444785589821356e-05, "loss": 0.1505, "step": 5040 }, { "epoch": 0.5130549629178096, "grad_norm": 1.6484375, "learning_rate": 2.436726703828118e-05, "loss": 0.2672, "step": 5050 }, { "epoch": 0.514070913339429, "grad_norm": 4.34375, "learning_rate": 2.428668475696308e-05, "loss": 0.1756, "step": 5060 }, { "epoch": 0.5150868637610485, "grad_norm": 2.78125, "learning_rate": 2.420610989208465e-05, "loss": 0.1655, "step": 5070 }, { "epoch": 0.5161028141826679, "grad_norm": 1.4609375, "learning_rate": 2.412554328139419e-05, "loss": 0.1579, "step": 5080 }, { "epoch": 0.5171187646042873, "grad_norm": 2.28125, "learning_rate": 2.404498576255416e-05, "loss": 0.1599, "step": 5090 }, { "epoch": 0.5181347150259067, "grad_norm": 0.6484375, "learning_rate": 2.3964438173132522e-05, "loss": 0.1508, "step": 5100 }, { "epoch": 0.5191506654475262, "grad_norm": 3.390625, "learning_rate": 2.388390135059395e-05, "loss": 0.1578, "step": 5110 }, { "epoch": 0.5201666158691456, "grad_norm": 1.21875, "learning_rate": 2.3803376132291226e-05, "loss": 0.1374, "step": 5120 }, { "epoch": 0.521182566290765, "grad_norm": 4.0625, "learning_rate": 2.3722863355456436e-05, "loss": 0.1854, "step": 5130 }, { "epoch": 0.5221985167123845, "grad_norm": 4.71875, "learning_rate": 2.364236385719236e-05, "loss": 0.1391, "step": 5140 }, { "epoch": 0.5232144671340039, "grad_norm": 3.296875, "learning_rate": 2.356187847446366e-05, "loss": 0.2106, "step": 5150 }, { "epoch": 0.5242304175556233, "grad_norm": 3.296875, "learning_rate": 2.348140804408829e-05, "loss": 0.2383, "step": 5160 }, { "epoch": 0.5252463679772427, "grad_norm": 3.359375, "learning_rate": 2.3400953402728713e-05, "loss": 0.1537, "step": 5170 }, { "epoch": 0.5262623183988622, "grad_norm": 1.4921875, "learning_rate": 2.332051538688322e-05, "loss": 0.1841, "step": 5180 }, { "epoch": 0.5272782688204816, "grad_norm": 3.25, "learning_rate": 2.3240094832877287e-05, "loss": 0.1855, "step": 5190 }, { "epoch": 0.528294219242101, "grad_norm": 3.34375, "learning_rate": 2.3159692576854793e-05, "loss": 0.2625, "step": 5200 }, { "epoch": 0.5293101696637205, "grad_norm": 3.6875, "learning_rate": 2.3079309454769413e-05, "loss": 0.1292, "step": 5210 }, { "epoch": 0.5303261200853399, "grad_norm": 1.1171875, "learning_rate": 2.2998946302375827e-05, "loss": 0.1263, "step": 5220 }, { "epoch": 0.5313420705069593, "grad_norm": 2.71875, "learning_rate": 2.2918603955221148e-05, "loss": 0.2296, "step": 5230 }, { "epoch": 0.5323580209285786, "grad_norm": 2.015625, "learning_rate": 2.283828324863613e-05, "loss": 0.1231, "step": 5240 }, { "epoch": 0.5333739713501982, "grad_norm": 3.671875, "learning_rate": 2.2757985017726557e-05, "loss": 0.1939, "step": 5250 }, { "epoch": 0.5343899217718175, "grad_norm": 1.9765625, "learning_rate": 2.2677710097364495e-05, "loss": 0.168, "step": 5260 }, { "epoch": 0.5354058721934369, "grad_norm": 2.609375, "learning_rate": 2.259745932217969e-05, "loss": 0.1883, "step": 5270 }, { "epoch": 0.5364218226150563, "grad_norm": 2.8125, "learning_rate": 2.2517233526550817e-05, "loss": 0.1898, "step": 5280 }, { "epoch": 0.5374377730366758, "grad_norm": 3.125, "learning_rate": 2.2437033544596837e-05, "loss": 0.1838, "step": 5290 }, { "epoch": 0.5384537234582952, "grad_norm": 4.90625, "learning_rate": 2.2356860210168336e-05, "loss": 0.1553, "step": 5300 }, { "epoch": 0.5394696738799146, "grad_norm": 3.171875, "learning_rate": 2.2276714356838824e-05, "loss": 0.2248, "step": 5310 }, { "epoch": 0.5404856243015341, "grad_norm": 1.34375, "learning_rate": 2.2196596817896118e-05, "loss": 0.1421, "step": 5320 }, { "epoch": 0.5415015747231535, "grad_norm": 3.28125, "learning_rate": 2.2116508426333596e-05, "loss": 0.1947, "step": 5330 }, { "epoch": 0.5425175251447729, "grad_norm": 1.9296875, "learning_rate": 2.2036450014841652e-05, "loss": 0.2207, "step": 5340 }, { "epoch": 0.5435334755663923, "grad_norm": 0.5703125, "learning_rate": 2.19564224157989e-05, "loss": 0.2208, "step": 5350 }, { "epoch": 0.5445494259880118, "grad_norm": 7.5625, "learning_rate": 2.1876426461263654e-05, "loss": 0.1739, "step": 5360 }, { "epoch": 0.5455653764096312, "grad_norm": 2.15625, "learning_rate": 2.179646298296519e-05, "loss": 0.1938, "step": 5370 }, { "epoch": 0.5465813268312506, "grad_norm": 4.1875, "learning_rate": 2.171653281229511e-05, "loss": 0.1736, "step": 5380 }, { "epoch": 0.5475972772528701, "grad_norm": 4.65625, "learning_rate": 2.1636636780298732e-05, "loss": 0.2167, "step": 5390 }, { "epoch": 0.5486132276744895, "grad_norm": 1.84375, "learning_rate": 2.1556775717666427e-05, "loss": 0.1711, "step": 5400 }, { "epoch": 0.5496291780961089, "grad_norm": 5.125, "learning_rate": 2.147695045472499e-05, "loss": 0.1789, "step": 5410 }, { "epoch": 0.5506451285177283, "grad_norm": 3.859375, "learning_rate": 2.1397161821428973e-05, "loss": 0.2187, "step": 5420 }, { "epoch": 0.5516610789393478, "grad_norm": 2.25, "learning_rate": 2.131741064735212e-05, "loss": 0.1367, "step": 5430 }, { "epoch": 0.5526770293609672, "grad_norm": 4.65625, "learning_rate": 2.1237697761678684e-05, "loss": 0.1574, "step": 5440 }, { "epoch": 0.5536929797825866, "grad_norm": 1.2265625, "learning_rate": 2.1158023993194848e-05, "loss": 0.1301, "step": 5450 }, { "epoch": 0.5547089302042061, "grad_norm": 4.21875, "learning_rate": 2.107839017028005e-05, "loss": 0.2782, "step": 5460 }, { "epoch": 0.5557248806258255, "grad_norm": 0.52734375, "learning_rate": 2.0998797120898457e-05, "loss": 0.2024, "step": 5470 }, { "epoch": 0.5567408310474449, "grad_norm": 1.46875, "learning_rate": 2.0919245672590277e-05, "loss": 0.1755, "step": 5480 }, { "epoch": 0.5577567814690643, "grad_norm": 2.140625, "learning_rate": 2.083973665246318e-05, "loss": 0.2058, "step": 5490 }, { "epoch": 0.5587727318906838, "grad_norm": 1.5390625, "learning_rate": 2.076027088718373e-05, "loss": 0.2159, "step": 5500 }, { "epoch": 0.5597886823123032, "grad_norm": 1.9921875, "learning_rate": 2.0680849202968743e-05, "loss": 0.2139, "step": 5510 }, { "epoch": 0.5608046327339226, "grad_norm": 2.4375, "learning_rate": 2.060147242557674e-05, "loss": 0.183, "step": 5520 }, { "epoch": 0.5618205831555421, "grad_norm": 5.5, "learning_rate": 2.0522141380299308e-05, "loss": 0.1673, "step": 5530 }, { "epoch": 0.5628365335771615, "grad_norm": 4.25, "learning_rate": 2.044285689195258e-05, "loss": 0.1674, "step": 5540 }, { "epoch": 0.5638524839987809, "grad_norm": 2.109375, "learning_rate": 2.0363619784868604e-05, "loss": 0.1531, "step": 5550 }, { "epoch": 0.5648684344204002, "grad_norm": 2.59375, "learning_rate": 2.0284430882886836e-05, "loss": 0.1665, "step": 5560 }, { "epoch": 0.5658843848420197, "grad_norm": 3.984375, "learning_rate": 2.020529100934549e-05, "loss": 0.1717, "step": 5570 }, { "epoch": 0.5669003352636391, "grad_norm": 1.6015625, "learning_rate": 2.012620098707306e-05, "loss": 0.1167, "step": 5580 }, { "epoch": 0.5679162856852585, "grad_norm": 6.0625, "learning_rate": 2.004716163837972e-05, "loss": 0.2084, "step": 5590 }, { "epoch": 0.5689322361068779, "grad_norm": 2.5625, "learning_rate": 1.996817378504876e-05, "loss": 0.1939, "step": 5600 }, { "epoch": 0.5699481865284974, "grad_norm": 3.109375, "learning_rate": 1.9889238248328108e-05, "loss": 0.1241, "step": 5610 }, { "epoch": 0.5709641369501168, "grad_norm": 4.875, "learning_rate": 1.981035584892171e-05, "loss": 0.1865, "step": 5620 }, { "epoch": 0.5719800873717362, "grad_norm": 2.984375, "learning_rate": 1.9731527406981072e-05, "loss": 0.1639, "step": 5630 }, { "epoch": 0.5729960377933557, "grad_norm": 4.4375, "learning_rate": 1.9652753742096655e-05, "loss": 0.2019, "step": 5640 }, { "epoch": 0.5740119882149751, "grad_norm": 4.3125, "learning_rate": 1.9574035673289432e-05, "loss": 0.1829, "step": 5650 }, { "epoch": 0.5750279386365945, "grad_norm": 3.203125, "learning_rate": 1.9495374019002312e-05, "loss": 0.2267, "step": 5660 }, { "epoch": 0.5760438890582139, "grad_norm": 1.765625, "learning_rate": 1.9416769597091673e-05, "loss": 0.1411, "step": 5670 }, { "epoch": 0.5770598394798334, "grad_norm": 2.640625, "learning_rate": 1.9338223224818818e-05, "loss": 0.1476, "step": 5680 }, { "epoch": 0.5780757899014528, "grad_norm": 4.84375, "learning_rate": 1.9259735718841524e-05, "loss": 0.1417, "step": 5690 }, { "epoch": 0.5790917403230722, "grad_norm": 2.421875, "learning_rate": 1.918130789520551e-05, "loss": 0.1592, "step": 5700 }, { "epoch": 0.5801076907446917, "grad_norm": 2.984375, "learning_rate": 1.9102940569335963e-05, "loss": 0.161, "step": 5710 }, { "epoch": 0.5811236411663111, "grad_norm": 1.0234375, "learning_rate": 1.9024634556029093e-05, "loss": 0.1614, "step": 5720 }, { "epoch": 0.5821395915879305, "grad_norm": 2.90625, "learning_rate": 1.89463906694436e-05, "loss": 0.1505, "step": 5730 }, { "epoch": 0.5831555420095499, "grad_norm": 2.875, "learning_rate": 1.8868209723092286e-05, "loss": 0.1674, "step": 5740 }, { "epoch": 0.5841714924311694, "grad_norm": 0.408203125, "learning_rate": 1.8790092529833508e-05, "loss": 0.1468, "step": 5750 }, { "epoch": 0.5851874428527888, "grad_norm": 5.1875, "learning_rate": 1.871203990186281e-05, "loss": 0.1903, "step": 5760 }, { "epoch": 0.5862033932744082, "grad_norm": 0.5546875, "learning_rate": 1.8634052650704415e-05, "loss": 0.2644, "step": 5770 }, { "epoch": 0.5872193436960277, "grad_norm": 3.203125, "learning_rate": 1.8556131587202848e-05, "loss": 0.1968, "step": 5780 }, { "epoch": 0.5882352941176471, "grad_norm": 2.484375, "learning_rate": 1.8478277521514424e-05, "loss": 0.2249, "step": 5790 }, { "epoch": 0.5892512445392665, "grad_norm": 4.0, "learning_rate": 1.8400491263098906e-05, "loss": 0.1881, "step": 5800 }, { "epoch": 0.5902671949608859, "grad_norm": 1.90625, "learning_rate": 1.832277362071106e-05, "loss": 0.1352, "step": 5810 }, { "epoch": 0.5912831453825054, "grad_norm": 2.765625, "learning_rate": 1.824512540239221e-05, "loss": 0.2737, "step": 5820 }, { "epoch": 0.5922990958041248, "grad_norm": 2.609375, "learning_rate": 1.81675474154619e-05, "loss": 0.1566, "step": 5830 }, { "epoch": 0.5933150462257442, "grad_norm": 2.6875, "learning_rate": 1.8090040466509444e-05, "loss": 0.1999, "step": 5840 }, { "epoch": 0.5943309966473637, "grad_norm": 2.609375, "learning_rate": 1.8012605361385592e-05, "loss": 0.2372, "step": 5850 }, { "epoch": 0.5953469470689831, "grad_norm": 8.125, "learning_rate": 1.7935242905194087e-05, "loss": 0.2411, "step": 5860 }, { "epoch": 0.5963628974906025, "grad_norm": 3.46875, "learning_rate": 1.785795390228336e-05, "loss": 0.138, "step": 5870 }, { "epoch": 0.5973788479122218, "grad_norm": 2.3125, "learning_rate": 1.7780739156238125e-05, "loss": 0.1867, "step": 5880 }, { "epoch": 0.5983947983338413, "grad_norm": 4.0625, "learning_rate": 1.770359946987105e-05, "loss": 0.2091, "step": 5890 }, { "epoch": 0.5994107487554607, "grad_norm": 5.21875, "learning_rate": 1.7626535645214378e-05, "loss": 0.2091, "step": 5900 }, { "epoch": 0.6004266991770801, "grad_norm": 3.15625, "learning_rate": 1.7549548483511614e-05, "loss": 0.1927, "step": 5910 }, { "epoch": 0.6014426495986995, "grad_norm": 4.71875, "learning_rate": 1.7472638785209198e-05, "loss": 0.1893, "step": 5920 }, { "epoch": 0.602458600020319, "grad_norm": 3.015625, "learning_rate": 1.7395807349948145e-05, "loss": 0.1557, "step": 5930 }, { "epoch": 0.6034745504419384, "grad_norm": 2.9375, "learning_rate": 1.73190549765558e-05, "loss": 0.1717, "step": 5940 }, { "epoch": 0.6044905008635578, "grad_norm": 3.109375, "learning_rate": 1.724238246303745e-05, "loss": 0.1879, "step": 5950 }, { "epoch": 0.6055064512851773, "grad_norm": 3.875, "learning_rate": 1.71657906065681e-05, "loss": 0.1908, "step": 5960 }, { "epoch": 0.6065224017067967, "grad_norm": 5.09375, "learning_rate": 1.7089280203484115e-05, "loss": 0.1712, "step": 5970 }, { "epoch": 0.6075383521284161, "grad_norm": 3.015625, "learning_rate": 1.701285204927502e-05, "loss": 0.1454, "step": 5980 }, { "epoch": 0.6085543025500355, "grad_norm": 3.265625, "learning_rate": 1.693650693857515e-05, "loss": 0.2283, "step": 5990 }, { "epoch": 0.609570252971655, "grad_norm": 3.40625, "learning_rate": 1.6860245665155466e-05, "loss": 0.2188, "step": 6000 }, { "epoch": 0.6105862033932744, "grad_norm": 2.5625, "learning_rate": 1.678406902191521e-05, "loss": 0.1605, "step": 6010 }, { "epoch": 0.6116021538148938, "grad_norm": 0.6796875, "learning_rate": 1.670797780087374e-05, "loss": 0.1472, "step": 6020 }, { "epoch": 0.6126181042365133, "grad_norm": 2.234375, "learning_rate": 1.6631972793162288e-05, "loss": 0.1676, "step": 6030 }, { "epoch": 0.6136340546581327, "grad_norm": 1.25, "learning_rate": 1.6556054789015662e-05, "loss": 0.1508, "step": 6040 }, { "epoch": 0.6146500050797521, "grad_norm": 4.78125, "learning_rate": 1.6480224577764132e-05, "loss": 0.1981, "step": 6050 }, { "epoch": 0.6156659555013715, "grad_norm": 3.46875, "learning_rate": 1.6404482947825137e-05, "loss": 0.2514, "step": 6060 }, { "epoch": 0.616681905922991, "grad_norm": 1.265625, "learning_rate": 1.6328830686695154e-05, "loss": 0.2397, "step": 6070 }, { "epoch": 0.6176978563446104, "grad_norm": 1.953125, "learning_rate": 1.625326858094144e-05, "loss": 0.1523, "step": 6080 }, { "epoch": 0.6187138067662298, "grad_norm": 3.484375, "learning_rate": 1.6177797416193953e-05, "loss": 0.218, "step": 6090 }, { "epoch": 0.6197297571878493, "grad_norm": 3.484375, "learning_rate": 1.6102417977137052e-05, "loss": 0.1476, "step": 6100 }, { "epoch": 0.6207457076094687, "grad_norm": 4.90625, "learning_rate": 1.602713104750147e-05, "loss": 0.1818, "step": 6110 }, { "epoch": 0.6217616580310881, "grad_norm": 4.375, "learning_rate": 1.5951937410056087e-05, "loss": 0.2061, "step": 6120 }, { "epoch": 0.6227776084527075, "grad_norm": 6.3125, "learning_rate": 1.587683784659979e-05, "loss": 0.1566, "step": 6130 }, { "epoch": 0.623793558874327, "grad_norm": 2.828125, "learning_rate": 1.58018331379534e-05, "loss": 0.1376, "step": 6140 }, { "epoch": 0.6248095092959464, "grad_norm": 2.40625, "learning_rate": 1.572692406395149e-05, "loss": 0.1655, "step": 6150 }, { "epoch": 0.6258254597175658, "grad_norm": 4.34375, "learning_rate": 1.5652111403434338e-05, "loss": 0.2363, "step": 6160 }, { "epoch": 0.6268414101391853, "grad_norm": 2.453125, "learning_rate": 1.5577395934239757e-05, "loss": 0.2464, "step": 6170 }, { "epoch": 0.6278573605608047, "grad_norm": 2.53125, "learning_rate": 1.5502778433195085e-05, "loss": 0.1898, "step": 6180 }, { "epoch": 0.628873310982424, "grad_norm": 2.28125, "learning_rate": 1.5428259676109048e-05, "loss": 0.1804, "step": 6190 }, { "epoch": 0.6298892614040434, "grad_norm": 4.3125, "learning_rate": 1.5353840437763732e-05, "loss": 0.1409, "step": 6200 }, { "epoch": 0.630905211825663, "grad_norm": 2.5625, "learning_rate": 1.5279521491906496e-05, "loss": 0.2449, "step": 6210 }, { "epoch": 0.6319211622472823, "grad_norm": 3.0625, "learning_rate": 1.520530361124195e-05, "loss": 0.2103, "step": 6220 }, { "epoch": 0.6329371126689017, "grad_norm": 2.609375, "learning_rate": 1.5131187567423937e-05, "loss": 0.2156, "step": 6230 }, { "epoch": 0.6339530630905211, "grad_norm": 2.703125, "learning_rate": 1.5057174131047446e-05, "loss": 0.161, "step": 6240 }, { "epoch": 0.6349690135121406, "grad_norm": 3.265625, "learning_rate": 1.4983264071640679e-05, "loss": 0.1757, "step": 6250 }, { "epoch": 0.63598496393376, "grad_norm": 3.15625, "learning_rate": 1.490945815765699e-05, "loss": 0.2011, "step": 6260 }, { "epoch": 0.6370009143553794, "grad_norm": 5.375, "learning_rate": 1.4835757156466945e-05, "loss": 0.1658, "step": 6270 }, { "epoch": 0.6380168647769989, "grad_norm": 2.984375, "learning_rate": 1.4762161834350271e-05, "loss": 0.1754, "step": 6280 }, { "epoch": 0.6390328151986183, "grad_norm": 2.015625, "learning_rate": 1.4688672956487987e-05, "loss": 0.1427, "step": 6290 }, { "epoch": 0.6400487656202377, "grad_norm": 3.78125, "learning_rate": 1.4615291286954352e-05, "loss": 0.1517, "step": 6300 }, { "epoch": 0.6410647160418571, "grad_norm": 2.859375, "learning_rate": 1.4542017588709005e-05, "loss": 0.2348, "step": 6310 }, { "epoch": 0.6420806664634766, "grad_norm": 2.421875, "learning_rate": 1.4468852623588961e-05, "loss": 0.2089, "step": 6320 }, { "epoch": 0.643096616885096, "grad_norm": 2.15625, "learning_rate": 1.4395797152300719e-05, "loss": 0.1702, "step": 6330 }, { "epoch": 0.6441125673067154, "grad_norm": 1.53125, "learning_rate": 1.4322851934412382e-05, "loss": 0.1017, "step": 6340 }, { "epoch": 0.6451285177283349, "grad_norm": 1.90625, "learning_rate": 1.4250017728345716e-05, "loss": 0.1813, "step": 6350 }, { "epoch": 0.6461444681499543, "grad_norm": 2.015625, "learning_rate": 1.4177295291368292e-05, "loss": 0.1095, "step": 6360 }, { "epoch": 0.6471604185715737, "grad_norm": 2.625, "learning_rate": 1.410468537958558e-05, "loss": 0.2259, "step": 6370 }, { "epoch": 0.6481763689931931, "grad_norm": 3.5, "learning_rate": 1.4032188747933136e-05, "loss": 0.1595, "step": 6380 }, { "epoch": 0.6491923194148126, "grad_norm": 5.21875, "learning_rate": 1.39598061501687e-05, "loss": 0.2226, "step": 6390 }, { "epoch": 0.650208269836432, "grad_norm": 5.34375, "learning_rate": 1.388753833886442e-05, "loss": 0.2132, "step": 6400 }, { "epoch": 0.6512242202580514, "grad_norm": 3.640625, "learning_rate": 1.3815386065398945e-05, "loss": 0.1227, "step": 6410 }, { "epoch": 0.6522401706796709, "grad_norm": 1.0, "learning_rate": 1.3743350079949705e-05, "loss": 0.1755, "step": 6420 }, { "epoch": 0.6532561211012903, "grad_norm": 2.359375, "learning_rate": 1.3671431131485057e-05, "loss": 0.1552, "step": 6430 }, { "epoch": 0.6542720715229097, "grad_norm": 5.3125, "learning_rate": 1.3599629967756483e-05, "loss": 0.1917, "step": 6440 }, { "epoch": 0.6552880219445291, "grad_norm": 4.625, "learning_rate": 1.3527947335290877e-05, "loss": 0.1812, "step": 6450 }, { "epoch": 0.6563039723661486, "grad_norm": 1.234375, "learning_rate": 1.3456383979382708e-05, "loss": 0.1896, "step": 6460 }, { "epoch": 0.657319922787768, "grad_norm": 3.984375, "learning_rate": 1.3384940644086352e-05, "loss": 0.1484, "step": 6470 }, { "epoch": 0.6583358732093874, "grad_norm": 2.40625, "learning_rate": 1.3313618072208268e-05, "loss": 0.1334, "step": 6480 }, { "epoch": 0.6593518236310069, "grad_norm": 4.375, "learning_rate": 1.3242417005299357e-05, "loss": 0.1351, "step": 6490 }, { "epoch": 0.6603677740526263, "grad_norm": 2.640625, "learning_rate": 1.31713381836472e-05, "loss": 0.1717, "step": 6500 }, { "epoch": 0.6613837244742456, "grad_norm": 2.640625, "learning_rate": 1.3100382346268392e-05, "loss": 0.1867, "step": 6510 }, { "epoch": 0.662399674895865, "grad_norm": 1.734375, "learning_rate": 1.3029550230900812e-05, "loss": 0.1997, "step": 6520 }, { "epoch": 0.6634156253174845, "grad_norm": 3.609375, "learning_rate": 1.2958842573996016e-05, "loss": 0.1969, "step": 6530 }, { "epoch": 0.6644315757391039, "grad_norm": 3.578125, "learning_rate": 1.2888260110711525e-05, "loss": 0.1469, "step": 6540 }, { "epoch": 0.6654475261607233, "grad_norm": 1.3515625, "learning_rate": 1.2817803574903212e-05, "loss": 0.1524, "step": 6550 }, { "epoch": 0.6664634765823427, "grad_norm": 2.109375, "learning_rate": 1.2747473699117668e-05, "loss": 0.159, "step": 6560 }, { "epoch": 0.6674794270039622, "grad_norm": 1.53125, "learning_rate": 1.267727121458458e-05, "loss": 0.1999, "step": 6570 }, { "epoch": 0.6684953774255816, "grad_norm": 1.7265625, "learning_rate": 1.2607196851209137e-05, "loss": 0.2216, "step": 6580 }, { "epoch": 0.669511327847201, "grad_norm": 3.125, "learning_rate": 1.2537251337564412e-05, "loss": 0.1607, "step": 6590 }, { "epoch": 0.6705272782688205, "grad_norm": 2.421875, "learning_rate": 1.2467435400883839e-05, "loss": 0.2187, "step": 6600 }, { "epoch": 0.6715432286904399, "grad_norm": 1.5078125, "learning_rate": 1.239774976705359e-05, "loss": 0.1753, "step": 6610 }, { "epoch": 0.6725591791120593, "grad_norm": 1.140625, "learning_rate": 1.2328195160605092e-05, "loss": 0.194, "step": 6620 }, { "epoch": 0.6735751295336787, "grad_norm": 4.9375, "learning_rate": 1.225877230470743e-05, "loss": 0.1485, "step": 6630 }, { "epoch": 0.6745910799552982, "grad_norm": 3.65625, "learning_rate": 1.218948192115988e-05, "loss": 0.1847, "step": 6640 }, { "epoch": 0.6756070303769176, "grad_norm": 3.875, "learning_rate": 1.21203247303844e-05, "loss": 0.1874, "step": 6650 }, { "epoch": 0.676622980798537, "grad_norm": 2.65625, "learning_rate": 1.2051301451418073e-05, "loss": 0.2377, "step": 6660 }, { "epoch": 0.6776389312201565, "grad_norm": 2.09375, "learning_rate": 1.198241280190574e-05, "loss": 0.1508, "step": 6670 }, { "epoch": 0.6786548816417759, "grad_norm": 2.203125, "learning_rate": 1.1913659498092431e-05, "loss": 0.1537, "step": 6680 }, { "epoch": 0.6796708320633953, "grad_norm": 2.484375, "learning_rate": 1.184504225481601e-05, "loss": 0.2339, "step": 6690 }, { "epoch": 0.6806867824850147, "grad_norm": 5.625, "learning_rate": 1.177656178549966e-05, "loss": 0.2102, "step": 6700 }, { "epoch": 0.6817027329066342, "grad_norm": 2.5, "learning_rate": 1.1708218802144536e-05, "loss": 0.1435, "step": 6710 }, { "epoch": 0.6827186833282536, "grad_norm": 3.84375, "learning_rate": 1.1640014015322323e-05, "loss": 0.1823, "step": 6720 }, { "epoch": 0.683734633749873, "grad_norm": 2.359375, "learning_rate": 1.1571948134167862e-05, "loss": 0.1154, "step": 6730 }, { "epoch": 0.6847505841714925, "grad_norm": 2.90625, "learning_rate": 1.1504021866371761e-05, "loss": 0.2105, "step": 6740 }, { "epoch": 0.6857665345931119, "grad_norm": 5.46875, "learning_rate": 1.143623591817304e-05, "loss": 0.1317, "step": 6750 }, { "epoch": 0.6867824850147313, "grad_norm": 3.34375, "learning_rate": 1.1368590994351835e-05, "loss": 0.1406, "step": 6760 }, { "epoch": 0.6877984354363507, "grad_norm": 3.78125, "learning_rate": 1.130108779822198e-05, "loss": 0.1425, "step": 6770 }, { "epoch": 0.6888143858579702, "grad_norm": 0.77734375, "learning_rate": 1.1233727031623783e-05, "loss": 0.1623, "step": 6780 }, { "epoch": 0.6898303362795896, "grad_norm": 4.625, "learning_rate": 1.1166509394916682e-05, "loss": 0.1591, "step": 6790 }, { "epoch": 0.690846286701209, "grad_norm": 3.84375, "learning_rate": 1.1099435586971982e-05, "loss": 0.1758, "step": 6800 }, { "epoch": 0.6918622371228285, "grad_norm": 2.4375, "learning_rate": 1.1032506305165555e-05, "loss": 0.1018, "step": 6810 }, { "epoch": 0.6928781875444479, "grad_norm": 3.203125, "learning_rate": 1.0965722245370641e-05, "loss": 0.1485, "step": 6820 }, { "epoch": 0.6938941379660672, "grad_norm": 0.7109375, "learning_rate": 1.0899084101950561e-05, "loss": 0.1762, "step": 6830 }, { "epoch": 0.6949100883876866, "grad_norm": 1.9765625, "learning_rate": 1.0832592567751555e-05, "loss": 0.1402, "step": 6840 }, { "epoch": 0.6959260388093061, "grad_norm": 1.4609375, "learning_rate": 1.0766248334095505e-05, "loss": 0.2278, "step": 6850 }, { "epoch": 0.6969419892309255, "grad_norm": 3.953125, "learning_rate": 1.0700052090772828e-05, "loss": 0.1969, "step": 6860 }, { "epoch": 0.6979579396525449, "grad_norm": 2.453125, "learning_rate": 1.0634004526035249e-05, "loss": 0.2073, "step": 6870 }, { "epoch": 0.6989738900741643, "grad_norm": 1.6171875, "learning_rate": 1.0568106326588645e-05, "loss": 0.1902, "step": 6880 }, { "epoch": 0.6999898404957838, "grad_norm": 1.2734375, "learning_rate": 1.0502358177585953e-05, "loss": 0.2165, "step": 6890 }, { "epoch": 0.7010057909174032, "grad_norm": 1.671875, "learning_rate": 1.0436760762619977e-05, "loss": 0.1952, "step": 6900 }, { "epoch": 0.7020217413390226, "grad_norm": 2.8125, "learning_rate": 1.0371314763716347e-05, "loss": 0.1422, "step": 6910 }, { "epoch": 0.7030376917606421, "grad_norm": 2.53125, "learning_rate": 1.0306020861326388e-05, "loss": 0.0961, "step": 6920 }, { "epoch": 0.7040536421822615, "grad_norm": 3.046875, "learning_rate": 1.0240879734320068e-05, "loss": 0.1542, "step": 6930 }, { "epoch": 0.7050695926038809, "grad_norm": 2.859375, "learning_rate": 1.0175892059978901e-05, "loss": 0.1748, "step": 6940 }, { "epoch": 0.7060855430255003, "grad_norm": 2.671875, "learning_rate": 1.0111058513988958e-05, "loss": 0.0819, "step": 6950 }, { "epoch": 0.7071014934471198, "grad_norm": 3.5625, "learning_rate": 1.0046379770433803e-05, "loss": 0.1933, "step": 6960 }, { "epoch": 0.7081174438687392, "grad_norm": 2.859375, "learning_rate": 9.98185650178749e-06, "loss": 0.1891, "step": 6970 }, { "epoch": 0.7091333942903586, "grad_norm": 3.15625, "learning_rate": 9.917489378907591e-06, "loss": 0.2102, "step": 6980 }, { "epoch": 0.7101493447119781, "grad_norm": 6.40625, "learning_rate": 9.853279071028212e-06, "loss": 0.1714, "step": 6990 }, { "epoch": 0.7111652951335975, "grad_norm": 2.375, "learning_rate": 9.78922624575303e-06, "loss": 0.1299, "step": 7000 }, { "epoch": 0.7121812455552169, "grad_norm": 2.078125, "learning_rate": 9.72533156904833e-06, "loss": 0.1914, "step": 7010 }, { "epoch": 0.7131971959768363, "grad_norm": 3.859375, "learning_rate": 9.661595705236137e-06, "loss": 0.2377, "step": 7020 }, { "epoch": 0.7142131463984558, "grad_norm": 1.171875, "learning_rate": 9.598019316987244e-06, "loss": 0.1851, "step": 7030 }, { "epoch": 0.7152290968200752, "grad_norm": 1.078125, "learning_rate": 9.53460306531439e-06, "loss": 0.2661, "step": 7040 }, { "epoch": 0.7162450472416946, "grad_norm": 1.6484375, "learning_rate": 9.471347609565311e-06, "loss": 0.1669, "step": 7050 }, { "epoch": 0.7172609976633141, "grad_norm": 4.59375, "learning_rate": 9.408253607415957e-06, "loss": 0.2487, "step": 7060 }, { "epoch": 0.7182769480849335, "grad_norm": 3.09375, "learning_rate": 9.345321714863614e-06, "loss": 0.186, "step": 7070 }, { "epoch": 0.7192928985065529, "grad_norm": 6.0625, "learning_rate": 9.282552586220075e-06, "loss": 0.2249, "step": 7080 }, { "epoch": 0.7203088489281723, "grad_norm": 1.5703125, "learning_rate": 9.219946874104885e-06, "loss": 0.1255, "step": 7090 }, { "epoch": 0.7213247993497918, "grad_norm": 1.9453125, "learning_rate": 9.157505229438481e-06, "loss": 0.1999, "step": 7100 }, { "epoch": 0.7223407497714112, "grad_norm": 5.1875, "learning_rate": 9.095228301435518e-06, "loss": 0.199, "step": 7110 }, { "epoch": 0.7233567001930306, "grad_norm": 2.078125, "learning_rate": 9.03311673759802e-06, "loss": 0.2182, "step": 7120 }, { "epoch": 0.7243726506146501, "grad_norm": 6.46875, "learning_rate": 8.971171183708733e-06, "loss": 0.1573, "step": 7130 }, { "epoch": 0.7253886010362695, "grad_norm": 3.015625, "learning_rate": 8.909392283824353e-06, "loss": 0.2044, "step": 7140 }, { "epoch": 0.7264045514578888, "grad_norm": 2.921875, "learning_rate": 8.847780680268872e-06, "loss": 0.11, "step": 7150 }, { "epoch": 0.7274205018795082, "grad_norm": 2.96875, "learning_rate": 8.786337013626853e-06, "loss": 0.1897, "step": 7160 }, { "epoch": 0.7284364523011277, "grad_norm": 1.7578125, "learning_rate": 8.725061922736799e-06, "loss": 0.153, "step": 7170 }, { "epoch": 0.7294524027227471, "grad_norm": 1.609375, "learning_rate": 8.663956044684532e-06, "loss": 0.1746, "step": 7180 }, { "epoch": 0.7304683531443665, "grad_norm": 1.9375, "learning_rate": 8.603020014796507e-06, "loss": 0.2284, "step": 7190 }, { "epoch": 0.7314843035659859, "grad_norm": 1.515625, "learning_rate": 8.542254466633273e-06, "loss": 0.1186, "step": 7200 }, { "epoch": 0.7325002539876054, "grad_norm": 1.671875, "learning_rate": 8.481660031982844e-06, "loss": 0.1971, "step": 7210 }, { "epoch": 0.7335162044092248, "grad_norm": 1.453125, "learning_rate": 8.421237340854157e-06, "loss": 0.196, "step": 7220 }, { "epoch": 0.7345321548308442, "grad_norm": 0.65234375, "learning_rate": 8.360987021470479e-06, "loss": 0.1724, "step": 7230 }, { "epoch": 0.7355481052524637, "grad_norm": 2.84375, "learning_rate": 8.300909700262929e-06, "loss": 0.175, "step": 7240 }, { "epoch": 0.7365640556740831, "grad_norm": 3.109375, "learning_rate": 8.241006001863924e-06, "loss": 0.2276, "step": 7250 }, { "epoch": 0.7375800060957025, "grad_norm": 4.8125, "learning_rate": 8.181276549100714e-06, "loss": 0.2029, "step": 7260 }, { "epoch": 0.7385959565173219, "grad_norm": 4.03125, "learning_rate": 8.12172196298887e-06, "loss": 0.175, "step": 7270 }, { "epoch": 0.7396119069389414, "grad_norm": 3.046875, "learning_rate": 8.062342862725878e-06, "loss": 0.1662, "step": 7280 }, { "epoch": 0.7406278573605608, "grad_norm": 3.375, "learning_rate": 8.003139865684662e-06, "loss": 0.1616, "step": 7290 }, { "epoch": 0.7416438077821802, "grad_norm": 2.5625, "learning_rate": 7.944113587407157e-06, "loss": 0.2448, "step": 7300 }, { "epoch": 0.7426597582037997, "grad_norm": 4.125, "learning_rate": 7.885264641597961e-06, "loss": 0.1618, "step": 7310 }, { "epoch": 0.7436757086254191, "grad_norm": 3.5, "learning_rate": 7.826593640117889e-06, "loss": 0.1134, "step": 7320 }, { "epoch": 0.7446916590470385, "grad_norm": 2.6875, "learning_rate": 7.76810119297767e-06, "loss": 0.1795, "step": 7330 }, { "epoch": 0.7457076094686579, "grad_norm": 4.34375, "learning_rate": 7.709787908331556e-06, "loss": 0.2736, "step": 7340 }, { "epoch": 0.7467235598902774, "grad_norm": 1.21875, "learning_rate": 7.651654392471038e-06, "loss": 0.139, "step": 7350 }, { "epoch": 0.7477395103118968, "grad_norm": 3.578125, "learning_rate": 7.593701249818521e-06, "loss": 0.2023, "step": 7360 }, { "epoch": 0.7487554607335162, "grad_norm": 2.15625, "learning_rate": 7.535929082921048e-06, "loss": 0.1702, "step": 7370 }, { "epoch": 0.7497714111551357, "grad_norm": 1.96875, "learning_rate": 7.47833849244402e-06, "loss": 0.1835, "step": 7380 }, { "epoch": 0.7507873615767551, "grad_norm": 2.796875, "learning_rate": 7.420930077164959e-06, "loss": 0.1713, "step": 7390 }, { "epoch": 0.7518033119983745, "grad_norm": 4.46875, "learning_rate": 7.363704433967311e-06, "loss": 0.1906, "step": 7400 }, { "epoch": 0.7528192624199939, "grad_norm": 1.75, "learning_rate": 7.306662157834185e-06, "loss": 0.1421, "step": 7410 }, { "epoch": 0.7538352128416134, "grad_norm": 1.140625, "learning_rate": 7.2498038418422145e-06, "loss": 0.1793, "step": 7420 }, { "epoch": 0.7548511632632328, "grad_norm": 2.578125, "learning_rate": 7.193130077155374e-06, "loss": 0.1603, "step": 7430 }, { "epoch": 0.7558671136848522, "grad_norm": 4.3125, "learning_rate": 7.13664145301883e-06, "loss": 0.2169, "step": 7440 }, { "epoch": 0.7568830641064717, "grad_norm": 3.078125, "learning_rate": 7.0803385567528025e-06, "loss": 0.1685, "step": 7450 }, { "epoch": 0.757899014528091, "grad_norm": 3.5625, "learning_rate": 7.024221973746495e-06, "loss": 0.2282, "step": 7460 }, { "epoch": 0.7589149649497104, "grad_norm": 2.265625, "learning_rate": 6.968292287451961e-06, "loss": 0.1786, "step": 7470 }, { "epoch": 0.7599309153713298, "grad_norm": 4.71875, "learning_rate": 6.912550079378091e-06, "loss": 0.1811, "step": 7480 }, { "epoch": 0.7609468657929493, "grad_norm": 2.328125, "learning_rate": 6.856995929084506e-06, "loss": 0.1747, "step": 7490 }, { "epoch": 0.7619628162145687, "grad_norm": 5.21875, "learning_rate": 6.801630414175589e-06, "loss": 0.2028, "step": 7500 }, { "epoch": 0.7629787666361881, "grad_norm": 3.78125, "learning_rate": 6.746454110294451e-06, "loss": 0.2255, "step": 7510 }, { "epoch": 0.7639947170578075, "grad_norm": 1.625, "learning_rate": 6.691467591116931e-06, "loss": 0.1604, "step": 7520 }, { "epoch": 0.765010667479427, "grad_norm": 1.7734375, "learning_rate": 6.6366714283456755e-06, "loss": 0.2559, "step": 7530 }, { "epoch": 0.7660266179010464, "grad_norm": 4.59375, "learning_rate": 6.582066191704142e-06, "loss": 0.2034, "step": 7540 }, { "epoch": 0.7670425683226658, "grad_norm": 1.578125, "learning_rate": 6.527652448930724e-06, "loss": 0.148, "step": 7550 }, { "epoch": 0.7680585187442853, "grad_norm": 1.7109375, "learning_rate": 6.4734307657728e-06, "loss": 0.1811, "step": 7560 }, { "epoch": 0.7690744691659047, "grad_norm": 1.2734375, "learning_rate": 6.419401705980924e-06, "loss": 0.1407, "step": 7570 }, { "epoch": 0.7700904195875241, "grad_norm": 2.25, "learning_rate": 6.365565831302869e-06, "loss": 0.1893, "step": 7580 }, { "epoch": 0.7711063700091435, "grad_norm": 1.625, "learning_rate": 6.311923701477854e-06, "loss": 0.1835, "step": 7590 }, { "epoch": 0.772122320430763, "grad_norm": 2.375, "learning_rate": 6.258475874230713e-06, "loss": 0.1579, "step": 7600 }, { "epoch": 0.7731382708523824, "grad_norm": 4.5, "learning_rate": 6.205222905266067e-06, "loss": 0.1794, "step": 7610 }, { "epoch": 0.7741542212740018, "grad_norm": 4.25, "learning_rate": 6.152165348262598e-06, "loss": 0.1477, "step": 7620 }, { "epoch": 0.7751701716956213, "grad_norm": 1.9765625, "learning_rate": 6.0993037548672246e-06, "loss": 0.2396, "step": 7630 }, { "epoch": 0.7761861221172407, "grad_norm": 2.671875, "learning_rate": 6.046638674689454e-06, "loss": 0.1717, "step": 7640 }, { "epoch": 0.7772020725388601, "grad_norm": 3.671875, "learning_rate": 5.994170655295567e-06, "loss": 0.2646, "step": 7650 }, { "epoch": 0.7782180229604795, "grad_norm": 1.3046875, "learning_rate": 5.9419002422030106e-06, "loss": 0.1553, "step": 7660 }, { "epoch": 0.779233973382099, "grad_norm": 3.734375, "learning_rate": 5.889827978874665e-06, "loss": 0.1854, "step": 7670 }, { "epoch": 0.7802499238037184, "grad_norm": 2.140625, "learning_rate": 5.837954406713245e-06, "loss": 0.1857, "step": 7680 }, { "epoch": 0.7812658742253378, "grad_norm": 3.34375, "learning_rate": 5.786280065055619e-06, "loss": 0.1797, "step": 7690 }, { "epoch": 0.7822818246469573, "grad_norm": 0.97265625, "learning_rate": 5.734805491167244e-06, "loss": 0.1488, "step": 7700 }, { "epoch": 0.7832977750685767, "grad_norm": 2.078125, "learning_rate": 5.683531220236576e-06, "loss": 0.1688, "step": 7710 }, { "epoch": 0.7843137254901961, "grad_norm": 3.046875, "learning_rate": 5.632457785369455e-06, "loss": 0.1503, "step": 7720 }, { "epoch": 0.7853296759118155, "grad_norm": 1.6875, "learning_rate": 5.581585717583637e-06, "loss": 0.1658, "step": 7730 }, { "epoch": 0.786345626333435, "grad_norm": 3.421875, "learning_rate": 5.530915545803209e-06, "loss": 0.2112, "step": 7740 }, { "epoch": 0.7873615767550544, "grad_norm": 4.1875, "learning_rate": 5.480447796853141e-06, "loss": 0.165, "step": 7750 }, { "epoch": 0.7883775271766738, "grad_norm": 5.3125, "learning_rate": 5.430182995453756e-06, "loss": 0.1499, "step": 7760 }, { "epoch": 0.7893934775982933, "grad_norm": 2.1875, "learning_rate": 5.380121664215329e-06, "loss": 0.1559, "step": 7770 }, { "epoch": 0.7904094280199127, "grad_norm": 1.46875, "learning_rate": 5.330264323632611e-06, "loss": 0.2098, "step": 7780 }, { "epoch": 0.791425378441532, "grad_norm": 4.65625, "learning_rate": 5.280611492079449e-06, "loss": 0.1776, "step": 7790 }, { "epoch": 0.7924413288631514, "grad_norm": 1.3359375, "learning_rate": 5.231163685803361e-06, "loss": 0.1497, "step": 7800 }, { "epoch": 0.7934572792847709, "grad_norm": 2.640625, "learning_rate": 5.181921418920191e-06, "loss": 0.12, "step": 7810 }, { "epoch": 0.7944732297063903, "grad_norm": 2.328125, "learning_rate": 5.13288520340878e-06, "loss": 0.1981, "step": 7820 }, { "epoch": 0.7954891801280097, "grad_norm": 3.0625, "learning_rate": 5.084055549105596e-06, "loss": 0.1389, "step": 7830 }, { "epoch": 0.7965051305496291, "grad_norm": 2.796875, "learning_rate": 5.035432963699479e-06, "loss": 0.2293, "step": 7840 }, { "epoch": 0.7975210809712486, "grad_norm": 5.0625, "learning_rate": 4.98701795272635e-06, "loss": 0.1618, "step": 7850 }, { "epoch": 0.798537031392868, "grad_norm": 5.09375, "learning_rate": 4.938811019563938e-06, "loss": 0.1755, "step": 7860 }, { "epoch": 0.7995529818144874, "grad_norm": 2.140625, "learning_rate": 4.8908126654265475e-06, "loss": 0.1565, "step": 7870 }, { "epoch": 0.8005689322361069, "grad_norm": 0.76171875, "learning_rate": 4.843023389359885e-06, "loss": 0.2176, "step": 7880 }, { "epoch": 0.8015848826577263, "grad_norm": 2.625, "learning_rate": 4.79544368823581e-06, "loss": 0.2013, "step": 7890 }, { "epoch": 0.8026008330793457, "grad_norm": 2.078125, "learning_rate": 4.748074056747234e-06, "loss": 0.1246, "step": 7900 }, { "epoch": 0.8036167835009651, "grad_norm": 3.5, "learning_rate": 4.700914987402919e-06, "loss": 0.1638, "step": 7910 }, { "epoch": 0.8046327339225846, "grad_norm": 3.4375, "learning_rate": 4.6539669705223916e-06, "loss": 0.2213, "step": 7920 }, { "epoch": 0.805648684344204, "grad_norm": 2.96875, "learning_rate": 4.607230494230849e-06, "loss": 0.1822, "step": 7930 }, { "epoch": 0.8066646347658234, "grad_norm": 2.359375, "learning_rate": 4.560706044454047e-06, "loss": 0.1763, "step": 7940 }, { "epoch": 0.8076805851874429, "grad_norm": 4.59375, "learning_rate": 4.514394104913291e-06, "loss": 0.234, "step": 7950 }, { "epoch": 0.8086965356090623, "grad_norm": 1.96875, "learning_rate": 4.468295157120372e-06, "loss": 0.1939, "step": 7960 }, { "epoch": 0.8097124860306817, "grad_norm": 2.578125, "learning_rate": 4.422409680372594e-06, "loss": 0.174, "step": 7970 }, { "epoch": 0.8107284364523011, "grad_norm": 4.5625, "learning_rate": 4.3767381517477505e-06, "loss": 0.2375, "step": 7980 }, { "epoch": 0.8117443868739206, "grad_norm": 0.9609375, "learning_rate": 4.331281046099203e-06, "loss": 0.2076, "step": 7990 }, { "epoch": 0.81276033729554, "grad_norm": 6.0625, "learning_rate": 4.286038836050929e-06, "loss": 0.2504, "step": 8000 }, { "epoch": 0.8137762877171594, "grad_norm": 3.484375, "learning_rate": 4.241011991992586e-06, "loss": 0.2102, "step": 8010 }, { "epoch": 0.8147922381387789, "grad_norm": 1.9765625, "learning_rate": 4.1962009820746635e-06, "loss": 0.1846, "step": 8020 }, { "epoch": 0.8158081885603983, "grad_norm": 1.875, "learning_rate": 4.15160627220357e-06, "loss": 0.1741, "step": 8030 }, { "epoch": 0.8168241389820177, "grad_norm": 5.5625, "learning_rate": 4.107228326036838e-06, "loss": 0.2078, "step": 8040 }, { "epoch": 0.8178400894036371, "grad_norm": 1.7578125, "learning_rate": 4.063067604978252e-06, "loss": 0.212, "step": 8050 }, { "epoch": 0.8188560398252566, "grad_norm": 4.09375, "learning_rate": 4.019124568173094e-06, "loss": 0.1831, "step": 8060 }, { "epoch": 0.819871990246876, "grad_norm": 6.625, "learning_rate": 3.975399672503341e-06, "loss": 0.2196, "step": 8070 }, { "epoch": 0.8208879406684954, "grad_norm": 2.78125, "learning_rate": 3.931893372582943e-06, "loss": 0.2002, "step": 8080 }, { "epoch": 0.8219038910901149, "grad_norm": 6.90625, "learning_rate": 3.888606120753047e-06, "loss": 0.2138, "step": 8090 }, { "epoch": 0.8229198415117343, "grad_norm": 4.09375, "learning_rate": 3.845538367077362e-06, "loss": 0.2593, "step": 8100 }, { "epoch": 0.8239357919333536, "grad_norm": 1.859375, "learning_rate": 3.8026905593374213e-06, "loss": 0.2062, "step": 8110 }, { "epoch": 0.824951742354973, "grad_norm": 4.3125, "learning_rate": 3.760063143027945e-06, "loss": 0.1343, "step": 8120 }, { "epoch": 0.8259676927765925, "grad_norm": 1.984375, "learning_rate": 3.7176565613522313e-06, "loss": 0.2494, "step": 8130 }, { "epoch": 0.8269836431982119, "grad_norm": 3.71875, "learning_rate": 3.675471255217516e-06, "loss": 0.1502, "step": 8140 }, { "epoch": 0.8279995936198313, "grad_norm": 2.359375, "learning_rate": 3.6335076632304175e-06, "loss": 0.1256, "step": 8150 }, { "epoch": 0.8290155440414507, "grad_norm": 1.46875, "learning_rate": 3.5917662216923332e-06, "loss": 0.1709, "step": 8160 }, { "epoch": 0.8300314944630702, "grad_norm": 2.78125, "learning_rate": 3.550247364594958e-06, "loss": 0.1881, "step": 8170 }, { "epoch": 0.8310474448846896, "grad_norm": 1.0703125, "learning_rate": 3.508951523615725e-06, "loss": 0.1998, "step": 8180 }, { "epoch": 0.832063395306309, "grad_norm": 2.40625, "learning_rate": 3.467879128113352e-06, "loss": 0.2429, "step": 8190 }, { "epoch": 0.8330793457279285, "grad_norm": 2.609375, "learning_rate": 3.427030605123352e-06, "loss": 0.1942, "step": 8200 }, { "epoch": 0.8340952961495479, "grad_norm": 1.6015625, "learning_rate": 3.3864063793536043e-06, "loss": 0.1898, "step": 8210 }, { "epoch": 0.8351112465711673, "grad_norm": 5.375, "learning_rate": 3.3460068731799577e-06, "loss": 0.1919, "step": 8220 }, { "epoch": 0.8361271969927867, "grad_norm": 3.3125, "learning_rate": 3.3058325066417818e-06, "loss": 0.1516, "step": 8230 }, { "epoch": 0.8371431474144062, "grad_norm": 0.76171875, "learning_rate": 3.26588369743768e-06, "loss": 0.1068, "step": 8240 }, { "epoch": 0.8381590978360256, "grad_norm": 3.171875, "learning_rate": 3.2261608609210653e-06, "loss": 0.1203, "step": 8250 }, { "epoch": 0.839175048257645, "grad_norm": 2.359375, "learning_rate": 3.186664410095913e-06, "loss": 0.2172, "step": 8260 }, { "epoch": 0.8401909986792645, "grad_norm": 3.328125, "learning_rate": 3.1473947556124093e-06, "loss": 0.1249, "step": 8270 }, { "epoch": 0.8412069491008839, "grad_norm": 2.484375, "learning_rate": 3.1083523057627213e-06, "loss": 0.1744, "step": 8280 }, { "epoch": 0.8422228995225033, "grad_norm": 4.46875, "learning_rate": 3.0695374664767353e-06, "loss": 0.1772, "step": 8290 }, { "epoch": 0.8432388499441227, "grad_norm": 0.59375, "learning_rate": 3.0309506413178397e-06, "loss": 0.2302, "step": 8300 }, { "epoch": 0.8442548003657422, "grad_norm": 2.390625, "learning_rate": 2.9925922314787136e-06, "loss": 0.1635, "step": 8310 }, { "epoch": 0.8452707507873616, "grad_norm": 2.34375, "learning_rate": 2.954462635777194e-06, "loss": 0.1573, "step": 8320 }, { "epoch": 0.846286701208981, "grad_norm": 2.015625, "learning_rate": 2.916562250652083e-06, "loss": 0.1608, "step": 8330 }, { "epoch": 0.8473026516306005, "grad_norm": 4.125, "learning_rate": 2.878891470159048e-06, "loss": 0.184, "step": 8340 }, { "epoch": 0.8483186020522199, "grad_norm": 2.515625, "learning_rate": 2.8414506859665514e-06, "loss": 0.2141, "step": 8350 }, { "epoch": 0.8493345524738393, "grad_norm": 3.375, "learning_rate": 2.8042402873517197e-06, "loss": 0.1729, "step": 8360 }, { "epoch": 0.8503505028954587, "grad_norm": 3.078125, "learning_rate": 2.76726066119635e-06, "loss": 0.2252, "step": 8370 }, { "epoch": 0.8513664533170782, "grad_norm": 1.5390625, "learning_rate": 2.730512191982845e-06, "loss": 0.1644, "step": 8380 }, { "epoch": 0.8523824037386976, "grad_norm": 1.9296875, "learning_rate": 2.693995261790261e-06, "loss": 0.1822, "step": 8390 }, { "epoch": 0.853398354160317, "grad_norm": 3.3125, "learning_rate": 2.657710250290285e-06, "loss": 0.2068, "step": 8400 }, { "epoch": 0.8544143045819365, "grad_norm": 0.640625, "learning_rate": 2.621657534743327e-06, "loss": 0.1224, "step": 8410 }, { "epoch": 0.8554302550035559, "grad_norm": 3.421875, "learning_rate": 2.5858374899945804e-06, "loss": 0.179, "step": 8420 }, { "epoch": 0.8564462054251752, "grad_norm": 3.484375, "learning_rate": 2.550250488470135e-06, "loss": 0.1873, "step": 8430 }, { "epoch": 0.8574621558467946, "grad_norm": 3.984375, "learning_rate": 2.5148969001730806e-06, "loss": 0.1799, "step": 8440 }, { "epoch": 0.8584781062684141, "grad_norm": 1.375, "learning_rate": 2.4797770926796858e-06, "loss": 0.176, "step": 8450 }, { "epoch": 0.8594940566900335, "grad_norm": 1.8984375, "learning_rate": 2.444891431135571e-06, "loss": 0.1664, "step": 8460 }, { "epoch": 0.8605100071116529, "grad_norm": 4.15625, "learning_rate": 2.4102402782518936e-06, "loss": 0.1512, "step": 8470 }, { "epoch": 0.8615259575332723, "grad_norm": 1.34375, "learning_rate": 2.3758239943016096e-06, "loss": 0.1629, "step": 8480 }, { "epoch": 0.8625419079548918, "grad_norm": 5.3125, "learning_rate": 2.3416429371157013e-06, "loss": 0.2099, "step": 8490 }, { "epoch": 0.8635578583765112, "grad_norm": 5.9375, "learning_rate": 2.307697462079464e-06, "loss": 0.2221, "step": 8500 }, { "epoch": 0.8645738087981306, "grad_norm": 5.4375, "learning_rate": 2.273987922128809e-06, "loss": 0.2191, "step": 8510 }, { "epoch": 0.8655897592197501, "grad_norm": 2.171875, "learning_rate": 2.240514667746607e-06, "loss": 0.1843, "step": 8520 }, { "epoch": 0.8666057096413695, "grad_norm": 2.5625, "learning_rate": 2.2072780469590245e-06, "loss": 0.2494, "step": 8530 }, { "epoch": 0.8676216600629889, "grad_norm": 2.25, "learning_rate": 2.1742784053319116e-06, "loss": 0.1712, "step": 8540 }, { "epoch": 0.8686376104846083, "grad_norm": 4.5625, "learning_rate": 2.141516085967224e-06, "loss": 0.1169, "step": 8550 }, { "epoch": 0.8696535609062278, "grad_norm": 4.25, "learning_rate": 2.1089914294994434e-06, "loss": 0.1374, "step": 8560 }, { "epoch": 0.8706695113278472, "grad_norm": 3.265625, "learning_rate": 2.0767047740920336e-06, "loss": 0.2162, "step": 8570 }, { "epoch": 0.8716854617494666, "grad_norm": 1.8203125, "learning_rate": 2.0446564554339187e-06, "loss": 0.1593, "step": 8580 }, { "epoch": 0.8727014121710861, "grad_norm": 2.671875, "learning_rate": 2.0128468067360185e-06, "loss": 0.1857, "step": 8590 }, { "epoch": 0.8737173625927055, "grad_norm": 2.765625, "learning_rate": 1.981276158727749e-06, "loss": 0.1989, "step": 8600 }, { "epoch": 0.8747333130143249, "grad_norm": 2.65625, "learning_rate": 1.949944839653625e-06, "loss": 0.2077, "step": 8610 }, { "epoch": 0.8757492634359443, "grad_norm": 2.625, "learning_rate": 1.918853175269797e-06, "loss": 0.2003, "step": 8620 }, { "epoch": 0.8767652138575638, "grad_norm": 0.71875, "learning_rate": 1.8880014888407127e-06, "loss": 0.2486, "step": 8630 }, { "epoch": 0.8777811642791832, "grad_norm": 4.71875, "learning_rate": 1.8573901011357336e-06, "loss": 0.1896, "step": 8640 }, { "epoch": 0.8787971147008026, "grad_norm": 5.0625, "learning_rate": 1.8270193304257887e-06, "loss": 0.1727, "step": 8650 }, { "epoch": 0.8798130651224221, "grad_norm": 1.75, "learning_rate": 1.7968894924800916e-06, "loss": 0.1687, "step": 8660 }, { "epoch": 0.8808290155440415, "grad_norm": 2.65625, "learning_rate": 1.7670009005628291e-06, "loss": 0.166, "step": 8670 }, { "epoch": 0.8818449659656609, "grad_norm": 4.71875, "learning_rate": 1.737353865429936e-06, "loss": 0.1471, "step": 8680 }, { "epoch": 0.8828609163872803, "grad_norm": 0.546875, "learning_rate": 1.7079486953258283e-06, "loss": 0.1075, "step": 8690 }, { "epoch": 0.8838768668088998, "grad_norm": 1.640625, "learning_rate": 1.6787856959802367e-06, "loss": 0.2113, "step": 8700 }, { "epoch": 0.8848928172305192, "grad_norm": 2.953125, "learning_rate": 1.6498651706049945e-06, "loss": 0.1412, "step": 8710 }, { "epoch": 0.8859087676521386, "grad_norm": 3.796875, "learning_rate": 1.6211874198909072e-06, "loss": 0.1701, "step": 8720 }, { "epoch": 0.8869247180737581, "grad_norm": 3.734375, "learning_rate": 1.592752742004605e-06, "loss": 0.1348, "step": 8730 }, { "epoch": 0.8879406684953774, "grad_norm": 2.21875, "learning_rate": 1.5645614325854735e-06, "loss": 0.1931, "step": 8740 }, { "epoch": 0.8889566189169968, "grad_norm": 3.4375, "learning_rate": 1.5366137847425466e-06, "loss": 0.1705, "step": 8750 }, { "epoch": 0.8899725693386162, "grad_norm": 3.5625, "learning_rate": 1.5089100890514769e-06, "loss": 0.1889, "step": 8760 }, { "epoch": 0.8909885197602357, "grad_norm": 2.65625, "learning_rate": 1.4814506335515176e-06, "loss": 0.1837, "step": 8770 }, { "epoch": 0.8920044701818551, "grad_norm": 1.421875, "learning_rate": 1.4542357037425207e-06, "loss": 0.1728, "step": 8780 }, { "epoch": 0.8930204206034745, "grad_norm": 1.625, "learning_rate": 1.4272655825819713e-06, "loss": 0.1562, "step": 8790 }, { "epoch": 0.8940363710250939, "grad_norm": 4.0625, "learning_rate": 1.4005405504820351e-06, "loss": 0.1681, "step": 8800 }, { "epoch": 0.8950523214467134, "grad_norm": 2.328125, "learning_rate": 1.3740608853066634e-06, "loss": 0.1449, "step": 8810 }, { "epoch": 0.8960682718683328, "grad_norm": 4.0625, "learning_rate": 1.347826862368684e-06, "loss": 0.2418, "step": 8820 }, { "epoch": 0.8970842222899522, "grad_norm": 0.55859375, "learning_rate": 1.3218387544269545e-06, "loss": 0.2473, "step": 8830 }, { "epoch": 0.8981001727115717, "grad_norm": 4.78125, "learning_rate": 1.2960968316835132e-06, "loss": 0.194, "step": 8840 }, { "epoch": 0.8991161231331911, "grad_norm": 3.921875, "learning_rate": 1.2706013617807822e-06, "loss": 0.2109, "step": 8850 }, { "epoch": 0.9001320735548105, "grad_norm": 5.03125, "learning_rate": 1.2453526097987778e-06, "loss": 0.151, "step": 8860 }, { "epoch": 0.9011480239764299, "grad_norm": 5.96875, "learning_rate": 1.2203508382523431e-06, "loss": 0.1811, "step": 8870 }, { "epoch": 0.9021639743980494, "grad_norm": 3.828125, "learning_rate": 1.1955963070884534e-06, "loss": 0.2004, "step": 8880 }, { "epoch": 0.9031799248196688, "grad_norm": 1.9765625, "learning_rate": 1.171089273683465e-06, "loss": 0.1395, "step": 8890 }, { "epoch": 0.9041958752412882, "grad_norm": 2.328125, "learning_rate": 1.1468299928404868e-06, "loss": 0.1915, "step": 8900 }, { "epoch": 0.9052118256629077, "grad_norm": 1.265625, "learning_rate": 1.1228187167866943e-06, "loss": 0.1281, "step": 8910 }, { "epoch": 0.9062277760845271, "grad_norm": 1.4375, "learning_rate": 1.099055695170728e-06, "loss": 0.1627, "step": 8920 }, { "epoch": 0.9072437265061465, "grad_norm": 0.6953125, "learning_rate": 1.0755411750600962e-06, "loss": 0.1768, "step": 8930 }, { "epoch": 0.9082596769277659, "grad_norm": 1.046875, "learning_rate": 1.052275400938596e-06, "loss": 0.1544, "step": 8940 }, { "epoch": 0.9092756273493854, "grad_norm": 2.71875, "learning_rate": 1.0292586147037764e-06, "loss": 0.2498, "step": 8950 }, { "epoch": 0.9102915777710048, "grad_norm": 3.0625, "learning_rate": 1.0064910556644214e-06, "loss": 0.1918, "step": 8960 }, { "epoch": 0.9113075281926242, "grad_norm": 4.0, "learning_rate": 9.839729605380766e-07, "loss": 0.2388, "step": 8970 }, { "epoch": 0.9123234786142437, "grad_norm": 3.765625, "learning_rate": 9.61704563448565e-07, "loss": 0.1944, "step": 8980 }, { "epoch": 0.9133394290358631, "grad_norm": 2.90625, "learning_rate": 9.396860959235671e-07, "loss": 0.1667, "step": 8990 }, { "epoch": 0.9143553794574825, "grad_norm": 2.4375, "learning_rate": 9.179177868922085e-07, "loss": 0.2143, "step": 9000 } ], "logging_steps": 10, "max_steps": 9843, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 7, "trial_name": null, "trial_params": null }