{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5949, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016809547823163557, "grad_norm": 79552.359375, "learning_rate": 4.9915952260884186e-05, "loss": 1.0656, "step": 10 }, { "epoch": 0.0033619095646327114, "grad_norm": 79056.6640625, "learning_rate": 4.983190452176837e-05, "loss": 1.0215, "step": 20 }, { "epoch": 0.005042864346949067, "grad_norm": 85323.828125, "learning_rate": 4.9747856782652546e-05, "loss": 0.9578, "step": 30 }, { "epoch": 0.006723819129265423, "grad_norm": 71872.125, "learning_rate": 4.966380904353673e-05, "loss": 0.9151, "step": 40 }, { "epoch": 0.008404773911581778, "grad_norm": 71638.46875, "learning_rate": 4.957976130442091e-05, "loss": 0.8803, "step": 50 }, { "epoch": 0.010085728693898134, "grad_norm": 74304.5234375, "learning_rate": 4.9495713565305096e-05, "loss": 0.9141, "step": 60 }, { "epoch": 0.01176668347621449, "grad_norm": 73676.1953125, "learning_rate": 4.941166582618928e-05, "loss": 0.9949, "step": 70 }, { "epoch": 0.013447638258530846, "grad_norm": 76955.7890625, "learning_rate": 4.9327618087073463e-05, "loss": 0.8852, "step": 80 }, { "epoch": 0.015128593040847202, "grad_norm": 69415.2578125, "learning_rate": 4.924357034795764e-05, "loss": 0.8891, "step": 90 }, { "epoch": 0.016809547823163556, "grad_norm": 84991.7734375, "learning_rate": 4.9159522608841824e-05, "loss": 0.939, "step": 100 }, { "epoch": 0.01849050260547991, "grad_norm": 72803.203125, "learning_rate": 4.907547486972601e-05, "loss": 0.8879, "step": 110 }, { "epoch": 0.020171457387796268, "grad_norm": 73581.6640625, "learning_rate": 4.899142713061019e-05, "loss": 0.923, "step": 120 }, { "epoch": 0.021852412170112624, "grad_norm": 80610.7421875, "learning_rate": 4.8907379391494374e-05, "loss": 0.8933, "step": 130 }, { "epoch": 0.02353336695242898, "grad_norm": 80112.125, "learning_rate": 4.882333165237855e-05, "loss": 0.8761, "step": 140 }, { "epoch": 0.025214321734745335, "grad_norm": 73625.0546875, "learning_rate": 4.8739283913262734e-05, "loss": 0.8442, "step": 150 }, { "epoch": 0.02689527651706169, "grad_norm": 79199.546875, "learning_rate": 4.865523617414692e-05, "loss": 0.9312, "step": 160 }, { "epoch": 0.028576231299378047, "grad_norm": 69096.6484375, "learning_rate": 4.85711884350311e-05, "loss": 0.9435, "step": 170 }, { "epoch": 0.030257186081694403, "grad_norm": 66993.53125, "learning_rate": 4.8487140695915285e-05, "loss": 0.8933, "step": 180 }, { "epoch": 0.031938140864010756, "grad_norm": 72366.515625, "learning_rate": 4.840309295679946e-05, "loss": 0.8513, "step": 190 }, { "epoch": 0.03361909564632711, "grad_norm": 70981.46875, "learning_rate": 4.8319045217683645e-05, "loss": 0.9126, "step": 200 }, { "epoch": 0.03530005042864347, "grad_norm": 73961.296875, "learning_rate": 4.823499747856783e-05, "loss": 0.908, "step": 210 }, { "epoch": 0.03698100521095982, "grad_norm": 71402.046875, "learning_rate": 4.815094973945201e-05, "loss": 0.915, "step": 220 }, { "epoch": 0.03866195999327618, "grad_norm": 69597.8984375, "learning_rate": 4.8066902000336195e-05, "loss": 0.892, "step": 230 }, { "epoch": 0.040342914775592535, "grad_norm": 77749.3359375, "learning_rate": 4.798285426122038e-05, "loss": 0.9004, "step": 240 }, { "epoch": 0.04202386955790889, "grad_norm": 69176.328125, "learning_rate": 4.7898806522104555e-05, "loss": 0.9878, "step": 250 }, { "epoch": 0.04370482434022525, "grad_norm": 76940.0625, "learning_rate": 4.781475878298874e-05, "loss": 0.8873, "step": 260 }, { "epoch": 0.0453857791225416, "grad_norm": 81460.0546875, "learning_rate": 4.773071104387292e-05, "loss": 0.9022, "step": 270 }, { "epoch": 0.04706673390485796, "grad_norm": 69520.1953125, "learning_rate": 4.7646663304757106e-05, "loss": 0.8729, "step": 280 }, { "epoch": 0.048747688687174315, "grad_norm": 76301.21875, "learning_rate": 4.756261556564129e-05, "loss": 0.9464, "step": 290 }, { "epoch": 0.05042864346949067, "grad_norm": 72831.2265625, "learning_rate": 4.7478567826525466e-05, "loss": 0.8346, "step": 300 }, { "epoch": 0.05210959825180703, "grad_norm": 73389.6328125, "learning_rate": 4.739452008740965e-05, "loss": 0.8794, "step": 310 }, { "epoch": 0.05379055303412338, "grad_norm": 78976.9453125, "learning_rate": 4.731047234829383e-05, "loss": 0.9318, "step": 320 }, { "epoch": 0.05547150781643974, "grad_norm": 72745.15625, "learning_rate": 4.7226424609178016e-05, "loss": 0.923, "step": 330 }, { "epoch": 0.057152462598756094, "grad_norm": 65032.38671875, "learning_rate": 4.71423768700622e-05, "loss": 0.9431, "step": 340 }, { "epoch": 0.05883341738107245, "grad_norm": 67867.875, "learning_rate": 4.705832913094638e-05, "loss": 0.853, "step": 350 }, { "epoch": 0.060514372163388806, "grad_norm": 132520.328125, "learning_rate": 4.697428139183056e-05, "loss": 0.8772, "step": 360 }, { "epoch": 0.06219532694570516, "grad_norm": 71777.140625, "learning_rate": 4.6890233652714743e-05, "loss": 0.841, "step": 370 }, { "epoch": 0.06387628172802151, "grad_norm": 71626.65625, "learning_rate": 4.680618591359893e-05, "loss": 0.8281, "step": 380 }, { "epoch": 0.06555723651033787, "grad_norm": 66127.0703125, "learning_rate": 4.672213817448311e-05, "loss": 0.9221, "step": 390 }, { "epoch": 0.06723819129265422, "grad_norm": 66148.8203125, "learning_rate": 4.6638090435367294e-05, "loss": 0.8607, "step": 400 }, { "epoch": 0.06891914607497059, "grad_norm": 75503.5078125, "learning_rate": 4.655404269625147e-05, "loss": 0.9062, "step": 410 }, { "epoch": 0.07060010085728693, "grad_norm": 61276.5625, "learning_rate": 4.6469994957135654e-05, "loss": 0.8669, "step": 420 }, { "epoch": 0.0722810556396033, "grad_norm": 71196.9453125, "learning_rate": 4.638594721801984e-05, "loss": 0.8686, "step": 430 }, { "epoch": 0.07396201042191965, "grad_norm": 66918.8671875, "learning_rate": 4.630189947890402e-05, "loss": 0.9121, "step": 440 }, { "epoch": 0.07564296520423601, "grad_norm": 64304.73828125, "learning_rate": 4.6217851739788204e-05, "loss": 0.8663, "step": 450 }, { "epoch": 0.07732391998655236, "grad_norm": 62031.97265625, "learning_rate": 4.613380400067238e-05, "loss": 0.8258, "step": 460 }, { "epoch": 0.07900487476886872, "grad_norm": 63083.9140625, "learning_rate": 4.6049756261556565e-05, "loss": 0.8657, "step": 470 }, { "epoch": 0.08068582955118507, "grad_norm": 67486.75, "learning_rate": 4.596570852244075e-05, "loss": 0.9225, "step": 480 }, { "epoch": 0.08236678433350143, "grad_norm": 68711.7890625, "learning_rate": 4.588166078332493e-05, "loss": 0.8172, "step": 490 }, { "epoch": 0.08404773911581778, "grad_norm": 69319.46875, "learning_rate": 4.5797613044209115e-05, "loss": 0.8034, "step": 500 }, { "epoch": 0.08572869389813415, "grad_norm": 67749.515625, "learning_rate": 4.57135653050933e-05, "loss": 0.9285, "step": 510 }, { "epoch": 0.0874096486804505, "grad_norm": 78357.3203125, "learning_rate": 4.5629517565977475e-05, "loss": 0.8566, "step": 520 }, { "epoch": 0.08909060346276686, "grad_norm": 59291.59375, "learning_rate": 4.554546982686166e-05, "loss": 0.8088, "step": 530 }, { "epoch": 0.0907715582450832, "grad_norm": 72898.515625, "learning_rate": 4.546142208774584e-05, "loss": 0.8411, "step": 540 }, { "epoch": 0.09245251302739957, "grad_norm": 65205.4296875, "learning_rate": 4.5377374348630026e-05, "loss": 0.9546, "step": 550 }, { "epoch": 0.09413346780971592, "grad_norm": 104009.6953125, "learning_rate": 4.529332660951421e-05, "loss": 0.8406, "step": 560 }, { "epoch": 0.09581442259203228, "grad_norm": 68204.453125, "learning_rate": 4.5209278870398386e-05, "loss": 0.897, "step": 570 }, { "epoch": 0.09749537737434863, "grad_norm": 70454.1484375, "learning_rate": 4.512523113128257e-05, "loss": 0.839, "step": 580 }, { "epoch": 0.09917633215666499, "grad_norm": 67070.96875, "learning_rate": 4.504118339216675e-05, "loss": 0.9008, "step": 590 }, { "epoch": 0.10085728693898134, "grad_norm": 70355.5234375, "learning_rate": 4.4957135653050936e-05, "loss": 0.9318, "step": 600 }, { "epoch": 0.10253824172129769, "grad_norm": 68497.7109375, "learning_rate": 4.487308791393512e-05, "loss": 0.9127, "step": 610 }, { "epoch": 0.10421919650361405, "grad_norm": 81705.2578125, "learning_rate": 4.47890401748193e-05, "loss": 0.8575, "step": 620 }, { "epoch": 0.1059001512859304, "grad_norm": 72572.75, "learning_rate": 4.470499243570348e-05, "loss": 0.8387, "step": 630 }, { "epoch": 0.10758110606824677, "grad_norm": 66429.328125, "learning_rate": 4.462094469658766e-05, "loss": 0.8322, "step": 640 }, { "epoch": 0.10926206085056311, "grad_norm": 73876.7265625, "learning_rate": 4.453689695747185e-05, "loss": 0.8788, "step": 650 }, { "epoch": 0.11094301563287948, "grad_norm": 73238.40625, "learning_rate": 4.445284921835603e-05, "loss": 0.8983, "step": 660 }, { "epoch": 0.11262397041519583, "grad_norm": 65609.2578125, "learning_rate": 4.4368801479240214e-05, "loss": 0.8406, "step": 670 }, { "epoch": 0.11430492519751219, "grad_norm": 79022.8515625, "learning_rate": 4.428475374012439e-05, "loss": 0.8204, "step": 680 }, { "epoch": 0.11598587997982854, "grad_norm": 63224.625, "learning_rate": 4.4200706001008574e-05, "loss": 0.8252, "step": 690 }, { "epoch": 0.1176668347621449, "grad_norm": 61681.51171875, "learning_rate": 4.411665826189276e-05, "loss": 0.9041, "step": 700 }, { "epoch": 0.11934778954446125, "grad_norm": 65631.578125, "learning_rate": 4.403261052277694e-05, "loss": 0.8541, "step": 710 }, { "epoch": 0.12102874432677761, "grad_norm": 70977.875, "learning_rate": 4.3948562783661124e-05, "loss": 0.9705, "step": 720 }, { "epoch": 0.12270969910909396, "grad_norm": 67375.078125, "learning_rate": 4.38645150445453e-05, "loss": 0.8542, "step": 730 }, { "epoch": 0.12439065389141032, "grad_norm": 60750.97265625, "learning_rate": 4.3780467305429484e-05, "loss": 0.7832, "step": 740 }, { "epoch": 0.1260716086737267, "grad_norm": 62710.984375, "learning_rate": 4.369641956631367e-05, "loss": 0.855, "step": 750 }, { "epoch": 0.12775256345604302, "grad_norm": 56731.50390625, "learning_rate": 4.361237182719785e-05, "loss": 0.8591, "step": 760 }, { "epoch": 0.12943351823835939, "grad_norm": 67084.78125, "learning_rate": 4.3528324088082035e-05, "loss": 0.8406, "step": 770 }, { "epoch": 0.13111447302067575, "grad_norm": 66152.15625, "learning_rate": 4.344427634896622e-05, "loss": 0.9062, "step": 780 }, { "epoch": 0.1327954278029921, "grad_norm": 71438.4765625, "learning_rate": 4.3360228609850395e-05, "loss": 0.8206, "step": 790 }, { "epoch": 0.13447638258530845, "grad_norm": 60924.875, "learning_rate": 4.327618087073458e-05, "loss": 0.8164, "step": 800 }, { "epoch": 0.1361573373676248, "grad_norm": 70178.5390625, "learning_rate": 4.319213313161876e-05, "loss": 0.8303, "step": 810 }, { "epoch": 0.13783829214994117, "grad_norm": 64838.84765625, "learning_rate": 4.3108085392502945e-05, "loss": 0.8503, "step": 820 }, { "epoch": 0.13951924693225753, "grad_norm": 72861.828125, "learning_rate": 4.302403765338713e-05, "loss": 0.9165, "step": 830 }, { "epoch": 0.14120020171457387, "grad_norm": 74359.4140625, "learning_rate": 4.2939989914271306e-05, "loss": 0.9102, "step": 840 }, { "epoch": 0.14288115649689023, "grad_norm": 70118.2109375, "learning_rate": 4.285594217515549e-05, "loss": 0.8703, "step": 850 }, { "epoch": 0.1445621112792066, "grad_norm": 63720.28125, "learning_rate": 4.277189443603967e-05, "loss": 0.893, "step": 860 }, { "epoch": 0.14624306606152296, "grad_norm": 77761.90625, "learning_rate": 4.2687846696923856e-05, "loss": 0.9613, "step": 870 }, { "epoch": 0.1479240208438393, "grad_norm": 64969.37890625, "learning_rate": 4.260379895780804e-05, "loss": 0.8691, "step": 880 }, { "epoch": 0.14960497562615566, "grad_norm": 70299.8203125, "learning_rate": 4.2519751218692216e-05, "loss": 0.8719, "step": 890 }, { "epoch": 0.15128593040847202, "grad_norm": 65577.1171875, "learning_rate": 4.24357034795764e-05, "loss": 0.8554, "step": 900 }, { "epoch": 0.15296688519078835, "grad_norm": 71740.8671875, "learning_rate": 4.235165574046058e-05, "loss": 0.9948, "step": 910 }, { "epoch": 0.15464783997310472, "grad_norm": 59819.0234375, "learning_rate": 4.226760800134477e-05, "loss": 0.8605, "step": 920 }, { "epoch": 0.15632879475542108, "grad_norm": 57238.7109375, "learning_rate": 4.218356026222895e-05, "loss": 0.8281, "step": 930 }, { "epoch": 0.15800974953773744, "grad_norm": 70315.765625, "learning_rate": 4.2099512523113134e-05, "loss": 0.9126, "step": 940 }, { "epoch": 0.15969070432005378, "grad_norm": 63374.6796875, "learning_rate": 4.201546478399731e-05, "loss": 0.8744, "step": 950 }, { "epoch": 0.16137165910237014, "grad_norm": 75860.09375, "learning_rate": 4.1931417044881494e-05, "loss": 0.8078, "step": 960 }, { "epoch": 0.1630526138846865, "grad_norm": 58454.1484375, "learning_rate": 4.184736930576568e-05, "loss": 0.8084, "step": 970 }, { "epoch": 0.16473356866700287, "grad_norm": 68843.625, "learning_rate": 4.176332156664986e-05, "loss": 0.8458, "step": 980 }, { "epoch": 0.1664145234493192, "grad_norm": 58082.43359375, "learning_rate": 4.1679273827534044e-05, "loss": 0.8148, "step": 990 }, { "epoch": 0.16809547823163556, "grad_norm": 58417.78125, "learning_rate": 4.159522608841822e-05, "loss": 0.8373, "step": 1000 }, { "epoch": 0.16977643301395193, "grad_norm": 61807.9140625, "learning_rate": 4.1511178349302404e-05, "loss": 0.8633, "step": 1010 }, { "epoch": 0.1714573877962683, "grad_norm": 61544.55078125, "learning_rate": 4.142713061018659e-05, "loss": 0.7921, "step": 1020 }, { "epoch": 0.17313834257858463, "grad_norm": 78909.9921875, "learning_rate": 4.134308287107077e-05, "loss": 0.8422, "step": 1030 }, { "epoch": 0.174819297360901, "grad_norm": 75892.40625, "learning_rate": 4.1259035131954955e-05, "loss": 0.8307, "step": 1040 }, { "epoch": 0.17650025214321735, "grad_norm": 62162.703125, "learning_rate": 4.117498739283914e-05, "loss": 0.8243, "step": 1050 }, { "epoch": 0.1781812069255337, "grad_norm": 66981.4609375, "learning_rate": 4.1090939653723315e-05, "loss": 0.819, "step": 1060 }, { "epoch": 0.17986216170785005, "grad_norm": 69593.6796875, "learning_rate": 4.10068919146075e-05, "loss": 0.8817, "step": 1070 }, { "epoch": 0.1815431164901664, "grad_norm": 67435.6640625, "learning_rate": 4.092284417549168e-05, "loss": 0.8077, "step": 1080 }, { "epoch": 0.18322407127248277, "grad_norm": 66041.1640625, "learning_rate": 4.0838796436375865e-05, "loss": 0.841, "step": 1090 }, { "epoch": 0.18490502605479914, "grad_norm": 65792.2890625, "learning_rate": 4.075474869726005e-05, "loss": 0.8742, "step": 1100 }, { "epoch": 0.18658598083711547, "grad_norm": 61461.2265625, "learning_rate": 4.0670700958144226e-05, "loss": 0.8381, "step": 1110 }, { "epoch": 0.18826693561943184, "grad_norm": 62316.87109375, "learning_rate": 4.058665321902841e-05, "loss": 0.8629, "step": 1120 }, { "epoch": 0.1899478904017482, "grad_norm": 60252.7421875, "learning_rate": 4.050260547991259e-05, "loss": 0.8003, "step": 1130 }, { "epoch": 0.19162884518406456, "grad_norm": 133503.328125, "learning_rate": 4.0418557740796776e-05, "loss": 0.8469, "step": 1140 }, { "epoch": 0.1933097999663809, "grad_norm": 64984.43359375, "learning_rate": 4.033451000168096e-05, "loss": 0.8024, "step": 1150 }, { "epoch": 0.19499075474869726, "grad_norm": 64012.15234375, "learning_rate": 4.0250462262565136e-05, "loss": 0.8655, "step": 1160 }, { "epoch": 0.19667170953101362, "grad_norm": 80887.5859375, "learning_rate": 4.016641452344932e-05, "loss": 0.8273, "step": 1170 }, { "epoch": 0.19835266431332998, "grad_norm": 67626.34375, "learning_rate": 4.00823667843335e-05, "loss": 0.7986, "step": 1180 }, { "epoch": 0.20003361909564632, "grad_norm": 71404.109375, "learning_rate": 3.9998319045217687e-05, "loss": 0.796, "step": 1190 }, { "epoch": 0.20171457387796268, "grad_norm": 69063.609375, "learning_rate": 3.991427130610187e-05, "loss": 0.8819, "step": 1200 }, { "epoch": 0.20339552866027905, "grad_norm": 64813.16796875, "learning_rate": 3.9830223566986053e-05, "loss": 0.8451, "step": 1210 }, { "epoch": 0.20507648344259538, "grad_norm": 76470.9765625, "learning_rate": 3.974617582787023e-05, "loss": 0.8079, "step": 1220 }, { "epoch": 0.20675743822491174, "grad_norm": 57304.13671875, "learning_rate": 3.9662128088754414e-05, "loss": 0.7686, "step": 1230 }, { "epoch": 0.2084383930072281, "grad_norm": 58822.703125, "learning_rate": 3.95780803496386e-05, "loss": 0.8457, "step": 1240 }, { "epoch": 0.21011934778954447, "grad_norm": 61174.65234375, "learning_rate": 3.949403261052278e-05, "loss": 0.752, "step": 1250 }, { "epoch": 0.2118003025718608, "grad_norm": 72221.9296875, "learning_rate": 3.9409984871406964e-05, "loss": 0.8431, "step": 1260 }, { "epoch": 0.21348125735417717, "grad_norm": 64120.45703125, "learning_rate": 3.932593713229114e-05, "loss": 0.8505, "step": 1270 }, { "epoch": 0.21516221213649353, "grad_norm": 67320.4140625, "learning_rate": 3.9241889393175324e-05, "loss": 0.9042, "step": 1280 }, { "epoch": 0.2168431669188099, "grad_norm": 64738.58203125, "learning_rate": 3.915784165405951e-05, "loss": 0.8681, "step": 1290 }, { "epoch": 0.21852412170112623, "grad_norm": 87432.8515625, "learning_rate": 3.907379391494369e-05, "loss": 0.8661, "step": 1300 }, { "epoch": 0.2202050764834426, "grad_norm": 56789.4453125, "learning_rate": 3.8989746175827875e-05, "loss": 0.8058, "step": 1310 }, { "epoch": 0.22188603126575895, "grad_norm": 55766.0234375, "learning_rate": 3.890569843671206e-05, "loss": 0.8627, "step": 1320 }, { "epoch": 0.22356698604807532, "grad_norm": 68595.6640625, "learning_rate": 3.8821650697596235e-05, "loss": 0.8299, "step": 1330 }, { "epoch": 0.22524794083039165, "grad_norm": 67234.25, "learning_rate": 3.873760295848042e-05, "loss": 0.8106, "step": 1340 }, { "epoch": 0.22692889561270801, "grad_norm": 67707.8828125, "learning_rate": 3.86535552193646e-05, "loss": 0.8399, "step": 1350 }, { "epoch": 0.22860985039502438, "grad_norm": 60259.1015625, "learning_rate": 3.8569507480248785e-05, "loss": 0.8484, "step": 1360 }, { "epoch": 0.23029080517734074, "grad_norm": 68358.1484375, "learning_rate": 3.848545974113297e-05, "loss": 0.8264, "step": 1370 }, { "epoch": 0.23197175995965708, "grad_norm": 62014.7109375, "learning_rate": 3.8401412002017145e-05, "loss": 0.8507, "step": 1380 }, { "epoch": 0.23365271474197344, "grad_norm": 61060.28125, "learning_rate": 3.831736426290133e-05, "loss": 0.8586, "step": 1390 }, { "epoch": 0.2353336695242898, "grad_norm": 57990.08984375, "learning_rate": 3.823331652378551e-05, "loss": 0.8011, "step": 1400 }, { "epoch": 0.23701462430660616, "grad_norm": 69242.015625, "learning_rate": 3.8149268784669696e-05, "loss": 0.7943, "step": 1410 }, { "epoch": 0.2386955790889225, "grad_norm": 85857.8046875, "learning_rate": 3.806522104555388e-05, "loss": 0.864, "step": 1420 }, { "epoch": 0.24037653387123886, "grad_norm": 69873.5234375, "learning_rate": 3.7981173306438056e-05, "loss": 0.8578, "step": 1430 }, { "epoch": 0.24205748865355523, "grad_norm": 59830.73046875, "learning_rate": 3.789712556732224e-05, "loss": 0.8517, "step": 1440 }, { "epoch": 0.2437384434358716, "grad_norm": 59296.34765625, "learning_rate": 3.781307782820642e-05, "loss": 0.864, "step": 1450 }, { "epoch": 0.24541939821818792, "grad_norm": 63888.60546875, "learning_rate": 3.7729030089090606e-05, "loss": 0.8161, "step": 1460 }, { "epoch": 0.24710035300050429, "grad_norm": 63231.421875, "learning_rate": 3.764498234997479e-05, "loss": 0.8901, "step": 1470 }, { "epoch": 0.24878130778282065, "grad_norm": 62661.12890625, "learning_rate": 3.756093461085897e-05, "loss": 0.8638, "step": 1480 }, { "epoch": 0.250462262565137, "grad_norm": 69614.8984375, "learning_rate": 3.747688687174315e-05, "loss": 0.8644, "step": 1490 }, { "epoch": 0.2521432173474534, "grad_norm": 67442.9296875, "learning_rate": 3.7392839132627334e-05, "loss": 0.7659, "step": 1500 }, { "epoch": 0.2538241721297697, "grad_norm": 67846.0390625, "learning_rate": 3.730879139351152e-05, "loss": 0.8243, "step": 1510 }, { "epoch": 0.25550512691208604, "grad_norm": 65967.703125, "learning_rate": 3.72247436543957e-05, "loss": 0.8573, "step": 1520 }, { "epoch": 0.25718608169440244, "grad_norm": 66188.1484375, "learning_rate": 3.7140695915279884e-05, "loss": 0.8491, "step": 1530 }, { "epoch": 0.25886703647671877, "grad_norm": 55857.7890625, "learning_rate": 3.705664817616406e-05, "loss": 0.8314, "step": 1540 }, { "epoch": 0.2605479912590351, "grad_norm": 78828.6328125, "learning_rate": 3.6972600437048244e-05, "loss": 0.7589, "step": 1550 }, { "epoch": 0.2622289460413515, "grad_norm": 64696.65234375, "learning_rate": 3.688855269793243e-05, "loss": 0.774, "step": 1560 }, { "epoch": 0.26390990082366783, "grad_norm": 71397.7265625, "learning_rate": 3.680450495881661e-05, "loss": 0.8351, "step": 1570 }, { "epoch": 0.2655908556059842, "grad_norm": 66192.203125, "learning_rate": 3.6720457219700795e-05, "loss": 0.8099, "step": 1580 }, { "epoch": 0.26727181038830056, "grad_norm": 70641.421875, "learning_rate": 3.663640948058498e-05, "loss": 0.8571, "step": 1590 }, { "epoch": 0.2689527651706169, "grad_norm": 75064.75, "learning_rate": 3.6552361741469155e-05, "loss": 0.8681, "step": 1600 }, { "epoch": 0.2706337199529333, "grad_norm": 75823.125, "learning_rate": 3.646831400235334e-05, "loss": 0.8258, "step": 1610 }, { "epoch": 0.2723146747352496, "grad_norm": 72296.765625, "learning_rate": 3.638426626323752e-05, "loss": 0.8561, "step": 1620 }, { "epoch": 0.27399562951756595, "grad_norm": 64309.3125, "learning_rate": 3.6300218524121705e-05, "loss": 0.856, "step": 1630 }, { "epoch": 0.27567658429988234, "grad_norm": 73902.8359375, "learning_rate": 3.621617078500589e-05, "loss": 0.8424, "step": 1640 }, { "epoch": 0.2773575390821987, "grad_norm": 63552.06640625, "learning_rate": 3.6132123045890065e-05, "loss": 0.861, "step": 1650 }, { "epoch": 0.27903849386451507, "grad_norm": 74067.7265625, "learning_rate": 3.604807530677425e-05, "loss": 0.7832, "step": 1660 }, { "epoch": 0.2807194486468314, "grad_norm": 68881.3046875, "learning_rate": 3.596402756765843e-05, "loss": 0.8443, "step": 1670 }, { "epoch": 0.28240040342914774, "grad_norm": 62996.19921875, "learning_rate": 3.5879979828542616e-05, "loss": 0.8075, "step": 1680 }, { "epoch": 0.28408135821146413, "grad_norm": 73364.8125, "learning_rate": 3.57959320894268e-05, "loss": 0.8425, "step": 1690 }, { "epoch": 0.28576231299378047, "grad_norm": 67509.296875, "learning_rate": 3.5711884350310976e-05, "loss": 0.755, "step": 1700 }, { "epoch": 0.2874432677760968, "grad_norm": 66616.984375, "learning_rate": 3.562783661119516e-05, "loss": 0.8679, "step": 1710 }, { "epoch": 0.2891242225584132, "grad_norm": 74004.359375, "learning_rate": 3.554378887207934e-05, "loss": 0.8793, "step": 1720 }, { "epoch": 0.2908051773407295, "grad_norm": 59084.9609375, "learning_rate": 3.5459741132963526e-05, "loss": 0.8322, "step": 1730 }, { "epoch": 0.2924861321230459, "grad_norm": 74027.28125, "learning_rate": 3.537569339384771e-05, "loss": 0.8661, "step": 1740 }, { "epoch": 0.29416708690536225, "grad_norm": 64524.97265625, "learning_rate": 3.529164565473189e-05, "loss": 0.7334, "step": 1750 }, { "epoch": 0.2958480416876786, "grad_norm": 74809.4921875, "learning_rate": 3.520759791561607e-05, "loss": 0.8091, "step": 1760 }, { "epoch": 0.297528996469995, "grad_norm": 66084.609375, "learning_rate": 3.5123550176500253e-05, "loss": 0.8715, "step": 1770 }, { "epoch": 0.2992099512523113, "grad_norm": 62919.05859375, "learning_rate": 3.503950243738444e-05, "loss": 0.8326, "step": 1780 }, { "epoch": 0.30089090603462765, "grad_norm": 68219.046875, "learning_rate": 3.495545469826862e-05, "loss": 0.8387, "step": 1790 }, { "epoch": 0.30257186081694404, "grad_norm": 61066.58203125, "learning_rate": 3.4871406959152804e-05, "loss": 0.8113, "step": 1800 }, { "epoch": 0.3042528155992604, "grad_norm": 64199.75390625, "learning_rate": 3.478735922003698e-05, "loss": 0.8359, "step": 1810 }, { "epoch": 0.3059337703815767, "grad_norm": 64780.81640625, "learning_rate": 3.4703311480921164e-05, "loss": 0.8528, "step": 1820 }, { "epoch": 0.3076147251638931, "grad_norm": 66866.6875, "learning_rate": 3.461926374180535e-05, "loss": 0.7795, "step": 1830 }, { "epoch": 0.30929567994620943, "grad_norm": 74536.5078125, "learning_rate": 3.453521600268953e-05, "loss": 0.8432, "step": 1840 }, { "epoch": 0.3109766347285258, "grad_norm": 63969.90234375, "learning_rate": 3.4451168263573714e-05, "loss": 0.8459, "step": 1850 }, { "epoch": 0.31265758951084216, "grad_norm": 68175.203125, "learning_rate": 3.43671205244579e-05, "loss": 0.8437, "step": 1860 }, { "epoch": 0.3143385442931585, "grad_norm": 61941.46484375, "learning_rate": 3.4283072785342075e-05, "loss": 0.8182, "step": 1870 }, { "epoch": 0.3160194990754749, "grad_norm": 67861.203125, "learning_rate": 3.419902504622626e-05, "loss": 0.8742, "step": 1880 }, { "epoch": 0.3177004538577912, "grad_norm": 66184.25, "learning_rate": 3.411497730711044e-05, "loss": 0.8312, "step": 1890 }, { "epoch": 0.31938140864010756, "grad_norm": 63603.37109375, "learning_rate": 3.4030929567994625e-05, "loss": 0.8162, "step": 1900 }, { "epoch": 0.32106236342242395, "grad_norm": 76040.2421875, "learning_rate": 3.394688182887881e-05, "loss": 0.8555, "step": 1910 }, { "epoch": 0.3227433182047403, "grad_norm": 62280.6328125, "learning_rate": 3.3862834089762985e-05, "loss": 0.8086, "step": 1920 }, { "epoch": 0.32442427298705667, "grad_norm": 68005.015625, "learning_rate": 3.377878635064717e-05, "loss": 0.7433, "step": 1930 }, { "epoch": 0.326105227769373, "grad_norm": 58576.92578125, "learning_rate": 3.369473861153135e-05, "loss": 0.7728, "step": 1940 }, { "epoch": 0.32778618255168934, "grad_norm": 64847.859375, "learning_rate": 3.3610690872415536e-05, "loss": 0.7596, "step": 1950 }, { "epoch": 0.32946713733400573, "grad_norm": 63781.38671875, "learning_rate": 3.352664313329972e-05, "loss": 0.7308, "step": 1960 }, { "epoch": 0.33114809211632207, "grad_norm": 66446.5859375, "learning_rate": 3.3442595394183896e-05, "loss": 0.8058, "step": 1970 }, { "epoch": 0.3328290468986384, "grad_norm": 61722.046875, "learning_rate": 3.335854765506808e-05, "loss": 0.869, "step": 1980 }, { "epoch": 0.3345100016809548, "grad_norm": 66245.3359375, "learning_rate": 3.327449991595226e-05, "loss": 0.8153, "step": 1990 }, { "epoch": 0.33619095646327113, "grad_norm": 60609.90625, "learning_rate": 3.3190452176836446e-05, "loss": 0.8484, "step": 2000 }, { "epoch": 0.3378719112455875, "grad_norm": 70234.8828125, "learning_rate": 3.310640443772063e-05, "loss": 0.8249, "step": 2010 }, { "epoch": 0.33955286602790385, "grad_norm": 52655.56640625, "learning_rate": 3.302235669860481e-05, "loss": 0.7768, "step": 2020 }, { "epoch": 0.3412338208102202, "grad_norm": 73065.375, "learning_rate": 3.293830895948899e-05, "loss": 0.8911, "step": 2030 }, { "epoch": 0.3429147755925366, "grad_norm": 59607.45703125, "learning_rate": 3.285426122037317e-05, "loss": 0.7773, "step": 2040 }, { "epoch": 0.3445957303748529, "grad_norm": 64399.9375, "learning_rate": 3.277021348125736e-05, "loss": 0.8161, "step": 2050 }, { "epoch": 0.34627668515716925, "grad_norm": 75411.3359375, "learning_rate": 3.268616574214154e-05, "loss": 0.7615, "step": 2060 }, { "epoch": 0.34795763993948564, "grad_norm": 69012.03125, "learning_rate": 3.2602118003025724e-05, "loss": 0.8245, "step": 2070 }, { "epoch": 0.349638594721802, "grad_norm": 69290.5625, "learning_rate": 3.25180702639099e-05, "loss": 0.786, "step": 2080 }, { "epoch": 0.35131954950411837, "grad_norm": 68525.0859375, "learning_rate": 3.2434022524794084e-05, "loss": 0.7572, "step": 2090 }, { "epoch": 0.3530005042864347, "grad_norm": 61886.20703125, "learning_rate": 3.234997478567827e-05, "loss": 0.8115, "step": 2100 }, { "epoch": 0.35468145906875104, "grad_norm": 73001.8828125, "learning_rate": 3.226592704656245e-05, "loss": 0.8336, "step": 2110 }, { "epoch": 0.3563624138510674, "grad_norm": 65161.6484375, "learning_rate": 3.2181879307446634e-05, "loss": 0.8124, "step": 2120 }, { "epoch": 0.35804336863338376, "grad_norm": 61001.9140625, "learning_rate": 3.209783156833082e-05, "loss": 0.8067, "step": 2130 }, { "epoch": 0.3597243234157001, "grad_norm": 56932.25, "learning_rate": 3.2013783829214994e-05, "loss": 0.77, "step": 2140 }, { "epoch": 0.3614052781980165, "grad_norm": 68792.859375, "learning_rate": 3.192973609009918e-05, "loss": 0.8217, "step": 2150 }, { "epoch": 0.3630862329803328, "grad_norm": 61329.98828125, "learning_rate": 3.184568835098336e-05, "loss": 0.7714, "step": 2160 }, { "epoch": 0.36476718776264916, "grad_norm": 66838.0078125, "learning_rate": 3.1761640611867545e-05, "loss": 0.8314, "step": 2170 }, { "epoch": 0.36644814254496555, "grad_norm": 73817.578125, "learning_rate": 3.167759287275173e-05, "loss": 0.8413, "step": 2180 }, { "epoch": 0.3681290973272819, "grad_norm": 67156.03125, "learning_rate": 3.1593545133635905e-05, "loss": 0.7996, "step": 2190 }, { "epoch": 0.3698100521095983, "grad_norm": 83176.359375, "learning_rate": 3.150949739452009e-05, "loss": 0.7875, "step": 2200 }, { "epoch": 0.3714910068919146, "grad_norm": 68843.5859375, "learning_rate": 3.142544965540427e-05, "loss": 0.8156, "step": 2210 }, { "epoch": 0.37317196167423095, "grad_norm": 61444.25, "learning_rate": 3.1341401916288455e-05, "loss": 0.7747, "step": 2220 }, { "epoch": 0.37485291645654734, "grad_norm": 70228.59375, "learning_rate": 3.125735417717264e-05, "loss": 0.8088, "step": 2230 }, { "epoch": 0.37653387123886367, "grad_norm": 56036.578125, "learning_rate": 3.1173306438056816e-05, "loss": 0.7834, "step": 2240 }, { "epoch": 0.37821482602118, "grad_norm": 62951.3828125, "learning_rate": 3.1089258698941e-05, "loss": 0.7642, "step": 2250 }, { "epoch": 0.3798957808034964, "grad_norm": 66556.96875, "learning_rate": 3.100521095982518e-05, "loss": 0.8115, "step": 2260 }, { "epoch": 0.38157673558581273, "grad_norm": 83066.890625, "learning_rate": 3.0921163220709366e-05, "loss": 0.8311, "step": 2270 }, { "epoch": 0.3832576903681291, "grad_norm": 68849.5078125, "learning_rate": 3.083711548159355e-05, "loss": 0.7935, "step": 2280 }, { "epoch": 0.38493864515044546, "grad_norm": 68215.6953125, "learning_rate": 3.075306774247773e-05, "loss": 0.8071, "step": 2290 }, { "epoch": 0.3866195999327618, "grad_norm": 65878.6484375, "learning_rate": 3.066902000336191e-05, "loss": 0.9158, "step": 2300 }, { "epoch": 0.3883005547150782, "grad_norm": 59912.58203125, "learning_rate": 3.058497226424609e-05, "loss": 0.8217, "step": 2310 }, { "epoch": 0.3899815094973945, "grad_norm": 65707.3828125, "learning_rate": 3.0500924525130277e-05, "loss": 0.7693, "step": 2320 }, { "epoch": 0.39166246427971085, "grad_norm": 75819.90625, "learning_rate": 3.041687678601446e-05, "loss": 0.8007, "step": 2330 }, { "epoch": 0.39334341906202724, "grad_norm": 66755.734375, "learning_rate": 3.0332829046898644e-05, "loss": 0.8416, "step": 2340 }, { "epoch": 0.3950243738443436, "grad_norm": 68420.984375, "learning_rate": 3.024878130778282e-05, "loss": 0.8119, "step": 2350 }, { "epoch": 0.39670532862665997, "grad_norm": 70407.578125, "learning_rate": 3.0164733568667004e-05, "loss": 0.7907, "step": 2360 }, { "epoch": 0.3983862834089763, "grad_norm": 70415.9453125, "learning_rate": 3.0080685829551187e-05, "loss": 0.7508, "step": 2370 }, { "epoch": 0.40006723819129264, "grad_norm": 64331.296875, "learning_rate": 2.999663809043537e-05, "loss": 0.7812, "step": 2380 }, { "epoch": 0.40174819297360903, "grad_norm": 55358.15625, "learning_rate": 2.9912590351319554e-05, "loss": 0.8044, "step": 2390 }, { "epoch": 0.40342914775592537, "grad_norm": 74381.0078125, "learning_rate": 2.9828542612203734e-05, "loss": 0.7862, "step": 2400 }, { "epoch": 0.4051101025382417, "grad_norm": 69503.421875, "learning_rate": 2.9744494873087914e-05, "loss": 0.8414, "step": 2410 }, { "epoch": 0.4067910573205581, "grad_norm": 63817.671875, "learning_rate": 2.9660447133972098e-05, "loss": 0.8239, "step": 2420 }, { "epoch": 0.4084720121028744, "grad_norm": 64829.07421875, "learning_rate": 2.957639939485628e-05, "loss": 0.8409, "step": 2430 }, { "epoch": 0.41015296688519076, "grad_norm": 90130.75, "learning_rate": 2.9492351655740465e-05, "loss": 0.7979, "step": 2440 }, { "epoch": 0.41183392166750715, "grad_norm": 65037.46484375, "learning_rate": 2.9408303916624648e-05, "loss": 0.7724, "step": 2450 }, { "epoch": 0.4135148764498235, "grad_norm": 74054.9765625, "learning_rate": 2.9324256177508825e-05, "loss": 0.9302, "step": 2460 }, { "epoch": 0.4151958312321399, "grad_norm": 65572.390625, "learning_rate": 2.924020843839301e-05, "loss": 0.8096, "step": 2470 }, { "epoch": 0.4168767860144562, "grad_norm": 70729.125, "learning_rate": 2.9156160699277192e-05, "loss": 0.7749, "step": 2480 }, { "epoch": 0.41855774079677255, "grad_norm": 60806.2265625, "learning_rate": 2.9072112960161375e-05, "loss": 0.7964, "step": 2490 }, { "epoch": 0.42023869557908894, "grad_norm": 66842.796875, "learning_rate": 2.898806522104556e-05, "loss": 0.7956, "step": 2500 }, { "epoch": 0.4219196503614053, "grad_norm": 65062.66015625, "learning_rate": 2.8904017481929735e-05, "loss": 0.765, "step": 2510 }, { "epoch": 0.4236006051437216, "grad_norm": 70468.4921875, "learning_rate": 2.881996974281392e-05, "loss": 0.8029, "step": 2520 }, { "epoch": 0.425281559926038, "grad_norm": 63679.37109375, "learning_rate": 2.8735922003698102e-05, "loss": 0.8116, "step": 2530 }, { "epoch": 0.42696251470835433, "grad_norm": 79301.3984375, "learning_rate": 2.8651874264582286e-05, "loss": 0.8561, "step": 2540 }, { "epoch": 0.4286434694906707, "grad_norm": 66358.953125, "learning_rate": 2.856782652546647e-05, "loss": 0.8753, "step": 2550 }, { "epoch": 0.43032442427298706, "grad_norm": 64068.6875, "learning_rate": 2.848377878635065e-05, "loss": 0.7874, "step": 2560 }, { "epoch": 0.4320053790553034, "grad_norm": 70842.203125, "learning_rate": 2.839973104723483e-05, "loss": 0.8036, "step": 2570 }, { "epoch": 0.4336863338376198, "grad_norm": 69636.0390625, "learning_rate": 2.8315683308119013e-05, "loss": 0.8069, "step": 2580 }, { "epoch": 0.4353672886199361, "grad_norm": 89295.2421875, "learning_rate": 2.8231635569003196e-05, "loss": 0.8322, "step": 2590 }, { "epoch": 0.43704824340225246, "grad_norm": 70937.7265625, "learning_rate": 2.814758782988738e-05, "loss": 0.9161, "step": 2600 }, { "epoch": 0.43872919818456885, "grad_norm": 88488.953125, "learning_rate": 2.8063540090771563e-05, "loss": 0.7753, "step": 2610 }, { "epoch": 0.4404101529668852, "grad_norm": 68679.078125, "learning_rate": 2.797949235165574e-05, "loss": 0.8489, "step": 2620 }, { "epoch": 0.4420911077492016, "grad_norm": 62188.109375, "learning_rate": 2.7895444612539924e-05, "loss": 0.7826, "step": 2630 }, { "epoch": 0.4437720625315179, "grad_norm": 72500.828125, "learning_rate": 2.7811396873424107e-05, "loss": 0.8122, "step": 2640 }, { "epoch": 0.44545301731383424, "grad_norm": 61758.7421875, "learning_rate": 2.772734913430829e-05, "loss": 0.7645, "step": 2650 }, { "epoch": 0.44713397209615063, "grad_norm": 59873.4375, "learning_rate": 2.7643301395192474e-05, "loss": 0.7859, "step": 2660 }, { "epoch": 0.44881492687846697, "grad_norm": 78891.734375, "learning_rate": 2.755925365607665e-05, "loss": 0.7696, "step": 2670 }, { "epoch": 0.4504958816607833, "grad_norm": 69256.2578125, "learning_rate": 2.7475205916960834e-05, "loss": 0.8161, "step": 2680 }, { "epoch": 0.4521768364430997, "grad_norm": 68757.21875, "learning_rate": 2.7391158177845018e-05, "loss": 0.7662, "step": 2690 }, { "epoch": 0.45385779122541603, "grad_norm": 80412.984375, "learning_rate": 2.73071104387292e-05, "loss": 0.7875, "step": 2700 }, { "epoch": 0.45553874600773236, "grad_norm": 74670.53125, "learning_rate": 2.7223062699613385e-05, "loss": 0.7664, "step": 2710 }, { "epoch": 0.45721970079004876, "grad_norm": 69425.640625, "learning_rate": 2.7139014960497565e-05, "loss": 0.8521, "step": 2720 }, { "epoch": 0.4589006555723651, "grad_norm": 70010.1484375, "learning_rate": 2.7054967221381745e-05, "loss": 0.7819, "step": 2730 }, { "epoch": 0.4605816103546815, "grad_norm": 76622.5078125, "learning_rate": 2.6970919482265928e-05, "loss": 0.8632, "step": 2740 }, { "epoch": 0.4622625651369978, "grad_norm": 63262.62890625, "learning_rate": 2.6886871743150112e-05, "loss": 0.7303, "step": 2750 }, { "epoch": 0.46394351991931415, "grad_norm": 63295.68359375, "learning_rate": 2.6802824004034295e-05, "loss": 0.773, "step": 2760 }, { "epoch": 0.46562447470163054, "grad_norm": 61289.41015625, "learning_rate": 2.6718776264918475e-05, "loss": 0.7776, "step": 2770 }, { "epoch": 0.4673054294839469, "grad_norm": 75931.3671875, "learning_rate": 2.6634728525802655e-05, "loss": 0.826, "step": 2780 }, { "epoch": 0.4689863842662632, "grad_norm": 69288.890625, "learning_rate": 2.655068078668684e-05, "loss": 0.7978, "step": 2790 }, { "epoch": 0.4706673390485796, "grad_norm": 61528.40234375, "learning_rate": 2.6466633047571022e-05, "loss": 0.7891, "step": 2800 }, { "epoch": 0.47234829383089594, "grad_norm": 67145.6796875, "learning_rate": 2.6382585308455206e-05, "loss": 0.7483, "step": 2810 }, { "epoch": 0.47402924861321233, "grad_norm": 62958.0703125, "learning_rate": 2.629853756933939e-05, "loss": 0.8407, "step": 2820 }, { "epoch": 0.47571020339552866, "grad_norm": 71565.0078125, "learning_rate": 2.621448983022357e-05, "loss": 0.8093, "step": 2830 }, { "epoch": 0.477391158177845, "grad_norm": 70295.3125, "learning_rate": 2.613044209110775e-05, "loss": 0.8328, "step": 2840 }, { "epoch": 0.4790721129601614, "grad_norm": 66831.65625, "learning_rate": 2.6046394351991933e-05, "loss": 0.7844, "step": 2850 }, { "epoch": 0.4807530677424777, "grad_norm": 62661.7421875, "learning_rate": 2.5962346612876116e-05, "loss": 0.7925, "step": 2860 }, { "epoch": 0.48243402252479406, "grad_norm": 61856.66796875, "learning_rate": 2.58782988737603e-05, "loss": 0.7488, "step": 2870 }, { "epoch": 0.48411497730711045, "grad_norm": 73633.4765625, "learning_rate": 2.579425113464448e-05, "loss": 0.7881, "step": 2880 }, { "epoch": 0.4857959320894268, "grad_norm": 69115.28125, "learning_rate": 2.571020339552866e-05, "loss": 0.8175, "step": 2890 }, { "epoch": 0.4874768868717432, "grad_norm": 61891.3828125, "learning_rate": 2.5626155656412843e-05, "loss": 0.8784, "step": 2900 }, { "epoch": 0.4891578416540595, "grad_norm": 77769.9921875, "learning_rate": 2.5542107917297027e-05, "loss": 0.8684, "step": 2910 }, { "epoch": 0.49083879643637585, "grad_norm": 68048.734375, "learning_rate": 2.545806017818121e-05, "loss": 0.7853, "step": 2920 }, { "epoch": 0.49251975121869224, "grad_norm": 61355.94140625, "learning_rate": 2.537401243906539e-05, "loss": 0.7845, "step": 2930 }, { "epoch": 0.49420070600100857, "grad_norm": 69287.953125, "learning_rate": 2.528996469994957e-05, "loss": 0.8007, "step": 2940 }, { "epoch": 0.4958816607833249, "grad_norm": 68851.8359375, "learning_rate": 2.5205916960833754e-05, "loss": 0.8159, "step": 2950 }, { "epoch": 0.4975626155656413, "grad_norm": 69794.859375, "learning_rate": 2.5121869221717938e-05, "loss": 0.7958, "step": 2960 }, { "epoch": 0.49924357034795763, "grad_norm": 73403.1953125, "learning_rate": 2.503782148260212e-05, "loss": 0.7855, "step": 2970 }, { "epoch": 0.500924525130274, "grad_norm": 74224.1328125, "learning_rate": 2.49537737434863e-05, "loss": 0.7728, "step": 2980 }, { "epoch": 0.5026054799125903, "grad_norm": 63414.859375, "learning_rate": 2.4869726004370485e-05, "loss": 0.8105, "step": 2990 }, { "epoch": 0.5042864346949067, "grad_norm": 77378.109375, "learning_rate": 2.4785678265254668e-05, "loss": 0.8636, "step": 3000 }, { "epoch": 0.5059673894772231, "grad_norm": 70468.2734375, "learning_rate": 2.4701630526138848e-05, "loss": 0.7242, "step": 3010 }, { "epoch": 0.5076483442595394, "grad_norm": 60809.28515625, "learning_rate": 2.461758278702303e-05, "loss": 0.8078, "step": 3020 }, { "epoch": 0.5093292990418558, "grad_norm": 51373.625, "learning_rate": 2.4533535047907215e-05, "loss": 0.8366, "step": 3030 }, { "epoch": 0.5110102538241721, "grad_norm": 59719.765625, "learning_rate": 2.4449487308791395e-05, "loss": 0.8665, "step": 3040 }, { "epoch": 0.5126912086064885, "grad_norm": 64678.41015625, "learning_rate": 2.436543956967558e-05, "loss": 0.7776, "step": 3050 }, { "epoch": 0.5143721633888049, "grad_norm": 74790.40625, "learning_rate": 2.4281391830559762e-05, "loss": 0.7729, "step": 3060 }, { "epoch": 0.5160531181711212, "grad_norm": 71427.921875, "learning_rate": 2.4197344091443942e-05, "loss": 0.7956, "step": 3070 }, { "epoch": 0.5177340729534375, "grad_norm": 66004.3671875, "learning_rate": 2.4113296352328126e-05, "loss": 0.7588, "step": 3080 }, { "epoch": 0.5194150277357539, "grad_norm": 64083.765625, "learning_rate": 2.4029248613212306e-05, "loss": 0.7931, "step": 3090 }, { "epoch": 0.5210959825180702, "grad_norm": 54726.55078125, "learning_rate": 2.394520087409649e-05, "loss": 0.7163, "step": 3100 }, { "epoch": 0.5227769373003867, "grad_norm": 73129.3125, "learning_rate": 2.3861153134980673e-05, "loss": 0.8364, "step": 3110 }, { "epoch": 0.524457892082703, "grad_norm": 62834.08984375, "learning_rate": 2.3777105395864853e-05, "loss": 0.7431, "step": 3120 }, { "epoch": 0.5261388468650193, "grad_norm": 61612.23828125, "learning_rate": 2.3693057656749036e-05, "loss": 0.7975, "step": 3130 }, { "epoch": 0.5278198016473357, "grad_norm": 71137.6640625, "learning_rate": 2.3609009917633216e-05, "loss": 0.783, "step": 3140 }, { "epoch": 0.529500756429652, "grad_norm": 63406.10546875, "learning_rate": 2.35249621785174e-05, "loss": 0.824, "step": 3150 }, { "epoch": 0.5311817112119684, "grad_norm": 64795.73828125, "learning_rate": 2.3440914439401583e-05, "loss": 0.8715, "step": 3160 }, { "epoch": 0.5328626659942848, "grad_norm": 59525.79296875, "learning_rate": 2.3356866700285763e-05, "loss": 0.7934, "step": 3170 }, { "epoch": 0.5345436207766011, "grad_norm": 72914.4609375, "learning_rate": 2.3272818961169947e-05, "loss": 0.7243, "step": 3180 }, { "epoch": 0.5362245755589174, "grad_norm": 62966.078125, "learning_rate": 2.318877122205413e-05, "loss": 0.712, "step": 3190 }, { "epoch": 0.5379055303412338, "grad_norm": 66647.0546875, "learning_rate": 2.310472348293831e-05, "loss": 0.7674, "step": 3200 }, { "epoch": 0.5395864851235502, "grad_norm": 65908.0703125, "learning_rate": 2.3020675743822494e-05, "loss": 0.8484, "step": 3210 }, { "epoch": 0.5412674399058666, "grad_norm": 60565.56640625, "learning_rate": 2.2936628004706674e-05, "loss": 0.7844, "step": 3220 }, { "epoch": 0.5429483946881829, "grad_norm": 67560.3828125, "learning_rate": 2.2852580265590857e-05, "loss": 0.9071, "step": 3230 }, { "epoch": 0.5446293494704992, "grad_norm": 66084.265625, "learning_rate": 2.276853252647504e-05, "loss": 0.7323, "step": 3240 }, { "epoch": 0.5463103042528156, "grad_norm": 78780.078125, "learning_rate": 2.268448478735922e-05, "loss": 0.9259, "step": 3250 }, { "epoch": 0.5479912590351319, "grad_norm": 67810.703125, "learning_rate": 2.2600437048243404e-05, "loss": 0.7813, "step": 3260 }, { "epoch": 0.5496722138174484, "grad_norm": 62699.97265625, "learning_rate": 2.2516389309127588e-05, "loss": 0.7831, "step": 3270 }, { "epoch": 0.5513531685997647, "grad_norm": 66553.1640625, "learning_rate": 2.2432341570011768e-05, "loss": 0.8185, "step": 3280 }, { "epoch": 0.553034123382081, "grad_norm": 60711.96875, "learning_rate": 2.234829383089595e-05, "loss": 0.7551, "step": 3290 }, { "epoch": 0.5547150781643974, "grad_norm": 64588.28515625, "learning_rate": 2.226424609178013e-05, "loss": 0.6919, "step": 3300 }, { "epoch": 0.5563960329467137, "grad_norm": 73545.9765625, "learning_rate": 2.2180198352664315e-05, "loss": 0.7834, "step": 3310 }, { "epoch": 0.5580769877290301, "grad_norm": 66515.1796875, "learning_rate": 2.20961506135485e-05, "loss": 0.8135, "step": 3320 }, { "epoch": 0.5597579425113465, "grad_norm": 66021.4140625, "learning_rate": 2.201210287443268e-05, "loss": 0.7767, "step": 3330 }, { "epoch": 0.5614388972936628, "grad_norm": 72027.734375, "learning_rate": 2.1928055135316862e-05, "loss": 0.7623, "step": 3340 }, { "epoch": 0.5631198520759791, "grad_norm": 69724.7109375, "learning_rate": 2.1844007396201046e-05, "loss": 0.8539, "step": 3350 }, { "epoch": 0.5648008068582955, "grad_norm": 70565.2890625, "learning_rate": 2.1759959657085226e-05, "loss": 0.8081, "step": 3360 }, { "epoch": 0.5664817616406118, "grad_norm": 69982.8515625, "learning_rate": 2.167591191796941e-05, "loss": 0.766, "step": 3370 }, { "epoch": 0.5681627164229283, "grad_norm": 73730.5859375, "learning_rate": 2.159186417885359e-05, "loss": 0.7858, "step": 3380 }, { "epoch": 0.5698436712052446, "grad_norm": 68504.3359375, "learning_rate": 2.1507816439737773e-05, "loss": 0.7774, "step": 3390 }, { "epoch": 0.5715246259875609, "grad_norm": 68371.71875, "learning_rate": 2.1423768700621956e-05, "loss": 0.7397, "step": 3400 }, { "epoch": 0.5732055807698773, "grad_norm": 62352.87890625, "learning_rate": 2.1339720961506136e-05, "loss": 0.7727, "step": 3410 }, { "epoch": 0.5748865355521936, "grad_norm": 67821.3671875, "learning_rate": 2.125567322239032e-05, "loss": 0.7494, "step": 3420 }, { "epoch": 0.57656749033451, "grad_norm": 75150.0703125, "learning_rate": 2.1171625483274503e-05, "loss": 0.738, "step": 3430 }, { "epoch": 0.5782484451168264, "grad_norm": 71489.2109375, "learning_rate": 2.1087577744158683e-05, "loss": 0.7666, "step": 3440 }, { "epoch": 0.5799293998991427, "grad_norm": 77000.265625, "learning_rate": 2.1003530005042867e-05, "loss": 0.8051, "step": 3450 }, { "epoch": 0.581610354681459, "grad_norm": 63612.04296875, "learning_rate": 2.0919482265927047e-05, "loss": 0.7408, "step": 3460 }, { "epoch": 0.5832913094637754, "grad_norm": 65412.390625, "learning_rate": 2.083543452681123e-05, "loss": 0.7585, "step": 3470 }, { "epoch": 0.5849722642460918, "grad_norm": 63992.859375, "learning_rate": 2.0751386787695414e-05, "loss": 0.8036, "step": 3480 }, { "epoch": 0.5866532190284082, "grad_norm": 67541.6015625, "learning_rate": 2.0667339048579594e-05, "loss": 0.7835, "step": 3490 }, { "epoch": 0.5883341738107245, "grad_norm": 87275.59375, "learning_rate": 2.0583291309463777e-05, "loss": 0.843, "step": 3500 }, { "epoch": 0.5900151285930408, "grad_norm": 62353.66015625, "learning_rate": 2.0499243570347957e-05, "loss": 0.8229, "step": 3510 }, { "epoch": 0.5916960833753572, "grad_norm": 76160.890625, "learning_rate": 2.041519583123214e-05, "loss": 0.839, "step": 3520 }, { "epoch": 0.5933770381576735, "grad_norm": 71393.7109375, "learning_rate": 2.0331148092116324e-05, "loss": 0.7136, "step": 3530 }, { "epoch": 0.59505799293999, "grad_norm": 70572.75, "learning_rate": 2.0247100353000504e-05, "loss": 0.7987, "step": 3540 }, { "epoch": 0.5967389477223063, "grad_norm": 66998.0703125, "learning_rate": 2.0163052613884688e-05, "loss": 0.7874, "step": 3550 }, { "epoch": 0.5984199025046226, "grad_norm": 53749.26953125, "learning_rate": 2.007900487476887e-05, "loss": 0.767, "step": 3560 }, { "epoch": 0.600100857286939, "grad_norm": 71350.6328125, "learning_rate": 1.999495713565305e-05, "loss": 0.8136, "step": 3570 }, { "epoch": 0.6017818120692553, "grad_norm": 64163.25390625, "learning_rate": 1.9910909396537235e-05, "loss": 0.7565, "step": 3580 }, { "epoch": 0.6034627668515717, "grad_norm": 61004.05078125, "learning_rate": 1.9826861657421415e-05, "loss": 0.7502, "step": 3590 }, { "epoch": 0.6051437216338881, "grad_norm": 66495.4375, "learning_rate": 1.97428139183056e-05, "loss": 0.7485, "step": 3600 }, { "epoch": 0.6068246764162044, "grad_norm": 62442.1015625, "learning_rate": 1.9658766179189782e-05, "loss": 0.7484, "step": 3610 }, { "epoch": 0.6085056311985207, "grad_norm": 66803.453125, "learning_rate": 1.9574718440073962e-05, "loss": 0.8459, "step": 3620 }, { "epoch": 0.6101865859808371, "grad_norm": 71394.6875, "learning_rate": 1.9490670700958145e-05, "loss": 0.8108, "step": 3630 }, { "epoch": 0.6118675407631534, "grad_norm": 72386.0859375, "learning_rate": 1.940662296184233e-05, "loss": 0.8039, "step": 3640 }, { "epoch": 0.6135484955454699, "grad_norm": 69155.015625, "learning_rate": 1.932257522272651e-05, "loss": 0.823, "step": 3650 }, { "epoch": 0.6152294503277862, "grad_norm": 60343.54296875, "learning_rate": 1.9238527483610693e-05, "loss": 0.7284, "step": 3660 }, { "epoch": 0.6169104051101025, "grad_norm": 70258.7734375, "learning_rate": 1.9154479744494873e-05, "loss": 0.7925, "step": 3670 }, { "epoch": 0.6185913598924189, "grad_norm": 68638.21875, "learning_rate": 1.9070432005379056e-05, "loss": 0.7802, "step": 3680 }, { "epoch": 0.6202723146747352, "grad_norm": 67208.890625, "learning_rate": 1.898638426626324e-05, "loss": 0.7921, "step": 3690 }, { "epoch": 0.6219532694570516, "grad_norm": 56271.5703125, "learning_rate": 1.890233652714742e-05, "loss": 0.7752, "step": 3700 }, { "epoch": 0.623634224239368, "grad_norm": 84835.21875, "learning_rate": 1.8818288788031603e-05, "loss": 0.8012, "step": 3710 }, { "epoch": 0.6253151790216843, "grad_norm": 58683.70703125, "learning_rate": 1.8734241048915787e-05, "loss": 0.7547, "step": 3720 }, { "epoch": 0.6269961338040007, "grad_norm": 66791.8828125, "learning_rate": 1.8650193309799967e-05, "loss": 0.782, "step": 3730 }, { "epoch": 0.628677088586317, "grad_norm": 60903.69140625, "learning_rate": 1.856614557068415e-05, "loss": 0.7304, "step": 3740 }, { "epoch": 0.6303580433686334, "grad_norm": 63727.53515625, "learning_rate": 1.848209783156833e-05, "loss": 0.7591, "step": 3750 }, { "epoch": 0.6320389981509498, "grad_norm": 65615.4140625, "learning_rate": 1.8398050092452514e-05, "loss": 0.7483, "step": 3760 }, { "epoch": 0.6337199529332661, "grad_norm": 72326.921875, "learning_rate": 1.8314002353336697e-05, "loss": 0.7795, "step": 3770 }, { "epoch": 0.6354009077155824, "grad_norm": 65443.81640625, "learning_rate": 1.8229954614220877e-05, "loss": 0.8154, "step": 3780 }, { "epoch": 0.6370818624978988, "grad_norm": 66016.125, "learning_rate": 1.814590687510506e-05, "loss": 0.8109, "step": 3790 }, { "epoch": 0.6387628172802151, "grad_norm": 70201.703125, "learning_rate": 1.8061859135989244e-05, "loss": 0.8039, "step": 3800 }, { "epoch": 0.6404437720625316, "grad_norm": 72543.0859375, "learning_rate": 1.7977811396873424e-05, "loss": 0.8201, "step": 3810 }, { "epoch": 0.6421247268448479, "grad_norm": 65741.5703125, "learning_rate": 1.7893763657757608e-05, "loss": 0.7919, "step": 3820 }, { "epoch": 0.6438056816271642, "grad_norm": 67635.5078125, "learning_rate": 1.7809715918641788e-05, "loss": 0.8174, "step": 3830 }, { "epoch": 0.6454866364094806, "grad_norm": 64830.48046875, "learning_rate": 1.772566817952597e-05, "loss": 0.7375, "step": 3840 }, { "epoch": 0.6471675911917969, "grad_norm": 66888.5234375, "learning_rate": 1.7641620440410155e-05, "loss": 0.7547, "step": 3850 }, { "epoch": 0.6488485459741133, "grad_norm": 67550.5234375, "learning_rate": 1.7557572701294335e-05, "loss": 0.7532, "step": 3860 }, { "epoch": 0.6505295007564297, "grad_norm": 66258.5859375, "learning_rate": 1.747352496217852e-05, "loss": 0.7623, "step": 3870 }, { "epoch": 0.652210455538746, "grad_norm": 70212.1875, "learning_rate": 1.7389477223062702e-05, "loss": 0.8022, "step": 3880 }, { "epoch": 0.6538914103210623, "grad_norm": 69388.6171875, "learning_rate": 1.7305429483946882e-05, "loss": 0.7672, "step": 3890 }, { "epoch": 0.6555723651033787, "grad_norm": 61498.984375, "learning_rate": 1.7221381744831065e-05, "loss": 0.7243, "step": 3900 }, { "epoch": 0.657253319885695, "grad_norm": 73348.640625, "learning_rate": 1.7137334005715245e-05, "loss": 0.7699, "step": 3910 }, { "epoch": 0.6589342746680115, "grad_norm": 76186.703125, "learning_rate": 1.705328626659943e-05, "loss": 0.8293, "step": 3920 }, { "epoch": 0.6606152294503278, "grad_norm": 72204.1171875, "learning_rate": 1.6969238527483612e-05, "loss": 0.8325, "step": 3930 }, { "epoch": 0.6622961842326441, "grad_norm": 68690.1328125, "learning_rate": 1.6885190788367792e-05, "loss": 0.8174, "step": 3940 }, { "epoch": 0.6639771390149605, "grad_norm": 69304.6015625, "learning_rate": 1.6801143049251976e-05, "loss": 0.7395, "step": 3950 }, { "epoch": 0.6656580937972768, "grad_norm": 67933.7890625, "learning_rate": 1.6717095310136156e-05, "loss": 0.7976, "step": 3960 }, { "epoch": 0.6673390485795933, "grad_norm": 74347.6953125, "learning_rate": 1.663304757102034e-05, "loss": 0.8003, "step": 3970 }, { "epoch": 0.6690200033619096, "grad_norm": 74672.625, "learning_rate": 1.6548999831904523e-05, "loss": 0.7809, "step": 3980 }, { "epoch": 0.6707009581442259, "grad_norm": 69611.8984375, "learning_rate": 1.6464952092788703e-05, "loss": 0.7884, "step": 3990 }, { "epoch": 0.6723819129265423, "grad_norm": 61687.546875, "learning_rate": 1.6380904353672887e-05, "loss": 0.7726, "step": 4000 }, { "epoch": 0.6740628677088586, "grad_norm": 57029.6953125, "learning_rate": 1.629685661455707e-05, "loss": 0.7537, "step": 4010 }, { "epoch": 0.675743822491175, "grad_norm": 62528.2265625, "learning_rate": 1.621280887544125e-05, "loss": 0.7542, "step": 4020 }, { "epoch": 0.6774247772734914, "grad_norm": 69657.46875, "learning_rate": 1.6128761136325434e-05, "loss": 0.7846, "step": 4030 }, { "epoch": 0.6791057320558077, "grad_norm": 61733.50390625, "learning_rate": 1.6044713397209614e-05, "loss": 0.7342, "step": 4040 }, { "epoch": 0.680786686838124, "grad_norm": 63571.60546875, "learning_rate": 1.5960665658093797e-05, "loss": 0.7046, "step": 4050 }, { "epoch": 0.6824676416204404, "grad_norm": 80437.828125, "learning_rate": 1.587661791897798e-05, "loss": 0.8413, "step": 4060 }, { "epoch": 0.6841485964027567, "grad_norm": 107205.921875, "learning_rate": 1.579257017986216e-05, "loss": 0.8052, "step": 4070 }, { "epoch": 0.6858295511850732, "grad_norm": 71237.375, "learning_rate": 1.5708522440746344e-05, "loss": 0.7729, "step": 4080 }, { "epoch": 0.6875105059673895, "grad_norm": 61227.4296875, "learning_rate": 1.5624474701630528e-05, "loss": 0.7972, "step": 4090 }, { "epoch": 0.6891914607497058, "grad_norm": 81108.9296875, "learning_rate": 1.5540426962514708e-05, "loss": 0.7173, "step": 4100 }, { "epoch": 0.6908724155320222, "grad_norm": 58387.5703125, "learning_rate": 1.545637922339889e-05, "loss": 0.8341, "step": 4110 }, { "epoch": 0.6925533703143385, "grad_norm": 77154.796875, "learning_rate": 1.537233148428307e-05, "loss": 0.7459, "step": 4120 }, { "epoch": 0.694234325096655, "grad_norm": 66626.140625, "learning_rate": 1.5288283745167255e-05, "loss": 0.7965, "step": 4130 }, { "epoch": 0.6959152798789713, "grad_norm": 65993.8671875, "learning_rate": 1.5204236006051437e-05, "loss": 0.8077, "step": 4140 }, { "epoch": 0.6975962346612876, "grad_norm": 65909.390625, "learning_rate": 1.512018826693562e-05, "loss": 0.7933, "step": 4150 }, { "epoch": 0.699277189443604, "grad_norm": 73940.3671875, "learning_rate": 1.5036140527819803e-05, "loss": 0.7937, "step": 4160 }, { "epoch": 0.7009581442259203, "grad_norm": 80516.8203125, "learning_rate": 1.4952092788703984e-05, "loss": 0.7921, "step": 4170 }, { "epoch": 0.7026390990082367, "grad_norm": 62047.8046875, "learning_rate": 1.4868045049588167e-05, "loss": 0.7405, "step": 4180 }, { "epoch": 0.7043200537905531, "grad_norm": 68648.3828125, "learning_rate": 1.478399731047235e-05, "loss": 0.7956, "step": 4190 }, { "epoch": 0.7060010085728694, "grad_norm": 74798.203125, "learning_rate": 1.469994957135653e-05, "loss": 0.7026, "step": 4200 }, { "epoch": 0.7076819633551857, "grad_norm": 65563.265625, "learning_rate": 1.4615901832240714e-05, "loss": 0.7846, "step": 4210 }, { "epoch": 0.7093629181375021, "grad_norm": 76866.96875, "learning_rate": 1.4531854093124898e-05, "loss": 0.829, "step": 4220 }, { "epoch": 0.7110438729198184, "grad_norm": 64904.796875, "learning_rate": 1.4447806354009078e-05, "loss": 0.8053, "step": 4230 }, { "epoch": 0.7127248277021349, "grad_norm": 72151.6484375, "learning_rate": 1.4363758614893261e-05, "loss": 0.7583, "step": 4240 }, { "epoch": 0.7144057824844512, "grad_norm": 57045.8203125, "learning_rate": 1.4279710875777441e-05, "loss": 0.7584, "step": 4250 }, { "epoch": 0.7160867372667675, "grad_norm": 74145.6953125, "learning_rate": 1.4195663136661625e-05, "loss": 0.7637, "step": 4260 }, { "epoch": 0.7177676920490839, "grad_norm": 63434.96875, "learning_rate": 1.4111615397545808e-05, "loss": 0.8278, "step": 4270 }, { "epoch": 0.7194486468314002, "grad_norm": 70511.5546875, "learning_rate": 1.4027567658429988e-05, "loss": 0.7512, "step": 4280 }, { "epoch": 0.7211296016137166, "grad_norm": 65691.6640625, "learning_rate": 1.3943519919314172e-05, "loss": 0.7509, "step": 4290 }, { "epoch": 0.722810556396033, "grad_norm": 72238.515625, "learning_rate": 1.3859472180198355e-05, "loss": 0.7977, "step": 4300 }, { "epoch": 0.7244915111783493, "grad_norm": 79115.5625, "learning_rate": 1.3775424441082535e-05, "loss": 0.8284, "step": 4310 }, { "epoch": 0.7261724659606656, "grad_norm": 64790.1484375, "learning_rate": 1.3691376701966719e-05, "loss": 0.7124, "step": 4320 }, { "epoch": 0.727853420742982, "grad_norm": 75220.65625, "learning_rate": 1.3607328962850899e-05, "loss": 0.7687, "step": 4330 }, { "epoch": 0.7295343755252983, "grad_norm": 60745.61328125, "learning_rate": 1.3523281223735082e-05, "loss": 0.7283, "step": 4340 }, { "epoch": 0.7312153303076148, "grad_norm": 54197.08203125, "learning_rate": 1.3439233484619266e-05, "loss": 0.768, "step": 4350 }, { "epoch": 0.7328962850899311, "grad_norm": 61422.91015625, "learning_rate": 1.3355185745503446e-05, "loss": 0.7324, "step": 4360 }, { "epoch": 0.7345772398722474, "grad_norm": 68666.9765625, "learning_rate": 1.327113800638763e-05, "loss": 0.8131, "step": 4370 }, { "epoch": 0.7362581946545638, "grad_norm": 54904.734375, "learning_rate": 1.3187090267271813e-05, "loss": 0.751, "step": 4380 }, { "epoch": 0.7379391494368801, "grad_norm": 74403.234375, "learning_rate": 1.3103042528155993e-05, "loss": 0.7564, "step": 4390 }, { "epoch": 0.7396201042191966, "grad_norm": 83832.09375, "learning_rate": 1.3018994789040176e-05, "loss": 0.7588, "step": 4400 }, { "epoch": 0.7413010590015129, "grad_norm": 65015.13671875, "learning_rate": 1.2934947049924356e-05, "loss": 0.7333, "step": 4410 }, { "epoch": 0.7429820137838292, "grad_norm": 70224.0234375, "learning_rate": 1.285089931080854e-05, "loss": 0.7437, "step": 4420 }, { "epoch": 0.7446629685661456, "grad_norm": 65735.6015625, "learning_rate": 1.2766851571692723e-05, "loss": 0.7311, "step": 4430 }, { "epoch": 0.7463439233484619, "grad_norm": 64739.11328125, "learning_rate": 1.2682803832576903e-05, "loss": 0.752, "step": 4440 }, { "epoch": 0.7480248781307783, "grad_norm": 84499.734375, "learning_rate": 1.2598756093461087e-05, "loss": 0.7085, "step": 4450 }, { "epoch": 0.7497058329130947, "grad_norm": 58072.46484375, "learning_rate": 1.251470835434527e-05, "loss": 0.7869, "step": 4460 }, { "epoch": 0.751386787695411, "grad_norm": 60343.64453125, "learning_rate": 1.243066061522945e-05, "loss": 0.748, "step": 4470 }, { "epoch": 0.7530677424777273, "grad_norm": 73976.90625, "learning_rate": 1.2346612876113634e-05, "loss": 0.8125, "step": 4480 }, { "epoch": 0.7547486972600437, "grad_norm": 83017.234375, "learning_rate": 1.2262565136997816e-05, "loss": 0.7647, "step": 4490 }, { "epoch": 0.75642965204236, "grad_norm": 63707.15625, "learning_rate": 1.2178517397881997e-05, "loss": 0.689, "step": 4500 }, { "epoch": 0.7581106068246765, "grad_norm": 62471.03125, "learning_rate": 1.2094469658766181e-05, "loss": 0.7642, "step": 4510 }, { "epoch": 0.7597915616069928, "grad_norm": 71938.2421875, "learning_rate": 1.2010421919650363e-05, "loss": 0.7018, "step": 4520 }, { "epoch": 0.7614725163893091, "grad_norm": 62137.34375, "learning_rate": 1.1926374180534544e-05, "loss": 0.7742, "step": 4530 }, { "epoch": 0.7631534711716255, "grad_norm": 74779.3515625, "learning_rate": 1.1842326441418726e-05, "loss": 0.76, "step": 4540 }, { "epoch": 0.7648344259539418, "grad_norm": 65167.984375, "learning_rate": 1.175827870230291e-05, "loss": 0.7814, "step": 4550 }, { "epoch": 0.7665153807362582, "grad_norm": 64530.6015625, "learning_rate": 1.1674230963187092e-05, "loss": 0.7496, "step": 4560 }, { "epoch": 0.7681963355185746, "grad_norm": 69663.7109375, "learning_rate": 1.1590183224071273e-05, "loss": 0.794, "step": 4570 }, { "epoch": 0.7698772903008909, "grad_norm": 58575.296875, "learning_rate": 1.1506135484955455e-05, "loss": 0.7438, "step": 4580 }, { "epoch": 0.7715582450832073, "grad_norm": 60153.59375, "learning_rate": 1.1422087745839639e-05, "loss": 0.7246, "step": 4590 }, { "epoch": 0.7732391998655236, "grad_norm": 63029.4140625, "learning_rate": 1.133804000672382e-05, "loss": 0.7449, "step": 4600 }, { "epoch": 0.7749201546478399, "grad_norm": 70431.859375, "learning_rate": 1.1253992267608002e-05, "loss": 0.8162, "step": 4610 }, { "epoch": 0.7766011094301564, "grad_norm": 56848.18359375, "learning_rate": 1.1169944528492184e-05, "loss": 0.8272, "step": 4620 }, { "epoch": 0.7782820642124727, "grad_norm": 63638.44921875, "learning_rate": 1.1085896789376367e-05, "loss": 0.6735, "step": 4630 }, { "epoch": 0.779963018994789, "grad_norm": 81583.1328125, "learning_rate": 1.1001849050260549e-05, "loss": 0.7195, "step": 4640 }, { "epoch": 0.7816439737771054, "grad_norm": 65090.41796875, "learning_rate": 1.0917801311144731e-05, "loss": 0.7348, "step": 4650 }, { "epoch": 0.7833249285594217, "grad_norm": 81298.7265625, "learning_rate": 1.0833753572028913e-05, "loss": 0.7036, "step": 4660 }, { "epoch": 0.7850058833417382, "grad_norm": 72416.609375, "learning_rate": 1.0749705832913096e-05, "loss": 0.759, "step": 4670 }, { "epoch": 0.7866868381240545, "grad_norm": 62808.859375, "learning_rate": 1.0665658093797278e-05, "loss": 0.8043, "step": 4680 }, { "epoch": 0.7883677929063708, "grad_norm": 57125.68359375, "learning_rate": 1.058161035468146e-05, "loss": 0.695, "step": 4690 }, { "epoch": 0.7900487476886872, "grad_norm": 70024.90625, "learning_rate": 1.0497562615565642e-05, "loss": 0.8026, "step": 4700 }, { "epoch": 0.7917297024710035, "grad_norm": 67909.5, "learning_rate": 1.0413514876449825e-05, "loss": 0.7139, "step": 4710 }, { "epoch": 0.7934106572533199, "grad_norm": 63104.14453125, "learning_rate": 1.0329467137334007e-05, "loss": 0.7578, "step": 4720 }, { "epoch": 0.7950916120356363, "grad_norm": 59612.7578125, "learning_rate": 1.0245419398218189e-05, "loss": 0.8141, "step": 4730 }, { "epoch": 0.7967725668179526, "grad_norm": 61766.25, "learning_rate": 1.016137165910237e-05, "loss": 0.7066, "step": 4740 }, { "epoch": 0.798453521600269, "grad_norm": 61020.91015625, "learning_rate": 1.0077323919986554e-05, "loss": 0.7129, "step": 4750 }, { "epoch": 0.8001344763825853, "grad_norm": 73282.953125, "learning_rate": 9.993276180870736e-06, "loss": 0.7443, "step": 4760 }, { "epoch": 0.8018154311649016, "grad_norm": 61251.37890625, "learning_rate": 9.909228441754917e-06, "loss": 0.7202, "step": 4770 }, { "epoch": 0.8034963859472181, "grad_norm": 57658.30859375, "learning_rate": 9.825180702639099e-06, "loss": 0.7261, "step": 4780 }, { "epoch": 0.8051773407295344, "grad_norm": 58032.66796875, "learning_rate": 9.741132963523283e-06, "loss": 0.7772, "step": 4790 }, { "epoch": 0.8068582955118507, "grad_norm": 67425.0703125, "learning_rate": 9.657085224407464e-06, "loss": 0.75, "step": 4800 }, { "epoch": 0.8085392502941671, "grad_norm": 61734.875, "learning_rate": 9.573037485291646e-06, "loss": 0.7046, "step": 4810 }, { "epoch": 0.8102202050764834, "grad_norm": 68259.4453125, "learning_rate": 9.48898974617583e-06, "loss": 0.7702, "step": 4820 }, { "epoch": 0.8119011598587998, "grad_norm": 62056.20703125, "learning_rate": 9.404942007060011e-06, "loss": 0.7043, "step": 4830 }, { "epoch": 0.8135821146411162, "grad_norm": 65011.296875, "learning_rate": 9.320894267944193e-06, "loss": 0.7163, "step": 4840 }, { "epoch": 0.8152630694234325, "grad_norm": 74655.8203125, "learning_rate": 9.236846528828375e-06, "loss": 0.8321, "step": 4850 }, { "epoch": 0.8169440242057489, "grad_norm": 75672.0703125, "learning_rate": 9.152798789712558e-06, "loss": 0.7922, "step": 4860 }, { "epoch": 0.8186249789880652, "grad_norm": 61180.29296875, "learning_rate": 9.06875105059674e-06, "loss": 0.7159, "step": 4870 }, { "epoch": 0.8203059337703815, "grad_norm": 87931.515625, "learning_rate": 8.984703311480922e-06, "loss": 0.6781, "step": 4880 }, { "epoch": 0.821986888552698, "grad_norm": 73175.0390625, "learning_rate": 8.900655572365104e-06, "loss": 0.7326, "step": 4890 }, { "epoch": 0.8236678433350143, "grad_norm": 63801.890625, "learning_rate": 8.816607833249287e-06, "loss": 0.7975, "step": 4900 }, { "epoch": 0.8253487981173306, "grad_norm": 80039.4375, "learning_rate": 8.732560094133469e-06, "loss": 0.6984, "step": 4910 }, { "epoch": 0.827029752899647, "grad_norm": 71785.921875, "learning_rate": 8.64851235501765e-06, "loss": 0.7481, "step": 4920 }, { "epoch": 0.8287107076819633, "grad_norm": 72605.5234375, "learning_rate": 8.564464615901833e-06, "loss": 0.7317, "step": 4930 }, { "epoch": 0.8303916624642798, "grad_norm": 72289.1171875, "learning_rate": 8.480416876786016e-06, "loss": 0.752, "step": 4940 }, { "epoch": 0.8320726172465961, "grad_norm": 69316.015625, "learning_rate": 8.396369137670198e-06, "loss": 0.722, "step": 4950 }, { "epoch": 0.8337535720289124, "grad_norm": 68252.515625, "learning_rate": 8.31232139855438e-06, "loss": 0.7674, "step": 4960 }, { "epoch": 0.8354345268112288, "grad_norm": 66645.7421875, "learning_rate": 8.228273659438561e-06, "loss": 0.7386, "step": 4970 }, { "epoch": 0.8371154815935451, "grad_norm": 64145.96875, "learning_rate": 8.144225920322745e-06, "loss": 0.7213, "step": 4980 }, { "epoch": 0.8387964363758615, "grad_norm": 62728.8125, "learning_rate": 8.060178181206927e-06, "loss": 0.6535, "step": 4990 }, { "epoch": 0.8404773911581779, "grad_norm": 72365.625, "learning_rate": 7.976130442091108e-06, "loss": 0.6833, "step": 5000 }, { "epoch": 0.8421583459404942, "grad_norm": 62805.015625, "learning_rate": 7.89208270297529e-06, "loss": 0.8574, "step": 5010 }, { "epoch": 0.8438393007228105, "grad_norm": 65521.5703125, "learning_rate": 7.808034963859474e-06, "loss": 0.7408, "step": 5020 }, { "epoch": 0.8455202555051269, "grad_norm": 67055.6171875, "learning_rate": 7.723987224743655e-06, "loss": 0.6515, "step": 5030 }, { "epoch": 0.8472012102874432, "grad_norm": 56498.43359375, "learning_rate": 7.639939485627837e-06, "loss": 0.8291, "step": 5040 }, { "epoch": 0.8488821650697597, "grad_norm": 68116.96875, "learning_rate": 7.555891746512018e-06, "loss": 0.7886, "step": 5050 }, { "epoch": 0.850563119852076, "grad_norm": 64178.546875, "learning_rate": 7.471844007396202e-06, "loss": 0.7746, "step": 5060 }, { "epoch": 0.8522440746343923, "grad_norm": 63573.09375, "learning_rate": 7.387796268280383e-06, "loss": 0.7769, "step": 5070 }, { "epoch": 0.8539250294167087, "grad_norm": 57395.2421875, "learning_rate": 7.303748529164565e-06, "loss": 0.7334, "step": 5080 }, { "epoch": 0.855605984199025, "grad_norm": 74868.375, "learning_rate": 7.219700790048749e-06, "loss": 0.7575, "step": 5090 }, { "epoch": 0.8572869389813415, "grad_norm": 69290.953125, "learning_rate": 7.1356530509329304e-06, "loss": 0.6967, "step": 5100 }, { "epoch": 0.8589678937636578, "grad_norm": 59573.5234375, "learning_rate": 7.051605311817112e-06, "loss": 0.8442, "step": 5110 }, { "epoch": 0.8606488485459741, "grad_norm": 61021.09375, "learning_rate": 6.967557572701294e-06, "loss": 0.7069, "step": 5120 }, { "epoch": 0.8623298033282905, "grad_norm": 71908.3359375, "learning_rate": 6.8835098335854775e-06, "loss": 0.6972, "step": 5130 }, { "epoch": 0.8640107581106068, "grad_norm": 71261.375, "learning_rate": 6.799462094469659e-06, "loss": 0.7308, "step": 5140 }, { "epoch": 0.8656917128929231, "grad_norm": 76563.328125, "learning_rate": 6.715414355353841e-06, "loss": 0.8391, "step": 5150 }, { "epoch": 0.8673726676752396, "grad_norm": 73876.8515625, "learning_rate": 6.631366616238023e-06, "loss": 0.6593, "step": 5160 }, { "epoch": 0.8690536224575559, "grad_norm": 58473.9140625, "learning_rate": 6.547318877122206e-06, "loss": 0.7766, "step": 5170 }, { "epoch": 0.8707345772398722, "grad_norm": 61153.5390625, "learning_rate": 6.463271138006388e-06, "loss": 0.7679, "step": 5180 }, { "epoch": 0.8724155320221886, "grad_norm": 58491.39453125, "learning_rate": 6.37922339889057e-06, "loss": 0.7115, "step": 5190 }, { "epoch": 0.8740964868045049, "grad_norm": 64786.53125, "learning_rate": 6.295175659774752e-06, "loss": 0.6976, "step": 5200 }, { "epoch": 0.8757774415868214, "grad_norm": 68189.5546875, "learning_rate": 6.211127920658934e-06, "loss": 0.8054, "step": 5210 }, { "epoch": 0.8774583963691377, "grad_norm": 66427.9921875, "learning_rate": 6.127080181543117e-06, "loss": 0.7512, "step": 5220 }, { "epoch": 0.879139351151454, "grad_norm": 57290.70703125, "learning_rate": 6.043032442427299e-06, "loss": 0.6927, "step": 5230 }, { "epoch": 0.8808203059337704, "grad_norm": 72431.390625, "learning_rate": 5.958984703311481e-06, "loss": 0.8137, "step": 5240 }, { "epoch": 0.8825012607160867, "grad_norm": 70580.7265625, "learning_rate": 5.874936964195663e-06, "loss": 0.7712, "step": 5250 }, { "epoch": 0.8841822154984031, "grad_norm": 73078.3046875, "learning_rate": 5.790889225079846e-06, "loss": 0.6761, "step": 5260 }, { "epoch": 0.8858631702807195, "grad_norm": 78010.0703125, "learning_rate": 5.7068414859640274e-06, "loss": 0.7715, "step": 5270 }, { "epoch": 0.8875441250630358, "grad_norm": 66805.8671875, "learning_rate": 5.62279374684821e-06, "loss": 0.7067, "step": 5280 }, { "epoch": 0.8892250798453522, "grad_norm": 68583.328125, "learning_rate": 5.538746007732392e-06, "loss": 0.7403, "step": 5290 }, { "epoch": 0.8909060346276685, "grad_norm": 80909.6796875, "learning_rate": 5.4546982686165745e-06, "loss": 0.7469, "step": 5300 }, { "epoch": 0.8925869894099848, "grad_norm": 62168.92578125, "learning_rate": 5.370650529500756e-06, "loss": 0.7284, "step": 5310 }, { "epoch": 0.8942679441923013, "grad_norm": 70689.0546875, "learning_rate": 5.286602790384939e-06, "loss": 0.7804, "step": 5320 }, { "epoch": 0.8959488989746176, "grad_norm": 70834.4609375, "learning_rate": 5.202555051269121e-06, "loss": 0.7504, "step": 5330 }, { "epoch": 0.8976298537569339, "grad_norm": 72244.8828125, "learning_rate": 5.118507312153303e-06, "loss": 0.6909, "step": 5340 }, { "epoch": 0.8993108085392503, "grad_norm": 68406.7734375, "learning_rate": 5.034459573037485e-06, "loss": 0.7554, "step": 5350 }, { "epoch": 0.9009917633215666, "grad_norm": 59602.8515625, "learning_rate": 4.950411833921668e-06, "loss": 0.7673, "step": 5360 }, { "epoch": 0.902672718103883, "grad_norm": 61461.9375, "learning_rate": 4.8663640948058495e-06, "loss": 0.7082, "step": 5370 }, { "epoch": 0.9043536728861994, "grad_norm": 64041.15625, "learning_rate": 4.782316355690032e-06, "loss": 0.682, "step": 5380 }, { "epoch": 0.9060346276685157, "grad_norm": 67531.3046875, "learning_rate": 4.698268616574215e-06, "loss": 0.8184, "step": 5390 }, { "epoch": 0.9077155824508321, "grad_norm": 64709.15625, "learning_rate": 4.6142208774583965e-06, "loss": 0.7433, "step": 5400 }, { "epoch": 0.9093965372331484, "grad_norm": 73473.203125, "learning_rate": 4.530173138342579e-06, "loss": 0.7386, "step": 5410 }, { "epoch": 0.9110774920154647, "grad_norm": 66414.84375, "learning_rate": 4.446125399226761e-06, "loss": 0.7839, "step": 5420 }, { "epoch": 0.9127584467977812, "grad_norm": 73395.6015625, "learning_rate": 4.3620776601109435e-06, "loss": 0.7699, "step": 5430 }, { "epoch": 0.9144394015800975, "grad_norm": 60129.15625, "learning_rate": 4.278029920995125e-06, "loss": 0.7111, "step": 5440 }, { "epoch": 0.9161203563624138, "grad_norm": 81137.3203125, "learning_rate": 4.193982181879308e-06, "loss": 0.7419, "step": 5450 }, { "epoch": 0.9178013111447302, "grad_norm": 69321.53125, "learning_rate": 4.10993444276349e-06, "loss": 0.6546, "step": 5460 }, { "epoch": 0.9194822659270465, "grad_norm": 73445.609375, "learning_rate": 4.025886703647672e-06, "loss": 0.7571, "step": 5470 }, { "epoch": 0.921163220709363, "grad_norm": 55211.56640625, "learning_rate": 3.941838964531854e-06, "loss": 0.7361, "step": 5480 }, { "epoch": 0.9228441754916793, "grad_norm": 60575.71484375, "learning_rate": 3.857791225416037e-06, "loss": 0.7854, "step": 5490 }, { "epoch": 0.9245251302739956, "grad_norm": 70186.140625, "learning_rate": 3.7737434863002185e-06, "loss": 0.8255, "step": 5500 }, { "epoch": 0.926206085056312, "grad_norm": 73100.078125, "learning_rate": 3.689695747184401e-06, "loss": 0.7469, "step": 5510 }, { "epoch": 0.9278870398386283, "grad_norm": 75805.078125, "learning_rate": 3.605648008068583e-06, "loss": 0.7616, "step": 5520 }, { "epoch": 0.9295679946209447, "grad_norm": 77849.140625, "learning_rate": 3.5216002689527655e-06, "loss": 0.7562, "step": 5530 }, { "epoch": 0.9312489494032611, "grad_norm": 68207.265625, "learning_rate": 3.4375525298369473e-06, "loss": 0.6813, "step": 5540 }, { "epoch": 0.9329299041855774, "grad_norm": 71734.1640625, "learning_rate": 3.35350479072113e-06, "loss": 0.6668, "step": 5550 }, { "epoch": 0.9346108589678938, "grad_norm": 80375.4375, "learning_rate": 3.2694570516053117e-06, "loss": 0.7542, "step": 5560 }, { "epoch": 0.9362918137502101, "grad_norm": 76297.75, "learning_rate": 3.1854093124894943e-06, "loss": 0.7539, "step": 5570 }, { "epoch": 0.9379727685325264, "grad_norm": 59915.84765625, "learning_rate": 3.1013615733736765e-06, "loss": 0.6965, "step": 5580 }, { "epoch": 0.9396537233148429, "grad_norm": 66482.078125, "learning_rate": 3.0173138342578587e-06, "loss": 0.7416, "step": 5590 }, { "epoch": 0.9413346780971592, "grad_norm": 73498.7578125, "learning_rate": 2.933266095142041e-06, "loss": 0.7404, "step": 5600 }, { "epoch": 0.9430156328794755, "grad_norm": 56677.17578125, "learning_rate": 2.849218356026223e-06, "loss": 0.7199, "step": 5610 }, { "epoch": 0.9446965876617919, "grad_norm": 63067.734375, "learning_rate": 2.7651706169104053e-06, "loss": 0.738, "step": 5620 }, { "epoch": 0.9463775424441082, "grad_norm": 65398.22265625, "learning_rate": 2.6811228777945875e-06, "loss": 0.7177, "step": 5630 }, { "epoch": 0.9480584972264247, "grad_norm": 56372.125, "learning_rate": 2.5970751386787698e-06, "loss": 0.7594, "step": 5640 }, { "epoch": 0.949739452008741, "grad_norm": 66133.2109375, "learning_rate": 2.513027399562952e-06, "loss": 0.755, "step": 5650 }, { "epoch": 0.9514204067910573, "grad_norm": 73855.5859375, "learning_rate": 2.428979660447134e-06, "loss": 0.7286, "step": 5660 }, { "epoch": 0.9531013615733737, "grad_norm": 80979.5703125, "learning_rate": 2.3449319213313164e-06, "loss": 0.7508, "step": 5670 }, { "epoch": 0.95478231635569, "grad_norm": 59096.21484375, "learning_rate": 2.2608841822154986e-06, "loss": 0.7273, "step": 5680 }, { "epoch": 0.9564632711380063, "grad_norm": 73199.34375, "learning_rate": 2.1768364430996808e-06, "loss": 0.8463, "step": 5690 }, { "epoch": 0.9581442259203228, "grad_norm": 70557.265625, "learning_rate": 2.092788703983863e-06, "loss": 0.7518, "step": 5700 }, { "epoch": 0.9598251807026391, "grad_norm": 64123.15234375, "learning_rate": 2.008740964868045e-06, "loss": 0.6743, "step": 5710 }, { "epoch": 0.9615061354849554, "grad_norm": 70495.03125, "learning_rate": 1.9246932257522274e-06, "loss": 0.7179, "step": 5720 }, { "epoch": 0.9631870902672718, "grad_norm": 62604.77734375, "learning_rate": 1.8406454866364096e-06, "loss": 0.7073, "step": 5730 }, { "epoch": 0.9648680450495881, "grad_norm": 69528.75, "learning_rate": 1.7565977475205918e-06, "loss": 0.7738, "step": 5740 }, { "epoch": 0.9665489998319046, "grad_norm": 64379.51953125, "learning_rate": 1.672550008404774e-06, "loss": 0.774, "step": 5750 }, { "epoch": 0.9682299546142209, "grad_norm": 70070.359375, "learning_rate": 1.5885022692889562e-06, "loss": 0.7902, "step": 5760 }, { "epoch": 0.9699109093965372, "grad_norm": 75525.7265625, "learning_rate": 1.5044545301731386e-06, "loss": 0.754, "step": 5770 }, { "epoch": 0.9715918641788536, "grad_norm": 72437.4375, "learning_rate": 1.4204067910573208e-06, "loss": 0.7122, "step": 5780 }, { "epoch": 0.9732728189611699, "grad_norm": 55177.8125, "learning_rate": 1.3363590519415028e-06, "loss": 0.7083, "step": 5790 }, { "epoch": 0.9749537737434864, "grad_norm": 74118.140625, "learning_rate": 1.252311312825685e-06, "loss": 0.7124, "step": 5800 }, { "epoch": 0.9766347285258027, "grad_norm": 75796.015625, "learning_rate": 1.1682635737098672e-06, "loss": 0.7563, "step": 5810 }, { "epoch": 0.978315683308119, "grad_norm": 66329.46875, "learning_rate": 1.0842158345940494e-06, "loss": 0.7379, "step": 5820 }, { "epoch": 0.9799966380904354, "grad_norm": 71971.1796875, "learning_rate": 1.0001680954782316e-06, "loss": 0.7237, "step": 5830 }, { "epoch": 0.9816775928727517, "grad_norm": 70406.546875, "learning_rate": 9.161203563624139e-07, "loss": 0.7518, "step": 5840 }, { "epoch": 0.983358547655068, "grad_norm": 65358.0, "learning_rate": 8.320726172465961e-07, "loss": 0.7263, "step": 5850 }, { "epoch": 0.9850395024373845, "grad_norm": 56430.7265625, "learning_rate": 7.480248781307783e-07, "loss": 0.7174, "step": 5860 }, { "epoch": 0.9867204572197008, "grad_norm": 69264.21875, "learning_rate": 6.639771390149606e-07, "loss": 0.7782, "step": 5870 }, { "epoch": 0.9884014120020171, "grad_norm": 60301.59765625, "learning_rate": 5.799293998991428e-07, "loss": 0.7487, "step": 5880 }, { "epoch": 0.9900823667843335, "grad_norm": 66026.0078125, "learning_rate": 4.95881660783325e-07, "loss": 0.7565, "step": 5890 }, { "epoch": 0.9917633215666498, "grad_norm": 63975.43359375, "learning_rate": 4.1183392166750716e-07, "loss": 0.7349, "step": 5900 }, { "epoch": 0.9934442763489663, "grad_norm": 72585.8046875, "learning_rate": 3.2778618255168936e-07, "loss": 0.7754, "step": 5910 }, { "epoch": 0.9951252311312826, "grad_norm": 67646.4296875, "learning_rate": 2.4373844343587156e-07, "loss": 0.7622, "step": 5920 }, { "epoch": 0.9968061859135989, "grad_norm": 74414.9375, "learning_rate": 1.596907043200538e-07, "loss": 0.7312, "step": 5930 }, { "epoch": 0.9984871406959153, "grad_norm": 69075.203125, "learning_rate": 7.5642965204236e-08, "loss": 0.7293, "step": 5940 }, { "epoch": 1.0, "step": 5949, "total_flos": 9.074524143432499e+17, "train_loss": 0.8014113598860578, "train_runtime": 82603.7739, "train_samples_per_second": 0.864, "train_steps_per_second": 0.072 } ], "logging_steps": 10, "max_steps": 5949, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5949, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.074524143432499e+17, "train_batch_size": 12, "trial_name": null, "trial_params": null }