{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 17790, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002810567734682406, "grad_norm": 96.63301086425781, "learning_rate": 1.1242270938729624e-07, "loss": 17.9839, "step": 1 }, { "epoch": 0.001405283867341203, "grad_norm": 115.6960678100586, "learning_rate": 5.621135469364812e-07, "loss": 18.4829, "step": 5 }, { "epoch": 0.002810567734682406, "grad_norm": 96.61837005615234, "learning_rate": 1.1242270938729624e-06, "loss": 18.1935, "step": 10 }, { "epoch": 0.0042158516020236085, "grad_norm": 132.02969360351562, "learning_rate": 1.6863406408094434e-06, "loss": 18.6642, "step": 15 }, { "epoch": 0.005621135469364812, "grad_norm": 80.8836441040039, "learning_rate": 2.248454187745925e-06, "loss": 17.9493, "step": 20 }, { "epoch": 0.0070264193367060145, "grad_norm": 87.12368774414062, "learning_rate": 2.810567734682406e-06, "loss": 17.8325, "step": 25 }, { "epoch": 0.008431703204047217, "grad_norm": 97.9262924194336, "learning_rate": 3.372681281618887e-06, "loss": 17.3621, "step": 30 }, { "epoch": 0.00983698707138842, "grad_norm": 75.57032012939453, "learning_rate": 3.9347948285553685e-06, "loss": 16.4306, "step": 35 }, { "epoch": 0.011242270938729624, "grad_norm": 71.48660278320312, "learning_rate": 4.49690837549185e-06, "loss": 15.7961, "step": 40 }, { "epoch": 0.012647554806070826, "grad_norm": 70.25821685791016, "learning_rate": 5.059021922428331e-06, "loss": 14.0696, "step": 45 }, { "epoch": 0.014052838673412029, "grad_norm": 68.05973052978516, "learning_rate": 5.621135469364812e-06, "loss": 12.9431, "step": 50 }, { "epoch": 0.015458122540753232, "grad_norm": 52.36540222167969, "learning_rate": 6.183249016301293e-06, "loss": 11.8773, "step": 55 }, { "epoch": 0.016863406408094434, "grad_norm": 52.9713134765625, "learning_rate": 6.745362563237774e-06, "loss": 10.3712, "step": 60 }, { "epoch": 0.018268690275435637, "grad_norm": 34.965667724609375, "learning_rate": 7.307476110174255e-06, "loss": 9.4828, "step": 65 }, { "epoch": 0.01967397414277684, "grad_norm": 29.373140335083008, "learning_rate": 7.869589657110737e-06, "loss": 8.2649, "step": 70 }, { "epoch": 0.021079258010118045, "grad_norm": 23.469764709472656, "learning_rate": 8.431703204047219e-06, "loss": 7.4431, "step": 75 }, { "epoch": 0.022484541877459248, "grad_norm": 15.5948486328125, "learning_rate": 8.9938167509837e-06, "loss": 6.6997, "step": 80 }, { "epoch": 0.02388982574480045, "grad_norm": 14.63684368133545, "learning_rate": 9.555930297920181e-06, "loss": 6.0383, "step": 85 }, { "epoch": 0.025295109612141653, "grad_norm": 9.950777053833008, "learning_rate": 1.0118043844856662e-05, "loss": 5.5526, "step": 90 }, { "epoch": 0.026700393479482856, "grad_norm": 8.490776062011719, "learning_rate": 1.0680157391793142e-05, "loss": 5.0148, "step": 95 }, { "epoch": 0.028105677346824058, "grad_norm": 5.514277935028076, "learning_rate": 1.1242270938729624e-05, "loss": 4.7405, "step": 100 }, { "epoch": 0.02951096121416526, "grad_norm": 3.681926965713501, "learning_rate": 1.1804384485666105e-05, "loss": 4.3935, "step": 105 }, { "epoch": 0.030916245081506463, "grad_norm": 2.6629223823547363, "learning_rate": 1.2366498032602587e-05, "loss": 4.2049, "step": 110 }, { "epoch": 0.03232152894884767, "grad_norm": 2.426866054534912, "learning_rate": 1.2928611579539069e-05, "loss": 3.9441, "step": 115 }, { "epoch": 0.03372681281618887, "grad_norm": 1.9169517755508423, "learning_rate": 1.3490725126475547e-05, "loss": 3.8311, "step": 120 }, { "epoch": 0.035132096683530074, "grad_norm": 3.5230212211608887, "learning_rate": 1.4052838673412031e-05, "loss": 3.6553, "step": 125 }, { "epoch": 0.03653738055087127, "grad_norm": 2.1056201457977295, "learning_rate": 1.461495222034851e-05, "loss": 3.67, "step": 130 }, { "epoch": 0.03794266441821248, "grad_norm": 2.1320669651031494, "learning_rate": 1.5177065767284992e-05, "loss": 3.5765, "step": 135 }, { "epoch": 0.03934794828555368, "grad_norm": 2.591682195663452, "learning_rate": 1.5739179314221474e-05, "loss": 3.5598, "step": 140 }, { "epoch": 0.040753232152894885, "grad_norm": 2.4693403244018555, "learning_rate": 1.6301292861157954e-05, "loss": 3.3373, "step": 145 }, { "epoch": 0.04215851602023609, "grad_norm": 2.9078850746154785, "learning_rate": 1.6863406408094438e-05, "loss": 3.1877, "step": 150 }, { "epoch": 0.04356379988757729, "grad_norm": 4.213356971740723, "learning_rate": 1.742551995503092e-05, "loss": 3.1931, "step": 155 }, { "epoch": 0.044969083754918496, "grad_norm": 6.048213958740234, "learning_rate": 1.79876335019674e-05, "loss": 3.0154, "step": 160 }, { "epoch": 0.046374367622259695, "grad_norm": 8.296866416931152, "learning_rate": 1.854974704890388e-05, "loss": 2.7614, "step": 165 }, { "epoch": 0.0477796514896009, "grad_norm": 11.323473930358887, "learning_rate": 1.9111860595840363e-05, "loss": 2.5561, "step": 170 }, { "epoch": 0.0491849353569421, "grad_norm": 8.23714828491211, "learning_rate": 1.967397414277684e-05, "loss": 2.0463, "step": 175 }, { "epoch": 0.050590219224283306, "grad_norm": 8.242648124694824, "learning_rate": 2.0236087689713324e-05, "loss": 1.7472, "step": 180 }, { "epoch": 0.051995503091624505, "grad_norm": 3.206883668899536, "learning_rate": 2.0798201236649804e-05, "loss": 1.5051, "step": 185 }, { "epoch": 0.05340078695896571, "grad_norm": 4.381938934326172, "learning_rate": 2.1360314783586284e-05, "loss": 1.3826, "step": 190 }, { "epoch": 0.05480607082630692, "grad_norm": 3.7853753566741943, "learning_rate": 2.1922428330522768e-05, "loss": 1.3835, "step": 195 }, { "epoch": 0.056211354693648116, "grad_norm": 3.806912422180176, "learning_rate": 2.248454187745925e-05, "loss": 1.3331, "step": 200 }, { "epoch": 0.05761663856098932, "grad_norm": 4.964791297912598, "learning_rate": 2.304665542439573e-05, "loss": 1.2944, "step": 205 }, { "epoch": 0.05902192242833052, "grad_norm": 2.8260233402252197, "learning_rate": 2.360876897133221e-05, "loss": 1.2893, "step": 210 }, { "epoch": 0.06042720629567173, "grad_norm": 4.206593036651611, "learning_rate": 2.4170882518268693e-05, "loss": 1.2806, "step": 215 }, { "epoch": 0.061832490163012926, "grad_norm": 3.243504762649536, "learning_rate": 2.4732996065205173e-05, "loss": 1.2596, "step": 220 }, { "epoch": 0.06323777403035413, "grad_norm": 3.7753634452819824, "learning_rate": 2.5295109612141654e-05, "loss": 1.2309, "step": 225 }, { "epoch": 0.06464305789769534, "grad_norm": 2.576805353164673, "learning_rate": 2.5857223159078137e-05, "loss": 1.2858, "step": 230 }, { "epoch": 0.06604834176503653, "grad_norm": 4.091485023498535, "learning_rate": 2.6419336706014614e-05, "loss": 1.2199, "step": 235 }, { "epoch": 0.06745362563237774, "grad_norm": 2.256748676300049, "learning_rate": 2.6981450252951095e-05, "loss": 1.1955, "step": 240 }, { "epoch": 0.06885890949971894, "grad_norm": 2.6037867069244385, "learning_rate": 2.754356379988758e-05, "loss": 1.1988, "step": 245 }, { "epoch": 0.07026419336706015, "grad_norm": 4.143032073974609, "learning_rate": 2.8105677346824062e-05, "loss": 1.2066, "step": 250 }, { "epoch": 0.07166947723440135, "grad_norm": 3.437196731567383, "learning_rate": 2.8667790893760543e-05, "loss": 1.1911, "step": 255 }, { "epoch": 0.07307476110174255, "grad_norm": 2.532320022583008, "learning_rate": 2.922990444069702e-05, "loss": 1.169, "step": 260 }, { "epoch": 0.07448004496908375, "grad_norm": 2.057971954345703, "learning_rate": 2.9792017987633503e-05, "loss": 1.1756, "step": 265 }, { "epoch": 0.07588532883642496, "grad_norm": 6.0402984619140625, "learning_rate": 3.0354131534569984e-05, "loss": 1.172, "step": 270 }, { "epoch": 0.07729061270376616, "grad_norm": 11.470712661743164, "learning_rate": 3.091624508150647e-05, "loss": 1.1545, "step": 275 }, { "epoch": 0.07869589657110736, "grad_norm": 3.5605509281158447, "learning_rate": 3.147835862844295e-05, "loss": 1.1581, "step": 280 }, { "epoch": 0.08010118043844856, "grad_norm": 10.173096656799316, "learning_rate": 3.204047217537943e-05, "loss": 1.1358, "step": 285 }, { "epoch": 0.08150646430578977, "grad_norm": 6.1135945320129395, "learning_rate": 3.260258572231591e-05, "loss": 1.1421, "step": 290 }, { "epoch": 0.08291174817313098, "grad_norm": 2.5845179557800293, "learning_rate": 3.316469926925239e-05, "loss": 1.1333, "step": 295 }, { "epoch": 0.08431703204047218, "grad_norm": 3.8196470737457275, "learning_rate": 3.3726812816188876e-05, "loss": 1.1352, "step": 300 }, { "epoch": 0.08572231590781337, "grad_norm": 1.9211612939834595, "learning_rate": 3.428892636312535e-05, "loss": 1.1224, "step": 305 }, { "epoch": 0.08712759977515458, "grad_norm": 7.396655559539795, "learning_rate": 3.485103991006184e-05, "loss": 1.1093, "step": 310 }, { "epoch": 0.08853288364249579, "grad_norm": 8.079272270202637, "learning_rate": 3.541315345699832e-05, "loss": 1.1327, "step": 315 }, { "epoch": 0.08993816750983699, "grad_norm": 1.7263989448547363, "learning_rate": 3.59752670039348e-05, "loss": 1.1074, "step": 320 }, { "epoch": 0.09134345137717818, "grad_norm": 6.237440586090088, "learning_rate": 3.653738055087128e-05, "loss": 1.1028, "step": 325 }, { "epoch": 0.09274873524451939, "grad_norm": 7.730904579162598, "learning_rate": 3.709949409780776e-05, "loss": 1.1231, "step": 330 }, { "epoch": 0.0941540191118606, "grad_norm": 7.369045734405518, "learning_rate": 3.766160764474424e-05, "loss": 1.1023, "step": 335 }, { "epoch": 0.0955593029792018, "grad_norm": 8.358131408691406, "learning_rate": 3.8223721191680726e-05, "loss": 1.1214, "step": 340 }, { "epoch": 0.09696458684654301, "grad_norm": 10.260603904724121, "learning_rate": 3.8785834738617206e-05, "loss": 1.1068, "step": 345 }, { "epoch": 0.0983698707138842, "grad_norm": 2.3183040618896484, "learning_rate": 3.934794828555368e-05, "loss": 1.0982, "step": 350 }, { "epoch": 0.0997751545812254, "grad_norm": 6.748996734619141, "learning_rate": 3.991006183249017e-05, "loss": 1.1149, "step": 355 }, { "epoch": 0.10118043844856661, "grad_norm": 3.9615724086761475, "learning_rate": 4.047217537942665e-05, "loss": 1.0757, "step": 360 }, { "epoch": 0.10258572231590782, "grad_norm": 6.453889846801758, "learning_rate": 4.103428892636313e-05, "loss": 1.0829, "step": 365 }, { "epoch": 0.10399100618324901, "grad_norm": 2.474149227142334, "learning_rate": 4.159640247329961e-05, "loss": 1.0964, "step": 370 }, { "epoch": 0.10539629005059022, "grad_norm": 4.072728633880615, "learning_rate": 4.215851602023609e-05, "loss": 1.0866, "step": 375 }, { "epoch": 0.10680157391793142, "grad_norm": 3.009608745574951, "learning_rate": 4.272062956717257e-05, "loss": 1.0807, "step": 380 }, { "epoch": 0.10820685778527263, "grad_norm": 8.37180233001709, "learning_rate": 4.3282743114109056e-05, "loss": 1.0558, "step": 385 }, { "epoch": 0.10961214165261383, "grad_norm": 5.847390651702881, "learning_rate": 4.3844856661045536e-05, "loss": 1.0805, "step": 390 }, { "epoch": 0.11101742551995503, "grad_norm": 5.546202182769775, "learning_rate": 4.440697020798201e-05, "loss": 1.0752, "step": 395 }, { "epoch": 0.11242270938729623, "grad_norm": 5.740989685058594, "learning_rate": 4.49690837549185e-05, "loss": 1.0631, "step": 400 }, { "epoch": 0.11382799325463744, "grad_norm": 2.483320713043213, "learning_rate": 4.553119730185498e-05, "loss": 1.0556, "step": 405 }, { "epoch": 0.11523327712197864, "grad_norm": 157.97256469726562, "learning_rate": 4.609331084879146e-05, "loss": 1.0953, "step": 410 }, { "epoch": 0.11663856098931984, "grad_norm": 6.035214900970459, "learning_rate": 4.665542439572794e-05, "loss": 1.0634, "step": 415 }, { "epoch": 0.11804384485666104, "grad_norm": 5.549474239349365, "learning_rate": 4.721753794266442e-05, "loss": 1.1014, "step": 420 }, { "epoch": 0.11944912872400225, "grad_norm": 7.824397563934326, "learning_rate": 4.77796514896009e-05, "loss": 1.0848, "step": 425 }, { "epoch": 0.12085441259134345, "grad_norm": 5.783429145812988, "learning_rate": 4.8341765036537386e-05, "loss": 1.0663, "step": 430 }, { "epoch": 0.12225969645868466, "grad_norm": 2.586413621902466, "learning_rate": 4.8903878583473866e-05, "loss": 1.0761, "step": 435 }, { "epoch": 0.12366498032602585, "grad_norm": 1.9556853771209717, "learning_rate": 4.9465992130410346e-05, "loss": 1.0505, "step": 440 }, { "epoch": 0.12507026419336706, "grad_norm": 2.7155544757843018, "learning_rate": 5.002810567734683e-05, "loss": 1.0534, "step": 445 }, { "epoch": 0.12647554806070826, "grad_norm": 2.0361337661743164, "learning_rate": 5.059021922428331e-05, "loss": 1.0671, "step": 450 }, { "epoch": 0.12788083192804947, "grad_norm": 3.4316651821136475, "learning_rate": 5.115233277121979e-05, "loss": 1.0728, "step": 455 }, { "epoch": 0.12928611579539068, "grad_norm": 10.915202140808105, "learning_rate": 5.1714446318156275e-05, "loss": 1.0571, "step": 460 }, { "epoch": 0.13069139966273188, "grad_norm": 8.766178131103516, "learning_rate": 5.2276559865092755e-05, "loss": 1.0604, "step": 465 }, { "epoch": 0.13209668353007306, "grad_norm": 6.660830497741699, "learning_rate": 5.283867341202923e-05, "loss": 1.0545, "step": 470 }, { "epoch": 0.13350196739741427, "grad_norm": 3.1207127571105957, "learning_rate": 5.340078695896571e-05, "loss": 1.0424, "step": 475 }, { "epoch": 0.13490725126475547, "grad_norm": 5.818568229675293, "learning_rate": 5.396290050590219e-05, "loss": 1.0461, "step": 480 }, { "epoch": 0.13631253513209668, "grad_norm": 5.6313910484313965, "learning_rate": 5.4525014052838676e-05, "loss": 1.0602, "step": 485 }, { "epoch": 0.13771781899943789, "grad_norm": 8.082501411437988, "learning_rate": 5.508712759977516e-05, "loss": 1.0362, "step": 490 }, { "epoch": 0.1391231028667791, "grad_norm": 5.104698181152344, "learning_rate": 5.564924114671164e-05, "loss": 1.0365, "step": 495 }, { "epoch": 0.1405283867341203, "grad_norm": 2.717695951461792, "learning_rate": 5.6211354693648124e-05, "loss": 1.0217, "step": 500 }, { "epoch": 0.1419336706014615, "grad_norm": 7.406291961669922, "learning_rate": 5.6773468240584605e-05, "loss": 1.0462, "step": 505 }, { "epoch": 0.1433389544688027, "grad_norm": 12.639225006103516, "learning_rate": 5.7335581787521085e-05, "loss": 1.0307, "step": 510 }, { "epoch": 0.1447442383361439, "grad_norm": 2.30743670463562, "learning_rate": 5.789769533445756e-05, "loss": 1.121, "step": 515 }, { "epoch": 0.1461495222034851, "grad_norm": 6.556262016296387, "learning_rate": 5.845980888139404e-05, "loss": 1.0203, "step": 520 }, { "epoch": 0.1475548060708263, "grad_norm": 7.790770530700684, "learning_rate": 5.9021922428330526e-05, "loss": 1.0727, "step": 525 }, { "epoch": 0.1489600899381675, "grad_norm": 2.6849281787872314, "learning_rate": 5.9584035975267006e-05, "loss": 1.0281, "step": 530 }, { "epoch": 0.1503653738055087, "grad_norm": 2.0983083248138428, "learning_rate": 6.014614952220349e-05, "loss": 1.0341, "step": 535 }, { "epoch": 0.15177065767284992, "grad_norm": 3.3170852661132812, "learning_rate": 6.070826306913997e-05, "loss": 1.049, "step": 540 }, { "epoch": 0.15317594154019112, "grad_norm": 2.881958484649658, "learning_rate": 6.127037661607645e-05, "loss": 1.0369, "step": 545 }, { "epoch": 0.15458122540753233, "grad_norm": 1.7803939580917358, "learning_rate": 6.183249016301293e-05, "loss": 1.048, "step": 550 }, { "epoch": 0.15598650927487354, "grad_norm": 5.816596984863281, "learning_rate": 6.239460370994942e-05, "loss": 1.0305, "step": 555 }, { "epoch": 0.15739179314221471, "grad_norm": 10.560365676879883, "learning_rate": 6.29567172568859e-05, "loss": 1.0275, "step": 560 }, { "epoch": 0.15879707700955592, "grad_norm": 7.882879734039307, "learning_rate": 6.351883080382237e-05, "loss": 1.0507, "step": 565 }, { "epoch": 0.16020236087689713, "grad_norm": 2.731837034225464, "learning_rate": 6.408094435075886e-05, "loss": 1.0264, "step": 570 }, { "epoch": 0.16160764474423833, "grad_norm": 12.15988540649414, "learning_rate": 6.464305789769533e-05, "loss": 1.0295, "step": 575 }, { "epoch": 0.16301292861157954, "grad_norm": 5.824089527130127, "learning_rate": 6.520517144463182e-05, "loss": 1.0193, "step": 580 }, { "epoch": 0.16441821247892074, "grad_norm": 5.727222442626953, "learning_rate": 6.57672849915683e-05, "loss": 1.0167, "step": 585 }, { "epoch": 0.16582349634626195, "grad_norm": 5.926555633544922, "learning_rate": 6.632939853850478e-05, "loss": 1.017, "step": 590 }, { "epoch": 0.16722878021360316, "grad_norm": 5.063398838043213, "learning_rate": 6.689151208544126e-05, "loss": 0.9991, "step": 595 }, { "epoch": 0.16863406408094436, "grad_norm": 3.315136671066284, "learning_rate": 6.745362563237775e-05, "loss": 1.027, "step": 600 }, { "epoch": 0.17003934794828554, "grad_norm": 8.511547088623047, "learning_rate": 6.801573917931423e-05, "loss": 1.0229, "step": 605 }, { "epoch": 0.17144463181562675, "grad_norm": 3.2332677841186523, "learning_rate": 6.85778527262507e-05, "loss": 1.0153, "step": 610 }, { "epoch": 0.17284991568296795, "grad_norm": 4.09757661819458, "learning_rate": 6.913996627318719e-05, "loss": 1.0119, "step": 615 }, { "epoch": 0.17425519955030916, "grad_norm": 40.794124603271484, "learning_rate": 6.970207982012367e-05, "loss": 1.0135, "step": 620 }, { "epoch": 0.17566048341765036, "grad_norm": 3.564362049102783, "learning_rate": 7.026419336706015e-05, "loss": 1.0159, "step": 625 }, { "epoch": 0.17706576728499157, "grad_norm": 3.3208792209625244, "learning_rate": 7.082630691399663e-05, "loss": 1.0091, "step": 630 }, { "epoch": 0.17847105115233278, "grad_norm": 2.5637195110321045, "learning_rate": 7.138842046093311e-05, "loss": 1.0092, "step": 635 }, { "epoch": 0.17987633501967398, "grad_norm": 4.020427703857422, "learning_rate": 7.19505340078696e-05, "loss": 1.0202, "step": 640 }, { "epoch": 0.1812816188870152, "grad_norm": 4.912067413330078, "learning_rate": 7.251264755480608e-05, "loss": 0.9985, "step": 645 }, { "epoch": 0.18268690275435637, "grad_norm": 2.0089938640594482, "learning_rate": 7.307476110174256e-05, "loss": 1.0217, "step": 650 }, { "epoch": 0.18409218662169757, "grad_norm": 10.243757247924805, "learning_rate": 7.363687464867903e-05, "loss": 1.0169, "step": 655 }, { "epoch": 0.18549747048903878, "grad_norm": 4.835886001586914, "learning_rate": 7.419898819561552e-05, "loss": 1.017, "step": 660 }, { "epoch": 0.18690275435637999, "grad_norm": 3.1483078002929688, "learning_rate": 7.4761101742552e-05, "loss": 1.0273, "step": 665 }, { "epoch": 0.1883080382237212, "grad_norm": 4.911044597625732, "learning_rate": 7.532321528948848e-05, "loss": 1.0251, "step": 670 }, { "epoch": 0.1897133220910624, "grad_norm": 3.3334503173828125, "learning_rate": 7.588532883642496e-05, "loss": 1.0054, "step": 675 }, { "epoch": 0.1911186059584036, "grad_norm": 9.200891494750977, "learning_rate": 7.644744238336145e-05, "loss": 1.0488, "step": 680 }, { "epoch": 0.1925238898257448, "grad_norm": 1.7150282859802246, "learning_rate": 7.700955593029792e-05, "loss": 1.0303, "step": 685 }, { "epoch": 0.19392917369308602, "grad_norm": 4.619067668914795, "learning_rate": 7.757166947723441e-05, "loss": 1.0174, "step": 690 }, { "epoch": 0.1953344575604272, "grad_norm": 2.54264760017395, "learning_rate": 7.813378302417089e-05, "loss": 1.0034, "step": 695 }, { "epoch": 0.1967397414277684, "grad_norm": 1.559621810913086, "learning_rate": 7.869589657110736e-05, "loss": 0.995, "step": 700 }, { "epoch": 0.1981450252951096, "grad_norm": 3.9408676624298096, "learning_rate": 7.925801011804385e-05, "loss": 1.006, "step": 705 }, { "epoch": 0.1995503091624508, "grad_norm": 5.564051151275635, "learning_rate": 7.982012366498033e-05, "loss": 1.0215, "step": 710 }, { "epoch": 0.20095559302979202, "grad_norm": 3.5051417350769043, "learning_rate": 8.038223721191681e-05, "loss": 1.0055, "step": 715 }, { "epoch": 0.20236087689713322, "grad_norm": 5.896759510040283, "learning_rate": 8.09443507588533e-05, "loss": 1.0162, "step": 720 }, { "epoch": 0.20376616076447443, "grad_norm": 2.0476253032684326, "learning_rate": 8.150646430578978e-05, "loss": 1.0338, "step": 725 }, { "epoch": 0.20517144463181564, "grad_norm": 4.3316545486450195, "learning_rate": 8.206857785272625e-05, "loss": 1.0276, "step": 730 }, { "epoch": 0.20657672849915684, "grad_norm": 3.6653268337249756, "learning_rate": 8.263069139966274e-05, "loss": 0.9991, "step": 735 }, { "epoch": 0.20798201236649802, "grad_norm": 2.070199966430664, "learning_rate": 8.319280494659922e-05, "loss": 0.999, "step": 740 }, { "epoch": 0.20938729623383923, "grad_norm": 5.528500556945801, "learning_rate": 8.375491849353569e-05, "loss": 0.9958, "step": 745 }, { "epoch": 0.21079258010118043, "grad_norm": 5.513944149017334, "learning_rate": 8.431703204047218e-05, "loss": 0.9946, "step": 750 }, { "epoch": 0.21219786396852164, "grad_norm": 2.3669729232788086, "learning_rate": 8.487914558740866e-05, "loss": 0.9898, "step": 755 }, { "epoch": 0.21360314783586284, "grad_norm": 6.63640022277832, "learning_rate": 8.544125913434514e-05, "loss": 1.0013, "step": 760 }, { "epoch": 0.21500843170320405, "grad_norm": 4.0495219230651855, "learning_rate": 8.600337268128162e-05, "loss": 1.0017, "step": 765 }, { "epoch": 0.21641371557054526, "grad_norm": 2.7395238876342773, "learning_rate": 8.656548622821811e-05, "loss": 0.999, "step": 770 }, { "epoch": 0.21781899943788646, "grad_norm": 3.2042696475982666, "learning_rate": 8.712759977515458e-05, "loss": 0.9674, "step": 775 }, { "epoch": 0.21922428330522767, "grad_norm": 4.542521953582764, "learning_rate": 8.768971332209107e-05, "loss": 0.9904, "step": 780 }, { "epoch": 0.22062956717256885, "grad_norm": 2.6508102416992188, "learning_rate": 8.825182686902755e-05, "loss": 0.9767, "step": 785 }, { "epoch": 0.22203485103991005, "grad_norm": 2.0267133712768555, "learning_rate": 8.881394041596402e-05, "loss": 0.9847, "step": 790 }, { "epoch": 0.22344013490725126, "grad_norm": 1.6072683334350586, "learning_rate": 8.93760539629005e-05, "loss": 0.9746, "step": 795 }, { "epoch": 0.22484541877459246, "grad_norm": 17.596202850341797, "learning_rate": 8.9938167509837e-05, "loss": 0.9854, "step": 800 }, { "epoch": 0.22625070264193367, "grad_norm": 3.559966564178467, "learning_rate": 9.050028105677347e-05, "loss": 1.0136, "step": 805 }, { "epoch": 0.22765598650927488, "grad_norm": 4.577120780944824, "learning_rate": 9.106239460370995e-05, "loss": 1.0072, "step": 810 }, { "epoch": 0.22906127037661608, "grad_norm": 5.16311502456665, "learning_rate": 9.162450815064644e-05, "loss": 1.0027, "step": 815 }, { "epoch": 0.2304665542439573, "grad_norm": 6.010768413543701, "learning_rate": 9.218662169758291e-05, "loss": 1.0, "step": 820 }, { "epoch": 0.2318718381112985, "grad_norm": 3.002239942550659, "learning_rate": 9.27487352445194e-05, "loss": 0.9899, "step": 825 }, { "epoch": 0.23327712197863967, "grad_norm": 4.010767459869385, "learning_rate": 9.331084879145588e-05, "loss": 0.9793, "step": 830 }, { "epoch": 0.23468240584598088, "grad_norm": 2.6313583850860596, "learning_rate": 9.387296233839236e-05, "loss": 0.9974, "step": 835 }, { "epoch": 0.23608768971332209, "grad_norm": 2.2778332233428955, "learning_rate": 9.443507588532884e-05, "loss": 0.9862, "step": 840 }, { "epoch": 0.2374929735806633, "grad_norm": 2.9743850231170654, "learning_rate": 9.499718943226532e-05, "loss": 0.988, "step": 845 }, { "epoch": 0.2388982574480045, "grad_norm": 38.51722717285156, "learning_rate": 9.55593029792018e-05, "loss": 1.0115, "step": 850 }, { "epoch": 0.2403035413153457, "grad_norm": 2.403254508972168, "learning_rate": 9.612141652613828e-05, "loss": 1.0015, "step": 855 }, { "epoch": 0.2417088251826869, "grad_norm": 5.464376449584961, "learning_rate": 9.668353007307477e-05, "loss": 1.0364, "step": 860 }, { "epoch": 0.24311410905002812, "grad_norm": 3.9224069118499756, "learning_rate": 9.724564362001124e-05, "loss": 0.996, "step": 865 }, { "epoch": 0.24451939291736932, "grad_norm": 3.181598424911499, "learning_rate": 9.780775716694773e-05, "loss": 0.9846, "step": 870 }, { "epoch": 0.2459246767847105, "grad_norm": 3.1867926120758057, "learning_rate": 9.83698707138842e-05, "loss": 0.976, "step": 875 }, { "epoch": 0.2473299606520517, "grad_norm": 3.488450765609741, "learning_rate": 9.893198426082069e-05, "loss": 0.9664, "step": 880 }, { "epoch": 0.2487352445193929, "grad_norm": 3.7854244709014893, "learning_rate": 9.949409780775717e-05, "loss": 0.9857, "step": 885 }, { "epoch": 0.2501405283867341, "grad_norm": 23.101707458496094, "learning_rate": 0.00010005621135469365, "loss": 0.9901, "step": 890 }, { "epoch": 0.2515458122540753, "grad_norm": 1.0758060216903687, "learning_rate": 0.00010061832490163013, "loss": 0.9794, "step": 895 }, { "epoch": 0.25295109612141653, "grad_norm": 2.6756820678710938, "learning_rate": 0.00010118043844856661, "loss": 1.006, "step": 900 }, { "epoch": 0.2543563799887577, "grad_norm": 3.2079479694366455, "learning_rate": 0.00010174255199550309, "loss": 0.9816, "step": 905 }, { "epoch": 0.25576166385609894, "grad_norm": 2.79168963432312, "learning_rate": 0.00010230466554243957, "loss": 0.9726, "step": 910 }, { "epoch": 0.2571669477234401, "grad_norm": 3.4075028896331787, "learning_rate": 0.00010286677908937605, "loss": 1.0016, "step": 915 }, { "epoch": 0.25857223159078135, "grad_norm": 2.6918251514434814, "learning_rate": 0.00010342889263631255, "loss": 0.9852, "step": 920 }, { "epoch": 0.25997751545812253, "grad_norm": 1.3752806186676025, "learning_rate": 0.00010399100618324902, "loss": 0.997, "step": 925 }, { "epoch": 0.26138279932546377, "grad_norm": 1.351408839225769, "learning_rate": 0.00010455311973018551, "loss": 0.9827, "step": 930 }, { "epoch": 0.26278808319280494, "grad_norm": 2.1732945442199707, "learning_rate": 0.00010511523327712198, "loss": 0.9762, "step": 935 }, { "epoch": 0.2641933670601461, "grad_norm": 2.2650272846221924, "learning_rate": 0.00010567734682405846, "loss": 0.9887, "step": 940 }, { "epoch": 0.26559865092748736, "grad_norm": 1.916999101638794, "learning_rate": 0.00010623946037099494, "loss": 0.9847, "step": 945 }, { "epoch": 0.26700393479482853, "grad_norm": 2.3614659309387207, "learning_rate": 0.00010680157391793142, "loss": 0.9843, "step": 950 }, { "epoch": 0.26840921866216977, "grad_norm": 0.9917680621147156, "learning_rate": 0.00010736368746486792, "loss": 0.9775, "step": 955 }, { "epoch": 0.26981450252951095, "grad_norm": 5.091005802154541, "learning_rate": 0.00010792580101180438, "loss": 0.9921, "step": 960 }, { "epoch": 0.2712197863968522, "grad_norm": 4.194843769073486, "learning_rate": 0.00010848791455874088, "loss": 0.9906, "step": 965 }, { "epoch": 0.27262507026419336, "grad_norm": 12.555365562438965, "learning_rate": 0.00010905002810567735, "loss": 0.983, "step": 970 }, { "epoch": 0.2740303541315346, "grad_norm": 6.976730823516846, "learning_rate": 0.00010961214165261384, "loss": 0.9863, "step": 975 }, { "epoch": 0.27543563799887577, "grad_norm": 5.128290176391602, "learning_rate": 0.00011017425519955031, "loss": 0.9908, "step": 980 }, { "epoch": 0.27684092186621695, "grad_norm": 1.3355897665023804, "learning_rate": 0.00011073636874648679, "loss": 0.9693, "step": 985 }, { "epoch": 0.2782462057335582, "grad_norm": 5.7703680992126465, "learning_rate": 0.00011129848229342327, "loss": 0.9829, "step": 990 }, { "epoch": 0.27965148960089936, "grad_norm": 3.747619867324829, "learning_rate": 0.00011186059584035975, "loss": 0.9776, "step": 995 }, { "epoch": 0.2810567734682406, "grad_norm": 3.929070472717285, "learning_rate": 0.00011242270938729625, "loss": 0.9846, "step": 1000 }, { "epoch": 0.2824620573355818, "grad_norm": 1.7742552757263184, "learning_rate": 0.00011298482293423271, "loss": 0.9908, "step": 1005 }, { "epoch": 0.283867341202923, "grad_norm": 6.783897399902344, "learning_rate": 0.00011354693648116921, "loss": 0.963, "step": 1010 }, { "epoch": 0.2852726250702642, "grad_norm": 1.7316850423812866, "learning_rate": 0.00011410905002810568, "loss": 0.9804, "step": 1015 }, { "epoch": 0.2866779089376054, "grad_norm": 1.1217191219329834, "learning_rate": 0.00011467116357504217, "loss": 0.9709, "step": 1020 }, { "epoch": 0.2880831928049466, "grad_norm": 5.882577896118164, "learning_rate": 0.00011523327712197864, "loss": 1.0065, "step": 1025 }, { "epoch": 0.2894884766722878, "grad_norm": 2.939326524734497, "learning_rate": 0.00011579539066891512, "loss": 0.9849, "step": 1030 }, { "epoch": 0.290893760539629, "grad_norm": 3.1381020545959473, "learning_rate": 0.0001163575042158516, "loss": 1.0186, "step": 1035 }, { "epoch": 0.2922990444069702, "grad_norm": 0.9430264234542847, "learning_rate": 0.00011691961776278808, "loss": 0.9546, "step": 1040 }, { "epoch": 0.2937043282743114, "grad_norm": 1.3062149286270142, "learning_rate": 0.00011748173130972458, "loss": 0.9848, "step": 1045 }, { "epoch": 0.2951096121416526, "grad_norm": 1.2571016550064087, "learning_rate": 0.00011804384485666105, "loss": 0.9755, "step": 1050 }, { "epoch": 0.29651489600899383, "grad_norm": 0.9796061515808105, "learning_rate": 0.00011860595840359754, "loss": 0.984, "step": 1055 }, { "epoch": 0.297920179876335, "grad_norm": 1.193790316581726, "learning_rate": 0.00011916807195053401, "loss": 0.964, "step": 1060 }, { "epoch": 0.29932546374367625, "grad_norm": 4.409701347351074, "learning_rate": 0.0001197301854974705, "loss": 0.9847, "step": 1065 }, { "epoch": 0.3007307476110174, "grad_norm": 1.948830246925354, "learning_rate": 0.00012029229904440697, "loss": 1.0549, "step": 1070 }, { "epoch": 0.3021360314783586, "grad_norm": 3.3599255084991455, "learning_rate": 0.00012085441259134345, "loss": 0.9967, "step": 1075 }, { "epoch": 0.30354131534569984, "grad_norm": 2.4269542694091797, "learning_rate": 0.00012141652613827993, "loss": 0.983, "step": 1080 }, { "epoch": 0.304946599213041, "grad_norm": 0.9867331385612488, "learning_rate": 0.00012197863968521641, "loss": 0.9672, "step": 1085 }, { "epoch": 0.30635188308038225, "grad_norm": 1.6596901416778564, "learning_rate": 0.0001225407532321529, "loss": 0.9783, "step": 1090 }, { "epoch": 0.3077571669477234, "grad_norm": 3.4487783908843994, "learning_rate": 0.00012310286677908938, "loss": 0.9693, "step": 1095 }, { "epoch": 0.30916245081506466, "grad_norm": 1.8991751670837402, "learning_rate": 0.00012366498032602587, "loss": 0.9756, "step": 1100 }, { "epoch": 0.31056773468240584, "grad_norm": 1.1805132627487183, "learning_rate": 0.00012422709387296233, "loss": 0.9629, "step": 1105 }, { "epoch": 0.31197301854974707, "grad_norm": 2.110664129257202, "learning_rate": 0.00012478920741989884, "loss": 0.9692, "step": 1110 }, { "epoch": 0.31337830241708825, "grad_norm": 1.4803400039672852, "learning_rate": 0.0001253513209668353, "loss": 0.9595, "step": 1115 }, { "epoch": 0.31478358628442943, "grad_norm": 4.135382652282715, "learning_rate": 0.0001259134345137718, "loss": 0.9667, "step": 1120 }, { "epoch": 0.31618887015177066, "grad_norm": 1.764575719833374, "learning_rate": 0.00012647554806070828, "loss": 0.9922, "step": 1125 }, { "epoch": 0.31759415401911184, "grad_norm": 1.1572849750518799, "learning_rate": 0.00012703766160764474, "loss": 0.9887, "step": 1130 }, { "epoch": 0.3189994378864531, "grad_norm": 1.766047477722168, "learning_rate": 0.00012759977515458123, "loss": 0.9518, "step": 1135 }, { "epoch": 0.32040472175379425, "grad_norm": 0.9366393685340881, "learning_rate": 0.0001281618887015177, "loss": 0.9745, "step": 1140 }, { "epoch": 0.3218100056211355, "grad_norm": 2.0685994625091553, "learning_rate": 0.0001287240022484542, "loss": 0.9711, "step": 1145 }, { "epoch": 0.32321528948847666, "grad_norm": 1.1759870052337646, "learning_rate": 0.00012928611579539066, "loss": 0.9752, "step": 1150 }, { "epoch": 0.3246205733558179, "grad_norm": 1.212844967842102, "learning_rate": 0.00012984822934232717, "loss": 0.9411, "step": 1155 }, { "epoch": 0.3260258572231591, "grad_norm": 3.3538694381713867, "learning_rate": 0.00013041034288926363, "loss": 0.9753, "step": 1160 }, { "epoch": 0.32743114109050026, "grad_norm": 2.4922068119049072, "learning_rate": 0.00013097245643620012, "loss": 0.9722, "step": 1165 }, { "epoch": 0.3288364249578415, "grad_norm": 38.42106628417969, "learning_rate": 0.0001315345699831366, "loss": 0.9676, "step": 1170 }, { "epoch": 0.33024170882518267, "grad_norm": 1.5579969882965088, "learning_rate": 0.00013209668353007307, "loss": 1.0097, "step": 1175 }, { "epoch": 0.3316469926925239, "grad_norm": 4.966186046600342, "learning_rate": 0.00013265879707700956, "loss": 0.9647, "step": 1180 }, { "epoch": 0.3330522765598651, "grad_norm": 3.0127198696136475, "learning_rate": 0.00013322091062394604, "loss": 0.9513, "step": 1185 }, { "epoch": 0.3344575604272063, "grad_norm": 1.1111674308776855, "learning_rate": 0.00013378302417088253, "loss": 0.9383, "step": 1190 }, { "epoch": 0.3358628442945475, "grad_norm": 1.101528525352478, "learning_rate": 0.000134345137717819, "loss": 0.9508, "step": 1195 }, { "epoch": 0.3372681281618887, "grad_norm": 4.363356113433838, "learning_rate": 0.0001349072512647555, "loss": 0.9496, "step": 1200 }, { "epoch": 0.3386734120292299, "grad_norm": 1.4446074962615967, "learning_rate": 0.00013546936481169196, "loss": 0.9652, "step": 1205 }, { "epoch": 0.3400786958965711, "grad_norm": 5.832637786865234, "learning_rate": 0.00013603147835862845, "loss": 0.976, "step": 1210 }, { "epoch": 0.3414839797639123, "grad_norm": 3.8764944076538086, "learning_rate": 0.00013659359190556494, "loss": 0.9526, "step": 1215 }, { "epoch": 0.3428892636312535, "grad_norm": 6.050147533416748, "learning_rate": 0.0001371557054525014, "loss": 0.9262, "step": 1220 }, { "epoch": 0.3442945474985947, "grad_norm": 2.3198440074920654, "learning_rate": 0.00013771781899943789, "loss": 0.9513, "step": 1225 }, { "epoch": 0.3456998313659359, "grad_norm": 10.792181015014648, "learning_rate": 0.00013827993254637437, "loss": 0.9447, "step": 1230 }, { "epoch": 0.34710511523327714, "grad_norm": 2.717088222503662, "learning_rate": 0.00013884204609331086, "loss": 0.9519, "step": 1235 }, { "epoch": 0.3485103991006183, "grad_norm": 1.6102532148361206, "learning_rate": 0.00013940415964024735, "loss": 0.9635, "step": 1240 }, { "epoch": 0.34991568296795955, "grad_norm": 8.83648681640625, "learning_rate": 0.00013996627318718383, "loss": 1.0115, "step": 1245 }, { "epoch": 0.35132096683530073, "grad_norm": 2.410649299621582, "learning_rate": 0.0001405283867341203, "loss": 0.9562, "step": 1250 }, { "epoch": 0.3527262507026419, "grad_norm": 1.7195380926132202, "learning_rate": 0.00014109050028105678, "loss": 0.9423, "step": 1255 }, { "epoch": 0.35413153456998314, "grad_norm": 2.283205270767212, "learning_rate": 0.00014165261382799327, "loss": 0.9423, "step": 1260 }, { "epoch": 0.3555368184373243, "grad_norm": 1.9860799312591553, "learning_rate": 0.00014221472737492973, "loss": 0.9557, "step": 1265 }, { "epoch": 0.35694210230466555, "grad_norm": 2.0284266471862793, "learning_rate": 0.00014277684092186622, "loss": 0.9422, "step": 1270 }, { "epoch": 0.35834738617200673, "grad_norm": 2.489513635635376, "learning_rate": 0.0001433389544688027, "loss": 0.9776, "step": 1275 }, { "epoch": 0.35975267003934797, "grad_norm": 1.5750313997268677, "learning_rate": 0.0001439010680157392, "loss": 0.9573, "step": 1280 }, { "epoch": 0.36115795390668914, "grad_norm": 3.0681686401367188, "learning_rate": 0.00014446318156267568, "loss": 0.9514, "step": 1285 }, { "epoch": 0.3625632377740304, "grad_norm": 3.1011362075805664, "learning_rate": 0.00014502529510961216, "loss": 0.9721, "step": 1290 }, { "epoch": 0.36396852164137156, "grad_norm": 1.5543673038482666, "learning_rate": 0.00014558740865654862, "loss": 0.9467, "step": 1295 }, { "epoch": 0.36537380550871273, "grad_norm": 2.299344539642334, "learning_rate": 0.0001461495222034851, "loss": 0.9351, "step": 1300 }, { "epoch": 0.36677908937605397, "grad_norm": 3.671466588973999, "learning_rate": 0.0001467116357504216, "loss": 0.967, "step": 1305 }, { "epoch": 0.36818437324339515, "grad_norm": 1.6322294473648071, "learning_rate": 0.00014727374929735806, "loss": 0.9511, "step": 1310 }, { "epoch": 0.3695896571107364, "grad_norm": 1.7458432912826538, "learning_rate": 0.00014783586284429457, "loss": 1.0559, "step": 1315 }, { "epoch": 0.37099494097807756, "grad_norm": 1.6651675701141357, "learning_rate": 0.00014839797639123103, "loss": 0.9355, "step": 1320 }, { "epoch": 0.3724002248454188, "grad_norm": 3.2860312461853027, "learning_rate": 0.00014896008993816752, "loss": 0.9926, "step": 1325 }, { "epoch": 0.37380550871275997, "grad_norm": 1.6739774942398071, "learning_rate": 0.000149522203485104, "loss": 0.9617, "step": 1330 }, { "epoch": 0.3752107925801012, "grad_norm": 1.423102855682373, "learning_rate": 0.0001500843170320405, "loss": 0.9352, "step": 1335 }, { "epoch": 0.3766160764474424, "grad_norm": 4.285151958465576, "learning_rate": 0.00015064643057897695, "loss": 0.9744, "step": 1340 }, { "epoch": 0.37802136031478356, "grad_norm": 0.7504451870918274, "learning_rate": 0.00015120854412591344, "loss": 0.9643, "step": 1345 }, { "epoch": 0.3794266441821248, "grad_norm": 2.98614764213562, "learning_rate": 0.00015177065767284993, "loss": 0.941, "step": 1350 }, { "epoch": 0.380831928049466, "grad_norm": 2.592519521713257, "learning_rate": 0.0001523327712197864, "loss": 1.0328, "step": 1355 }, { "epoch": 0.3822372119168072, "grad_norm": 1.2264142036437988, "learning_rate": 0.0001528948847667229, "loss": 0.9446, "step": 1360 }, { "epoch": 0.3836424957841484, "grad_norm": 1.3170299530029297, "learning_rate": 0.00015345699831365936, "loss": 0.9687, "step": 1365 }, { "epoch": 0.3850477796514896, "grad_norm": 1.306921362876892, "learning_rate": 0.00015401911186059585, "loss": 0.9547, "step": 1370 }, { "epoch": 0.3864530635188308, "grad_norm": 1.5683096647262573, "learning_rate": 0.00015458122540753234, "loss": 0.9549, "step": 1375 }, { "epoch": 0.38785834738617203, "grad_norm": 1.1363381147384644, "learning_rate": 0.00015514333895446882, "loss": 0.9694, "step": 1380 }, { "epoch": 0.3892636312535132, "grad_norm": 1.0660864114761353, "learning_rate": 0.00015570545250140528, "loss": 0.9643, "step": 1385 }, { "epoch": 0.3906689151208544, "grad_norm": 1.0931308269500732, "learning_rate": 0.00015626756604834177, "loss": 0.9533, "step": 1390 }, { "epoch": 0.3920741989881956, "grad_norm": 1.278201937675476, "learning_rate": 0.00015682967959527826, "loss": 0.9554, "step": 1395 }, { "epoch": 0.3934794828555368, "grad_norm": 0.7979600429534912, "learning_rate": 0.00015739179314221472, "loss": 0.9313, "step": 1400 }, { "epoch": 0.39488476672287803, "grad_norm": 2.599208116531372, "learning_rate": 0.00015795390668915123, "loss": 0.942, "step": 1405 }, { "epoch": 0.3962900505902192, "grad_norm": 1.375581979751587, "learning_rate": 0.0001585160202360877, "loss": 0.9513, "step": 1410 }, { "epoch": 0.39769533445756045, "grad_norm": 1.6056209802627563, "learning_rate": 0.00015907813378302418, "loss": 0.9683, "step": 1415 }, { "epoch": 0.3991006183249016, "grad_norm": 2.402937173843384, "learning_rate": 0.00015964024732996067, "loss": 0.9309, "step": 1420 }, { "epoch": 0.40050590219224286, "grad_norm": 4.955761909484863, "learning_rate": 0.00016020236087689715, "loss": 0.9383, "step": 1425 }, { "epoch": 0.40191118605958404, "grad_norm": 0.937556266784668, "learning_rate": 0.00016076447442383361, "loss": 0.9778, "step": 1430 }, { "epoch": 0.4033164699269252, "grad_norm": 0.992476761341095, "learning_rate": 0.0001613265879707701, "loss": 0.9425, "step": 1435 }, { "epoch": 0.40472175379426645, "grad_norm": 0.7373234629631042, "learning_rate": 0.0001618887015177066, "loss": 0.9552, "step": 1440 }, { "epoch": 0.4061270376616076, "grad_norm": 2.2464535236358643, "learning_rate": 0.00016245081506464305, "loss": 0.9572, "step": 1445 }, { "epoch": 0.40753232152894886, "grad_norm": 2.751627206802368, "learning_rate": 0.00016301292861157956, "loss": 0.9556, "step": 1450 }, { "epoch": 0.40893760539629004, "grad_norm": 0.9762445688247681, "learning_rate": 0.00016357504215851602, "loss": 0.9433, "step": 1455 }, { "epoch": 0.41034288926363127, "grad_norm": 2.10992431640625, "learning_rate": 0.0001641371557054525, "loss": 0.95, "step": 1460 }, { "epoch": 0.41174817313097245, "grad_norm": 2.1031289100646973, "learning_rate": 0.000164699269252389, "loss": 0.9414, "step": 1465 }, { "epoch": 0.4131534569983137, "grad_norm": 1.6147500276565552, "learning_rate": 0.00016526138279932548, "loss": 0.9454, "step": 1470 }, { "epoch": 0.41455874086565486, "grad_norm": 2.729752779006958, "learning_rate": 0.00016582349634626194, "loss": 0.9483, "step": 1475 }, { "epoch": 0.41596402473299604, "grad_norm": 0.8766548037528992, "learning_rate": 0.00016638560989319843, "loss": 0.9263, "step": 1480 }, { "epoch": 0.4173693086003373, "grad_norm": 1.7011388540267944, "learning_rate": 0.00016694772344013492, "loss": 0.985, "step": 1485 }, { "epoch": 0.41877459246767845, "grad_norm": 1.1635528802871704, "learning_rate": 0.00016750983698707138, "loss": 0.9511, "step": 1490 }, { "epoch": 0.4201798763350197, "grad_norm": 1.0107743740081787, "learning_rate": 0.0001680719505340079, "loss": 0.9545, "step": 1495 }, { "epoch": 0.42158516020236086, "grad_norm": 1.4937266111373901, "learning_rate": 0.00016863406408094435, "loss": 0.9632, "step": 1500 }, { "epoch": 0.4229904440697021, "grad_norm": 0.9577686190605164, "learning_rate": 0.00016919617762788084, "loss": 0.9236, "step": 1505 }, { "epoch": 0.4243957279370433, "grad_norm": 1.1507699489593506, "learning_rate": 0.00016975829117481733, "loss": 0.9276, "step": 1510 }, { "epoch": 0.4258010118043845, "grad_norm": 1.34715735912323, "learning_rate": 0.00017032040472175381, "loss": 0.9482, "step": 1515 }, { "epoch": 0.4272062956717257, "grad_norm": 1.894851803779602, "learning_rate": 0.00017088251826869027, "loss": 0.9504, "step": 1520 }, { "epoch": 0.42861157953906687, "grad_norm": 2.9545974731445312, "learning_rate": 0.00017144463181562676, "loss": 0.9504, "step": 1525 }, { "epoch": 0.4300168634064081, "grad_norm": 1.8173575401306152, "learning_rate": 0.00017200674536256325, "loss": 0.9181, "step": 1530 }, { "epoch": 0.4314221472737493, "grad_norm": 0.6965020298957825, "learning_rate": 0.0001725688589094997, "loss": 0.9325, "step": 1535 }, { "epoch": 0.4328274311410905, "grad_norm": 0.732700526714325, "learning_rate": 0.00017313097245643622, "loss": 0.963, "step": 1540 }, { "epoch": 0.4342327150084317, "grad_norm": 0.7434464693069458, "learning_rate": 0.00017369308600337268, "loss": 0.9319, "step": 1545 }, { "epoch": 0.4356379988757729, "grad_norm": 0.7588825821876526, "learning_rate": 0.00017425519955030917, "loss": 0.9438, "step": 1550 }, { "epoch": 0.4370432827431141, "grad_norm": 1.028361201286316, "learning_rate": 0.00017481731309724566, "loss": 0.9543, "step": 1555 }, { "epoch": 0.43844856661045534, "grad_norm": 0.9480350613594055, "learning_rate": 0.00017537942664418214, "loss": 0.9612, "step": 1560 }, { "epoch": 0.4398538504777965, "grad_norm": 0.8604362607002258, "learning_rate": 0.0001759415401911186, "loss": 0.9319, "step": 1565 }, { "epoch": 0.4412591343451377, "grad_norm": 1.40911066532135, "learning_rate": 0.0001765036537380551, "loss": 0.9613, "step": 1570 }, { "epoch": 0.4426644182124789, "grad_norm": 0.7593716979026794, "learning_rate": 0.00017706576728499158, "loss": 0.9468, "step": 1575 }, { "epoch": 0.4440697020798201, "grad_norm": 1.2685699462890625, "learning_rate": 0.00017762788083192804, "loss": 0.9466, "step": 1580 }, { "epoch": 0.44547498594716134, "grad_norm": 0.813873291015625, "learning_rate": 0.00017818999437886455, "loss": 0.9277, "step": 1585 }, { "epoch": 0.4468802698145025, "grad_norm": 2.493716239929199, "learning_rate": 0.000178752107925801, "loss": 0.9486, "step": 1590 }, { "epoch": 0.44828555368184375, "grad_norm": 4.12313175201416, "learning_rate": 0.0001793142214727375, "loss": 0.9498, "step": 1595 }, { "epoch": 0.44969083754918493, "grad_norm": 2.0010995864868164, "learning_rate": 0.000179876335019674, "loss": 0.9319, "step": 1600 }, { "epoch": 0.45109612141652616, "grad_norm": 1.3007354736328125, "learning_rate": 0.00018043844856661047, "loss": 0.9427, "step": 1605 }, { "epoch": 0.45250140528386734, "grad_norm": 2.6882872581481934, "learning_rate": 0.00018100056211354693, "loss": 0.9784, "step": 1610 }, { "epoch": 0.4539066891512085, "grad_norm": 0.7096717953681946, "learning_rate": 0.00018156267566048342, "loss": 0.9584, "step": 1615 }, { "epoch": 0.45531197301854975, "grad_norm": 0.6443789601325989, "learning_rate": 0.0001821247892074199, "loss": 0.939, "step": 1620 }, { "epoch": 0.45671725688589093, "grad_norm": 0.7829910516738892, "learning_rate": 0.00018268690275435637, "loss": 0.9273, "step": 1625 }, { "epoch": 0.45812254075323217, "grad_norm": 1.2124425172805786, "learning_rate": 0.00018324901630129288, "loss": 0.9321, "step": 1630 }, { "epoch": 0.45952782462057334, "grad_norm": 7.380026340484619, "learning_rate": 0.00018381112984822934, "loss": 1.0049, "step": 1635 }, { "epoch": 0.4609331084879146, "grad_norm": 1.8242307901382446, "learning_rate": 0.00018437324339516583, "loss": 0.9445, "step": 1640 }, { "epoch": 0.46233839235525576, "grad_norm": 1.4180879592895508, "learning_rate": 0.00018493535694210232, "loss": 0.946, "step": 1645 }, { "epoch": 0.463743676222597, "grad_norm": 2.3717854022979736, "learning_rate": 0.0001854974704890388, "loss": 0.9333, "step": 1650 }, { "epoch": 0.46514896008993817, "grad_norm": 4.793551921844482, "learning_rate": 0.00018605958403597526, "loss": 0.9427, "step": 1655 }, { "epoch": 0.46655424395727935, "grad_norm": 1.434067726135254, "learning_rate": 0.00018662169758291175, "loss": 0.9378, "step": 1660 }, { "epoch": 0.4679595278246206, "grad_norm": 1.8079577684402466, "learning_rate": 0.00018718381112984824, "loss": 0.9234, "step": 1665 }, { "epoch": 0.46936481169196176, "grad_norm": 2.289116144180298, "learning_rate": 0.00018774592467678473, "loss": 0.9345, "step": 1670 }, { "epoch": 0.470770095559303, "grad_norm": 1.7819665670394897, "learning_rate": 0.0001883080382237212, "loss": 0.9202, "step": 1675 }, { "epoch": 0.47217537942664417, "grad_norm": 1.405173897743225, "learning_rate": 0.00018887015177065767, "loss": 0.9222, "step": 1680 }, { "epoch": 0.4735806632939854, "grad_norm": 1.151394009590149, "learning_rate": 0.00018943226531759416, "loss": 0.9552, "step": 1685 }, { "epoch": 0.4749859471613266, "grad_norm": 1.7987416982650757, "learning_rate": 0.00018999437886453065, "loss": 0.9193, "step": 1690 }, { "epoch": 0.4763912310286678, "grad_norm": 3.1489951610565186, "learning_rate": 0.00019055649241146713, "loss": 0.973, "step": 1695 }, { "epoch": 0.477796514896009, "grad_norm": 1.2289990186691284, "learning_rate": 0.0001911186059584036, "loss": 0.9227, "step": 1700 }, { "epoch": 0.4792017987633502, "grad_norm": 1.305069923400879, "learning_rate": 0.00019168071950534008, "loss": 0.9307, "step": 1705 }, { "epoch": 0.4806070826306914, "grad_norm": 1.7787383794784546, "learning_rate": 0.00019224283305227657, "loss": 0.922, "step": 1710 }, { "epoch": 0.4820123664980326, "grad_norm": 1.4966998100280762, "learning_rate": 0.00019280494659921306, "loss": 0.9326, "step": 1715 }, { "epoch": 0.4834176503653738, "grad_norm": 1.7701624631881714, "learning_rate": 0.00019336706014614954, "loss": 0.95, "step": 1720 }, { "epoch": 0.484822934232715, "grad_norm": 3.242124319076538, "learning_rate": 0.000193929173693086, "loss": 1.0283, "step": 1725 }, { "epoch": 0.48622821810005623, "grad_norm": 1.4034018516540527, "learning_rate": 0.0001944912872400225, "loss": 0.9584, "step": 1730 }, { "epoch": 0.4876335019673974, "grad_norm": 1.9737906455993652, "learning_rate": 0.00019505340078695898, "loss": 0.9374, "step": 1735 }, { "epoch": 0.48903878583473864, "grad_norm": 0.9555981159210205, "learning_rate": 0.00019561551433389546, "loss": 0.9341, "step": 1740 }, { "epoch": 0.4904440697020798, "grad_norm": 1.5899726152420044, "learning_rate": 0.00019617762788083195, "loss": 0.9383, "step": 1745 }, { "epoch": 0.491849353569421, "grad_norm": 0.6762555837631226, "learning_rate": 0.0001967397414277684, "loss": 0.9336, "step": 1750 }, { "epoch": 0.49325463743676223, "grad_norm": 0.9318333268165588, "learning_rate": 0.0001973018549747049, "loss": 0.9246, "step": 1755 }, { "epoch": 0.4946599213041034, "grad_norm": 1.3311002254486084, "learning_rate": 0.00019786396852164139, "loss": 0.9285, "step": 1760 }, { "epoch": 0.49606520517144465, "grad_norm": 2.389007806777954, "learning_rate": 0.00019842608206857787, "loss": 0.923, "step": 1765 }, { "epoch": 0.4974704890387858, "grad_norm": 2.03153657913208, "learning_rate": 0.00019898819561551433, "loss": 0.9424, "step": 1770 }, { "epoch": 0.49887577290612706, "grad_norm": 2.2828330993652344, "learning_rate": 0.00019955030916245082, "loss": 0.9245, "step": 1775 }, { "epoch": 0.5002810567734682, "grad_norm": 0.7001500129699707, "learning_rate": 0.0001999999980749907, "loss": 1.002, "step": 1780 }, { "epoch": 0.5016863406408094, "grad_norm": 0.7825088500976562, "learning_rate": 0.0001999999306996728, "loss": 0.9381, "step": 1785 }, { "epoch": 0.5030916245081506, "grad_norm": 1.0128787755966187, "learning_rate": 0.0001999997670739637, "loss": 0.9247, "step": 1790 }, { "epoch": 0.5044969083754919, "grad_norm": 5.00535249710083, "learning_rate": 0.0001999995071980209, "loss": 0.9202, "step": 1795 }, { "epoch": 0.5059021922428331, "grad_norm": 1.0808401107788086, "learning_rate": 0.00019999915107209458, "loss": 0.9324, "step": 1800 }, { "epoch": 0.5073074761101742, "grad_norm": 2.2071526050567627, "learning_rate": 0.00019999869869652748, "loss": 0.9561, "step": 1805 }, { "epoch": 0.5087127599775154, "grad_norm": 3.5948903560638428, "learning_rate": 0.00019999815007175502, "loss": 0.937, "step": 1810 }, { "epoch": 0.5101180438448567, "grad_norm": 0.7620937824249268, "learning_rate": 0.00019999750519830522, "loss": 0.9281, "step": 1815 }, { "epoch": 0.5115233277121979, "grad_norm": 2.3497204780578613, "learning_rate": 0.00019999676407679885, "loss": 0.9289, "step": 1820 }, { "epoch": 0.5129286115795391, "grad_norm": 1.0564337968826294, "learning_rate": 0.00019999592670794916, "loss": 0.9256, "step": 1825 }, { "epoch": 0.5143338954468802, "grad_norm": 0.8735659122467041, "learning_rate": 0.00019999499309256215, "loss": 0.9301, "step": 1830 }, { "epoch": 0.5157391793142214, "grad_norm": 1.4924262762069702, "learning_rate": 0.00019999396323153645, "loss": 0.9426, "step": 1835 }, { "epoch": 0.5171444631815627, "grad_norm": 2.184305429458618, "learning_rate": 0.00019999283712586328, "loss": 0.9199, "step": 1840 }, { "epoch": 0.5185497470489039, "grad_norm": 0.9658164381980896, "learning_rate": 0.00019999161477662653, "loss": 0.9344, "step": 1845 }, { "epoch": 0.5199550309162451, "grad_norm": 0.8796664476394653, "learning_rate": 0.00019999029618500273, "loss": 0.9335, "step": 1850 }, { "epoch": 0.5213603147835862, "grad_norm": 1.1257429122924805, "learning_rate": 0.00019998888135226104, "loss": 0.9265, "step": 1855 }, { "epoch": 0.5227655986509275, "grad_norm": 1.2149803638458252, "learning_rate": 0.00019998737027976323, "loss": 0.9255, "step": 1860 }, { "epoch": 0.5241708825182687, "grad_norm": 1.6585772037506104, "learning_rate": 0.00019998576296896366, "loss": 0.9232, "step": 1865 }, { "epoch": 0.5255761663856099, "grad_norm": 1.0203379392623901, "learning_rate": 0.00019998405942140942, "loss": 0.9343, "step": 1870 }, { "epoch": 0.5269814502529511, "grad_norm": 1.212179183959961, "learning_rate": 0.00019998225963874022, "loss": 0.9311, "step": 1875 }, { "epoch": 0.5283867341202922, "grad_norm": 2.862008571624756, "learning_rate": 0.00019998036362268832, "loss": 0.9289, "step": 1880 }, { "epoch": 0.5297920179876335, "grad_norm": 1.0918841361999512, "learning_rate": 0.00019997837137507865, "loss": 0.948, "step": 1885 }, { "epoch": 0.5311973018549747, "grad_norm": 1.3842644691467285, "learning_rate": 0.00019997628289782874, "loss": 0.9262, "step": 1890 }, { "epoch": 0.5326025857223159, "grad_norm": 1.543491244316101, "learning_rate": 0.0001999740981929488, "loss": 0.9329, "step": 1895 }, { "epoch": 0.5340078695896571, "grad_norm": 1.2093950510025024, "learning_rate": 0.00019997181726254154, "loss": 0.9714, "step": 1900 }, { "epoch": 0.5354131534569984, "grad_norm": 0.9563074111938477, "learning_rate": 0.00019996944010880247, "loss": 0.9445, "step": 1905 }, { "epoch": 0.5368184373243395, "grad_norm": 0.952589750289917, "learning_rate": 0.00019996696673401954, "loss": 0.9308, "step": 1910 }, { "epoch": 0.5382237211916807, "grad_norm": 1.4094700813293457, "learning_rate": 0.0001999643971405734, "loss": 0.9215, "step": 1915 }, { "epoch": 0.5396290050590219, "grad_norm": 0.819480299949646, "learning_rate": 0.0001999617313309373, "loss": 0.9293, "step": 1920 }, { "epoch": 0.5410342889263631, "grad_norm": 0.8585676550865173, "learning_rate": 0.0001999589693076771, "loss": 0.9155, "step": 1925 }, { "epoch": 0.5424395727937044, "grad_norm": 1.154471755027771, "learning_rate": 0.00019995611107345127, "loss": 0.9389, "step": 1930 }, { "epoch": 0.5438448566610455, "grad_norm": 1.1028889417648315, "learning_rate": 0.00019995315663101082, "loss": 0.9127, "step": 1935 }, { "epoch": 0.5452501405283867, "grad_norm": 0.968929648399353, "learning_rate": 0.00019995010598319947, "loss": 0.9408, "step": 1940 }, { "epoch": 0.5466554243957279, "grad_norm": 1.0982134342193604, "learning_rate": 0.00019994695913295348, "loss": 0.9062, "step": 1945 }, { "epoch": 0.5480607082630692, "grad_norm": 0.6998859643936157, "learning_rate": 0.00019994371608330166, "loss": 0.9204, "step": 1950 }, { "epoch": 0.5494659921304104, "grad_norm": 1.3865907192230225, "learning_rate": 0.0001999403768373655, "loss": 0.9142, "step": 1955 }, { "epoch": 0.5508712759977515, "grad_norm": 1.1197766065597534, "learning_rate": 0.00019993694139835904, "loss": 0.9232, "step": 1960 }, { "epoch": 0.5522765598650927, "grad_norm": 0.7113555669784546, "learning_rate": 0.0001999334097695889, "loss": 0.9303, "step": 1965 }, { "epoch": 0.5536818437324339, "grad_norm": 0.5910270810127258, "learning_rate": 0.0001999297819544543, "loss": 0.9236, "step": 1970 }, { "epoch": 0.5550871275997752, "grad_norm": 0.5368698835372925, "learning_rate": 0.000199926057956447, "loss": 0.932, "step": 1975 }, { "epoch": 0.5564924114671164, "grad_norm": 2.1805481910705566, "learning_rate": 0.00019992223777915132, "loss": 0.9225, "step": 1980 }, { "epoch": 0.5578976953344575, "grad_norm": 2.1168668270111084, "learning_rate": 0.00019991832142624434, "loss": 0.9148, "step": 1985 }, { "epoch": 0.5593029792017987, "grad_norm": 1.4886637926101685, "learning_rate": 0.00019991430890149549, "loss": 0.9378, "step": 1990 }, { "epoch": 0.56070826306914, "grad_norm": 0.647662878036499, "learning_rate": 0.00019991020020876675, "loss": 0.9283, "step": 1995 }, { "epoch": 0.5621135469364812, "grad_norm": 1.1095281839370728, "learning_rate": 0.00019990599535201292, "loss": 0.9062, "step": 2000 }, { "epoch": 0.5635188308038224, "grad_norm": 0.6007787585258484, "learning_rate": 0.0001999016943352811, "loss": 0.9207, "step": 2005 }, { "epoch": 0.5649241146711635, "grad_norm": 1.0246546268463135, "learning_rate": 0.00019989729716271106, "loss": 0.9243, "step": 2010 }, { "epoch": 0.5663293985385047, "grad_norm": 0.9852647185325623, "learning_rate": 0.0001998928038385351, "loss": 0.9044, "step": 2015 }, { "epoch": 0.567734682405846, "grad_norm": 0.6966019868850708, "learning_rate": 0.00019988821436707805, "loss": 0.9343, "step": 2020 }, { "epoch": 0.5691399662731872, "grad_norm": 1.1939654350280762, "learning_rate": 0.0001998835287527573, "loss": 0.9349, "step": 2025 }, { "epoch": 0.5705452501405284, "grad_norm": 0.592685341835022, "learning_rate": 0.00019987874700008282, "loss": 0.939, "step": 2030 }, { "epoch": 0.5719505340078695, "grad_norm": 0.609995424747467, "learning_rate": 0.000199873869113657, "loss": 0.9275, "step": 2035 }, { "epoch": 0.5733558178752108, "grad_norm": 1.3798760175704956, "learning_rate": 0.00019986889509817485, "loss": 0.9265, "step": 2040 }, { "epoch": 0.574761101742552, "grad_norm": 0.6716069579124451, "learning_rate": 0.00019986382495842394, "loss": 0.918, "step": 2045 }, { "epoch": 0.5761663856098932, "grad_norm": 2.2000043392181396, "learning_rate": 0.0001998586586992842, "loss": 0.9108, "step": 2050 }, { "epoch": 0.5775716694772344, "grad_norm": 0.9085923433303833, "learning_rate": 0.00019985339632572826, "loss": 0.9138, "step": 2055 }, { "epoch": 0.5789769533445756, "grad_norm": 0.5850256085395813, "learning_rate": 0.00019984803784282116, "loss": 0.9131, "step": 2060 }, { "epoch": 0.5803822372119168, "grad_norm": 0.6118605136871338, "learning_rate": 0.00019984258325572043, "loss": 0.9301, "step": 2065 }, { "epoch": 0.581787521079258, "grad_norm": 0.936461865901947, "learning_rate": 0.0001998370325696762, "loss": 0.9257, "step": 2070 }, { "epoch": 0.5831928049465992, "grad_norm": 1.579474687576294, "learning_rate": 0.00019983138579003095, "loss": 1.0169, "step": 2075 }, { "epoch": 0.5845980888139404, "grad_norm": 1.5684884786605835, "learning_rate": 0.0001998256429222198, "loss": 0.932, "step": 2080 }, { "epoch": 0.5860033726812817, "grad_norm": 0.9524665474891663, "learning_rate": 0.00019981980397177024, "loss": 0.9334, "step": 2085 }, { "epoch": 0.5874086565486228, "grad_norm": 1.5445525646209717, "learning_rate": 0.00019981386894430233, "loss": 0.9331, "step": 2090 }, { "epoch": 0.588813940415964, "grad_norm": 0.8801035284996033, "learning_rate": 0.00019980783784552853, "loss": 0.9187, "step": 2095 }, { "epoch": 0.5902192242833052, "grad_norm": 1.2468377351760864, "learning_rate": 0.0001998017106812538, "loss": 0.9256, "step": 2100 }, { "epoch": 0.5916245081506464, "grad_norm": 1.2207618951797485, "learning_rate": 0.00019979548745737558, "loss": 0.9445, "step": 2105 }, { "epoch": 0.5930297920179877, "grad_norm": 0.573489785194397, "learning_rate": 0.00019978916817988375, "loss": 0.9224, "step": 2110 }, { "epoch": 0.5944350758853288, "grad_norm": 0.6274116635322571, "learning_rate": 0.00019978275285486064, "loss": 0.9221, "step": 2115 }, { "epoch": 0.59584035975267, "grad_norm": 0.6306831240653992, "learning_rate": 0.000199776241488481, "loss": 0.8899, "step": 2120 }, { "epoch": 0.5972456436200112, "grad_norm": 0.6260164976119995, "learning_rate": 0.00019976963408701207, "loss": 0.9108, "step": 2125 }, { "epoch": 0.5986509274873525, "grad_norm": 1.0233713388442993, "learning_rate": 0.00019976293065681353, "loss": 0.9266, "step": 2130 }, { "epoch": 0.6000562113546937, "grad_norm": 0.7524725198745728, "learning_rate": 0.00019975613120433745, "loss": 0.9121, "step": 2135 }, { "epoch": 0.6014614952220348, "grad_norm": 0.9613284468650818, "learning_rate": 0.0001997492357361283, "loss": 0.9071, "step": 2140 }, { "epoch": 0.602866779089376, "grad_norm": 1.719527244567871, "learning_rate": 0.00019974224425882306, "loss": 0.9086, "step": 2145 }, { "epoch": 0.6042720629567172, "grad_norm": 0.6603277921676636, "learning_rate": 0.00019973515677915103, "loss": 0.9066, "step": 2150 }, { "epoch": 0.6056773468240585, "grad_norm": 0.7629093527793884, "learning_rate": 0.0001997279733039339, "loss": 0.9185, "step": 2155 }, { "epoch": 0.6070826306913997, "grad_norm": 1.0620144605636597, "learning_rate": 0.00019972069384008588, "loss": 0.9076, "step": 2160 }, { "epoch": 0.6084879145587408, "grad_norm": 0.8223029971122742, "learning_rate": 0.00019971331839461337, "loss": 0.9276, "step": 2165 }, { "epoch": 0.609893198426082, "grad_norm": 0.9483692049980164, "learning_rate": 0.00019970584697461542, "loss": 0.8958, "step": 2170 }, { "epoch": 0.6112984822934233, "grad_norm": 1.230619192123413, "learning_rate": 0.00019969827958728317, "loss": 0.9166, "step": 2175 }, { "epoch": 0.6127037661607645, "grad_norm": 0.8268842697143555, "learning_rate": 0.00019969061623990037, "loss": 0.9181, "step": 2180 }, { "epoch": 0.6141090500281057, "grad_norm": 1.1452637910842896, "learning_rate": 0.00019968285693984297, "loss": 0.9154, "step": 2185 }, { "epoch": 0.6155143338954469, "grad_norm": 0.6976009607315063, "learning_rate": 0.0001996750016945793, "loss": 0.905, "step": 2190 }, { "epoch": 0.616919617762788, "grad_norm": 1.1496474742889404, "learning_rate": 0.00019966705051167015, "loss": 0.9063, "step": 2195 }, { "epoch": 0.6183249016301293, "grad_norm": 1.9882307052612305, "learning_rate": 0.0001996590033987685, "loss": 0.9169, "step": 2200 }, { "epoch": 0.6197301854974705, "grad_norm": 0.7984516620635986, "learning_rate": 0.00019965086036361979, "loss": 0.9283, "step": 2205 }, { "epoch": 0.6211354693648117, "grad_norm": 0.7119618654251099, "learning_rate": 0.0001996426214140617, "loss": 0.922, "step": 2210 }, { "epoch": 0.6225407532321529, "grad_norm": 1.7469879388809204, "learning_rate": 0.00019963428655802426, "loss": 0.9503, "step": 2215 }, { "epoch": 0.6239460370994941, "grad_norm": 1.0765151977539062, "learning_rate": 0.0001996258558035298, "loss": 0.9176, "step": 2220 }, { "epoch": 0.6253513209668353, "grad_norm": 1.2708961963653564, "learning_rate": 0.000199617329158693, "loss": 0.9053, "step": 2225 }, { "epoch": 0.6267566048341765, "grad_norm": 1.076857566833496, "learning_rate": 0.00019960870663172074, "loss": 0.9068, "step": 2230 }, { "epoch": 0.6281618887015177, "grad_norm": 2.503718614578247, "learning_rate": 0.00019959998823091226, "loss": 0.9279, "step": 2235 }, { "epoch": 0.6295671725688589, "grad_norm": 0.9719533920288086, "learning_rate": 0.00019959117396465905, "loss": 0.9305, "step": 2240 }, { "epoch": 0.6309724564362001, "grad_norm": 1.205297827720642, "learning_rate": 0.00019958226384144488, "loss": 0.9147, "step": 2245 }, { "epoch": 0.6323777403035413, "grad_norm": 1.4823075532913208, "learning_rate": 0.00019957325786984585, "loss": 0.9127, "step": 2250 }, { "epoch": 0.6337830241708825, "grad_norm": 0.7677815556526184, "learning_rate": 0.0001995641560585302, "loss": 0.9114, "step": 2255 }, { "epoch": 0.6351883080382237, "grad_norm": 0.8798638582229614, "learning_rate": 0.00019955495841625842, "loss": 0.8995, "step": 2260 }, { "epoch": 0.636593591905565, "grad_norm": 1.329459309577942, "learning_rate": 0.00019954566495188332, "loss": 0.9068, "step": 2265 }, { "epoch": 0.6379988757729061, "grad_norm": 0.6494102478027344, "learning_rate": 0.00019953627567434996, "loss": 0.8993, "step": 2270 }, { "epoch": 0.6394041596402473, "grad_norm": 1.5372142791748047, "learning_rate": 0.00019952679059269545, "loss": 0.9268, "step": 2275 }, { "epoch": 0.6408094435075885, "grad_norm": 0.6430231928825378, "learning_rate": 0.00019951720971604932, "loss": 0.9612, "step": 2280 }, { "epoch": 0.6422147273749297, "grad_norm": 0.6014605760574341, "learning_rate": 0.0001995075330536332, "loss": 0.8996, "step": 2285 }, { "epoch": 0.643620011242271, "grad_norm": 0.5937612652778625, "learning_rate": 0.00019949776061476088, "loss": 0.9167, "step": 2290 }, { "epoch": 0.6450252951096122, "grad_norm": 3.626796245574951, "learning_rate": 0.00019948789240883835, "loss": 0.9205, "step": 2295 }, { "epoch": 0.6464305789769533, "grad_norm": 0.9523254036903381, "learning_rate": 0.00019947792844536387, "loss": 0.9335, "step": 2300 }, { "epoch": 0.6478358628442945, "grad_norm": 1.2352759838104248, "learning_rate": 0.0001994678687339278, "loss": 0.9045, "step": 2305 }, { "epoch": 0.6492411467116358, "grad_norm": 0.8574492931365967, "learning_rate": 0.00019945771328421262, "loss": 0.9146, "step": 2310 }, { "epoch": 0.650646430578977, "grad_norm": 0.9836457967758179, "learning_rate": 0.00019944746210599301, "loss": 0.9032, "step": 2315 }, { "epoch": 0.6520517144463182, "grad_norm": 0.8709744811058044, "learning_rate": 0.00019943711520913575, "loss": 0.9209, "step": 2320 }, { "epoch": 0.6534569983136593, "grad_norm": 1.0688056945800781, "learning_rate": 0.00019942667260359985, "loss": 0.9149, "step": 2325 }, { "epoch": 0.6548622821810005, "grad_norm": 1.94585120677948, "learning_rate": 0.0001994161342994363, "loss": 0.9644, "step": 2330 }, { "epoch": 0.6562675660483418, "grad_norm": 1.7208789587020874, "learning_rate": 0.00019940550030678826, "loss": 0.9037, "step": 2335 }, { "epoch": 0.657672849915683, "grad_norm": 1.9082733392715454, "learning_rate": 0.00019939477063589105, "loss": 0.9122, "step": 2340 }, { "epoch": 0.6590781337830242, "grad_norm": 0.9194398522377014, "learning_rate": 0.00019938394529707198, "loss": 0.9127, "step": 2345 }, { "epoch": 0.6604834176503653, "grad_norm": 4.11295223236084, "learning_rate": 0.00019937302430075052, "loss": 0.9152, "step": 2350 }, { "epoch": 0.6618887015177066, "grad_norm": 1.0777262449264526, "learning_rate": 0.00019936200765743815, "loss": 0.8949, "step": 2355 }, { "epoch": 0.6632939853850478, "grad_norm": 0.7004035711288452, "learning_rate": 0.00019935089537773847, "loss": 0.9256, "step": 2360 }, { "epoch": 0.664699269252389, "grad_norm": 0.8367961049079895, "learning_rate": 0.00019933968747234707, "loss": 0.9059, "step": 2365 }, { "epoch": 0.6661045531197302, "grad_norm": 0.8426024317741394, "learning_rate": 0.00019932838395205166, "loss": 0.8936, "step": 2370 }, { "epoch": 0.6675098369870713, "grad_norm": 1.0446916818618774, "learning_rate": 0.00019931698482773187, "loss": 0.9106, "step": 2375 }, { "epoch": 0.6689151208544126, "grad_norm": 0.6767590045928955, "learning_rate": 0.00019930549011035943, "loss": 0.9095, "step": 2380 }, { "epoch": 0.6703204047217538, "grad_norm": 0.834368884563446, "learning_rate": 0.00019929389981099806, "loss": 0.8967, "step": 2385 }, { "epoch": 0.671725688589095, "grad_norm": 0.6920280456542969, "learning_rate": 0.0001992822139408035, "loss": 0.9059, "step": 2390 }, { "epoch": 0.6731309724564362, "grad_norm": 0.7495502233505249, "learning_rate": 0.00019927043251102342, "loss": 1.0464, "step": 2395 }, { "epoch": 0.6745362563237775, "grad_norm": 0.8813453316688538, "learning_rate": 0.00019925855553299752, "loss": 0.9374, "step": 2400 }, { "epoch": 0.6759415401911186, "grad_norm": 0.6671605110168457, "learning_rate": 0.00019924658301815744, "loss": 0.9172, "step": 2405 }, { "epoch": 0.6773468240584598, "grad_norm": 1.1641546487808228, "learning_rate": 0.00019923451497802676, "loss": 0.9068, "step": 2410 }, { "epoch": 0.678752107925801, "grad_norm": 1.446001648902893, "learning_rate": 0.0001992223514242211, "loss": 0.9051, "step": 2415 }, { "epoch": 0.6801573917931422, "grad_norm": 0.8141966462135315, "learning_rate": 0.0001992100923684478, "loss": 0.9603, "step": 2420 }, { "epoch": 0.6815626756604835, "grad_norm": 0.8345810174942017, "learning_rate": 0.00019919773782250638, "loss": 0.9314, "step": 2425 }, { "epoch": 0.6829679595278246, "grad_norm": 3.239243268966675, "learning_rate": 0.0001991852877982881, "loss": 0.9192, "step": 2430 }, { "epoch": 0.6843732433951658, "grad_norm": 1.1311649084091187, "learning_rate": 0.00019917274230777618, "loss": 0.8953, "step": 2435 }, { "epoch": 0.685778527262507, "grad_norm": 1.2739644050598145, "learning_rate": 0.00019916010136304565, "loss": 0.9324, "step": 2440 }, { "epoch": 0.6871838111298483, "grad_norm": 1.0117378234863281, "learning_rate": 0.0001991473649762636, "loss": 0.8924, "step": 2445 }, { "epoch": 0.6885890949971895, "grad_norm": 0.7392624616622925, "learning_rate": 0.00019913453315968874, "loss": 0.9007, "step": 2450 }, { "epoch": 0.6899943788645306, "grad_norm": 0.6434834599494934, "learning_rate": 0.00019912160592567183, "loss": 0.9242, "step": 2455 }, { "epoch": 0.6913996627318718, "grad_norm": 0.6763484477996826, "learning_rate": 0.0001991085832866553, "loss": 0.8886, "step": 2460 }, { "epoch": 0.692804946599213, "grad_norm": 0.5814651250839233, "learning_rate": 0.00019909546525517365, "loss": 0.9238, "step": 2465 }, { "epoch": 0.6942102304665543, "grad_norm": 0.5961918234825134, "learning_rate": 0.00019908225184385293, "loss": 0.9185, "step": 2470 }, { "epoch": 0.6956155143338955, "grad_norm": 0.7034104466438293, "learning_rate": 0.00019906894306541108, "loss": 0.9004, "step": 2475 }, { "epoch": 0.6970207982012366, "grad_norm": 1.2677810192108154, "learning_rate": 0.00019905553893265798, "loss": 0.9185, "step": 2480 }, { "epoch": 0.6984260820685778, "grad_norm": 0.9194796085357666, "learning_rate": 0.0001990420394584951, "loss": 0.9258, "step": 2485 }, { "epoch": 0.6998313659359191, "grad_norm": 0.6065542697906494, "learning_rate": 0.00019902844465591573, "loss": 0.9223, "step": 2490 }, { "epoch": 0.7012366498032603, "grad_norm": 0.5745678544044495, "learning_rate": 0.00019901475453800496, "loss": 0.918, "step": 2495 }, { "epoch": 0.7026419336706015, "grad_norm": 0.8046496510505676, "learning_rate": 0.00019900096911793958, "loss": 0.9037, "step": 2500 }, { "epoch": 0.7040472175379426, "grad_norm": 0.7904361486434937, "learning_rate": 0.0001989870884089881, "loss": 0.9048, "step": 2505 }, { "epoch": 0.7054525014052838, "grad_norm": 1.3024152517318726, "learning_rate": 0.00019897311242451086, "loss": 0.9052, "step": 2510 }, { "epoch": 0.7068577852726251, "grad_norm": 1.1783087253570557, "learning_rate": 0.00019895904117795966, "loss": 0.9047, "step": 2515 }, { "epoch": 0.7082630691399663, "grad_norm": 1.1952921152114868, "learning_rate": 0.00019894487468287826, "loss": 0.9032, "step": 2520 }, { "epoch": 0.7096683530073075, "grad_norm": 1.389844536781311, "learning_rate": 0.00019893061295290192, "loss": 0.9283, "step": 2525 }, { "epoch": 0.7110736368746486, "grad_norm": 1.3843488693237305, "learning_rate": 0.00019891625600175763, "loss": 0.8915, "step": 2530 }, { "epoch": 0.7124789207419899, "grad_norm": 1.0372074842453003, "learning_rate": 0.00019890180384326403, "loss": 0.9074, "step": 2535 }, { "epoch": 0.7138842046093311, "grad_norm": 0.9712080359458923, "learning_rate": 0.00019888725649133137, "loss": 0.9281, "step": 2540 }, { "epoch": 0.7152894884766723, "grad_norm": 0.5139784812927246, "learning_rate": 0.00019887261395996157, "loss": 0.9132, "step": 2545 }, { "epoch": 0.7166947723440135, "grad_norm": 1.2253260612487793, "learning_rate": 0.00019885787626324812, "loss": 0.899, "step": 2550 }, { "epoch": 0.7181000562113546, "grad_norm": 0.992481529712677, "learning_rate": 0.00019884304341537615, "loss": 0.9, "step": 2555 }, { "epoch": 0.7195053400786959, "grad_norm": 0.6645762324333191, "learning_rate": 0.00019882811543062227, "loss": 0.9244, "step": 2560 }, { "epoch": 0.7209106239460371, "grad_norm": 0.7887224555015564, "learning_rate": 0.0001988130923233548, "loss": 0.9072, "step": 2565 }, { "epoch": 0.7223159078133783, "grad_norm": 0.7903003692626953, "learning_rate": 0.0001987979741080335, "loss": 0.9112, "step": 2570 }, { "epoch": 0.7237211916807195, "grad_norm": 0.5832539796829224, "learning_rate": 0.00019878276079920979, "loss": 0.915, "step": 2575 }, { "epoch": 0.7251264755480608, "grad_norm": 0.7505447864532471, "learning_rate": 0.00019876745241152648, "loss": 0.9067, "step": 2580 }, { "epoch": 0.7265317594154019, "grad_norm": 1.1223492622375488, "learning_rate": 0.00019875204895971802, "loss": 0.9111, "step": 2585 }, { "epoch": 0.7279370432827431, "grad_norm": 1.6072221994400024, "learning_rate": 0.00019873655045861023, "loss": 0.8981, "step": 2590 }, { "epoch": 0.7293423271500843, "grad_norm": 0.6952162384986877, "learning_rate": 0.00019872095692312057, "loss": 0.9382, "step": 2595 }, { "epoch": 0.7307476110174255, "grad_norm": 0.7969158291816711, "learning_rate": 0.00019870526836825785, "loss": 0.9029, "step": 2600 }, { "epoch": 0.7321528948847668, "grad_norm": 0.9538679718971252, "learning_rate": 0.00019868948480912234, "loss": 0.9033, "step": 2605 }, { "epoch": 0.7335581787521079, "grad_norm": 1.2282218933105469, "learning_rate": 0.00019867360626090586, "loss": 0.8992, "step": 2610 }, { "epoch": 0.7349634626194491, "grad_norm": 0.6724523305892944, "learning_rate": 0.00019865763273889156, "loss": 0.9907, "step": 2615 }, { "epoch": 0.7363687464867903, "grad_norm": 1.1378065347671509, "learning_rate": 0.000198641564258454, "loss": 0.9116, "step": 2620 }, { "epoch": 0.7377740303541316, "grad_norm": 1.1677615642547607, "learning_rate": 0.00019862540083505917, "loss": 0.9044, "step": 2625 }, { "epoch": 0.7391793142214728, "grad_norm": 0.8698350787162781, "learning_rate": 0.00019860914248426447, "loss": 0.9051, "step": 2630 }, { "epoch": 0.7405845980888139, "grad_norm": 2.929654598236084, "learning_rate": 0.00019859278922171864, "loss": 0.9174, "step": 2635 }, { "epoch": 0.7419898819561551, "grad_norm": 0.8416985869407654, "learning_rate": 0.00019857634106316174, "loss": 0.9164, "step": 2640 }, { "epoch": 0.7433951658234963, "grad_norm": 0.8570914268493652, "learning_rate": 0.00019855979802442522, "loss": 0.9186, "step": 2645 }, { "epoch": 0.7448004496908376, "grad_norm": 1.2417738437652588, "learning_rate": 0.00019854316012143182, "loss": 0.898, "step": 2650 }, { "epoch": 0.7462057335581788, "grad_norm": 1.1438087224960327, "learning_rate": 0.00019852642737019558, "loss": 0.9072, "step": 2655 }, { "epoch": 0.7476110174255199, "grad_norm": 0.5742961764335632, "learning_rate": 0.00019850959978682186, "loss": 0.8934, "step": 2660 }, { "epoch": 0.7490163012928611, "grad_norm": 0.7306331396102905, "learning_rate": 0.00019849267738750732, "loss": 0.9112, "step": 2665 }, { "epoch": 0.7504215851602024, "grad_norm": 0.6199610233306885, "learning_rate": 0.0001984756601885398, "loss": 0.8977, "step": 2670 }, { "epoch": 0.7518268690275436, "grad_norm": 1.0061787366867065, "learning_rate": 0.00019845854820629846, "loss": 0.91, "step": 2675 }, { "epoch": 0.7532321528948848, "grad_norm": 0.7906721830368042, "learning_rate": 0.00019844134145725363, "loss": 0.9033, "step": 2680 }, { "epoch": 0.7546374367622259, "grad_norm": 0.9115892052650452, "learning_rate": 0.00019842403995796697, "loss": 0.9001, "step": 2685 }, { "epoch": 0.7560427206295671, "grad_norm": 0.7275146245956421, "learning_rate": 0.00019840664372509115, "loss": 0.927, "step": 2690 }, { "epoch": 0.7574480044969084, "grad_norm": 1.2060136795043945, "learning_rate": 0.00019838915277537017, "loss": 0.9192, "step": 2695 }, { "epoch": 0.7588532883642496, "grad_norm": 1.1314098834991455, "learning_rate": 0.00019837156712563912, "loss": 0.8778, "step": 2700 }, { "epoch": 0.7602585722315908, "grad_norm": 0.6868102550506592, "learning_rate": 0.00019835388679282433, "loss": 0.9251, "step": 2705 }, { "epoch": 0.761663856098932, "grad_norm": 1.133839726448059, "learning_rate": 0.00019833611179394313, "loss": 0.9157, "step": 2710 }, { "epoch": 0.7630691399662732, "grad_norm": 1.597622036933899, "learning_rate": 0.0001983182421461041, "loss": 0.9015, "step": 2715 }, { "epoch": 0.7644744238336144, "grad_norm": 1.8652737140655518, "learning_rate": 0.0001983002778665068, "loss": 0.904, "step": 2720 }, { "epoch": 0.7658797077009556, "grad_norm": 0.7736433148384094, "learning_rate": 0.000198282218972442, "loss": 0.9139, "step": 2725 }, { "epoch": 0.7672849915682968, "grad_norm": 0.6170510053634644, "learning_rate": 0.0001982640654812914, "loss": 0.9061, "step": 2730 }, { "epoch": 0.768690275435638, "grad_norm": 0.88065505027771, "learning_rate": 0.00019824581741052785, "loss": 0.897, "step": 2735 }, { "epoch": 0.7700955593029792, "grad_norm": 0.6491146683692932, "learning_rate": 0.0001982274747777152, "loss": 0.9058, "step": 2740 }, { "epoch": 0.7715008431703204, "grad_norm": 0.5739811062812805, "learning_rate": 0.00019820903760050832, "loss": 0.8854, "step": 2745 }, { "epoch": 0.7729061270376616, "grad_norm": 0.8427909016609192, "learning_rate": 0.00019819050589665307, "loss": 0.8983, "step": 2750 }, { "epoch": 0.7743114109050028, "grad_norm": 0.6211098432540894, "learning_rate": 0.0001981718796839863, "loss": 0.8902, "step": 2755 }, { "epoch": 0.7757166947723441, "grad_norm": 1.1936466693878174, "learning_rate": 0.00019815315898043582, "loss": 0.9144, "step": 2760 }, { "epoch": 0.7771219786396852, "grad_norm": 0.8049221038818359, "learning_rate": 0.00019813434380402045, "loss": 0.8962, "step": 2765 }, { "epoch": 0.7785272625070264, "grad_norm": 0.6825897097587585, "learning_rate": 0.00019811543417284978, "loss": 0.8929, "step": 2770 }, { "epoch": 0.7799325463743676, "grad_norm": 0.7605122923851013, "learning_rate": 0.0001980964301051245, "loss": 0.8846, "step": 2775 }, { "epoch": 0.7813378302417088, "grad_norm": 0.6697847843170166, "learning_rate": 0.00019807733161913608, "loss": 0.9627, "step": 2780 }, { "epoch": 0.7827431141090501, "grad_norm": 0.7112193703651428, "learning_rate": 0.0001980581387332669, "loss": 0.9225, "step": 2785 }, { "epoch": 0.7841483979763912, "grad_norm": 0.9057043790817261, "learning_rate": 0.0001980388514659902, "loss": 0.9215, "step": 2790 }, { "epoch": 0.7855536818437324, "grad_norm": 0.7113947868347168, "learning_rate": 0.00019801946983587007, "loss": 0.9021, "step": 2795 }, { "epoch": 0.7869589657110736, "grad_norm": 0.5814126133918762, "learning_rate": 0.00019799999386156146, "loss": 0.9, "step": 2800 }, { "epoch": 0.7883642495784149, "grad_norm": 0.7124185562133789, "learning_rate": 0.00019798042356181, "loss": 0.8951, "step": 2805 }, { "epoch": 0.7897695334457561, "grad_norm": 0.7653499841690063, "learning_rate": 0.00019796075895545223, "loss": 0.9109, "step": 2810 }, { "epoch": 0.7911748173130972, "grad_norm": 0.5366164445877075, "learning_rate": 0.00019794100006141543, "loss": 0.8895, "step": 2815 }, { "epoch": 0.7925801011804384, "grad_norm": 0.949338436126709, "learning_rate": 0.0001979211468987176, "loss": 0.9306, "step": 2820 }, { "epoch": 0.7939853850477796, "grad_norm": 0.5122391581535339, "learning_rate": 0.00019790119948646755, "loss": 0.8965, "step": 2825 }, { "epoch": 0.7953906689151209, "grad_norm": 0.8764130473136902, "learning_rate": 0.00019788115784386473, "loss": 0.9049, "step": 2830 }, { "epoch": 0.7967959527824621, "grad_norm": 0.5787503719329834, "learning_rate": 0.00019786102199019932, "loss": 0.9092, "step": 2835 }, { "epoch": 0.7982012366498032, "grad_norm": 0.7566272616386414, "learning_rate": 0.00019784079194485213, "loss": 0.8874, "step": 2840 }, { "epoch": 0.7996065205171444, "grad_norm": 0.8594691157341003, "learning_rate": 0.00019782046772729475, "loss": 0.9042, "step": 2845 }, { "epoch": 0.8010118043844857, "grad_norm": 1.1679846048355103, "learning_rate": 0.00019780004935708925, "loss": 0.9016, "step": 2850 }, { "epoch": 0.8024170882518269, "grad_norm": 1.2150373458862305, "learning_rate": 0.00019777953685388844, "loss": 0.9022, "step": 2855 }, { "epoch": 0.8038223721191681, "grad_norm": 0.7408333420753479, "learning_rate": 0.00019775893023743572, "loss": 0.9224, "step": 2860 }, { "epoch": 0.8052276559865092, "grad_norm": 1.2019459009170532, "learning_rate": 0.00019773822952756501, "loss": 0.8983, "step": 2865 }, { "epoch": 0.8066329398538504, "grad_norm": 0.6662552356719971, "learning_rate": 0.00019771743474420088, "loss": 0.906, "step": 2870 }, { "epoch": 0.8080382237211917, "grad_norm": 0.6724129319190979, "learning_rate": 0.00019769654590735838, "loss": 0.8981, "step": 2875 }, { "epoch": 0.8094435075885329, "grad_norm": 2.850919246673584, "learning_rate": 0.0001976755630371431, "loss": 0.9585, "step": 2880 }, { "epoch": 0.8108487914558741, "grad_norm": 1.244378924369812, "learning_rate": 0.00019765448615375117, "loss": 0.9517, "step": 2885 }, { "epoch": 0.8122540753232153, "grad_norm": 0.7747802138328552, "learning_rate": 0.0001976333152774692, "loss": 0.8957, "step": 2890 }, { "epoch": 0.8136593591905565, "grad_norm": 0.544975221157074, "learning_rate": 0.00019761205042867423, "loss": 0.9665, "step": 2895 }, { "epoch": 0.8150646430578977, "grad_norm": 0.5579664707183838, "learning_rate": 0.00019759069162783376, "loss": 0.9038, "step": 2900 }, { "epoch": 0.8164699269252389, "grad_norm": 10.984373092651367, "learning_rate": 0.00019756923889550579, "loss": 0.9749, "step": 2905 }, { "epoch": 0.8178752107925801, "grad_norm": 1.2287412881851196, "learning_rate": 0.00019754769225233863, "loss": 0.8992, "step": 2910 }, { "epoch": 0.8192804946599213, "grad_norm": 0.8453310132026672, "learning_rate": 0.00019752605171907098, "loss": 0.9039, "step": 2915 }, { "epoch": 0.8206857785272625, "grad_norm": 0.6380810141563416, "learning_rate": 0.00019750431731653206, "loss": 0.9129, "step": 2920 }, { "epoch": 0.8220910623946037, "grad_norm": 0.7804015278816223, "learning_rate": 0.00019748248906564125, "loss": 0.8979, "step": 2925 }, { "epoch": 0.8234963462619449, "grad_norm": 6.066610813140869, "learning_rate": 0.00019746056698740835, "loss": 0.8857, "step": 2930 }, { "epoch": 0.8249016301292861, "grad_norm": 0.9343296885490417, "learning_rate": 0.00019743855110293353, "loss": 0.8939, "step": 2935 }, { "epoch": 0.8263069139966274, "grad_norm": 0.5537464022636414, "learning_rate": 0.00019741644143340706, "loss": 0.9067, "step": 2940 }, { "epoch": 0.8277121978639685, "grad_norm": 0.8258460164070129, "learning_rate": 0.0001973942380001097, "loss": 0.8952, "step": 2945 }, { "epoch": 0.8291174817313097, "grad_norm": 0.9482399225234985, "learning_rate": 0.0001973719408244123, "loss": 0.9046, "step": 2950 }, { "epoch": 0.8305227655986509, "grad_norm": 1.2801766395568848, "learning_rate": 0.00019734954992777604, "loss": 0.8946, "step": 2955 }, { "epoch": 0.8319280494659921, "grad_norm": 1.2622408866882324, "learning_rate": 0.00019732706533175223, "loss": 0.9179, "step": 2960 }, { "epoch": 0.8333333333333334, "grad_norm": 1.2289254665374756, "learning_rate": 0.00019730448705798239, "loss": 0.8904, "step": 2965 }, { "epoch": 0.8347386172006745, "grad_norm": 0.6249719262123108, "learning_rate": 0.00019728181512819823, "loss": 0.9245, "step": 2970 }, { "epoch": 0.8361439010680157, "grad_norm": 0.7410894632339478, "learning_rate": 0.00019725904956422157, "loss": 0.8945, "step": 2975 }, { "epoch": 0.8375491849353569, "grad_norm": 0.609602689743042, "learning_rate": 0.0001972361903879644, "loss": 0.9139, "step": 2980 }, { "epoch": 0.8389544688026982, "grad_norm": 0.763276219367981, "learning_rate": 0.00019721323762142873, "loss": 0.8985, "step": 2985 }, { "epoch": 0.8403597526700394, "grad_norm": 0.5820448994636536, "learning_rate": 0.00019719019128670677, "loss": 0.9034, "step": 2990 }, { "epoch": 0.8417650365373806, "grad_norm": 0.8953343033790588, "learning_rate": 0.00019716705140598067, "loss": 0.9607, "step": 2995 }, { "epoch": 0.8431703204047217, "grad_norm": 0.5899667143821716, "learning_rate": 0.00019714381800152268, "loss": 0.8925, "step": 3000 }, { "epoch": 0.8445756042720629, "grad_norm": 0.7428545951843262, "learning_rate": 0.00019712049109569507, "loss": 0.8898, "step": 3005 }, { "epoch": 0.8459808881394042, "grad_norm": 0.5695815682411194, "learning_rate": 0.00019709707071095006, "loss": 0.8838, "step": 3010 }, { "epoch": 0.8473861720067454, "grad_norm": 0.6270226836204529, "learning_rate": 0.00019707355686982995, "loss": 0.9089, "step": 3015 }, { "epoch": 0.8487914558740866, "grad_norm": 0.5706961750984192, "learning_rate": 0.00019704994959496687, "loss": 0.8895, "step": 3020 }, { "epoch": 0.8501967397414277, "grad_norm": 0.6293683052062988, "learning_rate": 0.00019702624890908293, "loss": 0.899, "step": 3025 }, { "epoch": 0.851602023608769, "grad_norm": 0.8228982090950012, "learning_rate": 0.00019700245483499017, "loss": 0.8767, "step": 3030 }, { "epoch": 0.8530073074761102, "grad_norm": 0.7604111433029175, "learning_rate": 0.00019697856739559044, "loss": 0.8926, "step": 3035 }, { "epoch": 0.8544125913434514, "grad_norm": 1.0019569396972656, "learning_rate": 0.00019695458661387558, "loss": 0.9085, "step": 3040 }, { "epoch": 0.8558178752107926, "grad_norm": 0.6671732664108276, "learning_rate": 0.00019693051251292717, "loss": 0.8854, "step": 3045 }, { "epoch": 0.8572231590781337, "grad_norm": 0.9283923506736755, "learning_rate": 0.00019690634511591664, "loss": 0.9482, "step": 3050 }, { "epoch": 0.858628442945475, "grad_norm": 1.4978197813034058, "learning_rate": 0.00019688208444610522, "loss": 0.9213, "step": 3055 }, { "epoch": 0.8600337268128162, "grad_norm": 1.9540295600891113, "learning_rate": 0.00019685773052684392, "loss": 0.9069, "step": 3060 }, { "epoch": 0.8614390106801574, "grad_norm": 0.8743392825126648, "learning_rate": 0.00019683328338157354, "loss": 0.9185, "step": 3065 }, { "epoch": 0.8628442945474986, "grad_norm": 0.5670438408851624, "learning_rate": 0.0001968087430338245, "loss": 0.9206, "step": 3070 }, { "epoch": 0.8642495784148398, "grad_norm": 1.187651515007019, "learning_rate": 0.00019678410950721702, "loss": 0.9046, "step": 3075 }, { "epoch": 0.865654862282181, "grad_norm": 0.6912389397621155, "learning_rate": 0.000196759382825461, "loss": 0.9016, "step": 3080 }, { "epoch": 0.8670601461495222, "grad_norm": 0.7588440775871277, "learning_rate": 0.00019673456301235595, "loss": 0.8968, "step": 3085 }, { "epoch": 0.8684654300168634, "grad_norm": 0.5993475317955017, "learning_rate": 0.0001967096500917911, "loss": 0.9102, "step": 3090 }, { "epoch": 0.8698707138842046, "grad_norm": 0.815632164478302, "learning_rate": 0.00019668464408774522, "loss": 0.9001, "step": 3095 }, { "epoch": 0.8712759977515458, "grad_norm": 0.8173598647117615, "learning_rate": 0.0001966595450242867, "loss": 0.8948, "step": 3100 }, { "epoch": 0.872681281618887, "grad_norm": 0.6007003784179688, "learning_rate": 0.00019663435292557356, "loss": 0.8947, "step": 3105 }, { "epoch": 0.8740865654862282, "grad_norm": 0.5965673923492432, "learning_rate": 0.0001966090678158532, "loss": 0.9009, "step": 3110 }, { "epoch": 0.8754918493535694, "grad_norm": 0.7668277621269226, "learning_rate": 0.00019658368971946276, "loss": 0.9043, "step": 3115 }, { "epoch": 0.8768971332209107, "grad_norm": 0.6460701823234558, "learning_rate": 0.0001965582186608287, "loss": 0.9009, "step": 3120 }, { "epoch": 0.8783024170882519, "grad_norm": 0.6337530612945557, "learning_rate": 0.00019653265466446708, "loss": 0.944, "step": 3125 }, { "epoch": 0.879707700955593, "grad_norm": 0.8764949440956116, "learning_rate": 0.00019650699775498334, "loss": 0.9032, "step": 3130 }, { "epoch": 0.8811129848229342, "grad_norm": 0.5437586307525635, "learning_rate": 0.0001964812479570724, "loss": 0.8881, "step": 3135 }, { "epoch": 0.8825182686902754, "grad_norm": 1.294757604598999, "learning_rate": 0.0001964554052955185, "loss": 0.9081, "step": 3140 }, { "epoch": 0.8839235525576167, "grad_norm": 0.7840426564216614, "learning_rate": 0.0001964294697951954, "loss": 0.9523, "step": 3145 }, { "epoch": 0.8853288364249579, "grad_norm": 1.4854894876480103, "learning_rate": 0.00019640344148106606, "loss": 0.9636, "step": 3150 }, { "epoch": 0.886734120292299, "grad_norm": 0.5375068187713623, "learning_rate": 0.0001963773203781829, "loss": 0.9046, "step": 3155 }, { "epoch": 0.8881394041596402, "grad_norm": 0.5959550738334656, "learning_rate": 0.0001963511065116876, "loss": 0.9146, "step": 3160 }, { "epoch": 0.8895446880269815, "grad_norm": 0.5686089396476746, "learning_rate": 0.0001963247999068111, "loss": 0.8997, "step": 3165 }, { "epoch": 0.8909499718943227, "grad_norm": 0.491180956363678, "learning_rate": 0.00019629840058887362, "loss": 0.8926, "step": 3170 }, { "epoch": 0.8923552557616639, "grad_norm": 0.9157688021659851, "learning_rate": 0.0001962719085832847, "loss": 0.891, "step": 3175 }, { "epoch": 0.893760539629005, "grad_norm": 0.5665452480316162, "learning_rate": 0.00019624532391554294, "loss": 0.8966, "step": 3180 }, { "epoch": 0.8951658234963462, "grad_norm": 0.5333157777786255, "learning_rate": 0.00019621864661123622, "loss": 0.909, "step": 3185 }, { "epoch": 0.8965711073636875, "grad_norm": 0.5403627157211304, "learning_rate": 0.00019619187669604155, "loss": 0.8982, "step": 3190 }, { "epoch": 0.8979763912310287, "grad_norm": 0.5004389882087708, "learning_rate": 0.00019616501419572515, "loss": 0.8957, "step": 3195 }, { "epoch": 0.8993816750983699, "grad_norm": 0.6234089732170105, "learning_rate": 0.00019613805913614227, "loss": 0.8909, "step": 3200 }, { "epoch": 0.900786958965711, "grad_norm": 0.5197539925575256, "learning_rate": 0.00019611101154323727, "loss": 0.8762, "step": 3205 }, { "epoch": 0.9021922428330523, "grad_norm": 0.8515856266021729, "learning_rate": 0.00019608387144304362, "loss": 0.9055, "step": 3210 }, { "epoch": 0.9035975267003935, "grad_norm": 0.5120190382003784, "learning_rate": 0.0001960566388616837, "loss": 0.8963, "step": 3215 }, { "epoch": 0.9050028105677347, "grad_norm": 0.6141207218170166, "learning_rate": 0.0001960293138253691, "loss": 0.8766, "step": 3220 }, { "epoch": 0.9064080944350759, "grad_norm": 0.6554030179977417, "learning_rate": 0.00019600189636040025, "loss": 0.8858, "step": 3225 }, { "epoch": 0.907813378302417, "grad_norm": 1.2978062629699707, "learning_rate": 0.00019597438649316656, "loss": 0.8902, "step": 3230 }, { "epoch": 0.9092186621697583, "grad_norm": 0.5541256666183472, "learning_rate": 0.00019594678425014644, "loss": 0.9083, "step": 3235 }, { "epoch": 0.9106239460370995, "grad_norm": 0.6985655426979065, "learning_rate": 0.0001959190896579072, "loss": 0.8784, "step": 3240 }, { "epoch": 0.9120292299044407, "grad_norm": 0.6532580852508545, "learning_rate": 0.00019589130274310493, "loss": 0.9066, "step": 3245 }, { "epoch": 0.9134345137717819, "grad_norm": 0.9649792909622192, "learning_rate": 0.0001958634235324847, "loss": 0.8924, "step": 3250 }, { "epoch": 0.9148397976391232, "grad_norm": 0.8815706968307495, "learning_rate": 0.0001958354520528804, "loss": 0.9032, "step": 3255 }, { "epoch": 0.9162450815064643, "grad_norm": 1.1214852333068848, "learning_rate": 0.00019580738833121467, "loss": 0.8915, "step": 3260 }, { "epoch": 0.9176503653738055, "grad_norm": 0.9138022661209106, "learning_rate": 0.00019577923239449905, "loss": 0.9029, "step": 3265 }, { "epoch": 0.9190556492411467, "grad_norm": 0.6111552715301514, "learning_rate": 0.00019575098426983365, "loss": 0.9131, "step": 3270 }, { "epoch": 0.9204609331084879, "grad_norm": 0.8963708877563477, "learning_rate": 0.0001957226439844075, "loss": 0.8875, "step": 3275 }, { "epoch": 0.9218662169758292, "grad_norm": 0.6350470781326294, "learning_rate": 0.0001956942115654982, "loss": 0.8741, "step": 3280 }, { "epoch": 0.9232715008431703, "grad_norm": 1.011514663696289, "learning_rate": 0.0001956656870404721, "loss": 0.8876, "step": 3285 }, { "epoch": 0.9246767847105115, "grad_norm": 0.5660095810890198, "learning_rate": 0.0001956370704367842, "loss": 0.9137, "step": 3290 }, { "epoch": 0.9260820685778527, "grad_norm": 0.5166494250297546, "learning_rate": 0.00019560836178197813, "loss": 0.8968, "step": 3295 }, { "epoch": 0.927487352445194, "grad_norm": 0.7625125050544739, "learning_rate": 0.00019557956110368606, "loss": 0.8933, "step": 3300 }, { "epoch": 0.9288926363125352, "grad_norm": 0.6975813508033752, "learning_rate": 0.0001955506684296288, "loss": 0.8998, "step": 3305 }, { "epoch": 0.9302979201798763, "grad_norm": 0.719968318939209, "learning_rate": 0.00019552168378761565, "loss": 0.913, "step": 3310 }, { "epoch": 0.9317032040472175, "grad_norm": 0.5239852070808411, "learning_rate": 0.00019549260720554452, "loss": 0.9068, "step": 3315 }, { "epoch": 0.9331084879145587, "grad_norm": 0.5270377993583679, "learning_rate": 0.0001954634387114017, "loss": 0.8918, "step": 3320 }, { "epoch": 0.9345137717819, "grad_norm": 0.49702611565589905, "learning_rate": 0.000195434178333262, "loss": 0.8893, "step": 3325 }, { "epoch": 0.9359190556492412, "grad_norm": 0.590557873249054, "learning_rate": 0.0001954048260992887, "loss": 0.8901, "step": 3330 }, { "epoch": 0.9373243395165823, "grad_norm": 0.5189043283462524, "learning_rate": 0.00019537538203773344, "loss": 0.8826, "step": 3335 }, { "epoch": 0.9387296233839235, "grad_norm": 0.6192963719367981, "learning_rate": 0.0001953458461769363, "loss": 0.8801, "step": 3340 }, { "epoch": 0.9401349072512648, "grad_norm": 0.5685038566589355, "learning_rate": 0.00019531621854532562, "loss": 0.8944, "step": 3345 }, { "epoch": 0.941540191118606, "grad_norm": 0.49915850162506104, "learning_rate": 0.00019528649917141815, "loss": 0.8948, "step": 3350 }, { "epoch": 0.9429454749859472, "grad_norm": 0.5442516207695007, "learning_rate": 0.00019525668808381897, "loss": 0.9089, "step": 3355 }, { "epoch": 0.9443507588532883, "grad_norm": 0.5647527575492859, "learning_rate": 0.0001952267853112213, "loss": 0.9057, "step": 3360 }, { "epoch": 0.9457560427206295, "grad_norm": 0.7556710243225098, "learning_rate": 0.00019519679088240679, "loss": 0.8948, "step": 3365 }, { "epoch": 0.9471613265879708, "grad_norm": 0.578629195690155, "learning_rate": 0.00019516670482624515, "loss": 0.9092, "step": 3370 }, { "epoch": 0.948566610455312, "grad_norm": 0.6450733542442322, "learning_rate": 0.00019513652717169437, "loss": 0.8919, "step": 3375 }, { "epoch": 0.9499718943226532, "grad_norm": 1.024837613105774, "learning_rate": 0.0001951062579478006, "loss": 0.8965, "step": 3380 }, { "epoch": 0.9513771781899943, "grad_norm": 0.6713098287582397, "learning_rate": 0.0001950758971836981, "loss": 0.9124, "step": 3385 }, { "epoch": 0.9527824620573356, "grad_norm": 0.5165430903434753, "learning_rate": 0.00019504544490860917, "loss": 0.9042, "step": 3390 }, { "epoch": 0.9541877459246768, "grad_norm": 0.7243596911430359, "learning_rate": 0.0001950149011518444, "loss": 0.8922, "step": 3395 }, { "epoch": 0.955593029792018, "grad_norm": 0.5755138993263245, "learning_rate": 0.00019498426594280214, "loss": 0.8886, "step": 3400 }, { "epoch": 0.9569983136593592, "grad_norm": 0.5063710808753967, "learning_rate": 0.00019495353931096908, "loss": 0.8976, "step": 3405 }, { "epoch": 0.9584035975267003, "grad_norm": 0.9493336081504822, "learning_rate": 0.0001949227212859196, "loss": 0.897, "step": 3410 }, { "epoch": 0.9598088813940416, "grad_norm": 0.6509354710578918, "learning_rate": 0.0001948918118973163, "loss": 0.8797, "step": 3415 }, { "epoch": 0.9612141652613828, "grad_norm": 1.024837613105774, "learning_rate": 0.0001948608111749095, "loss": 0.8834, "step": 3420 }, { "epoch": 0.962619449128724, "grad_norm": 0.8351315855979919, "learning_rate": 0.00019482971914853766, "loss": 0.9357, "step": 3425 }, { "epoch": 0.9640247329960652, "grad_norm": 0.6943332552909851, "learning_rate": 0.00019479853584812693, "loss": 0.897, "step": 3430 }, { "epoch": 0.9654300168634065, "grad_norm": 0.8710034489631653, "learning_rate": 0.00019476726130369137, "loss": 0.8862, "step": 3435 }, { "epoch": 0.9668353007307476, "grad_norm": 1.5271203517913818, "learning_rate": 0.0001947358955453329, "loss": 0.9136, "step": 3440 }, { "epoch": 0.9682405845980888, "grad_norm": 0.9000211358070374, "learning_rate": 0.00019470443860324118, "loss": 0.9069, "step": 3445 }, { "epoch": 0.96964586846543, "grad_norm": 0.8771162033081055, "learning_rate": 0.0001946728905076937, "loss": 0.9507, "step": 3450 }, { "epoch": 0.9710511523327712, "grad_norm": 0.5751308798789978, "learning_rate": 0.0001946412512890556, "loss": 0.9047, "step": 3455 }, { "epoch": 0.9724564362001125, "grad_norm": 1.101432204246521, "learning_rate": 0.0001946095209777798, "loss": 0.9037, "step": 3460 }, { "epoch": 0.9738617200674536, "grad_norm": 0.8276761174201965, "learning_rate": 0.00019457769960440685, "loss": 0.8907, "step": 3465 }, { "epoch": 0.9752670039347948, "grad_norm": 1.6798173189163208, "learning_rate": 0.00019454578719956502, "loss": 0.9013, "step": 3470 }, { "epoch": 0.976672287802136, "grad_norm": 0.5530985593795776, "learning_rate": 0.0001945137837939701, "loss": 0.8757, "step": 3475 }, { "epoch": 0.9780775716694773, "grad_norm": 0.9847890138626099, "learning_rate": 0.00019448168941842552, "loss": 0.8859, "step": 3480 }, { "epoch": 0.9794828555368185, "grad_norm": 0.7510066628456116, "learning_rate": 0.00019444950410382226, "loss": 0.8919, "step": 3485 }, { "epoch": 0.9808881394041596, "grad_norm": 0.5439227819442749, "learning_rate": 0.00019441722788113882, "loss": 0.9016, "step": 3490 }, { "epoch": 0.9822934232715008, "grad_norm": 0.5271068811416626, "learning_rate": 0.00019438486078144124, "loss": 0.8879, "step": 3495 }, { "epoch": 0.983698707138842, "grad_norm": 0.6192128658294678, "learning_rate": 0.00019435240283588302, "loss": 0.889, "step": 3500 }, { "epoch": 0.9851039910061833, "grad_norm": 0.838772177696228, "learning_rate": 0.00019431985407570502, "loss": 0.895, "step": 3505 }, { "epoch": 0.9865092748735245, "grad_norm": 0.5664814710617065, "learning_rate": 0.0001942872145322356, "loss": 0.8984, "step": 3510 }, { "epoch": 0.9879145587408656, "grad_norm": 0.617550253868103, "learning_rate": 0.0001942544842368905, "loss": 0.9046, "step": 3515 }, { "epoch": 0.9893198426082068, "grad_norm": 0.690051794052124, "learning_rate": 0.00019422166322117276, "loss": 0.8811, "step": 3520 }, { "epoch": 0.9907251264755481, "grad_norm": 0.896776020526886, "learning_rate": 0.00019418875151667276, "loss": 0.8935, "step": 3525 }, { "epoch": 0.9921304103428893, "grad_norm": 0.8291999697685242, "learning_rate": 0.0001941557491550681, "loss": 0.9204, "step": 3530 }, { "epoch": 0.9935356942102305, "grad_norm": 0.6615550518035889, "learning_rate": 0.0001941226561681238, "loss": 0.8859, "step": 3535 }, { "epoch": 0.9949409780775716, "grad_norm": 0.469942569732666, "learning_rate": 0.00019408947258769198, "loss": 0.8806, "step": 3540 }, { "epoch": 0.9963462619449128, "grad_norm": 0.6039283275604248, "learning_rate": 0.00019405619844571197, "loss": 0.8735, "step": 3545 }, { "epoch": 0.9977515458122541, "grad_norm": 0.6278392672538757, "learning_rate": 0.0001940228337742103, "loss": 0.8934, "step": 3550 }, { "epoch": 0.9991568296795953, "grad_norm": 0.4628046452999115, "learning_rate": 0.0001939893786053006, "loss": 0.8916, "step": 3555 }, { "epoch": 1.0, "eval_loss": 0.8928501009941101, "eval_runtime": 641.2073, "eval_samples_per_second": 7.013, "eval_steps_per_second": 0.585, "step": 3558 }, { "epoch": 1.0005621135469365, "grad_norm": 0.7678861021995544, "learning_rate": 0.00019395583297118367, "loss": 0.885, "step": 3560 }, { "epoch": 1.0019673974142778, "grad_norm": 0.6337043642997742, "learning_rate": 0.00019392219690414727, "loss": 0.8735, "step": 3565 }, { "epoch": 1.0033726812816188, "grad_norm": 0.5203379988670349, "learning_rate": 0.00019388847043656633, "loss": 0.8724, "step": 3570 }, { "epoch": 1.0047779651489601, "grad_norm": 0.5606422424316406, "learning_rate": 0.00019385465360090268, "loss": 0.8682, "step": 3575 }, { "epoch": 1.0061832490163012, "grad_norm": 0.5460006594657898, "learning_rate": 0.00019382074642970522, "loss": 0.8634, "step": 3580 }, { "epoch": 1.0075885328836425, "grad_norm": 0.616584062576294, "learning_rate": 0.00019378674895560973, "loss": 0.8573, "step": 3585 }, { "epoch": 1.0089938167509838, "grad_norm": 0.5995298624038696, "learning_rate": 0.00019375266121133896, "loss": 0.8787, "step": 3590 }, { "epoch": 1.0103991006183248, "grad_norm": 0.5080907940864563, "learning_rate": 0.00019371848322970249, "loss": 0.8603, "step": 3595 }, { "epoch": 1.0118043844856661, "grad_norm": 0.5254037976264954, "learning_rate": 0.00019368421504359676, "loss": 0.8648, "step": 3600 }, { "epoch": 1.0132096683530074, "grad_norm": 0.7366768717765808, "learning_rate": 0.00019364985668600515, "loss": 0.8591, "step": 3605 }, { "epoch": 1.0146149522203485, "grad_norm": 0.9682057499885559, "learning_rate": 0.00019361540818999765, "loss": 0.8659, "step": 3610 }, { "epoch": 1.0160202360876898, "grad_norm": 0.572347104549408, "learning_rate": 0.00019358086958873113, "loss": 0.8721, "step": 3615 }, { "epoch": 1.0174255199550308, "grad_norm": 0.5305222868919373, "learning_rate": 0.00019354624091544916, "loss": 0.884, "step": 3620 }, { "epoch": 1.0188308038223721, "grad_norm": 0.4776608347892761, "learning_rate": 0.00019351152220348198, "loss": 0.8733, "step": 3625 }, { "epoch": 1.0202360876897134, "grad_norm": 0.6774052977561951, "learning_rate": 0.0001934767134862465, "loss": 0.9416, "step": 3630 }, { "epoch": 1.0216413715570545, "grad_norm": 0.6616668701171875, "learning_rate": 0.00019344181479724628, "loss": 0.8712, "step": 3635 }, { "epoch": 1.0230466554243958, "grad_norm": 0.6705074310302734, "learning_rate": 0.00019340682617007148, "loss": 0.8677, "step": 3640 }, { "epoch": 1.0244519392917368, "grad_norm": 0.5559976100921631, "learning_rate": 0.0001933717476383988, "loss": 0.8658, "step": 3645 }, { "epoch": 1.0258572231590781, "grad_norm": 0.5060365796089172, "learning_rate": 0.00019333657923599148, "loss": 0.8655, "step": 3650 }, { "epoch": 1.0272625070264194, "grad_norm": 0.5650534629821777, "learning_rate": 0.0001933013209966993, "loss": 0.8694, "step": 3655 }, { "epoch": 1.0286677908937605, "grad_norm": 0.6062630414962769, "learning_rate": 0.00019326597295445848, "loss": 0.8927, "step": 3660 }, { "epoch": 1.0300730747611018, "grad_norm": 0.7026441693305969, "learning_rate": 0.00019323053514329162, "loss": 0.8869, "step": 3665 }, { "epoch": 1.0314783586284428, "grad_norm": 0.6207774877548218, "learning_rate": 0.0001931950075973078, "loss": 0.883, "step": 3670 }, { "epoch": 1.0328836424957841, "grad_norm": 0.5569882988929749, "learning_rate": 0.00019315939035070246, "loss": 0.8604, "step": 3675 }, { "epoch": 1.0342889263631254, "grad_norm": 0.8226413726806641, "learning_rate": 0.00019312368343775733, "loss": 0.8624, "step": 3680 }, { "epoch": 1.0356942102304665, "grad_norm": 0.7316590547561646, "learning_rate": 0.00019308788689284052, "loss": 0.8663, "step": 3685 }, { "epoch": 1.0370994940978078, "grad_norm": 0.7959001064300537, "learning_rate": 0.00019305200075040634, "loss": 0.8617, "step": 3690 }, { "epoch": 1.038504777965149, "grad_norm": 0.7762426733970642, "learning_rate": 0.0001930160250449954, "loss": 0.8708, "step": 3695 }, { "epoch": 1.0399100618324901, "grad_norm": 1.2157001495361328, "learning_rate": 0.00019297995981123442, "loss": 0.8887, "step": 3700 }, { "epoch": 1.0413153456998314, "grad_norm": 0.8485284447669983, "learning_rate": 0.00019294380508383643, "loss": 0.8734, "step": 3705 }, { "epoch": 1.0427206295671725, "grad_norm": 0.6756306886672974, "learning_rate": 0.00019290756089760045, "loss": 0.8539, "step": 3710 }, { "epoch": 1.0441259134345138, "grad_norm": 0.6776533126831055, "learning_rate": 0.00019287122728741171, "loss": 0.8867, "step": 3715 }, { "epoch": 1.045531197301855, "grad_norm": 1.2012450695037842, "learning_rate": 0.00019283480428824147, "loss": 0.8837, "step": 3720 }, { "epoch": 1.0469364811691961, "grad_norm": 0.7898838520050049, "learning_rate": 0.00019279829193514706, "loss": 0.871, "step": 3725 }, { "epoch": 1.0483417650365374, "grad_norm": 0.6537933349609375, "learning_rate": 0.0001927616902632717, "loss": 0.8774, "step": 3730 }, { "epoch": 1.0497470489038785, "grad_norm": 0.8855597376823425, "learning_rate": 0.00019272499930784477, "loss": 0.8722, "step": 3735 }, { "epoch": 1.0511523327712198, "grad_norm": 1.1047497987747192, "learning_rate": 0.00019268821910418146, "loss": 0.8883, "step": 3740 }, { "epoch": 1.052557616638561, "grad_norm": 0.6723616719245911, "learning_rate": 0.00019265134968768285, "loss": 0.8685, "step": 3745 }, { "epoch": 1.0539629005059021, "grad_norm": 0.5157325863838196, "learning_rate": 0.00019261439109383591, "loss": 0.8733, "step": 3750 }, { "epoch": 1.0553681843732434, "grad_norm": 0.5530291199684143, "learning_rate": 0.0001925773433582135, "loss": 0.868, "step": 3755 }, { "epoch": 1.0567734682405847, "grad_norm": 0.47156935930252075, "learning_rate": 0.00019254020651647427, "loss": 0.868, "step": 3760 }, { "epoch": 1.0581787521079258, "grad_norm": 0.9338856935501099, "learning_rate": 0.00019250298060436246, "loss": 0.8783, "step": 3765 }, { "epoch": 1.059584035975267, "grad_norm": 0.5520219206809998, "learning_rate": 0.00019246566565770835, "loss": 0.8671, "step": 3770 }, { "epoch": 1.0609893198426081, "grad_norm": 0.5825735926628113, "learning_rate": 0.0001924282617124276, "loss": 0.8721, "step": 3775 }, { "epoch": 1.0623946037099494, "grad_norm": 0.9456936120986938, "learning_rate": 0.0001923907688045218, "loss": 0.8805, "step": 3780 }, { "epoch": 1.0637998875772907, "grad_norm": 0.8601976037025452, "learning_rate": 0.00019235318697007796, "loss": 0.8667, "step": 3785 }, { "epoch": 1.0652051714446318, "grad_norm": 0.6083686351776123, "learning_rate": 0.00019231551624526881, "loss": 0.8591, "step": 3790 }, { "epoch": 1.066610455311973, "grad_norm": 0.5371261239051819, "learning_rate": 0.00019227775666635257, "loss": 0.8917, "step": 3795 }, { "epoch": 1.0680157391793141, "grad_norm": 0.481442391872406, "learning_rate": 0.00019223990826967304, "loss": 0.8697, "step": 3800 }, { "epoch": 1.0694210230466554, "grad_norm": 0.5719467401504517, "learning_rate": 0.00019220197109165942, "loss": 0.8707, "step": 3805 }, { "epoch": 1.0708263069139967, "grad_norm": 0.7797774076461792, "learning_rate": 0.0001921639451688265, "loss": 0.8836, "step": 3810 }, { "epoch": 1.0722315907813378, "grad_norm": 0.6479620337486267, "learning_rate": 0.00019212583053777432, "loss": 0.88, "step": 3815 }, { "epoch": 1.073636874648679, "grad_norm": 0.5058332085609436, "learning_rate": 0.00019208762723518845, "loss": 0.8698, "step": 3820 }, { "epoch": 1.0750421585160201, "grad_norm": 0.7176181077957153, "learning_rate": 0.00019204933529783972, "loss": 0.8777, "step": 3825 }, { "epoch": 1.0764474423833614, "grad_norm": 0.8829546570777893, "learning_rate": 0.0001920109547625843, "loss": 0.8804, "step": 3830 }, { "epoch": 1.0778527262507027, "grad_norm": 0.6037710905075073, "learning_rate": 0.00019197248566636362, "loss": 0.8713, "step": 3835 }, { "epoch": 1.0792580101180438, "grad_norm": 0.8079067468643188, "learning_rate": 0.00019193392804620434, "loss": 0.8765, "step": 3840 }, { "epoch": 1.080663293985385, "grad_norm": 0.5948657989501953, "learning_rate": 0.0001918952819392184, "loss": 0.8682, "step": 3845 }, { "epoch": 1.0820685778527261, "grad_norm": 0.5095741748809814, "learning_rate": 0.0001918565473826028, "loss": 0.8579, "step": 3850 }, { "epoch": 1.0834738617200674, "grad_norm": 0.6867838501930237, "learning_rate": 0.00019181772441363978, "loss": 0.8651, "step": 3855 }, { "epoch": 1.0848791455874087, "grad_norm": 0.7166353464126587, "learning_rate": 0.0001917788130696966, "loss": 0.8687, "step": 3860 }, { "epoch": 1.0862844294547498, "grad_norm": 0.8581190705299377, "learning_rate": 0.0001917398133882256, "loss": 0.8882, "step": 3865 }, { "epoch": 1.087689713322091, "grad_norm": 0.6934407353401184, "learning_rate": 0.00019170072540676417, "loss": 0.8818, "step": 3870 }, { "epoch": 1.0890949971894324, "grad_norm": 0.5701424479484558, "learning_rate": 0.00019166154916293464, "loss": 0.8772, "step": 3875 }, { "epoch": 1.0905002810567734, "grad_norm": 0.6658897995948792, "learning_rate": 0.00019162228469444433, "loss": 0.8657, "step": 3880 }, { "epoch": 1.0919055649241147, "grad_norm": 0.5013540983200073, "learning_rate": 0.00019158293203908551, "loss": 0.8829, "step": 3885 }, { "epoch": 1.0933108487914558, "grad_norm": 0.5879671573638916, "learning_rate": 0.00019154349123473528, "loss": 0.8775, "step": 3890 }, { "epoch": 1.094716132658797, "grad_norm": 0.580640971660614, "learning_rate": 0.0001915039623193556, "loss": 0.8547, "step": 3895 }, { "epoch": 1.0961214165261384, "grad_norm": 0.5695182085037231, "learning_rate": 0.00019146434533099318, "loss": 0.8644, "step": 3900 }, { "epoch": 1.0975267003934794, "grad_norm": 0.5622773766517639, "learning_rate": 0.00019142464030777958, "loss": 0.868, "step": 3905 }, { "epoch": 1.0989319842608207, "grad_norm": 0.5157364010810852, "learning_rate": 0.00019138484728793107, "loss": 0.8713, "step": 3910 }, { "epoch": 1.1003372681281618, "grad_norm": 0.6461440920829773, "learning_rate": 0.00019134496630974864, "loss": 0.8821, "step": 3915 }, { "epoch": 1.101742551995503, "grad_norm": 0.5542477965354919, "learning_rate": 0.0001913049974116179, "loss": 0.8773, "step": 3920 }, { "epoch": 1.1031478358628444, "grad_norm": 0.5190820097923279, "learning_rate": 0.00019126494063200907, "loss": 0.8856, "step": 3925 }, { "epoch": 1.1045531197301854, "grad_norm": 0.4922696053981781, "learning_rate": 0.00019122479600947699, "loss": 0.8911, "step": 3930 }, { "epoch": 1.1059584035975267, "grad_norm": 0.6155653595924377, "learning_rate": 0.00019118456358266107, "loss": 0.8843, "step": 3935 }, { "epoch": 1.107363687464868, "grad_norm": 0.8045044541358948, "learning_rate": 0.00019114424339028516, "loss": 0.8715, "step": 3940 }, { "epoch": 1.108768971332209, "grad_norm": 0.5465171933174133, "learning_rate": 0.0001911038354711577, "loss": 0.8755, "step": 3945 }, { "epoch": 1.1101742551995504, "grad_norm": 0.5287275314331055, "learning_rate": 0.00019106333986417142, "loss": 0.8802, "step": 3950 }, { "epoch": 1.1115795390668914, "grad_norm": 0.7569435834884644, "learning_rate": 0.0001910227566083036, "loss": 0.8553, "step": 3955 }, { "epoch": 1.1129848229342327, "grad_norm": 0.9262810945510864, "learning_rate": 0.00019098208574261575, "loss": 0.8663, "step": 3960 }, { "epoch": 1.114390106801574, "grad_norm": 1.0431913137435913, "learning_rate": 0.00019094132730625377, "loss": 0.9002, "step": 3965 }, { "epoch": 1.115795390668915, "grad_norm": 0.577294111251831, "learning_rate": 0.0001909004813384479, "loss": 0.8888, "step": 3970 }, { "epoch": 1.1172006745362564, "grad_norm": 0.6003448963165283, "learning_rate": 0.0001908595478785125, "loss": 0.8726, "step": 3975 }, { "epoch": 1.1186059584035974, "grad_norm": 0.5789997577667236, "learning_rate": 0.00019081852696584627, "loss": 0.8669, "step": 3980 }, { "epoch": 1.1200112422709387, "grad_norm": 0.5823659896850586, "learning_rate": 0.00019077741863993199, "loss": 0.8812, "step": 3985 }, { "epoch": 1.12141652613828, "grad_norm": 0.8449714183807373, "learning_rate": 0.00019073622294033663, "loss": 0.8714, "step": 3990 }, { "epoch": 1.122821810005621, "grad_norm": 1.3145679235458374, "learning_rate": 0.00019069493990671118, "loss": 0.9059, "step": 3995 }, { "epoch": 1.1242270938729624, "grad_norm": 0.8457211852073669, "learning_rate": 0.00019065356957879086, "loss": 0.8518, "step": 4000 }, { "epoch": 1.1256323777403034, "grad_norm": 1.5860358476638794, "learning_rate": 0.00019061211199639474, "loss": 0.8808, "step": 4005 }, { "epoch": 1.1270376616076447, "grad_norm": 0.7184464335441589, "learning_rate": 0.00019057056719942587, "loss": 0.8753, "step": 4010 }, { "epoch": 1.128442945474986, "grad_norm": 0.585971474647522, "learning_rate": 0.00019052893522787144, "loss": 0.8672, "step": 4015 }, { "epoch": 1.129848229342327, "grad_norm": 0.551020622253418, "learning_rate": 0.00019048721612180232, "loss": 0.8659, "step": 4020 }, { "epoch": 1.1312535132096684, "grad_norm": 0.5553367137908936, "learning_rate": 0.00019044540992137337, "loss": 0.8753, "step": 4025 }, { "epoch": 1.1326587970770094, "grad_norm": 0.5330313444137573, "learning_rate": 0.00019040351666682322, "loss": 0.8818, "step": 4030 }, { "epoch": 1.1340640809443507, "grad_norm": 1.2441614866256714, "learning_rate": 0.00019036153639847433, "loss": 0.8695, "step": 4035 }, { "epoch": 1.135469364811692, "grad_norm": 0.6410412192344666, "learning_rate": 0.00019031946915673293, "loss": 0.8793, "step": 4040 }, { "epoch": 1.136874648679033, "grad_norm": 0.4777545928955078, "learning_rate": 0.00019027731498208895, "loss": 0.8738, "step": 4045 }, { "epoch": 1.1382799325463744, "grad_norm": 0.5763461589813232, "learning_rate": 0.00019023507391511591, "loss": 0.8783, "step": 4050 }, { "epoch": 1.1396852164137155, "grad_norm": 0.5704853534698486, "learning_rate": 0.00019019274599647106, "loss": 0.8611, "step": 4055 }, { "epoch": 1.1410905002810567, "grad_norm": 0.4842630922794342, "learning_rate": 0.00019015033126689522, "loss": 0.8543, "step": 4060 }, { "epoch": 1.142495784148398, "grad_norm": 0.6521385312080383, "learning_rate": 0.00019010782976721277, "loss": 0.8695, "step": 4065 }, { "epoch": 1.143901068015739, "grad_norm": 0.7000013589859009, "learning_rate": 0.00019006524153833158, "loss": 0.8659, "step": 4070 }, { "epoch": 1.1453063518830804, "grad_norm": 0.6031846404075623, "learning_rate": 0.000190022566621243, "loss": 0.8598, "step": 4075 }, { "epoch": 1.1467116357504217, "grad_norm": 0.5788977742195129, "learning_rate": 0.0001899798050570219, "loss": 0.8682, "step": 4080 }, { "epoch": 1.1481169196177627, "grad_norm": 0.6468327641487122, "learning_rate": 0.00018993695688682643, "loss": 0.874, "step": 4085 }, { "epoch": 1.149522203485104, "grad_norm": 0.5688002109527588, "learning_rate": 0.00018989402215189812, "loss": 0.8514, "step": 4090 }, { "epoch": 1.150927487352445, "grad_norm": 0.6317186951637268, "learning_rate": 0.00018985100089356194, "loss": 0.8683, "step": 4095 }, { "epoch": 1.1523327712197864, "grad_norm": 0.8182606101036072, "learning_rate": 0.00018980789315322595, "loss": 0.8882, "step": 4100 }, { "epoch": 1.1537380550871277, "grad_norm": 0.615042507648468, "learning_rate": 0.00018976469897238158, "loss": 0.88, "step": 4105 }, { "epoch": 1.1551433389544687, "grad_norm": 0.5782546401023865, "learning_rate": 0.00018972141839260348, "loss": 0.8685, "step": 4110 }, { "epoch": 1.15654862282181, "grad_norm": 1.0364948511123657, "learning_rate": 0.00018967805145554936, "loss": 0.8658, "step": 4115 }, { "epoch": 1.1579539066891513, "grad_norm": 0.7199181914329529, "learning_rate": 0.0001896345982029601, "loss": 0.8865, "step": 4120 }, { "epoch": 1.1593591905564924, "grad_norm": 0.8875741958618164, "learning_rate": 0.0001895910586766596, "loss": 0.878, "step": 4125 }, { "epoch": 1.1607644744238337, "grad_norm": 0.6755944490432739, "learning_rate": 0.00018954743291855496, "loss": 0.8666, "step": 4130 }, { "epoch": 1.1621697582911747, "grad_norm": 0.516245424747467, "learning_rate": 0.0001895037209706361, "loss": 0.8648, "step": 4135 }, { "epoch": 1.163575042158516, "grad_norm": 0.5283127427101135, "learning_rate": 0.00018945992287497601, "loss": 0.8684, "step": 4140 }, { "epoch": 1.1649803260258573, "grad_norm": 0.8151612281799316, "learning_rate": 0.00018941603867373054, "loss": 0.8662, "step": 4145 }, { "epoch": 1.1663856098931984, "grad_norm": 0.8823288083076477, "learning_rate": 0.00018937206840913842, "loss": 0.8853, "step": 4150 }, { "epoch": 1.1677908937605397, "grad_norm": 0.6180223822593689, "learning_rate": 0.00018932801212352124, "loss": 0.874, "step": 4155 }, { "epoch": 1.1691961776278808, "grad_norm": 0.531110942363739, "learning_rate": 0.00018928386985928337, "loss": 0.8658, "step": 4160 }, { "epoch": 1.170601461495222, "grad_norm": 0.4815613925457001, "learning_rate": 0.00018923964165891197, "loss": 0.8654, "step": 4165 }, { "epoch": 1.1720067453625633, "grad_norm": 0.5188429951667786, "learning_rate": 0.00018919532756497687, "loss": 0.8691, "step": 4170 }, { "epoch": 1.1734120292299044, "grad_norm": 0.5245307087898254, "learning_rate": 0.00018915092762013055, "loss": 0.8914, "step": 4175 }, { "epoch": 1.1748173130972457, "grad_norm": 0.6985578536987305, "learning_rate": 0.00018910644186710825, "loss": 0.8538, "step": 4180 }, { "epoch": 1.1762225969645868, "grad_norm": 0.8950197100639343, "learning_rate": 0.00018906187034872763, "loss": 0.8551, "step": 4185 }, { "epoch": 1.177627880831928, "grad_norm": 0.5236636996269226, "learning_rate": 0.00018901721310788898, "loss": 0.8947, "step": 4190 }, { "epoch": 1.1790331646992693, "grad_norm": 0.5204771161079407, "learning_rate": 0.00018897247018757516, "loss": 0.8729, "step": 4195 }, { "epoch": 1.1804384485666104, "grad_norm": 0.47652414441108704, "learning_rate": 0.0001889276416308514, "loss": 0.8644, "step": 4200 }, { "epoch": 1.1818437324339517, "grad_norm": 0.5452648401260376, "learning_rate": 0.00018888272748086537, "loss": 0.87, "step": 4205 }, { "epoch": 1.1832490163012928, "grad_norm": 0.7756819725036621, "learning_rate": 0.0001888377277808472, "loss": 0.8555, "step": 4210 }, { "epoch": 1.184654300168634, "grad_norm": 0.5953937768936157, "learning_rate": 0.00018879264257410926, "loss": 0.8778, "step": 4215 }, { "epoch": 1.1860595840359753, "grad_norm": 0.6475582718849182, "learning_rate": 0.00018874747190404624, "loss": 0.8938, "step": 4220 }, { "epoch": 1.1874648679033164, "grad_norm": 0.5985028743743896, "learning_rate": 0.0001887022158141352, "loss": 0.8741, "step": 4225 }, { "epoch": 1.1888701517706577, "grad_norm": 0.5418311953544617, "learning_rate": 0.0001886568743479353, "loss": 0.8513, "step": 4230 }, { "epoch": 1.1902754356379988, "grad_norm": 0.5689842104911804, "learning_rate": 0.0001886114475490879, "loss": 0.8737, "step": 4235 }, { "epoch": 1.19168071950534, "grad_norm": 0.5415789484977722, "learning_rate": 0.00018856593546131648, "loss": 0.8721, "step": 4240 }, { "epoch": 1.1930860033726813, "grad_norm": 0.6934672594070435, "learning_rate": 0.0001885203381284267, "loss": 0.85, "step": 4245 }, { "epoch": 1.1944912872400224, "grad_norm": 0.7670535445213318, "learning_rate": 0.00018847465559430614, "loss": 0.8853, "step": 4250 }, { "epoch": 1.1958965711073637, "grad_norm": 0.5853431224822998, "learning_rate": 0.00018842888790292442, "loss": 0.8842, "step": 4255 }, { "epoch": 1.197301854974705, "grad_norm": 0.5499680638313293, "learning_rate": 0.00018838303509833323, "loss": 0.8941, "step": 4260 }, { "epoch": 1.198707138842046, "grad_norm": 0.5514938235282898, "learning_rate": 0.00018833709722466607, "loss": 0.8641, "step": 4265 }, { "epoch": 1.2001124227093873, "grad_norm": 0.560611367225647, "learning_rate": 0.0001882910743261384, "loss": 0.872, "step": 4270 }, { "epoch": 1.2015177065767284, "grad_norm": 0.601483166217804, "learning_rate": 0.00018824496644704737, "loss": 0.8838, "step": 4275 }, { "epoch": 1.2029229904440697, "grad_norm": 0.46491849422454834, "learning_rate": 0.00018819877363177213, "loss": 0.8916, "step": 4280 }, { "epoch": 1.204328274311411, "grad_norm": 0.588778018951416, "learning_rate": 0.00018815249592477338, "loss": 0.864, "step": 4285 }, { "epoch": 1.205733558178752, "grad_norm": 0.5212571620941162, "learning_rate": 0.0001881061333705937, "loss": 0.8707, "step": 4290 }, { "epoch": 1.2071388420460933, "grad_norm": 0.9074152112007141, "learning_rate": 0.00018805968601385724, "loss": 0.8632, "step": 4295 }, { "epoch": 1.2085441259134346, "grad_norm": 0.5140102505683899, "learning_rate": 0.0001880131538992698, "loss": 0.8635, "step": 4300 }, { "epoch": 1.2099494097807757, "grad_norm": 0.7265368103981018, "learning_rate": 0.0001879665370716187, "loss": 0.8657, "step": 4305 }, { "epoch": 1.211354693648117, "grad_norm": 0.5487323999404907, "learning_rate": 0.00018791983557577292, "loss": 0.8694, "step": 4310 }, { "epoch": 1.212759977515458, "grad_norm": 0.5688353180885315, "learning_rate": 0.00018787304945668283, "loss": 0.8744, "step": 4315 }, { "epoch": 1.2141652613827993, "grad_norm": 0.6270791888237, "learning_rate": 0.00018782617875938028, "loss": 0.8752, "step": 4320 }, { "epoch": 1.2155705452501406, "grad_norm": 0.5649317502975464, "learning_rate": 0.00018777922352897854, "loss": 0.8689, "step": 4325 }, { "epoch": 1.2169758291174817, "grad_norm": 0.4823704957962036, "learning_rate": 0.00018773218381067225, "loss": 0.8728, "step": 4330 }, { "epoch": 1.218381112984823, "grad_norm": 0.5369855761528015, "learning_rate": 0.00018768505964973731, "loss": 0.8642, "step": 4335 }, { "epoch": 1.219786396852164, "grad_norm": 0.684241533279419, "learning_rate": 0.000187637851091531, "loss": 0.8687, "step": 4340 }, { "epoch": 1.2211916807195053, "grad_norm": 0.664493978023529, "learning_rate": 0.0001875905581814917, "loss": 0.8733, "step": 4345 }, { "epoch": 1.2225969645868466, "grad_norm": 0.6049327254295349, "learning_rate": 0.00018754318096513917, "loss": 0.8762, "step": 4350 }, { "epoch": 1.2240022484541877, "grad_norm": 0.6041503548622131, "learning_rate": 0.00018749571948807405, "loss": 0.8768, "step": 4355 }, { "epoch": 1.225407532321529, "grad_norm": 0.528701901435852, "learning_rate": 0.00018744817379597834, "loss": 0.8695, "step": 4360 }, { "epoch": 1.22681281618887, "grad_norm": 0.8402326703071594, "learning_rate": 0.00018740054393461493, "loss": 0.8739, "step": 4365 }, { "epoch": 1.2282181000562113, "grad_norm": 0.5691919922828674, "learning_rate": 0.00018735282994982778, "loss": 0.8692, "step": 4370 }, { "epoch": 1.2296233839235526, "grad_norm": 0.4437329173088074, "learning_rate": 0.00018730503188754187, "loss": 0.8728, "step": 4375 }, { "epoch": 1.2310286677908937, "grad_norm": 0.778617262840271, "learning_rate": 0.000187257149793763, "loss": 0.8746, "step": 4380 }, { "epoch": 1.232433951658235, "grad_norm": 0.5112653970718384, "learning_rate": 0.00018720918371457792, "loss": 0.869, "step": 4385 }, { "epoch": 1.233839235525576, "grad_norm": 0.5120835900306702, "learning_rate": 0.00018716113369615425, "loss": 0.8802, "step": 4390 }, { "epoch": 1.2352445193929174, "grad_norm": 0.6164501309394836, "learning_rate": 0.00018711299978474023, "loss": 0.877, "step": 4395 }, { "epoch": 1.2366498032602586, "grad_norm": 0.7080835103988647, "learning_rate": 0.0001870647820266651, "loss": 0.8633, "step": 4400 }, { "epoch": 1.2380550871275997, "grad_norm": 0.5996346473693848, "learning_rate": 0.00018701648046833862, "loss": 0.8751, "step": 4405 }, { "epoch": 1.239460370994941, "grad_norm": 0.5956994891166687, "learning_rate": 0.00018696809515625126, "loss": 0.944, "step": 4410 }, { "epoch": 1.240865654862282, "grad_norm": 0.6986256837844849, "learning_rate": 0.0001869196261369741, "loss": 0.868, "step": 4415 }, { "epoch": 1.2422709387296234, "grad_norm": 0.7284836173057556, "learning_rate": 0.0001868710734571588, "loss": 0.8701, "step": 4420 }, { "epoch": 1.2436762225969646, "grad_norm": 0.5841744542121887, "learning_rate": 0.00018682243716353753, "loss": 0.8731, "step": 4425 }, { "epoch": 1.2450815064643057, "grad_norm": 0.5694195032119751, "learning_rate": 0.00018677371730292297, "loss": 0.8674, "step": 4430 }, { "epoch": 1.246486790331647, "grad_norm": 0.6393140554428101, "learning_rate": 0.00018672491392220816, "loss": 0.8758, "step": 4435 }, { "epoch": 1.2478920741989883, "grad_norm": 1.008575201034546, "learning_rate": 0.00018667602706836663, "loss": 0.8647, "step": 4440 }, { "epoch": 1.2492973580663294, "grad_norm": 0.8124025464057922, "learning_rate": 0.00018662705678845217, "loss": 0.887, "step": 4445 }, { "epoch": 1.2507026419336706, "grad_norm": 0.8950420618057251, "learning_rate": 0.0001865780031295989, "loss": 0.8758, "step": 4450 }, { "epoch": 1.252107925801012, "grad_norm": 0.533831000328064, "learning_rate": 0.0001865288661390212, "loss": 0.8801, "step": 4455 }, { "epoch": 1.253513209668353, "grad_norm": 0.5837879776954651, "learning_rate": 0.00018647964586401367, "loss": 0.8838, "step": 4460 }, { "epoch": 1.2549184935356943, "grad_norm": 0.5391103029251099, "learning_rate": 0.00018643034235195103, "loss": 0.8636, "step": 4465 }, { "epoch": 1.2563237774030354, "grad_norm": 0.7856371402740479, "learning_rate": 0.0001863809556502881, "loss": 0.857, "step": 4470 }, { "epoch": 1.2577290612703766, "grad_norm": 0.6522362232208252, "learning_rate": 0.00018633148580655986, "loss": 0.8684, "step": 4475 }, { "epoch": 1.259134345137718, "grad_norm": 0.5931859612464905, "learning_rate": 0.00018628193286838123, "loss": 0.9331, "step": 4480 }, { "epoch": 1.260539629005059, "grad_norm": 0.8700698614120483, "learning_rate": 0.00018623229688344715, "loss": 0.8859, "step": 4485 }, { "epoch": 1.2619449128724003, "grad_norm": 1.0133111476898193, "learning_rate": 0.0001861825778995325, "loss": 0.878, "step": 4490 }, { "epoch": 1.2633501967397414, "grad_norm": 0.7472015619277954, "learning_rate": 0.00018613277596449197, "loss": 0.866, "step": 4495 }, { "epoch": 1.2647554806070826, "grad_norm": 0.7217004895210266, "learning_rate": 0.00018608289112626025, "loss": 0.8696, "step": 4500 }, { "epoch": 1.266160764474424, "grad_norm": 0.5697894096374512, "learning_rate": 0.00018603292343285163, "loss": 0.8879, "step": 4505 }, { "epoch": 1.267566048341765, "grad_norm": 0.5211483836174011, "learning_rate": 0.00018598287293236028, "loss": 0.8811, "step": 4510 }, { "epoch": 1.2689713322091063, "grad_norm": 0.5396711230278015, "learning_rate": 0.00018593273967296004, "loss": 0.8725, "step": 4515 }, { "epoch": 1.2703766160764474, "grad_norm": 0.46194639801979065, "learning_rate": 0.00018588252370290443, "loss": 0.8657, "step": 4520 }, { "epoch": 1.2717818999437887, "grad_norm": 0.6557855606079102, "learning_rate": 0.00018583222507052649, "loss": 0.8573, "step": 4525 }, { "epoch": 1.27318718381113, "grad_norm": 0.5345513820648193, "learning_rate": 0.00018578184382423893, "loss": 0.8807, "step": 4530 }, { "epoch": 1.274592467678471, "grad_norm": 0.5782117247581482, "learning_rate": 0.0001857313800125339, "loss": 0.8643, "step": 4535 }, { "epoch": 1.2759977515458123, "grad_norm": 0.7175543904304504, "learning_rate": 0.0001856808336839831, "loss": 0.8682, "step": 4540 }, { "epoch": 1.2774030354131534, "grad_norm": 0.518791675567627, "learning_rate": 0.00018563020488723752, "loss": 0.9229, "step": 4545 }, { "epoch": 1.2788083192804947, "grad_norm": 0.7908794283866882, "learning_rate": 0.0001855794936710277, "loss": 0.869, "step": 4550 }, { "epoch": 1.280213603147836, "grad_norm": 1.0072758197784424, "learning_rate": 0.00018552870008416335, "loss": 0.8675, "step": 4555 }, { "epoch": 1.281618887015177, "grad_norm": 0.505922794342041, "learning_rate": 0.00018547782417553354, "loss": 0.8683, "step": 4560 }, { "epoch": 1.2830241708825183, "grad_norm": 0.8561394810676575, "learning_rate": 0.00018542686599410662, "loss": 0.8777, "step": 4565 }, { "epoch": 1.2844294547498594, "grad_norm": 0.6131189465522766, "learning_rate": 0.00018537582558892998, "loss": 0.8396, "step": 4570 }, { "epoch": 1.2858347386172007, "grad_norm": 0.5617784857749939, "learning_rate": 0.00018532470300913035, "loss": 0.8692, "step": 4575 }, { "epoch": 1.287240022484542, "grad_norm": 0.5197677612304688, "learning_rate": 0.00018527349830391336, "loss": 0.8841, "step": 4580 }, { "epoch": 1.288645306351883, "grad_norm": 0.572418212890625, "learning_rate": 0.00018522221152256378, "loss": 0.8794, "step": 4585 }, { "epoch": 1.2900505902192243, "grad_norm": 0.5657436847686768, "learning_rate": 0.00018517084271444544, "loss": 0.9167, "step": 4590 }, { "epoch": 1.2914558740865654, "grad_norm": 0.8176900744438171, "learning_rate": 0.00018511939192900097, "loss": 0.8716, "step": 4595 }, { "epoch": 1.2928611579539067, "grad_norm": 0.7152751088142395, "learning_rate": 0.0001850678592157521, "loss": 0.8831, "step": 4600 }, { "epoch": 1.294266441821248, "grad_norm": 0.7368189096450806, "learning_rate": 0.00018501624462429918, "loss": 0.8747, "step": 4605 }, { "epoch": 1.295671725688589, "grad_norm": 0.6734061241149902, "learning_rate": 0.00018496454820432154, "loss": 0.8758, "step": 4610 }, { "epoch": 1.2970770095559303, "grad_norm": 1.670920968055725, "learning_rate": 0.00018491277000557722, "loss": 0.9219, "step": 4615 }, { "epoch": 1.2984822934232714, "grad_norm": 0.8343753814697266, "learning_rate": 0.00018486091007790297, "loss": 0.8635, "step": 4620 }, { "epoch": 1.2998875772906127, "grad_norm": 0.5270551443099976, "learning_rate": 0.00018480896847121426, "loss": 0.8667, "step": 4625 }, { "epoch": 1.301292861157954, "grad_norm": 0.9477784633636475, "learning_rate": 0.00018475694523550505, "loss": 0.8679, "step": 4630 }, { "epoch": 1.3026981450252952, "grad_norm": 0.4685782194137573, "learning_rate": 0.00018470484042084796, "loss": 0.8784, "step": 4635 }, { "epoch": 1.3041034288926363, "grad_norm": 0.7531285881996155, "learning_rate": 0.00018465265407739413, "loss": 0.8775, "step": 4640 }, { "epoch": 1.3055087127599776, "grad_norm": 0.6527368426322937, "learning_rate": 0.00018460038625537313, "loss": 0.9371, "step": 4645 }, { "epoch": 1.3069139966273187, "grad_norm": 0.5861037373542786, "learning_rate": 0.000184548037005093, "loss": 0.8715, "step": 4650 }, { "epoch": 1.30831928049466, "grad_norm": 0.6117671728134155, "learning_rate": 0.00018449560637694013, "loss": 0.8816, "step": 4655 }, { "epoch": 1.3097245643620012, "grad_norm": 0.5061159729957581, "learning_rate": 0.00018444309442137923, "loss": 0.8773, "step": 4660 }, { "epoch": 1.3111298482293423, "grad_norm": 0.4746789038181305, "learning_rate": 0.00018439050118895334, "loss": 0.8711, "step": 4665 }, { "epoch": 1.3125351320966836, "grad_norm": 0.5956834554672241, "learning_rate": 0.00018433782673028362, "loss": 0.8697, "step": 4670 }, { "epoch": 1.3139404159640247, "grad_norm": 0.5160255432128906, "learning_rate": 0.0001842850710960695, "loss": 0.8804, "step": 4675 }, { "epoch": 1.315345699831366, "grad_norm": 0.5058934688568115, "learning_rate": 0.00018423223433708857, "loss": 0.8785, "step": 4680 }, { "epoch": 1.3167509836987072, "grad_norm": 0.735233724117279, "learning_rate": 0.00018417931650419639, "loss": 0.8582, "step": 4685 }, { "epoch": 1.3181562675660483, "grad_norm": 0.6757941246032715, "learning_rate": 0.00018412631764832662, "loss": 0.8829, "step": 4690 }, { "epoch": 1.3195615514333896, "grad_norm": 0.808224081993103, "learning_rate": 0.00018407323782049093, "loss": 0.8721, "step": 4695 }, { "epoch": 1.3209668353007307, "grad_norm": 0.69460529088974, "learning_rate": 0.0001840200770717789, "loss": 0.8762, "step": 4700 }, { "epoch": 1.322372119168072, "grad_norm": 0.576110303401947, "learning_rate": 0.00018396683545335798, "loss": 0.8686, "step": 4705 }, { "epoch": 1.3237774030354132, "grad_norm": 0.69572913646698, "learning_rate": 0.00018391351301647344, "loss": 0.8768, "step": 4710 }, { "epoch": 1.3251826869027543, "grad_norm": 0.6554480195045471, "learning_rate": 0.00018386010981244843, "loss": 0.8863, "step": 4715 }, { "epoch": 1.3265879707700956, "grad_norm": 0.6453247666358948, "learning_rate": 0.00018380662589268377, "loss": 0.8602, "step": 4720 }, { "epoch": 1.3279932546374367, "grad_norm": 0.6823776364326477, "learning_rate": 0.00018375306130865793, "loss": 0.8771, "step": 4725 }, { "epoch": 1.329398538504778, "grad_norm": 0.8525605797767639, "learning_rate": 0.00018369941611192712, "loss": 0.8788, "step": 4730 }, { "epoch": 1.3308038223721192, "grad_norm": 0.9090725183486938, "learning_rate": 0.00018364569035412502, "loss": 0.9202, "step": 4735 }, { "epoch": 1.3322091062394603, "grad_norm": 0.6051588654518127, "learning_rate": 0.000183591884086963, "loss": 0.8619, "step": 4740 }, { "epoch": 1.3336143901068016, "grad_norm": 0.4923397898674011, "learning_rate": 0.00018353799736222975, "loss": 0.8691, "step": 4745 }, { "epoch": 1.3350196739741427, "grad_norm": 0.5829062461853027, "learning_rate": 0.0001834840302317916, "loss": 0.8534, "step": 4750 }, { "epoch": 1.336424957841484, "grad_norm": 0.5169609785079956, "learning_rate": 0.00018342998274759208, "loss": 0.8565, "step": 4755 }, { "epoch": 1.3378302417088253, "grad_norm": 0.5367736220359802, "learning_rate": 0.00018337585496165215, "loss": 0.9078, "step": 4760 }, { "epoch": 1.3392355255761663, "grad_norm": 0.5715290904045105, "learning_rate": 0.00018332164692607008, "loss": 0.8706, "step": 4765 }, { "epoch": 1.3406408094435076, "grad_norm": 0.5462102293968201, "learning_rate": 0.0001832673586930213, "loss": 0.8705, "step": 4770 }, { "epoch": 1.3420460933108487, "grad_norm": 0.5522609353065491, "learning_rate": 0.00018321299031475854, "loss": 0.867, "step": 4775 }, { "epoch": 1.34345137717819, "grad_norm": 0.5512838363647461, "learning_rate": 0.00018315854184361156, "loss": 0.8865, "step": 4780 }, { "epoch": 1.3448566610455313, "grad_norm": 0.6370877623558044, "learning_rate": 0.00018310401333198733, "loss": 0.8747, "step": 4785 }, { "epoch": 1.3462619449128723, "grad_norm": 0.47698330879211426, "learning_rate": 0.00018304940483236974, "loss": 0.8586, "step": 4790 }, { "epoch": 1.3476672287802136, "grad_norm": 0.49109163880348206, "learning_rate": 0.00018299471639731977, "loss": 0.8797, "step": 4795 }, { "epoch": 1.3490725126475547, "grad_norm": 0.5076658129692078, "learning_rate": 0.00018293994807947522, "loss": 0.8777, "step": 4800 }, { "epoch": 1.350477796514896, "grad_norm": 0.5216749310493469, "learning_rate": 0.00018288509993155086, "loss": 0.8673, "step": 4805 }, { "epoch": 1.3518830803822373, "grad_norm": 0.5092537999153137, "learning_rate": 0.00018283017200633833, "loss": 0.8536, "step": 4810 }, { "epoch": 1.3532883642495785, "grad_norm": 0.6097123622894287, "learning_rate": 0.000182775164356706, "loss": 0.8653, "step": 4815 }, { "epoch": 1.3546936481169196, "grad_norm": 0.6337655186653137, "learning_rate": 0.00018272007703559894, "loss": 0.8663, "step": 4820 }, { "epoch": 1.356098931984261, "grad_norm": 0.6192864775657654, "learning_rate": 0.000182664910096039, "loss": 0.8686, "step": 4825 }, { "epoch": 1.357504215851602, "grad_norm": 0.46798455715179443, "learning_rate": 0.0001826096635911246, "loss": 0.8611, "step": 4830 }, { "epoch": 1.3589094997189433, "grad_norm": 0.5440805554389954, "learning_rate": 0.00018255433757403071, "loss": 0.8682, "step": 4835 }, { "epoch": 1.3603147835862845, "grad_norm": 0.5112454295158386, "learning_rate": 0.00018249893209800892, "loss": 0.886, "step": 4840 }, { "epoch": 1.3617200674536256, "grad_norm": 1.0984801054000854, "learning_rate": 0.00018244344721638726, "loss": 0.8734, "step": 4845 }, { "epoch": 1.363125351320967, "grad_norm": 0.5598686933517456, "learning_rate": 0.00018238788298257014, "loss": 0.8729, "step": 4850 }, { "epoch": 1.364530635188308, "grad_norm": 0.7339655160903931, "learning_rate": 0.00018233223945003844, "loss": 0.8681, "step": 4855 }, { "epoch": 1.3659359190556493, "grad_norm": 0.6859006881713867, "learning_rate": 0.0001822765166723493, "loss": 0.8687, "step": 4860 }, { "epoch": 1.3673412029229906, "grad_norm": 0.5756944417953491, "learning_rate": 0.0001822207147031361, "loss": 0.8754, "step": 4865 }, { "epoch": 1.3687464867903316, "grad_norm": 0.45417851209640503, "learning_rate": 0.00018216483359610855, "loss": 0.8683, "step": 4870 }, { "epoch": 1.370151770657673, "grad_norm": 0.45869648456573486, "learning_rate": 0.00018210887340505244, "loss": 0.883, "step": 4875 }, { "epoch": 1.371557054525014, "grad_norm": 0.5835374593734741, "learning_rate": 0.00018205283418382972, "loss": 0.8745, "step": 4880 }, { "epoch": 1.3729623383923553, "grad_norm": 0.5375783443450928, "learning_rate": 0.00018199671598637842, "loss": 0.8668, "step": 4885 }, { "epoch": 1.3743676222596966, "grad_norm": 0.5502672791481018, "learning_rate": 0.00018194051886671252, "loss": 0.8679, "step": 4890 }, { "epoch": 1.3757729061270376, "grad_norm": 0.9193586111068726, "learning_rate": 0.00018188424287892202, "loss": 0.8848, "step": 4895 }, { "epoch": 1.377178189994379, "grad_norm": 0.5418441891670227, "learning_rate": 0.00018182788807717285, "loss": 0.8614, "step": 4900 }, { "epoch": 1.37858347386172, "grad_norm": 0.6423871517181396, "learning_rate": 0.0001817714545157067, "loss": 0.9258, "step": 4905 }, { "epoch": 1.3799887577290613, "grad_norm": 0.9485771059989929, "learning_rate": 0.0001817149422488412, "loss": 0.8672, "step": 4910 }, { "epoch": 1.3813940415964026, "grad_norm": 0.49273064732551575, "learning_rate": 0.00018165835133096962, "loss": 0.861, "step": 4915 }, { "epoch": 1.3827993254637436, "grad_norm": 0.7470026016235352, "learning_rate": 0.00018160168181656099, "loss": 0.8763, "step": 4920 }, { "epoch": 1.384204609331085, "grad_norm": 0.5558050870895386, "learning_rate": 0.00018154493376015997, "loss": 0.8754, "step": 4925 }, { "epoch": 1.385609893198426, "grad_norm": 0.7394638657569885, "learning_rate": 0.00018148810721638686, "loss": 0.8764, "step": 4930 }, { "epoch": 1.3870151770657673, "grad_norm": 0.6907084584236145, "learning_rate": 0.0001814312022399374, "loss": 0.886, "step": 4935 }, { "epoch": 1.3884204609331086, "grad_norm": 0.6553033590316772, "learning_rate": 0.00018137421888558296, "loss": 0.8617, "step": 4940 }, { "epoch": 1.3898257448004496, "grad_norm": 0.5968340635299683, "learning_rate": 0.00018131715720817024, "loss": 0.8673, "step": 4945 }, { "epoch": 1.391231028667791, "grad_norm": 0.5546401739120483, "learning_rate": 0.00018126001726262135, "loss": 0.8698, "step": 4950 }, { "epoch": 1.392636312535132, "grad_norm": 0.4818323254585266, "learning_rate": 0.00018120279910393384, "loss": 0.853, "step": 4955 }, { "epoch": 1.3940415964024733, "grad_norm": 0.7312915921211243, "learning_rate": 0.0001811455027871803, "loss": 0.8815, "step": 4960 }, { "epoch": 1.3954468802698146, "grad_norm": 0.6379952430725098, "learning_rate": 0.00018108812836750885, "loss": 0.8611, "step": 4965 }, { "epoch": 1.3968521641371556, "grad_norm": 0.547435998916626, "learning_rate": 0.00018103067590014254, "loss": 0.8794, "step": 4970 }, { "epoch": 1.398257448004497, "grad_norm": 0.6002300381660461, "learning_rate": 0.00018097314544037967, "loss": 0.8679, "step": 4975 }, { "epoch": 1.399662731871838, "grad_norm": 0.4890361726284027, "learning_rate": 0.00018091553704359354, "loss": 0.927, "step": 4980 }, { "epoch": 1.4010680157391793, "grad_norm": 0.7528916597366333, "learning_rate": 0.0001808578507652325, "loss": 0.869, "step": 4985 }, { "epoch": 1.4024732996065206, "grad_norm": 0.5764560103416443, "learning_rate": 0.00018080008666081988, "loss": 0.8734, "step": 4990 }, { "epoch": 1.4038785834738619, "grad_norm": 0.5067351460456848, "learning_rate": 0.00018074224478595392, "loss": 0.8641, "step": 4995 }, { "epoch": 1.405283867341203, "grad_norm": 0.617499053478241, "learning_rate": 0.0001806843251963076, "loss": 0.8697, "step": 5000 }, { "epoch": 1.4066891512085442, "grad_norm": 0.7035298347473145, "learning_rate": 0.00018062632794762888, "loss": 0.8745, "step": 5005 }, { "epoch": 1.4080944350758853, "grad_norm": 0.6594152450561523, "learning_rate": 0.0001805682530957403, "loss": 0.8636, "step": 5010 }, { "epoch": 1.4094997189432266, "grad_norm": 0.8443005681037903, "learning_rate": 0.0001805101006965393, "loss": 0.8699, "step": 5015 }, { "epoch": 1.4109050028105679, "grad_norm": 0.643632173538208, "learning_rate": 0.0001804518708059977, "loss": 0.8853, "step": 5020 }, { "epoch": 1.412310286677909, "grad_norm": 0.5515636801719666, "learning_rate": 0.00018039356348016202, "loss": 0.874, "step": 5025 }, { "epoch": 1.4137155705452502, "grad_norm": 0.5284005999565125, "learning_rate": 0.00018033517877515345, "loss": 0.8645, "step": 5030 }, { "epoch": 1.4151208544125913, "grad_norm": 0.652372419834137, "learning_rate": 0.00018027671674716747, "loss": 0.8675, "step": 5035 }, { "epoch": 1.4165261382799326, "grad_norm": 0.8631367087364197, "learning_rate": 0.00018021817745247402, "loss": 0.8673, "step": 5040 }, { "epoch": 1.4179314221472739, "grad_norm": 1.9220738410949707, "learning_rate": 0.0001801595609474175, "loss": 0.8711, "step": 5045 }, { "epoch": 1.419336706014615, "grad_norm": 0.6992930173873901, "learning_rate": 0.00018010086728841653, "loss": 0.8634, "step": 5050 }, { "epoch": 1.4207419898819562, "grad_norm": 0.5778381824493408, "learning_rate": 0.00018004209653196403, "loss": 0.8788, "step": 5055 }, { "epoch": 1.4221472737492973, "grad_norm": 0.5597308874130249, "learning_rate": 0.00017998324873462712, "loss": 0.8531, "step": 5060 }, { "epoch": 1.4235525576166386, "grad_norm": 0.5867209434509277, "learning_rate": 0.0001799243239530471, "loss": 0.8763, "step": 5065 }, { "epoch": 1.4249578414839799, "grad_norm": 0.9052205681800842, "learning_rate": 0.0001798653222439393, "loss": 0.8581, "step": 5070 }, { "epoch": 1.426363125351321, "grad_norm": 0.7517098784446716, "learning_rate": 0.00017980624366409318, "loss": 0.8549, "step": 5075 }, { "epoch": 1.4277684092186622, "grad_norm": 0.5015307664871216, "learning_rate": 0.0001797470882703721, "loss": 0.8756, "step": 5080 }, { "epoch": 1.4291736930860033, "grad_norm": 0.5503989458084106, "learning_rate": 0.00017968785611971344, "loss": 0.9157, "step": 5085 }, { "epoch": 1.4305789769533446, "grad_norm": 0.7186044454574585, "learning_rate": 0.00017962854726912838, "loss": 0.8578, "step": 5090 }, { "epoch": 1.4319842608206859, "grad_norm": 0.6403028964996338, "learning_rate": 0.00017956916177570197, "loss": 0.8621, "step": 5095 }, { "epoch": 1.433389544688027, "grad_norm": 0.7702693343162537, "learning_rate": 0.00017950969969659302, "loss": 0.8796, "step": 5100 }, { "epoch": 1.4347948285553682, "grad_norm": 0.5178045034408569, "learning_rate": 0.00017945016108903406, "loss": 0.8437, "step": 5105 }, { "epoch": 1.4362001124227093, "grad_norm": 0.5101531147956848, "learning_rate": 0.00017939054601033124, "loss": 0.874, "step": 5110 }, { "epoch": 1.4376053962900506, "grad_norm": 0.6295761466026306, "learning_rate": 0.00017933085451786443, "loss": 0.8793, "step": 5115 }, { "epoch": 1.4390106801573919, "grad_norm": 0.6453625559806824, "learning_rate": 0.00017927108666908686, "loss": 0.9023, "step": 5120 }, { "epoch": 1.440415964024733, "grad_norm": 0.6669296622276306, "learning_rate": 0.00017921124252152546, "loss": 0.8664, "step": 5125 }, { "epoch": 1.4418212478920742, "grad_norm": 0.555692732334137, "learning_rate": 0.0001791513221327804, "loss": 0.9177, "step": 5130 }, { "epoch": 1.4432265317594153, "grad_norm": 0.9209009408950806, "learning_rate": 0.00017909132556052538, "loss": 0.8641, "step": 5135 }, { "epoch": 1.4446318156267566, "grad_norm": 0.5858955979347229, "learning_rate": 0.00017903125286250737, "loss": 0.9361, "step": 5140 }, { "epoch": 1.4460370994940979, "grad_norm": 0.7377965450286865, "learning_rate": 0.00017897110409654661, "loss": 0.8703, "step": 5145 }, { "epoch": 1.447442383361439, "grad_norm": 0.6079499125480652, "learning_rate": 0.0001789108793205366, "loss": 0.8642, "step": 5150 }, { "epoch": 1.4488476672287802, "grad_norm": 0.8482330441474915, "learning_rate": 0.0001788505785924439, "loss": 0.86, "step": 5155 }, { "epoch": 1.4502529510961213, "grad_norm": 0.8403465151786804, "learning_rate": 0.0001787902019703083, "loss": 0.8705, "step": 5160 }, { "epoch": 1.4516582349634626, "grad_norm": 0.8771752715110779, "learning_rate": 0.0001787297495122425, "loss": 0.854, "step": 5165 }, { "epoch": 1.4530635188308039, "grad_norm": 1.1639344692230225, "learning_rate": 0.00017866922127643232, "loss": 0.9359, "step": 5170 }, { "epoch": 1.4544688026981452, "grad_norm": 0.7973338961601257, "learning_rate": 0.00017860861732113648, "loss": 0.8632, "step": 5175 }, { "epoch": 1.4558740865654862, "grad_norm": 0.6689393520355225, "learning_rate": 0.0001785479377046865, "loss": 0.8706, "step": 5180 }, { "epoch": 1.4572793704328275, "grad_norm": 1.694438099861145, "learning_rate": 0.00017848718248548686, "loss": 0.8738, "step": 5185 }, { "epoch": 1.4586846543001686, "grad_norm": 0.636578381061554, "learning_rate": 0.0001784263517220147, "loss": 0.8705, "step": 5190 }, { "epoch": 1.4600899381675099, "grad_norm": 0.9136772751808167, "learning_rate": 0.0001783654454728199, "loss": 0.8528, "step": 5195 }, { "epoch": 1.4614952220348512, "grad_norm": 0.623679518699646, "learning_rate": 0.00017830446379652504, "loss": 0.8694, "step": 5200 }, { "epoch": 1.4629005059021922, "grad_norm": 0.7054007649421692, "learning_rate": 0.0001782434067518252, "loss": 0.869, "step": 5205 }, { "epoch": 1.4643057897695335, "grad_norm": 1.3509278297424316, "learning_rate": 0.00017818227439748814, "loss": 0.8816, "step": 5210 }, { "epoch": 1.4657110736368746, "grad_norm": 0.787777841091156, "learning_rate": 0.00017812106679235395, "loss": 0.8716, "step": 5215 }, { "epoch": 1.4671163575042159, "grad_norm": 0.6147462129592896, "learning_rate": 0.0001780597839953353, "loss": 0.8798, "step": 5220 }, { "epoch": 1.4685216413715572, "grad_norm": 0.6231019496917725, "learning_rate": 0.00017799842606541714, "loss": 0.8933, "step": 5225 }, { "epoch": 1.4699269252388982, "grad_norm": 0.669661819934845, "learning_rate": 0.0001779369930616568, "loss": 0.8615, "step": 5230 }, { "epoch": 1.4713322091062395, "grad_norm": 0.5519552230834961, "learning_rate": 0.00017787548504318373, "loss": 0.8696, "step": 5235 }, { "epoch": 1.4727374929735806, "grad_norm": 0.5218339562416077, "learning_rate": 0.00017781390206919975, "loss": 0.8734, "step": 5240 }, { "epoch": 1.4741427768409219, "grad_norm": 0.5840299725532532, "learning_rate": 0.0001777522441989787, "loss": 0.8815, "step": 5245 }, { "epoch": 1.4755480607082632, "grad_norm": 1.1502022743225098, "learning_rate": 0.00017769051149186664, "loss": 0.8753, "step": 5250 }, { "epoch": 1.4769533445756042, "grad_norm": 0.6379995942115784, "learning_rate": 0.00017762870400728152, "loss": 0.8736, "step": 5255 }, { "epoch": 1.4783586284429455, "grad_norm": 0.5611156821250916, "learning_rate": 0.00017756682180471338, "loss": 0.8603, "step": 5260 }, { "epoch": 1.4797639123102866, "grad_norm": 0.6152333617210388, "learning_rate": 0.00017750486494372408, "loss": 0.8793, "step": 5265 }, { "epoch": 1.4811691961776279, "grad_norm": 0.6901036500930786, "learning_rate": 0.0001774428334839474, "loss": 0.8594, "step": 5270 }, { "epoch": 1.4825744800449692, "grad_norm": 0.8560009598731995, "learning_rate": 0.0001773807274850889, "loss": 0.8688, "step": 5275 }, { "epoch": 1.4839797639123102, "grad_norm": 0.5628454685211182, "learning_rate": 0.00017731854700692588, "loss": 0.8718, "step": 5280 }, { "epoch": 1.4853850477796515, "grad_norm": 0.46983468532562256, "learning_rate": 0.00017725629210930732, "loss": 0.8741, "step": 5285 }, { "epoch": 1.4867903316469926, "grad_norm": 0.592480480670929, "learning_rate": 0.0001771939628521539, "loss": 0.8663, "step": 5290 }, { "epoch": 1.4881956155143339, "grad_norm": 0.5054726600646973, "learning_rate": 0.00017713155929545776, "loss": 0.8711, "step": 5295 }, { "epoch": 1.4896008993816752, "grad_norm": 0.5680946111679077, "learning_rate": 0.00017706908149928266, "loss": 0.864, "step": 5300 }, { "epoch": 1.4910061832490162, "grad_norm": 0.670420229434967, "learning_rate": 0.00017700652952376374, "loss": 0.8774, "step": 5305 }, { "epoch": 1.4924114671163575, "grad_norm": 0.5870426893234253, "learning_rate": 0.00017694390342910753, "loss": 0.8723, "step": 5310 }, { "epoch": 1.4938167509836986, "grad_norm": 0.7351912260055542, "learning_rate": 0.00017688120327559196, "loss": 0.8652, "step": 5315 }, { "epoch": 1.4952220348510399, "grad_norm": 0.5224320888519287, "learning_rate": 0.00017681842912356624, "loss": 0.8831, "step": 5320 }, { "epoch": 1.4966273187183812, "grad_norm": 0.5571185946464539, "learning_rate": 0.00017675558103345075, "loss": 0.8747, "step": 5325 }, { "epoch": 1.4980326025857222, "grad_norm": 1.3304951190948486, "learning_rate": 0.00017669265906573704, "loss": 0.9177, "step": 5330 }, { "epoch": 1.4994378864530635, "grad_norm": 0.48752954602241516, "learning_rate": 0.00017662966328098784, "loss": 0.8519, "step": 5335 }, { "epoch": 1.5008431703204046, "grad_norm": 0.6318584084510803, "learning_rate": 0.0001765665937398369, "loss": 0.8763, "step": 5340 }, { "epoch": 1.5022484541877459, "grad_norm": 0.5569983124732971, "learning_rate": 0.00017650345050298886, "loss": 0.8781, "step": 5345 }, { "epoch": 1.5036537380550872, "grad_norm": 0.5715246796607971, "learning_rate": 0.00017644023363121947, "loss": 0.8548, "step": 5350 }, { "epoch": 1.5050590219224285, "grad_norm": 0.5527710914611816, "learning_rate": 0.00017637694318537513, "loss": 0.8537, "step": 5355 }, { "epoch": 1.5064643057897695, "grad_norm": 0.5467240810394287, "learning_rate": 0.00017631357922637332, "loss": 0.8709, "step": 5360 }, { "epoch": 1.5078695896571106, "grad_norm": 0.49403300881385803, "learning_rate": 0.0001762501418152021, "loss": 0.8675, "step": 5365 }, { "epoch": 1.5092748735244519, "grad_norm": 0.5923786163330078, "learning_rate": 0.00017618663101292022, "loss": 0.8837, "step": 5370 }, { "epoch": 1.5106801573917932, "grad_norm": 0.5225454568862915, "learning_rate": 0.00017612304688065713, "loss": 0.8664, "step": 5375 }, { "epoch": 1.5120854412591345, "grad_norm": 0.589855432510376, "learning_rate": 0.00017605938947961292, "loss": 0.8517, "step": 5380 }, { "epoch": 1.5134907251264755, "grad_norm": 0.5037773847579956, "learning_rate": 0.00017599565887105803, "loss": 0.88, "step": 5385 }, { "epoch": 1.5148960089938166, "grad_norm": 0.4747573435306549, "learning_rate": 0.00017593185511633356, "loss": 0.9105, "step": 5390 }, { "epoch": 1.516301292861158, "grad_norm": 0.4647715985774994, "learning_rate": 0.00017586797827685082, "loss": 0.8597, "step": 5395 }, { "epoch": 1.5177065767284992, "grad_norm": 0.5436646938323975, "learning_rate": 0.0001758040284140916, "loss": 0.8565, "step": 5400 }, { "epoch": 1.5191118605958405, "grad_norm": 0.7642892003059387, "learning_rate": 0.00017574000558960798, "loss": 0.8635, "step": 5405 }, { "epoch": 1.5205171444631815, "grad_norm": 1.8821611404418945, "learning_rate": 0.0001756759098650222, "loss": 0.9264, "step": 5410 }, { "epoch": 1.5219224283305226, "grad_norm": 0.5186439752578735, "learning_rate": 0.00017561174130202662, "loss": 0.8599, "step": 5415 }, { "epoch": 1.523327712197864, "grad_norm": 0.5758541226387024, "learning_rate": 0.00017554749996238388, "loss": 0.8523, "step": 5420 }, { "epoch": 1.5247329960652052, "grad_norm": 0.6760127544403076, "learning_rate": 0.00017548318590792646, "loss": 0.9273, "step": 5425 }, { "epoch": 1.5261382799325465, "grad_norm": 0.4689038097858429, "learning_rate": 0.00017541879920055702, "loss": 0.8731, "step": 5430 }, { "epoch": 1.5275435637998875, "grad_norm": 0.5448866486549377, "learning_rate": 0.00017535433990224802, "loss": 0.8888, "step": 5435 }, { "epoch": 1.5289488476672288, "grad_norm": 0.5650317668914795, "learning_rate": 0.00017528980807504185, "loss": 0.8654, "step": 5440 }, { "epoch": 1.53035413153457, "grad_norm": 0.5792402625083923, "learning_rate": 0.00017522520378105064, "loss": 0.858, "step": 5445 }, { "epoch": 1.5317594154019112, "grad_norm": 0.5568968057632446, "learning_rate": 0.00017516052708245637, "loss": 0.8609, "step": 5450 }, { "epoch": 1.5331646992692525, "grad_norm": 0.7661116123199463, "learning_rate": 0.00017509577804151064, "loss": 0.8754, "step": 5455 }, { "epoch": 1.5345699831365935, "grad_norm": 0.5771020650863647, "learning_rate": 0.0001750309567205347, "loss": 0.8595, "step": 5460 }, { "epoch": 1.5359752670039348, "grad_norm": 0.8583388328552246, "learning_rate": 0.0001749660631819194, "loss": 0.8652, "step": 5465 }, { "epoch": 1.537380550871276, "grad_norm": 0.6303179264068604, "learning_rate": 0.00017490109748812498, "loss": 0.8611, "step": 5470 }, { "epoch": 1.5387858347386172, "grad_norm": 1.0614551305770874, "learning_rate": 0.00017483605970168128, "loss": 0.8662, "step": 5475 }, { "epoch": 1.5401911186059585, "grad_norm": 0.7186009287834167, "learning_rate": 0.0001747709498851875, "loss": 0.8712, "step": 5480 }, { "epoch": 1.5415964024732998, "grad_norm": 1.02847158908844, "learning_rate": 0.0001747057681013121, "loss": 0.8588, "step": 5485 }, { "epoch": 1.5430016863406408, "grad_norm": 0.6516429781913757, "learning_rate": 0.00017464051441279282, "loss": 0.8482, "step": 5490 }, { "epoch": 1.544406970207982, "grad_norm": 0.4523126184940338, "learning_rate": 0.00017457518888243666, "loss": 0.8673, "step": 5495 }, { "epoch": 1.5458122540753232, "grad_norm": 0.6133017539978027, "learning_rate": 0.0001745097915731197, "loss": 0.8715, "step": 5500 }, { "epoch": 1.5472175379426645, "grad_norm": 0.9278969168663025, "learning_rate": 0.00017444432254778725, "loss": 0.8513, "step": 5505 }, { "epoch": 1.5486228218100058, "grad_norm": 0.831170380115509, "learning_rate": 0.00017437878186945348, "loss": 0.8677, "step": 5510 }, { "epoch": 1.5500281056773468, "grad_norm": 0.588657557964325, "learning_rate": 0.00017431316960120157, "loss": 0.8634, "step": 5515 }, { "epoch": 1.551433389544688, "grad_norm": 0.5699366927146912, "learning_rate": 0.00017424748580618365, "loss": 0.8804, "step": 5520 }, { "epoch": 1.5528386734120292, "grad_norm": 0.7426570653915405, "learning_rate": 0.0001741817305476207, "loss": 0.9172, "step": 5525 }, { "epoch": 1.5542439572793705, "grad_norm": 0.5099422335624695, "learning_rate": 0.00017411590388880242, "loss": 0.8437, "step": 5530 }, { "epoch": 1.5556492411467118, "grad_norm": 0.5340549349784851, "learning_rate": 0.00017405000589308732, "loss": 0.8754, "step": 5535 }, { "epoch": 1.5570545250140528, "grad_norm": 0.5274845957756042, "learning_rate": 0.00017398403662390244, "loss": 0.8667, "step": 5540 }, { "epoch": 1.558459808881394, "grad_norm": 0.45988380908966064, "learning_rate": 0.0001739179961447436, "loss": 0.8749, "step": 5545 }, { "epoch": 1.5598650927487352, "grad_norm": 0.7324682474136353, "learning_rate": 0.000173851884519175, "loss": 0.8513, "step": 5550 }, { "epoch": 1.5612703766160765, "grad_norm": 0.5798319578170776, "learning_rate": 0.00017378570181082943, "loss": 0.8748, "step": 5555 }, { "epoch": 1.5626756604834178, "grad_norm": 0.47172462940216064, "learning_rate": 0.000173719448083408, "loss": 0.8588, "step": 5560 }, { "epoch": 1.5640809443507588, "grad_norm": 0.567803144454956, "learning_rate": 0.00017365312340068023, "loss": 0.8879, "step": 5565 }, { "epoch": 1.5654862282181, "grad_norm": 0.48772311210632324, "learning_rate": 0.00017358672782648397, "loss": 0.8682, "step": 5570 }, { "epoch": 1.5668915120854412, "grad_norm": 0.47581541538238525, "learning_rate": 0.00017352026142472524, "loss": 0.8811, "step": 5575 }, { "epoch": 1.5682967959527825, "grad_norm": 0.4929133653640747, "learning_rate": 0.00017345372425937825, "loss": 0.8517, "step": 5580 }, { "epoch": 1.5697020798201238, "grad_norm": 0.5461881160736084, "learning_rate": 0.0001733871163944853, "loss": 0.8556, "step": 5585 }, { "epoch": 1.5711073636874648, "grad_norm": 0.6379693746566772, "learning_rate": 0.00017332043789415684, "loss": 0.8469, "step": 5590 }, { "epoch": 1.572512647554806, "grad_norm": 0.5068411231040955, "learning_rate": 0.00017325368882257117, "loss": 0.8714, "step": 5595 }, { "epoch": 1.5739179314221472, "grad_norm": 0.5322855710983276, "learning_rate": 0.00017318686924397463, "loss": 0.8757, "step": 5600 }, { "epoch": 1.5753232152894885, "grad_norm": 0.5769434571266174, "learning_rate": 0.00017311997922268133, "loss": 0.8715, "step": 5605 }, { "epoch": 1.5767284991568298, "grad_norm": 0.6665701270103455, "learning_rate": 0.0001730530188230732, "loss": 0.8703, "step": 5610 }, { "epoch": 1.5781337830241708, "grad_norm": 0.8959527015686035, "learning_rate": 0.00017298598810959996, "loss": 0.8805, "step": 5615 }, { "epoch": 1.5795390668915121, "grad_norm": 0.7809743285179138, "learning_rate": 0.000172918887146779, "loss": 0.8761, "step": 5620 }, { "epoch": 1.5809443507588532, "grad_norm": 0.888780951499939, "learning_rate": 0.0001728517159991953, "loss": 0.8699, "step": 5625 }, { "epoch": 1.5823496346261945, "grad_norm": 0.6694765686988831, "learning_rate": 0.00017278447473150133, "loss": 0.8659, "step": 5630 }, { "epoch": 1.5837549184935358, "grad_norm": 0.5062331557273865, "learning_rate": 0.00017271716340841722, "loss": 0.8545, "step": 5635 }, { "epoch": 1.5851602023608768, "grad_norm": 0.6682859659194946, "learning_rate": 0.00017264978209473035, "loss": 0.8799, "step": 5640 }, { "epoch": 1.5865654862282181, "grad_norm": 0.4775022268295288, "learning_rate": 0.0001725823308552956, "loss": 0.869, "step": 5645 }, { "epoch": 1.5879707700955592, "grad_norm": 0.5014980435371399, "learning_rate": 0.00017251480975503502, "loss": 0.8747, "step": 5650 }, { "epoch": 1.5893760539629005, "grad_norm": 0.5155623555183411, "learning_rate": 0.00017244721885893802, "loss": 0.8792, "step": 5655 }, { "epoch": 1.5907813378302418, "grad_norm": 0.6804929375648499, "learning_rate": 0.00017237955823206117, "loss": 0.8792, "step": 5660 }, { "epoch": 1.592186621697583, "grad_norm": 0.47904977202415466, "learning_rate": 0.0001723118279395281, "loss": 0.8696, "step": 5665 }, { "epoch": 1.5935919055649241, "grad_norm": 0.6762366890907288, "learning_rate": 0.00017224402804652956, "loss": 0.851, "step": 5670 }, { "epoch": 1.5949971894322652, "grad_norm": 0.5059134364128113, "learning_rate": 0.00017217615861832327, "loss": 0.8803, "step": 5675 }, { "epoch": 1.5964024732996065, "grad_norm": 0.6604952216148376, "learning_rate": 0.00017210821972023376, "loss": 0.8655, "step": 5680 }, { "epoch": 1.5978077571669478, "grad_norm": 0.6762186288833618, "learning_rate": 0.00017204021141765266, "loss": 0.8874, "step": 5685 }, { "epoch": 1.599213041034289, "grad_norm": 0.6892690658569336, "learning_rate": 0.00017197213377603818, "loss": 0.8747, "step": 5690 }, { "epoch": 1.6006183249016301, "grad_norm": 0.507866621017456, "learning_rate": 0.0001719039868609154, "loss": 0.8735, "step": 5695 }, { "epoch": 1.6020236087689712, "grad_norm": 0.6932044625282288, "learning_rate": 0.00017183577073787607, "loss": 0.88, "step": 5700 }, { "epoch": 1.6034288926363125, "grad_norm": 0.820203423500061, "learning_rate": 0.00017176748547257845, "loss": 0.8689, "step": 5705 }, { "epoch": 1.6048341765036538, "grad_norm": 0.7642927169799805, "learning_rate": 0.00017169913113074747, "loss": 0.874, "step": 5710 }, { "epoch": 1.606239460370995, "grad_norm": 0.5460090041160583, "learning_rate": 0.00017163070777817448, "loss": 0.8566, "step": 5715 }, { "epoch": 1.6076447442383361, "grad_norm": 0.597949743270874, "learning_rate": 0.00017156221548071728, "loss": 0.9107, "step": 5720 }, { "epoch": 1.6090500281056772, "grad_norm": 0.733345627784729, "learning_rate": 0.00017149365430430002, "loss": 0.8565, "step": 5725 }, { "epoch": 1.6104553119730185, "grad_norm": 0.5471769571304321, "learning_rate": 0.00017142502431491313, "loss": 0.8591, "step": 5730 }, { "epoch": 1.6118605958403598, "grad_norm": 0.5078122615814209, "learning_rate": 0.0001713563255786133, "loss": 0.9085, "step": 5735 }, { "epoch": 1.613265879707701, "grad_norm": 0.7095496654510498, "learning_rate": 0.00017128755816152338, "loss": 0.8743, "step": 5740 }, { "epoch": 1.6146711635750421, "grad_norm": 0.7899284362792969, "learning_rate": 0.0001712187221298323, "loss": 0.881, "step": 5745 }, { "epoch": 1.6160764474423832, "grad_norm": 0.5304770469665527, "learning_rate": 0.0001711498175497951, "loss": 0.8609, "step": 5750 }, { "epoch": 1.6174817313097245, "grad_norm": 0.4922604560852051, "learning_rate": 0.00017108084448773272, "loss": 0.9403, "step": 5755 }, { "epoch": 1.6188870151770658, "grad_norm": 0.6193071007728577, "learning_rate": 0.00017101180301003205, "loss": 0.8504, "step": 5760 }, { "epoch": 1.620292299044407, "grad_norm": 0.49560025334358215, "learning_rate": 0.00017094269318314588, "loss": 0.8619, "step": 5765 }, { "epoch": 1.6216975829117481, "grad_norm": 0.5417090654373169, "learning_rate": 0.00017087351507359263, "loss": 0.9363, "step": 5770 }, { "epoch": 1.6231028667790892, "grad_norm": 0.48260176181793213, "learning_rate": 0.00017080426874795665, "loss": 0.8597, "step": 5775 }, { "epoch": 1.6245081506464305, "grad_norm": 0.5174697041511536, "learning_rate": 0.0001707349542728878, "loss": 0.8719, "step": 5780 }, { "epoch": 1.6259134345137718, "grad_norm": 0.48467588424682617, "learning_rate": 0.0001706655717151016, "loss": 0.8736, "step": 5785 }, { "epoch": 1.627318718381113, "grad_norm": 0.4969995319843292, "learning_rate": 0.00017059612114137904, "loss": 0.8659, "step": 5790 }, { "epoch": 1.6287240022484542, "grad_norm": 0.8451137542724609, "learning_rate": 0.00017052660261856662, "loss": 0.8671, "step": 5795 }, { "epoch": 1.6301292861157954, "grad_norm": 0.6291013956069946, "learning_rate": 0.0001704570162135763, "loss": 0.868, "step": 5800 }, { "epoch": 1.6315345699831365, "grad_norm": 0.6090657711029053, "learning_rate": 0.00017038736199338525, "loss": 0.8794, "step": 5805 }, { "epoch": 1.6329398538504778, "grad_norm": 0.4956682026386261, "learning_rate": 0.000170317640025036, "loss": 0.8652, "step": 5810 }, { "epoch": 1.634345137717819, "grad_norm": 0.5881335139274597, "learning_rate": 0.00017024785037563625, "loss": 0.8664, "step": 5815 }, { "epoch": 1.6357504215851602, "grad_norm": 0.6143118143081665, "learning_rate": 0.00017017799311235885, "loss": 0.8806, "step": 5820 }, { "epoch": 1.6371557054525014, "grad_norm": 0.5305077433586121, "learning_rate": 0.00017010806830244178, "loss": 0.8679, "step": 5825 }, { "epoch": 1.6385609893198425, "grad_norm": 0.47793033719062805, "learning_rate": 0.00017003807601318792, "loss": 0.8736, "step": 5830 }, { "epoch": 1.6399662731871838, "grad_norm": 0.8925052881240845, "learning_rate": 0.00016996801631196526, "loss": 0.8488, "step": 5835 }, { "epoch": 1.641371557054525, "grad_norm": 1.0212877988815308, "learning_rate": 0.0001698978892662065, "loss": 0.8608, "step": 5840 }, { "epoch": 1.6427768409218664, "grad_norm": 0.8234551548957825, "learning_rate": 0.00016982769494340932, "loss": 0.8797, "step": 5845 }, { "epoch": 1.6441821247892074, "grad_norm": 0.49760153889656067, "learning_rate": 0.00016975743341113598, "loss": 0.8597, "step": 5850 }, { "epoch": 1.6455874086565485, "grad_norm": 0.575788676738739, "learning_rate": 0.00016968710473701362, "loss": 0.9101, "step": 5855 }, { "epoch": 1.6469926925238898, "grad_norm": 0.5459645390510559, "learning_rate": 0.00016961670898873386, "loss": 0.8635, "step": 5860 }, { "epoch": 1.648397976391231, "grad_norm": 0.5154797434806824, "learning_rate": 0.00016954624623405292, "loss": 0.8677, "step": 5865 }, { "epoch": 1.6498032602585724, "grad_norm": 0.481139600276947, "learning_rate": 0.00016947571654079156, "loss": 0.8608, "step": 5870 }, { "epoch": 1.6512085441259134, "grad_norm": 0.5141423940658569, "learning_rate": 0.0001694051199768349, "loss": 0.8565, "step": 5875 }, { "epoch": 1.6526138279932545, "grad_norm": 0.5673717260360718, "learning_rate": 0.00016933445661013248, "loss": 0.8603, "step": 5880 }, { "epoch": 1.6540191118605958, "grad_norm": 0.6446055769920349, "learning_rate": 0.00016926372650869813, "loss": 0.9708, "step": 5885 }, { "epoch": 1.655424395727937, "grad_norm": 0.6382657289505005, "learning_rate": 0.00016919292974060986, "loss": 0.8699, "step": 5890 }, { "epoch": 1.6568296795952784, "grad_norm": 0.5263247489929199, "learning_rate": 0.0001691220663740099, "loss": 0.8638, "step": 5895 }, { "epoch": 1.6582349634626194, "grad_norm": 0.5247864127159119, "learning_rate": 0.00016905113647710452, "loss": 0.8787, "step": 5900 }, { "epoch": 1.6596402473299605, "grad_norm": 0.5854396224021912, "learning_rate": 0.00016898014011816414, "loss": 0.8849, "step": 5905 }, { "epoch": 1.6610455311973018, "grad_norm": 0.46041086316108704, "learning_rate": 0.00016890907736552308, "loss": 0.8677, "step": 5910 }, { "epoch": 1.662450815064643, "grad_norm": 0.509297788143158, "learning_rate": 0.00016883794828757953, "loss": 0.8579, "step": 5915 }, { "epoch": 1.6638560989319844, "grad_norm": 0.5629145503044128, "learning_rate": 0.00016876675295279553, "loss": 0.8699, "step": 5920 }, { "epoch": 1.6652613827993255, "grad_norm": 0.6250473856925964, "learning_rate": 0.000168695491429697, "loss": 0.8679, "step": 5925 }, { "epoch": 1.6666666666666665, "grad_norm": 0.5854396224021912, "learning_rate": 0.0001686241637868734, "loss": 0.8662, "step": 5930 }, { "epoch": 1.6680719505340078, "grad_norm": 0.5250497460365295, "learning_rate": 0.0001685527700929779, "loss": 0.8886, "step": 5935 }, { "epoch": 1.669477234401349, "grad_norm": 0.552189826965332, "learning_rate": 0.00016848131041672735, "loss": 0.8784, "step": 5940 }, { "epoch": 1.6708825182686904, "grad_norm": 0.612978458404541, "learning_rate": 0.00016840978482690196, "loss": 0.8735, "step": 5945 }, { "epoch": 1.6722878021360315, "grad_norm": 0.7062554955482483, "learning_rate": 0.0001683381933923454, "loss": 0.9183, "step": 5950 }, { "epoch": 1.6736930860033725, "grad_norm": 0.5159368515014648, "learning_rate": 0.00016826653618196485, "loss": 0.8606, "step": 5955 }, { "epoch": 1.6750983698707138, "grad_norm": 0.5322017669677734, "learning_rate": 0.00016819481326473063, "loss": 0.8482, "step": 5960 }, { "epoch": 1.676503653738055, "grad_norm": 0.5021723508834839, "learning_rate": 0.0001681230247096764, "loss": 0.8741, "step": 5965 }, { "epoch": 1.6779089376053964, "grad_norm": 0.5162078738212585, "learning_rate": 0.00016805117058589893, "loss": 0.8628, "step": 5970 }, { "epoch": 1.6793142214727375, "grad_norm": 0.6757156252861023, "learning_rate": 0.00016797925096255823, "loss": 0.8788, "step": 5975 }, { "epoch": 1.6807195053400787, "grad_norm": 0.5981365442276001, "learning_rate": 0.0001679072659088772, "loss": 0.8685, "step": 5980 }, { "epoch": 1.6821247892074198, "grad_norm": 0.4823263883590698, "learning_rate": 0.0001678352154941418, "loss": 0.8558, "step": 5985 }, { "epoch": 1.683530073074761, "grad_norm": 0.4609000086784363, "learning_rate": 0.00016776309978770092, "loss": 0.8645, "step": 5990 }, { "epoch": 1.6849353569421024, "grad_norm": 0.6788878440856934, "learning_rate": 0.00016769091885896617, "loss": 0.8625, "step": 5995 }, { "epoch": 1.6863406408094435, "grad_norm": 0.5988131761550903, "learning_rate": 0.0001676186727774121, "loss": 0.8695, "step": 6000 }, { "epoch": 1.6877459246767847, "grad_norm": 0.6434552669525146, "learning_rate": 0.00016754636161257587, "loss": 0.8588, "step": 6005 }, { "epoch": 1.6891512085441258, "grad_norm": 0.48518407344818115, "learning_rate": 0.0001674739854340573, "loss": 0.8488, "step": 6010 }, { "epoch": 1.690556492411467, "grad_norm": 0.515152096748352, "learning_rate": 0.0001674015443115188, "loss": 0.8626, "step": 6015 }, { "epoch": 1.6919617762788084, "grad_norm": 0.50271075963974, "learning_rate": 0.00016732903831468532, "loss": 0.8681, "step": 6020 }, { "epoch": 1.6933670601461497, "grad_norm": 0.7227884531021118, "learning_rate": 0.00016725646751334416, "loss": 0.8623, "step": 6025 }, { "epoch": 1.6947723440134908, "grad_norm": 0.5185177326202393, "learning_rate": 0.00016718383197734506, "loss": 0.9226, "step": 6030 }, { "epoch": 1.6961776278808318, "grad_norm": 0.46443167328834534, "learning_rate": 0.00016711113177660008, "loss": 0.8738, "step": 6035 }, { "epoch": 1.697582911748173, "grad_norm": 0.7292987108230591, "learning_rate": 0.00016703836698108346, "loss": 0.8619, "step": 6040 }, { "epoch": 1.6989881956155144, "grad_norm": 0.7463960647583008, "learning_rate": 0.00016696553766083167, "loss": 0.8702, "step": 6045 }, { "epoch": 1.7003934794828557, "grad_norm": 1.0327949523925781, "learning_rate": 0.0001668926438859433, "loss": 0.8756, "step": 6050 }, { "epoch": 1.7017987633501968, "grad_norm": 0.5928092002868652, "learning_rate": 0.00016681968572657886, "loss": 0.8703, "step": 6055 }, { "epoch": 1.7032040472175378, "grad_norm": 0.7332513332366943, "learning_rate": 0.00016674666325296097, "loss": 0.8628, "step": 6060 }, { "epoch": 1.704609331084879, "grad_norm": 0.6123602390289307, "learning_rate": 0.00016667357653537407, "loss": 0.8604, "step": 6065 }, { "epoch": 1.7060146149522204, "grad_norm": 0.6254216432571411, "learning_rate": 0.00016660042564416448, "loss": 0.8634, "step": 6070 }, { "epoch": 1.7074198988195617, "grad_norm": 0.6215693354606628, "learning_rate": 0.00016652721064974027, "loss": 0.8586, "step": 6075 }, { "epoch": 1.7088251826869028, "grad_norm": 0.5529701113700867, "learning_rate": 0.00016645393162257122, "loss": 0.8586, "step": 6080 }, { "epoch": 1.7102304665542438, "grad_norm": 0.6067641377449036, "learning_rate": 0.00016638058863318865, "loss": 0.8579, "step": 6085 }, { "epoch": 1.7116357504215851, "grad_norm": 0.5234971046447754, "learning_rate": 0.00016630718175218565, "loss": 0.8686, "step": 6090 }, { "epoch": 1.7130410342889264, "grad_norm": 0.5197697281837463, "learning_rate": 0.00016623371105021654, "loss": 0.8561, "step": 6095 }, { "epoch": 1.7144463181562677, "grad_norm": 0.5112480521202087, "learning_rate": 0.00016616017659799732, "loss": 0.8715, "step": 6100 }, { "epoch": 1.7158516020236088, "grad_norm": 0.5215193629264832, "learning_rate": 0.00016608657846630518, "loss": 0.8542, "step": 6105 }, { "epoch": 1.7172568858909498, "grad_norm": 0.4877747893333435, "learning_rate": 0.00016601291672597865, "loss": 0.8755, "step": 6110 }, { "epoch": 1.7186621697582911, "grad_norm": 0.6159438490867615, "learning_rate": 0.00016593919144791754, "loss": 0.8752, "step": 6115 }, { "epoch": 1.7200674536256324, "grad_norm": 0.48233506083488464, "learning_rate": 0.00016586540270308275, "loss": 0.8641, "step": 6120 }, { "epoch": 1.7214727374929737, "grad_norm": 0.5225429534912109, "learning_rate": 0.00016579155056249627, "loss": 0.8409, "step": 6125 }, { "epoch": 1.7228780213603148, "grad_norm": 0.5753469467163086, "learning_rate": 0.00016571763509724109, "loss": 0.8601, "step": 6130 }, { "epoch": 1.7242833052276558, "grad_norm": 0.685015082359314, "learning_rate": 0.00016564365637846125, "loss": 0.8675, "step": 6135 }, { "epoch": 1.7256885890949971, "grad_norm": 0.5576612949371338, "learning_rate": 0.0001655696144773616, "loss": 0.8848, "step": 6140 }, { "epoch": 1.7270938729623384, "grad_norm": 0.5092169046401978, "learning_rate": 0.0001654955094652078, "loss": 0.8648, "step": 6145 }, { "epoch": 1.7284991568296797, "grad_norm": 0.4944714903831482, "learning_rate": 0.00016542134141332623, "loss": 0.8733, "step": 6150 }, { "epoch": 1.7299044406970208, "grad_norm": 0.47435131669044495, "learning_rate": 0.000165347110393104, "loss": 0.8629, "step": 6155 }, { "epoch": 1.731309724564362, "grad_norm": 0.6226024627685547, "learning_rate": 0.00016527281647598881, "loss": 0.8616, "step": 6160 }, { "epoch": 1.7327150084317031, "grad_norm": 0.5335372090339661, "learning_rate": 0.00016519845973348888, "loss": 0.8588, "step": 6165 }, { "epoch": 1.7341202922990444, "grad_norm": 0.4744356870651245, "learning_rate": 0.00016512404023717294, "loss": 0.8632, "step": 6170 }, { "epoch": 1.7355255761663857, "grad_norm": 0.5560280084609985, "learning_rate": 0.0001650495580586701, "loss": 0.8601, "step": 6175 }, { "epoch": 1.7369308600337268, "grad_norm": 0.6050639152526855, "learning_rate": 0.00016497501326966974, "loss": 0.8506, "step": 6180 }, { "epoch": 1.738336143901068, "grad_norm": 0.5097574591636658, "learning_rate": 0.00016490040594192165, "loss": 0.8942, "step": 6185 }, { "epoch": 1.7397414277684091, "grad_norm": 1.0571644306182861, "learning_rate": 0.00016482573614723563, "loss": 0.8579, "step": 6190 }, { "epoch": 1.7411467116357504, "grad_norm": 0.5297767519950867, "learning_rate": 0.00016475100395748178, "loss": 0.8646, "step": 6195 }, { "epoch": 1.7425519955030917, "grad_norm": 0.6666551232337952, "learning_rate": 0.00016467620944459014, "loss": 0.8746, "step": 6200 }, { "epoch": 1.743957279370433, "grad_norm": 0.726667582988739, "learning_rate": 0.00016460135268055076, "loss": 0.8708, "step": 6205 }, { "epoch": 1.745362563237774, "grad_norm": 0.5243112444877625, "learning_rate": 0.00016452643373741365, "loss": 0.8806, "step": 6210 }, { "epoch": 1.7467678471051151, "grad_norm": 0.5628378391265869, "learning_rate": 0.0001644514526872886, "loss": 0.8617, "step": 6215 }, { "epoch": 1.7481731309724564, "grad_norm": 0.49396875500679016, "learning_rate": 0.00016437640960234525, "loss": 0.8651, "step": 6220 }, { "epoch": 1.7495784148397977, "grad_norm": 0.7454749941825867, "learning_rate": 0.00016430130455481287, "loss": 0.8682, "step": 6225 }, { "epoch": 1.750983698707139, "grad_norm": 0.5046769976615906, "learning_rate": 0.00016422613761698046, "loss": 0.8449, "step": 6230 }, { "epoch": 1.75238898257448, "grad_norm": 0.5620967745780945, "learning_rate": 0.00016415090886119649, "loss": 0.9277, "step": 6235 }, { "epoch": 1.7537942664418211, "grad_norm": 0.5239649415016174, "learning_rate": 0.00016407561835986902, "loss": 0.9241, "step": 6240 }, { "epoch": 1.7551995503091624, "grad_norm": 0.5544788837432861, "learning_rate": 0.00016400026618546552, "loss": 0.8676, "step": 6245 }, { "epoch": 1.7566048341765037, "grad_norm": 0.5142987370491028, "learning_rate": 0.00016392485241051272, "loss": 0.8683, "step": 6250 }, { "epoch": 1.758010118043845, "grad_norm": 0.5113844275474548, "learning_rate": 0.00016384937710759681, "loss": 0.8682, "step": 6255 }, { "epoch": 1.759415401911186, "grad_norm": 0.6806747317314148, "learning_rate": 0.0001637738403493631, "loss": 0.8582, "step": 6260 }, { "epoch": 1.7608206857785271, "grad_norm": 0.688289225101471, "learning_rate": 0.00016369824220851604, "loss": 0.9636, "step": 6265 }, { "epoch": 1.7622259696458684, "grad_norm": 0.817633330821991, "learning_rate": 0.00016362258275781917, "loss": 0.8833, "step": 6270 }, { "epoch": 1.7636312535132097, "grad_norm": 0.7489690184593201, "learning_rate": 0.00016354686207009511, "loss": 0.872, "step": 6275 }, { "epoch": 1.765036537380551, "grad_norm": 0.5069727301597595, "learning_rate": 0.0001634710802182253, "loss": 0.8629, "step": 6280 }, { "epoch": 1.766441821247892, "grad_norm": 0.48243850469589233, "learning_rate": 0.00016339523727515023, "loss": 0.8687, "step": 6285 }, { "epoch": 1.7678471051152331, "grad_norm": 0.5298514366149902, "learning_rate": 0.00016331933331386898, "loss": 0.855, "step": 6290 }, { "epoch": 1.7692523889825744, "grad_norm": 0.6091026067733765, "learning_rate": 0.00016324336840743947, "loss": 0.8624, "step": 6295 }, { "epoch": 1.7706576728499157, "grad_norm": 0.7348006367683411, "learning_rate": 0.00016316734262897834, "loss": 0.8552, "step": 6300 }, { "epoch": 1.772062956717257, "grad_norm": 0.5105456709861755, "learning_rate": 0.00016309125605166064, "loss": 0.871, "step": 6305 }, { "epoch": 1.773468240584598, "grad_norm": 0.5031980872154236, "learning_rate": 0.00016301510874872015, "loss": 0.8582, "step": 6310 }, { "epoch": 1.7748735244519391, "grad_norm": 0.7760841250419617, "learning_rate": 0.00016293890079344892, "loss": 0.8663, "step": 6315 }, { "epoch": 1.7762788083192804, "grad_norm": 0.6499359607696533, "learning_rate": 0.00016286263225919755, "loss": 0.8555, "step": 6320 }, { "epoch": 1.7776840921866217, "grad_norm": 0.4855884611606598, "learning_rate": 0.0001627863032193748, "loss": 0.8681, "step": 6325 }, { "epoch": 1.779089376053963, "grad_norm": 0.4679391384124756, "learning_rate": 0.00016270991374744766, "loss": 0.8635, "step": 6330 }, { "epoch": 1.780494659921304, "grad_norm": 0.4512339234352112, "learning_rate": 0.00016263346391694143, "loss": 0.8475, "step": 6335 }, { "epoch": 1.7818999437886454, "grad_norm": 0.7599973082542419, "learning_rate": 0.0001625569538014394, "loss": 0.8658, "step": 6340 }, { "epoch": 1.7833052276559864, "grad_norm": 0.4672722816467285, "learning_rate": 0.0001624803834745829, "loss": 0.8744, "step": 6345 }, { "epoch": 1.7847105115233277, "grad_norm": 0.6698594689369202, "learning_rate": 0.00016240375301007122, "loss": 0.8644, "step": 6350 }, { "epoch": 1.786115795390669, "grad_norm": 0.803933322429657, "learning_rate": 0.00016232706248166156, "loss": 0.8648, "step": 6355 }, { "epoch": 1.78752107925801, "grad_norm": 0.4994564950466156, "learning_rate": 0.00016225031196316885, "loss": 0.8635, "step": 6360 }, { "epoch": 1.7889263631253514, "grad_norm": 0.6590884327888489, "learning_rate": 0.00016217350152846586, "loss": 0.863, "step": 6365 }, { "epoch": 1.7903316469926924, "grad_norm": 0.5152565240859985, "learning_rate": 0.00016209663125148297, "loss": 0.8706, "step": 6370 }, { "epoch": 1.7917369308600337, "grad_norm": 0.607562780380249, "learning_rate": 0.00016201970120620818, "loss": 0.8638, "step": 6375 }, { "epoch": 1.793142214727375, "grad_norm": 0.5181170701980591, "learning_rate": 0.000161942711466687, "loss": 0.8551, "step": 6380 }, { "epoch": 1.7945474985947163, "grad_norm": 0.8388688564300537, "learning_rate": 0.00016186566210702244, "loss": 0.8834, "step": 6385 }, { "epoch": 1.7959527824620574, "grad_norm": 0.8339769244194031, "learning_rate": 0.0001617885532013748, "loss": 0.8562, "step": 6390 }, { "epoch": 1.7973580663293984, "grad_norm": 0.5623959302902222, "learning_rate": 0.0001617113848239618, "loss": 0.8492, "step": 6395 }, { "epoch": 1.7987633501967397, "grad_norm": 0.5428623557090759, "learning_rate": 0.00016163415704905835, "loss": 0.8539, "step": 6400 }, { "epoch": 1.800168634064081, "grad_norm": 0.5430036783218384, "learning_rate": 0.00016155686995099653, "loss": 0.8651, "step": 6405 }, { "epoch": 1.8015739179314223, "grad_norm": 0.6421025991439819, "learning_rate": 0.00016147952360416552, "loss": 0.8671, "step": 6410 }, { "epoch": 1.8029792017987634, "grad_norm": 0.44885459542274475, "learning_rate": 0.00016140211808301155, "loss": 0.8575, "step": 6415 }, { "epoch": 1.8043844856661044, "grad_norm": 0.4630917012691498, "learning_rate": 0.00016132465346203775, "loss": 0.8611, "step": 6420 }, { "epoch": 1.8057897695334457, "grad_norm": 0.473455011844635, "learning_rate": 0.00016124712981580426, "loss": 0.879, "step": 6425 }, { "epoch": 1.807195053400787, "grad_norm": 0.5540944933891296, "learning_rate": 0.00016116954721892785, "loss": 0.8766, "step": 6430 }, { "epoch": 1.8086003372681283, "grad_norm": 0.7242889404296875, "learning_rate": 0.00016109190574608215, "loss": 0.8432, "step": 6435 }, { "epoch": 1.8100056211354694, "grad_norm": 0.5389221906661987, "learning_rate": 0.00016101420547199745, "loss": 0.8511, "step": 6440 }, { "epoch": 1.8114109050028104, "grad_norm": 0.48298758268356323, "learning_rate": 0.0001609364464714606, "loss": 0.8775, "step": 6445 }, { "epoch": 1.8128161888701517, "grad_norm": 0.595059335231781, "learning_rate": 0.00016085862881931508, "loss": 0.8528, "step": 6450 }, { "epoch": 1.814221472737493, "grad_norm": 0.47923505306243896, "learning_rate": 0.0001607807525904606, "loss": 0.8674, "step": 6455 }, { "epoch": 1.8156267566048343, "grad_norm": 0.7948163151741028, "learning_rate": 0.00016070281785985347, "loss": 0.8582, "step": 6460 }, { "epoch": 1.8170320404721754, "grad_norm": 0.676969051361084, "learning_rate": 0.0001606248247025062, "loss": 0.877, "step": 6465 }, { "epoch": 1.8184373243395164, "grad_norm": 0.6349411010742188, "learning_rate": 0.00016054677319348758, "loss": 0.8593, "step": 6470 }, { "epoch": 1.8198426082068577, "grad_norm": 0.4888991415500641, "learning_rate": 0.00016046866340792252, "loss": 0.8573, "step": 6475 }, { "epoch": 1.821247892074199, "grad_norm": 0.5025482773780823, "learning_rate": 0.00016039049542099207, "loss": 0.8644, "step": 6480 }, { "epoch": 1.8226531759415403, "grad_norm": 0.47073066234588623, "learning_rate": 0.00016031226930793326, "loss": 0.9275, "step": 6485 }, { "epoch": 1.8240584598088814, "grad_norm": 0.6219227910041809, "learning_rate": 0.0001602339851440391, "loss": 0.8498, "step": 6490 }, { "epoch": 1.8254637436762224, "grad_norm": 0.6504101157188416, "learning_rate": 0.00016015564300465843, "loss": 0.8494, "step": 6495 }, { "epoch": 1.8268690275435637, "grad_norm": 0.887282133102417, "learning_rate": 0.0001600772429651959, "loss": 0.857, "step": 6500 }, { "epoch": 1.828274311410905, "grad_norm": 0.9653581380844116, "learning_rate": 0.00015999878510111195, "loss": 0.858, "step": 6505 }, { "epoch": 1.8296795952782463, "grad_norm": 0.6541657447814941, "learning_rate": 0.00015992026948792267, "loss": 0.8747, "step": 6510 }, { "epoch": 1.8310848791455874, "grad_norm": 0.5337585210800171, "learning_rate": 0.00015984169620119959, "loss": 0.8528, "step": 6515 }, { "epoch": 1.8324901630129287, "grad_norm": 0.6224717497825623, "learning_rate": 0.00015976306531656994, "loss": 0.8653, "step": 6520 }, { "epoch": 1.8338954468802697, "grad_norm": 0.6390025019645691, "learning_rate": 0.00015968437690971627, "loss": 0.8612, "step": 6525 }, { "epoch": 1.835300730747611, "grad_norm": 0.6096919775009155, "learning_rate": 0.00015960563105637653, "loss": 0.8634, "step": 6530 }, { "epoch": 1.8367060146149523, "grad_norm": 0.8214162588119507, "learning_rate": 0.00015952682783234402, "loss": 0.8735, "step": 6535 }, { "epoch": 1.8381112984822934, "grad_norm": 0.7625783681869507, "learning_rate": 0.00015944796731346713, "loss": 0.8654, "step": 6540 }, { "epoch": 1.8395165823496347, "grad_norm": 0.5845365524291992, "learning_rate": 0.00015936904957564955, "loss": 0.8705, "step": 6545 }, { "epoch": 1.8409218662169757, "grad_norm": 0.5661296844482422, "learning_rate": 0.00015929007469484986, "loss": 0.851, "step": 6550 }, { "epoch": 1.842327150084317, "grad_norm": 0.5553953647613525, "learning_rate": 0.00015921104274708184, "loss": 0.8592, "step": 6555 }, { "epoch": 1.8437324339516583, "grad_norm": 0.46224939823150635, "learning_rate": 0.00015913195380841402, "loss": 0.8698, "step": 6560 }, { "epoch": 1.8451377178189996, "grad_norm": 0.5660232305526733, "learning_rate": 0.00015905280795496999, "loss": 0.8666, "step": 6565 }, { "epoch": 1.8465430016863407, "grad_norm": 0.5240206718444824, "learning_rate": 0.00015897360526292783, "loss": 0.8637, "step": 6570 }, { "epoch": 1.8479482855536817, "grad_norm": 0.4983304738998413, "learning_rate": 0.0001588943458085206, "loss": 0.8826, "step": 6575 }, { "epoch": 1.849353569421023, "grad_norm": 0.6366061568260193, "learning_rate": 0.00015881502966803588, "loss": 0.8531, "step": 6580 }, { "epoch": 1.8507588532883643, "grad_norm": 0.5946831703186035, "learning_rate": 0.0001587356569178158, "loss": 0.8601, "step": 6585 }, { "epoch": 1.8521641371557056, "grad_norm": 0.45793646574020386, "learning_rate": 0.000158656227634257, "loss": 0.8618, "step": 6590 }, { "epoch": 1.8535694210230467, "grad_norm": 0.5167106986045837, "learning_rate": 0.00015857674189381053, "loss": 0.8604, "step": 6595 }, { "epoch": 1.8549747048903877, "grad_norm": 0.5827748775482178, "learning_rate": 0.00015849719977298178, "loss": 0.8646, "step": 6600 }, { "epoch": 1.856379988757729, "grad_norm": 0.8314809799194336, "learning_rate": 0.00015841760134833042, "loss": 0.8668, "step": 6605 }, { "epoch": 1.8577852726250703, "grad_norm": 0.8763378858566284, "learning_rate": 0.00015833794669647025, "loss": 0.9796, "step": 6610 }, { "epoch": 1.8591905564924116, "grad_norm": 0.7995222210884094, "learning_rate": 0.0001582582358940693, "loss": 0.8516, "step": 6615 }, { "epoch": 1.8605958403597527, "grad_norm": 0.5486027598381042, "learning_rate": 0.00015817846901784952, "loss": 0.8541, "step": 6620 }, { "epoch": 1.8620011242270937, "grad_norm": 0.484632670879364, "learning_rate": 0.00015809864614458694, "loss": 0.8738, "step": 6625 }, { "epoch": 1.863406408094435, "grad_norm": 0.8042235374450684, "learning_rate": 0.00015801876735111142, "loss": 0.9362, "step": 6630 }, { "epoch": 1.8648116919617763, "grad_norm": 0.7505983710289001, "learning_rate": 0.0001579388327143067, "loss": 0.8639, "step": 6635 }, { "epoch": 1.8662169758291176, "grad_norm": 0.48971813917160034, "learning_rate": 0.00015785884231111016, "loss": 0.8397, "step": 6640 }, { "epoch": 1.8676222596964587, "grad_norm": 0.5034501552581787, "learning_rate": 0.00015777879621851302, "loss": 0.8685, "step": 6645 }, { "epoch": 1.8690275435637997, "grad_norm": 0.5677081346511841, "learning_rate": 0.00015769869451355995, "loss": 0.883, "step": 6650 }, { "epoch": 1.870432827431141, "grad_norm": 0.5018690824508667, "learning_rate": 0.00015761853727334918, "loss": 0.8672, "step": 6655 }, { "epoch": 1.8718381112984823, "grad_norm": 0.45845654606819153, "learning_rate": 0.0001575383245750325, "loss": 0.8566, "step": 6660 }, { "epoch": 1.8732433951658236, "grad_norm": 0.7619211673736572, "learning_rate": 0.00015745805649581497, "loss": 0.8681, "step": 6665 }, { "epoch": 1.8746486790331647, "grad_norm": 0.45849132537841797, "learning_rate": 0.000157377733112955, "loss": 0.8668, "step": 6670 }, { "epoch": 1.8760539629005057, "grad_norm": 0.45092353224754333, "learning_rate": 0.0001572973545037641, "loss": 0.873, "step": 6675 }, { "epoch": 1.877459246767847, "grad_norm": 0.5022733807563782, "learning_rate": 0.0001572169207456072, "loss": 0.8598, "step": 6680 }, { "epoch": 1.8788645306351883, "grad_norm": 0.670704185962677, "learning_rate": 0.00015713643191590213, "loss": 0.8547, "step": 6685 }, { "epoch": 1.8802698145025296, "grad_norm": 0.5759883522987366, "learning_rate": 0.00015705588809211967, "loss": 0.8592, "step": 6690 }, { "epoch": 1.8816750983698707, "grad_norm": 0.7777016162872314, "learning_rate": 0.00015697528935178372, "loss": 0.8674, "step": 6695 }, { "epoch": 1.883080382237212, "grad_norm": 0.5708914995193481, "learning_rate": 0.00015689463577247086, "loss": 0.8668, "step": 6700 }, { "epoch": 1.884485666104553, "grad_norm": 0.7172780632972717, "learning_rate": 0.00015681392743181058, "loss": 0.8627, "step": 6705 }, { "epoch": 1.8858909499718943, "grad_norm": 0.8145684003829956, "learning_rate": 0.00015673316440748499, "loss": 0.8605, "step": 6710 }, { "epoch": 1.8872962338392356, "grad_norm": 0.5138697624206543, "learning_rate": 0.0001566523467772289, "loss": 0.8655, "step": 6715 }, { "epoch": 1.8887015177065767, "grad_norm": 0.5821599364280701, "learning_rate": 0.00015657147461882963, "loss": 0.8761, "step": 6720 }, { "epoch": 1.890106801573918, "grad_norm": 0.5124241709709167, "learning_rate": 0.00015649054801012704, "loss": 0.8593, "step": 6725 }, { "epoch": 1.891512085441259, "grad_norm": 0.47895899415016174, "learning_rate": 0.00015640956702901336, "loss": 0.8492, "step": 6730 }, { "epoch": 1.8929173693086003, "grad_norm": 0.5329840779304504, "learning_rate": 0.00015632853175343305, "loss": 0.87, "step": 6735 }, { "epoch": 1.8943226531759416, "grad_norm": 0.5166814923286438, "learning_rate": 0.00015624744226138307, "loss": 0.8625, "step": 6740 }, { "epoch": 1.895727937043283, "grad_norm": 0.5631088614463806, "learning_rate": 0.00015616629863091235, "loss": 0.8552, "step": 6745 }, { "epoch": 1.897133220910624, "grad_norm": 3.2275400161743164, "learning_rate": 0.00015608510094012202, "loss": 0.9232, "step": 6750 }, { "epoch": 1.898538504777965, "grad_norm": 0.4875563085079193, "learning_rate": 0.00015600384926716524, "loss": 0.8574, "step": 6755 }, { "epoch": 1.8999437886453063, "grad_norm": 0.7080219388008118, "learning_rate": 0.00015592254369024714, "loss": 0.8426, "step": 6760 }, { "epoch": 1.9013490725126476, "grad_norm": 0.44725337624549866, "learning_rate": 0.00015584118428762467, "loss": 0.8487, "step": 6765 }, { "epoch": 1.902754356379989, "grad_norm": 0.4571017622947693, "learning_rate": 0.0001557597711376066, "loss": 0.8535, "step": 6770 }, { "epoch": 1.90415964024733, "grad_norm": 0.4571828246116638, "learning_rate": 0.00015567830431855353, "loss": 0.8665, "step": 6775 }, { "epoch": 1.905564924114671, "grad_norm": 0.49334484338760376, "learning_rate": 0.00015559678390887762, "loss": 0.8681, "step": 6780 }, { "epoch": 1.9069702079820123, "grad_norm": 0.5663411617279053, "learning_rate": 0.00015551520998704262, "loss": 0.8603, "step": 6785 }, { "epoch": 1.9083754918493536, "grad_norm": 0.4837425947189331, "learning_rate": 0.00015543358263156383, "loss": 0.8656, "step": 6790 }, { "epoch": 1.909780775716695, "grad_norm": 0.5288939476013184, "learning_rate": 0.00015535190192100793, "loss": 0.8806, "step": 6795 }, { "epoch": 1.911186059584036, "grad_norm": 0.4623594284057617, "learning_rate": 0.00015527016793399301, "loss": 0.8595, "step": 6800 }, { "epoch": 1.912591343451377, "grad_norm": 0.5188004374504089, "learning_rate": 0.00015518838074918834, "loss": 0.8453, "step": 6805 }, { "epoch": 1.9139966273187183, "grad_norm": 0.6675241589546204, "learning_rate": 0.00015510654044531452, "loss": 0.844, "step": 6810 }, { "epoch": 1.9154019111860596, "grad_norm": 0.590613603591919, "learning_rate": 0.00015502464710114323, "loss": 0.8849, "step": 6815 }, { "epoch": 1.916807195053401, "grad_norm": 0.4679904580116272, "learning_rate": 0.00015494270079549716, "loss": 0.9126, "step": 6820 }, { "epoch": 1.918212478920742, "grad_norm": 0.5950139760971069, "learning_rate": 0.00015486070160724994, "loss": 0.8569, "step": 6825 }, { "epoch": 1.919617762788083, "grad_norm": 0.5042142271995544, "learning_rate": 0.00015477864961532632, "loss": 0.8616, "step": 6830 }, { "epoch": 1.9210230466554243, "grad_norm": 0.5188416242599487, "learning_rate": 0.00015469654489870158, "loss": 0.8647, "step": 6835 }, { "epoch": 1.9224283305227656, "grad_norm": 0.5152612924575806, "learning_rate": 0.00015461438753640194, "loss": 0.8778, "step": 6840 }, { "epoch": 1.923833614390107, "grad_norm": 0.4704989492893219, "learning_rate": 0.00015453217760750426, "loss": 0.8673, "step": 6845 }, { "epoch": 1.925238898257448, "grad_norm": 0.6396019458770752, "learning_rate": 0.00015444991519113587, "loss": 0.8623, "step": 6850 }, { "epoch": 1.926644182124789, "grad_norm": 0.49166232347488403, "learning_rate": 0.00015436760036647483, "loss": 0.8774, "step": 6855 }, { "epoch": 1.9280494659921303, "grad_norm": 0.49419841170310974, "learning_rate": 0.00015428523321274953, "loss": 0.8567, "step": 6860 }, { "epoch": 1.9294547498594716, "grad_norm": 0.5003138184547424, "learning_rate": 0.00015420281380923868, "loss": 0.8531, "step": 6865 }, { "epoch": 1.930860033726813, "grad_norm": 0.5504321455955505, "learning_rate": 0.00015412034223527137, "loss": 0.8519, "step": 6870 }, { "epoch": 1.932265317594154, "grad_norm": 0.6040800213813782, "learning_rate": 0.00015403781857022684, "loss": 0.8731, "step": 6875 }, { "epoch": 1.9336706014614953, "grad_norm": 0.43906983733177185, "learning_rate": 0.00015395524289353452, "loss": 0.8671, "step": 6880 }, { "epoch": 1.9350758853288363, "grad_norm": 0.4836946427822113, "learning_rate": 0.00015387261528467384, "loss": 0.8584, "step": 6885 }, { "epoch": 1.9364811691961776, "grad_norm": 0.6498724222183228, "learning_rate": 0.00015378993582317428, "loss": 0.8575, "step": 6890 }, { "epoch": 1.937886453063519, "grad_norm": 0.5695022344589233, "learning_rate": 0.00015370720458861525, "loss": 0.8753, "step": 6895 }, { "epoch": 1.93929173693086, "grad_norm": 0.6768786907196045, "learning_rate": 0.00015362442166062587, "loss": 0.8501, "step": 6900 }, { "epoch": 1.9406970207982013, "grad_norm": 0.4813404977321625, "learning_rate": 0.0001535415871188851, "loss": 0.8499, "step": 6905 }, { "epoch": 1.9421023046655423, "grad_norm": 0.5628063082695007, "learning_rate": 0.00015345870104312154, "loss": 0.8837, "step": 6910 }, { "epoch": 1.9435075885328836, "grad_norm": 0.5575454831123352, "learning_rate": 0.0001533757635131135, "loss": 0.8557, "step": 6915 }, { "epoch": 1.944912872400225, "grad_norm": 0.5431692004203796, "learning_rate": 0.00015329277460868868, "loss": 0.8626, "step": 6920 }, { "epoch": 1.9463181562675662, "grad_norm": 0.5444419384002686, "learning_rate": 0.00015320973440972427, "loss": 0.8512, "step": 6925 }, { "epoch": 1.9477234401349073, "grad_norm": 0.6308559775352478, "learning_rate": 0.00015312664299614684, "loss": 0.8664, "step": 6930 }, { "epoch": 1.9491287240022483, "grad_norm": 0.5455043315887451, "learning_rate": 0.00015304350044793229, "loss": 0.8664, "step": 6935 }, { "epoch": 1.9505340078695896, "grad_norm": 0.609303891658783, "learning_rate": 0.0001529603068451057, "loss": 0.8577, "step": 6940 }, { "epoch": 1.951939291736931, "grad_norm": 0.5833176374435425, "learning_rate": 0.00015287706226774125, "loss": 0.9236, "step": 6945 }, { "epoch": 1.9533445756042722, "grad_norm": 0.8206747174263, "learning_rate": 0.00015279376679596228, "loss": 0.8897, "step": 6950 }, { "epoch": 1.9547498594716133, "grad_norm": 0.6312025189399719, "learning_rate": 0.00015271042050994104, "loss": 0.8609, "step": 6955 }, { "epoch": 1.9561551433389543, "grad_norm": 0.5285316109657288, "learning_rate": 0.0001526270234898987, "loss": 0.8649, "step": 6960 }, { "epoch": 1.9575604272062956, "grad_norm": 0.5742127299308777, "learning_rate": 0.0001525435758161053, "loss": 0.8497, "step": 6965 }, { "epoch": 1.958965711073637, "grad_norm": 0.5842658877372742, "learning_rate": 0.00015246007756887958, "loss": 0.8666, "step": 6970 }, { "epoch": 1.9603709949409782, "grad_norm": 0.4612257182598114, "learning_rate": 0.00015237652882858898, "loss": 0.8673, "step": 6975 }, { "epoch": 1.9617762788083193, "grad_norm": 0.5715500116348267, "learning_rate": 0.00015229292967564959, "loss": 0.8635, "step": 6980 }, { "epoch": 1.9631815626756604, "grad_norm": 0.5319962501525879, "learning_rate": 0.00015220928019052594, "loss": 0.8713, "step": 6985 }, { "epoch": 1.9645868465430016, "grad_norm": 0.5012961030006409, "learning_rate": 0.00015212558045373106, "loss": 0.8628, "step": 6990 }, { "epoch": 1.965992130410343, "grad_norm": 0.5424646139144897, "learning_rate": 0.00015204183054582632, "loss": 0.8636, "step": 6995 }, { "epoch": 1.9673974142776842, "grad_norm": 0.4767484664916992, "learning_rate": 0.0001519580305474214, "loss": 0.8557, "step": 7000 }, { "epoch": 1.9688026981450253, "grad_norm": 0.5038319826126099, "learning_rate": 0.00015187418053917416, "loss": 0.8672, "step": 7005 }, { "epoch": 1.9702079820123664, "grad_norm": 0.5484253764152527, "learning_rate": 0.00015179028060179062, "loss": 0.8588, "step": 7010 }, { "epoch": 1.9716132658797076, "grad_norm": 0.6408220529556274, "learning_rate": 0.00015170633081602487, "loss": 0.8636, "step": 7015 }, { "epoch": 1.973018549747049, "grad_norm": 0.6039834022521973, "learning_rate": 0.00015162233126267898, "loss": 0.8569, "step": 7020 }, { "epoch": 1.9744238336143902, "grad_norm": 0.6858595609664917, "learning_rate": 0.0001515382820226029, "loss": 0.8581, "step": 7025 }, { "epoch": 1.9758291174817313, "grad_norm": 0.5240831971168518, "learning_rate": 0.00015145418317669438, "loss": 0.8515, "step": 7030 }, { "epoch": 1.9772344013490724, "grad_norm": 0.5512087345123291, "learning_rate": 0.0001513700348058989, "loss": 0.8528, "step": 7035 }, { "epoch": 1.9786396852164136, "grad_norm": 0.5006879568099976, "learning_rate": 0.00015128583699120977, "loss": 0.8623, "step": 7040 }, { "epoch": 1.980044969083755, "grad_norm": 0.49987828731536865, "learning_rate": 0.0001512015898136677, "loss": 0.8451, "step": 7045 }, { "epoch": 1.9814502529510962, "grad_norm": 0.5027675628662109, "learning_rate": 0.00015111729335436097, "loss": 0.8563, "step": 7050 }, { "epoch": 1.9828555368184373, "grad_norm": 0.4932931959629059, "learning_rate": 0.00015103294769442535, "loss": 0.8731, "step": 7055 }, { "epoch": 1.9842608206857786, "grad_norm": 0.4976560175418854, "learning_rate": 0.00015094855291504391, "loss": 0.871, "step": 7060 }, { "epoch": 1.9856661045531196, "grad_norm": 0.4505634605884552, "learning_rate": 0.00015086410909744702, "loss": 0.8559, "step": 7065 }, { "epoch": 1.987071388420461, "grad_norm": 0.5050502419471741, "learning_rate": 0.0001507796163229122, "loss": 0.8527, "step": 7070 }, { "epoch": 1.9884766722878022, "grad_norm": 0.8051122426986694, "learning_rate": 0.00015069507467276418, "loss": 0.8406, "step": 7075 }, { "epoch": 1.9898819561551433, "grad_norm": 0.6766129732131958, "learning_rate": 0.00015061048422837468, "loss": 0.8581, "step": 7080 }, { "epoch": 1.9912872400224846, "grad_norm": 0.5113483667373657, "learning_rate": 0.00015052584507116234, "loss": 0.8441, "step": 7085 }, { "epoch": 1.9926925238898257, "grad_norm": 0.5204007029533386, "learning_rate": 0.0001504411572825928, "loss": 0.8424, "step": 7090 }, { "epoch": 1.994097807757167, "grad_norm": 0.7898670434951782, "learning_rate": 0.00015035642094417842, "loss": 0.8468, "step": 7095 }, { "epoch": 1.9955030916245082, "grad_norm": 0.5176621675491333, "learning_rate": 0.0001502716361374783, "loss": 0.8494, "step": 7100 }, { "epoch": 1.9969083754918495, "grad_norm": 0.6323757171630859, "learning_rate": 0.00015018680294409822, "loss": 0.8556, "step": 7105 }, { "epoch": 1.9983136593591906, "grad_norm": 1.1276917457580566, "learning_rate": 0.0001501019214456905, "loss": 0.8694, "step": 7110 }, { "epoch": 1.9997189432265317, "grad_norm": 0.4633052349090576, "learning_rate": 0.000150016991723954, "loss": 0.8691, "step": 7115 }, { "epoch": 2.0, "eval_loss": 0.8701068758964539, "eval_runtime": 638.3523, "eval_samples_per_second": 7.045, "eval_steps_per_second": 0.587, "step": 7116 }, { "epoch": 2.001124227093873, "grad_norm": 0.5140894651412964, "learning_rate": 0.00014993201386063394, "loss": 0.8231, "step": 7120 }, { "epoch": 2.0025295109612142, "grad_norm": 0.4917040169239044, "learning_rate": 0.0001498469879375219, "loss": 0.8273, "step": 7125 }, { "epoch": 2.0039347948285555, "grad_norm": 0.5956901907920837, "learning_rate": 0.00014976191403645578, "loss": 0.8191, "step": 7130 }, { "epoch": 2.0053400786958964, "grad_norm": 0.46787455677986145, "learning_rate": 0.0001496767922393195, "loss": 0.8201, "step": 7135 }, { "epoch": 2.0067453625632377, "grad_norm": 0.4993847608566284, "learning_rate": 0.00014959162262804328, "loss": 0.8115, "step": 7140 }, { "epoch": 2.008150646430579, "grad_norm": 0.4391626715660095, "learning_rate": 0.00014950640528460317, "loss": 0.8185, "step": 7145 }, { "epoch": 2.0095559302979202, "grad_norm": 0.5829885005950928, "learning_rate": 0.00014942114029102132, "loss": 0.8289, "step": 7150 }, { "epoch": 2.0109612141652615, "grad_norm": 0.5417194962501526, "learning_rate": 0.00014933582772936569, "loss": 0.8132, "step": 7155 }, { "epoch": 2.0123664980326024, "grad_norm": 0.4647194445133209, "learning_rate": 0.00014925046768174988, "loss": 0.8201, "step": 7160 }, { "epoch": 2.0137717818999437, "grad_norm": 0.6761397123336792, "learning_rate": 0.00014916506023033348, "loss": 0.8059, "step": 7165 }, { "epoch": 2.015177065767285, "grad_norm": 0.5787038207054138, "learning_rate": 0.0001490796054573215, "loss": 0.8342, "step": 7170 }, { "epoch": 2.0165823496346262, "grad_norm": 0.4988895356655121, "learning_rate": 0.0001489941034449645, "loss": 0.8208, "step": 7175 }, { "epoch": 2.0179876335019675, "grad_norm": 0.4594341516494751, "learning_rate": 0.00014890855427555864, "loss": 0.8325, "step": 7180 }, { "epoch": 2.019392917369309, "grad_norm": 0.48250508308410645, "learning_rate": 0.00014882295803144536, "loss": 0.835, "step": 7185 }, { "epoch": 2.0207982012366497, "grad_norm": 0.5917899012565613, "learning_rate": 0.0001487373147950114, "loss": 0.8172, "step": 7190 }, { "epoch": 2.022203485103991, "grad_norm": 0.4929775297641754, "learning_rate": 0.00014865162464868878, "loss": 0.8457, "step": 7195 }, { "epoch": 2.0236087689713322, "grad_norm": 0.6094657182693481, "learning_rate": 0.0001485658876749547, "loss": 0.8277, "step": 7200 }, { "epoch": 2.0250140528386735, "grad_norm": 0.5238663554191589, "learning_rate": 0.00014848010395633135, "loss": 0.8271, "step": 7205 }, { "epoch": 2.026419336706015, "grad_norm": 0.6498709321022034, "learning_rate": 0.00014839427357538597, "loss": 0.8383, "step": 7210 }, { "epoch": 2.0278246205733557, "grad_norm": 0.8006538152694702, "learning_rate": 0.00014830839661473069, "loss": 0.8162, "step": 7215 }, { "epoch": 2.029229904440697, "grad_norm": 0.5685740113258362, "learning_rate": 0.00014822247315702245, "loss": 0.834, "step": 7220 }, { "epoch": 2.0306351883080382, "grad_norm": 0.5004500150680542, "learning_rate": 0.00014813650328496301, "loss": 0.8248, "step": 7225 }, { "epoch": 2.0320404721753795, "grad_norm": 0.8819432854652405, "learning_rate": 0.0001480504870812988, "loss": 0.8773, "step": 7230 }, { "epoch": 2.033445756042721, "grad_norm": 0.6650183200836182, "learning_rate": 0.0001479644246288207, "loss": 0.8298, "step": 7235 }, { "epoch": 2.0348510399100617, "grad_norm": 0.5287449359893799, "learning_rate": 0.0001478783160103643, "loss": 0.8332, "step": 7240 }, { "epoch": 2.036256323777403, "grad_norm": 0.5206481218338013, "learning_rate": 0.00014779216130880951, "loss": 0.82, "step": 7245 }, { "epoch": 2.0376616076447442, "grad_norm": 0.5504814982414246, "learning_rate": 0.00014770596060708065, "loss": 0.8312, "step": 7250 }, { "epoch": 2.0390668915120855, "grad_norm": 0.4870205223560333, "learning_rate": 0.00014761971398814626, "loss": 0.8228, "step": 7255 }, { "epoch": 2.040472175379427, "grad_norm": 0.5187703371047974, "learning_rate": 0.00014753342153501912, "loss": 0.8155, "step": 7260 }, { "epoch": 2.0418774592467677, "grad_norm": 0.6017442941665649, "learning_rate": 0.00014744708333075607, "loss": 0.8356, "step": 7265 }, { "epoch": 2.043282743114109, "grad_norm": 0.48812979459762573, "learning_rate": 0.0001473606994584581, "loss": 0.8223, "step": 7270 }, { "epoch": 2.0446880269814502, "grad_norm": 0.5613028407096863, "learning_rate": 0.0001472742700012701, "loss": 0.8292, "step": 7275 }, { "epoch": 2.0460933108487915, "grad_norm": 0.48770609498023987, "learning_rate": 0.00014718779504238068, "loss": 0.8303, "step": 7280 }, { "epoch": 2.047498594716133, "grad_norm": 0.6781899929046631, "learning_rate": 0.00014710127466502254, "loss": 0.8322, "step": 7285 }, { "epoch": 2.0489038785834737, "grad_norm": 0.7580632567405701, "learning_rate": 0.00014701470895247181, "loss": 0.8407, "step": 7290 }, { "epoch": 2.050309162450815, "grad_norm": 0.5731956362724304, "learning_rate": 0.00014692809798804847, "loss": 0.8423, "step": 7295 }, { "epoch": 2.0517144463181562, "grad_norm": 0.48143020272254944, "learning_rate": 0.00014684144185511596, "loss": 0.8206, "step": 7300 }, { "epoch": 2.0531197301854975, "grad_norm": 0.589863121509552, "learning_rate": 0.00014675474063708118, "loss": 0.8379, "step": 7305 }, { "epoch": 2.054525014052839, "grad_norm": 0.5739490985870361, "learning_rate": 0.00014666799441739444, "loss": 0.8297, "step": 7310 }, { "epoch": 2.0559302979201797, "grad_norm": 0.6172598600387573, "learning_rate": 0.00014658120327954937, "loss": 0.88, "step": 7315 }, { "epoch": 2.057335581787521, "grad_norm": 0.725379228591919, "learning_rate": 0.0001464943673070829, "loss": 0.8385, "step": 7320 }, { "epoch": 2.0587408656548623, "grad_norm": 0.7602442502975464, "learning_rate": 0.000146407486583575, "loss": 0.836, "step": 7325 }, { "epoch": 2.0601461495222035, "grad_norm": 0.6438000798225403, "learning_rate": 0.0001463205611926488, "loss": 0.8323, "step": 7330 }, { "epoch": 2.061551433389545, "grad_norm": 0.5296928286552429, "learning_rate": 0.00014623359121797034, "loss": 0.8391, "step": 7335 }, { "epoch": 2.0629567172568857, "grad_norm": 0.6987408399581909, "learning_rate": 0.00014614657674324864, "loss": 0.8278, "step": 7340 }, { "epoch": 2.064362001124227, "grad_norm": 0.5474652647972107, "learning_rate": 0.00014605951785223552, "loss": 0.84, "step": 7345 }, { "epoch": 2.0657672849915683, "grad_norm": 0.573154866695404, "learning_rate": 0.00014597241462872558, "loss": 0.8188, "step": 7350 }, { "epoch": 2.0671725688589095, "grad_norm": 0.6933932900428772, "learning_rate": 0.00014588526715655608, "loss": 0.8346, "step": 7355 }, { "epoch": 2.068577852726251, "grad_norm": 0.5106696486473083, "learning_rate": 0.00014579807551960683, "loss": 0.8275, "step": 7360 }, { "epoch": 2.069983136593592, "grad_norm": 0.48654860258102417, "learning_rate": 0.0001457108398018002, "loss": 0.8261, "step": 7365 }, { "epoch": 2.071388420460933, "grad_norm": 0.5113856196403503, "learning_rate": 0.00014562356008710094, "loss": 0.8258, "step": 7370 }, { "epoch": 2.0727937043282743, "grad_norm": 0.5285763144493103, "learning_rate": 0.00014553623645951623, "loss": 0.8463, "step": 7375 }, { "epoch": 2.0741989881956155, "grad_norm": 0.6509875059127808, "learning_rate": 0.00014544886900309537, "loss": 0.8306, "step": 7380 }, { "epoch": 2.075604272062957, "grad_norm": 0.5188117623329163, "learning_rate": 0.00014536145780193007, "loss": 0.8144, "step": 7385 }, { "epoch": 2.077009555930298, "grad_norm": 0.5361855626106262, "learning_rate": 0.0001452740029401539, "loss": 0.8233, "step": 7390 }, { "epoch": 2.078414839797639, "grad_norm": 0.4979911148548126, "learning_rate": 0.0001451865045019426, "loss": 0.8183, "step": 7395 }, { "epoch": 2.0798201236649803, "grad_norm": 0.5529736280441284, "learning_rate": 0.00014509896257151384, "loss": 0.8261, "step": 7400 }, { "epoch": 2.0812254075323215, "grad_norm": 0.6499199867248535, "learning_rate": 0.00014501137723312707, "loss": 0.8303, "step": 7405 }, { "epoch": 2.082630691399663, "grad_norm": 0.4609116017818451, "learning_rate": 0.00014492374857108365, "loss": 0.8321, "step": 7410 }, { "epoch": 2.084035975267004, "grad_norm": 0.4897962808609009, "learning_rate": 0.00014483607666972652, "loss": 0.8317, "step": 7415 }, { "epoch": 2.085441259134345, "grad_norm": 0.4982929825782776, "learning_rate": 0.0001447483616134403, "loss": 0.8786, "step": 7420 }, { "epoch": 2.0868465430016863, "grad_norm": 0.5008282661437988, "learning_rate": 0.00014466060348665116, "loss": 0.8353, "step": 7425 }, { "epoch": 2.0882518268690275, "grad_norm": 0.5852164626121521, "learning_rate": 0.00014457280237382665, "loss": 0.834, "step": 7430 }, { "epoch": 2.089657110736369, "grad_norm": 0.6624243855476379, "learning_rate": 0.00014448495835947577, "loss": 0.8287, "step": 7435 }, { "epoch": 2.09106239460371, "grad_norm": 0.583459734916687, "learning_rate": 0.0001443970715281488, "loss": 0.8281, "step": 7440 }, { "epoch": 2.092467678471051, "grad_norm": 0.5138252973556519, "learning_rate": 0.00014430914196443716, "loss": 0.8379, "step": 7445 }, { "epoch": 2.0938729623383923, "grad_norm": 0.5127986669540405, "learning_rate": 0.00014422116975297352, "loss": 0.8355, "step": 7450 }, { "epoch": 2.0952782462057336, "grad_norm": 0.4994426369667053, "learning_rate": 0.00014413315497843152, "loss": 0.8117, "step": 7455 }, { "epoch": 2.096683530073075, "grad_norm": 0.5604568719863892, "learning_rate": 0.00014404509772552579, "loss": 0.8552, "step": 7460 }, { "epoch": 2.098088813940416, "grad_norm": 0.5112202763557434, "learning_rate": 0.00014395699807901181, "loss": 0.8389, "step": 7465 }, { "epoch": 2.099494097807757, "grad_norm": 0.564951479434967, "learning_rate": 0.0001438688561236859, "loss": 0.8199, "step": 7470 }, { "epoch": 2.1008993816750983, "grad_norm": 0.5867136120796204, "learning_rate": 0.00014378067194438513, "loss": 0.8436, "step": 7475 }, { "epoch": 2.1023046655424396, "grad_norm": 0.4982098340988159, "learning_rate": 0.00014369244562598715, "loss": 0.8185, "step": 7480 }, { "epoch": 2.103709949409781, "grad_norm": 0.5959761738777161, "learning_rate": 0.00014360417725341017, "loss": 0.8435, "step": 7485 }, { "epoch": 2.105115233277122, "grad_norm": 0.5060386061668396, "learning_rate": 0.00014351586691161298, "loss": 0.8206, "step": 7490 }, { "epoch": 2.106520517144463, "grad_norm": 0.5006709694862366, "learning_rate": 0.0001434275146855946, "loss": 0.8228, "step": 7495 }, { "epoch": 2.1079258010118043, "grad_norm": 0.5913635492324829, "learning_rate": 0.0001433391206603945, "loss": 0.8263, "step": 7500 }, { "epoch": 2.1093310848791456, "grad_norm": 0.6296173334121704, "learning_rate": 0.00014325068492109235, "loss": 0.8259, "step": 7505 }, { "epoch": 2.110736368746487, "grad_norm": 0.5836814045906067, "learning_rate": 0.0001431622075528079, "loss": 0.8324, "step": 7510 }, { "epoch": 2.112141652613828, "grad_norm": 0.577974259853363, "learning_rate": 0.00014307368864070104, "loss": 0.842, "step": 7515 }, { "epoch": 2.1135469364811694, "grad_norm": 0.501258134841919, "learning_rate": 0.00014298512826997164, "loss": 0.8301, "step": 7520 }, { "epoch": 2.1149522203485103, "grad_norm": 0.5843653678894043, "learning_rate": 0.0001428965265258595, "loss": 0.8339, "step": 7525 }, { "epoch": 2.1163575042158516, "grad_norm": 0.584141194820404, "learning_rate": 0.00014280788349364413, "loss": 0.8257, "step": 7530 }, { "epoch": 2.117762788083193, "grad_norm": 0.6166936159133911, "learning_rate": 0.00014271919925864493, "loss": 0.8396, "step": 7535 }, { "epoch": 2.119168071950534, "grad_norm": 0.5409911870956421, "learning_rate": 0.0001426304739062208, "loss": 0.8152, "step": 7540 }, { "epoch": 2.120573355817875, "grad_norm": 0.48173296451568604, "learning_rate": 0.00014254170752177035, "loss": 0.8239, "step": 7545 }, { "epoch": 2.1219786396852163, "grad_norm": 0.5484293103218079, "learning_rate": 0.00014245290019073166, "loss": 0.8364, "step": 7550 }, { "epoch": 2.1233839235525576, "grad_norm": 0.6085915565490723, "learning_rate": 0.00014236405199858208, "loss": 0.8374, "step": 7555 }, { "epoch": 2.124789207419899, "grad_norm": 0.49279549717903137, "learning_rate": 0.00014227516303083856, "loss": 0.8442, "step": 7560 }, { "epoch": 2.12619449128724, "grad_norm": 0.4717954695224762, "learning_rate": 0.000142186233373057, "loss": 0.8289, "step": 7565 }, { "epoch": 2.1275997751545814, "grad_norm": 0.4982221722602844, "learning_rate": 0.0001420972631108327, "loss": 0.8382, "step": 7570 }, { "epoch": 2.1290050590219223, "grad_norm": 0.472332626581192, "learning_rate": 0.00014200825232979985, "loss": 0.8262, "step": 7575 }, { "epoch": 2.1304103428892636, "grad_norm": 0.6342632174491882, "learning_rate": 0.00014191920111563183, "loss": 0.8333, "step": 7580 }, { "epoch": 2.131815626756605, "grad_norm": 0.5856114029884338, "learning_rate": 0.0001418301095540408, "loss": 0.8315, "step": 7585 }, { "epoch": 2.133220910623946, "grad_norm": 0.505497932434082, "learning_rate": 0.00014174097773077778, "loss": 0.827, "step": 7590 }, { "epoch": 2.1346261944912874, "grad_norm": 0.6035593748092651, "learning_rate": 0.00014165180573163255, "loss": 0.8263, "step": 7595 }, { "epoch": 2.1360314783586283, "grad_norm": 0.5006420612335205, "learning_rate": 0.0001415625936424336, "loss": 0.8215, "step": 7600 }, { "epoch": 2.1374367622259696, "grad_norm": 0.5282971858978271, "learning_rate": 0.000141473341549048, "loss": 0.842, "step": 7605 }, { "epoch": 2.138842046093311, "grad_norm": 0.6301651000976562, "learning_rate": 0.00014138404953738124, "loss": 0.8265, "step": 7610 }, { "epoch": 2.140247329960652, "grad_norm": 0.5510733127593994, "learning_rate": 0.0001412947176933773, "loss": 0.8365, "step": 7615 }, { "epoch": 2.1416526138279934, "grad_norm": 0.5520345568656921, "learning_rate": 0.0001412053461030185, "loss": 0.8347, "step": 7620 }, { "epoch": 2.1430578976953343, "grad_norm": 0.5838735103607178, "learning_rate": 0.00014111593485232541, "loss": 0.8367, "step": 7625 }, { "epoch": 2.1444631815626756, "grad_norm": 0.5474026799201965, "learning_rate": 0.00014102648402735678, "loss": 0.8313, "step": 7630 }, { "epoch": 2.145868465430017, "grad_norm": 0.5024116635322571, "learning_rate": 0.0001409369937142094, "loss": 0.8667, "step": 7635 }, { "epoch": 2.147273749297358, "grad_norm": 0.996107280254364, "learning_rate": 0.00014084746399901818, "loss": 0.8323, "step": 7640 }, { "epoch": 2.1486790331646994, "grad_norm": 0.6177522540092468, "learning_rate": 0.00014075789496795576, "loss": 0.8362, "step": 7645 }, { "epoch": 2.1500843170320403, "grad_norm": 0.500321090221405, "learning_rate": 0.00014066828670723287, "loss": 0.824, "step": 7650 }, { "epoch": 2.1514896008993816, "grad_norm": 0.49639928340911865, "learning_rate": 0.0001405786393030978, "loss": 0.8277, "step": 7655 }, { "epoch": 2.152894884766723, "grad_norm": 0.5836513042449951, "learning_rate": 0.00014048895284183657, "loss": 0.8489, "step": 7660 }, { "epoch": 2.154300168634064, "grad_norm": 0.5478304624557495, "learning_rate": 0.0001403992274097729, "loss": 0.8407, "step": 7665 }, { "epoch": 2.1557054525014054, "grad_norm": 0.5731642842292786, "learning_rate": 0.00014030946309326784, "loss": 0.8414, "step": 7670 }, { "epoch": 2.1571107363687463, "grad_norm": 0.5691959261894226, "learning_rate": 0.00014021965997871994, "loss": 0.8802, "step": 7675 }, { "epoch": 2.1585160202360876, "grad_norm": 0.683435320854187, "learning_rate": 0.00014012981815256524, "loss": 0.829, "step": 7680 }, { "epoch": 2.159921304103429, "grad_norm": 0.5545250773429871, "learning_rate": 0.00014003993770127674, "loss": 0.8352, "step": 7685 }, { "epoch": 2.16132658797077, "grad_norm": 0.5581771731376648, "learning_rate": 0.00013995001871136494, "loss": 0.8342, "step": 7690 }, { "epoch": 2.1627318718381114, "grad_norm": 0.5403844714164734, "learning_rate": 0.00013986006126937716, "loss": 0.8255, "step": 7695 }, { "epoch": 2.1641371557054523, "grad_norm": 0.6160198450088501, "learning_rate": 0.0001397700654618979, "loss": 0.8862, "step": 7700 }, { "epoch": 2.1655424395727936, "grad_norm": 0.5250822901725769, "learning_rate": 0.00013968003137554855, "loss": 0.8255, "step": 7705 }, { "epoch": 2.166947723440135, "grad_norm": 0.49927449226379395, "learning_rate": 0.00013958995909698734, "loss": 0.8179, "step": 7710 }, { "epoch": 2.168353007307476, "grad_norm": 0.5784510374069214, "learning_rate": 0.0001394998487129092, "loss": 0.8234, "step": 7715 }, { "epoch": 2.1697582911748174, "grad_norm": 0.6177597641944885, "learning_rate": 0.0001394097003100458, "loss": 0.8204, "step": 7720 }, { "epoch": 2.1711635750421587, "grad_norm": 0.5223365426063538, "learning_rate": 0.00013931951397516543, "loss": 0.8366, "step": 7725 }, { "epoch": 2.1725688589094996, "grad_norm": 0.5713496208190918, "learning_rate": 0.0001392292897950728, "loss": 0.8324, "step": 7730 }, { "epoch": 2.173974142776841, "grad_norm": 0.4760042726993561, "learning_rate": 0.00013913902785660915, "loss": 0.8312, "step": 7735 }, { "epoch": 2.175379426644182, "grad_norm": 0.5172589421272278, "learning_rate": 0.00013904872824665196, "loss": 0.8131, "step": 7740 }, { "epoch": 2.1767847105115234, "grad_norm": 0.4978179633617401, "learning_rate": 0.00013895839105211504, "loss": 0.8284, "step": 7745 }, { "epoch": 2.1781899943788647, "grad_norm": 0.5030422806739807, "learning_rate": 0.00013886801635994836, "loss": 0.8335, "step": 7750 }, { "epoch": 2.1795952782462056, "grad_norm": 0.48498082160949707, "learning_rate": 0.00013877760425713795, "loss": 0.8212, "step": 7755 }, { "epoch": 2.181000562113547, "grad_norm": 0.5106037855148315, "learning_rate": 0.00013868715483070592, "loss": 0.8278, "step": 7760 }, { "epoch": 2.182405845980888, "grad_norm": 0.6524946689605713, "learning_rate": 0.0001385966681677102, "loss": 0.83, "step": 7765 }, { "epoch": 2.1838111298482294, "grad_norm": 0.5415175557136536, "learning_rate": 0.00013850614435524465, "loss": 0.8262, "step": 7770 }, { "epoch": 2.1852164137155707, "grad_norm": 0.5459450483322144, "learning_rate": 0.00013841558348043885, "loss": 0.8351, "step": 7775 }, { "epoch": 2.1866216975829116, "grad_norm": 0.7794672846794128, "learning_rate": 0.00013832498563045803, "loss": 0.8317, "step": 7780 }, { "epoch": 2.188026981450253, "grad_norm": 0.6935010552406311, "learning_rate": 0.0001382343508925031, "loss": 0.8486, "step": 7785 }, { "epoch": 2.189432265317594, "grad_norm": 0.7011950612068176, "learning_rate": 0.00013814367935381037, "loss": 0.8469, "step": 7790 }, { "epoch": 2.1908375491849355, "grad_norm": 0.49912673234939575, "learning_rate": 0.0001380529711016516, "loss": 0.8299, "step": 7795 }, { "epoch": 2.1922428330522767, "grad_norm": 0.6039685606956482, "learning_rate": 0.00013796222622333387, "loss": 0.8326, "step": 7800 }, { "epoch": 2.1936481169196176, "grad_norm": 0.48364749550819397, "learning_rate": 0.00013787144480619963, "loss": 0.8317, "step": 7805 }, { "epoch": 2.195053400786959, "grad_norm": 0.5642433762550354, "learning_rate": 0.00013778062693762632, "loss": 0.8197, "step": 7810 }, { "epoch": 2.1964586846543, "grad_norm": 0.5070891976356506, "learning_rate": 0.00013768977270502665, "loss": 0.8285, "step": 7815 }, { "epoch": 2.1978639685216415, "grad_norm": 0.6507462859153748, "learning_rate": 0.00013759888219584814, "loss": 0.8448, "step": 7820 }, { "epoch": 2.1992692523889827, "grad_norm": 0.5021531581878662, "learning_rate": 0.00013750795549757335, "loss": 0.8283, "step": 7825 }, { "epoch": 2.2006745362563236, "grad_norm": 0.5254793167114258, "learning_rate": 0.0001374169926977197, "loss": 0.8204, "step": 7830 }, { "epoch": 2.202079820123665, "grad_norm": 0.6183409690856934, "learning_rate": 0.0001373259938838392, "loss": 0.8293, "step": 7835 }, { "epoch": 2.203485103991006, "grad_norm": 0.5064858198165894, "learning_rate": 0.00013723495914351873, "loss": 0.848, "step": 7840 }, { "epoch": 2.2048903878583475, "grad_norm": 0.493991881608963, "learning_rate": 0.00013714388856437957, "loss": 0.8439, "step": 7845 }, { "epoch": 2.2062956717256887, "grad_norm": 0.685554027557373, "learning_rate": 0.0001370527822340776, "loss": 0.8522, "step": 7850 }, { "epoch": 2.2077009555930296, "grad_norm": 0.899387538433075, "learning_rate": 0.00013696164024030305, "loss": 0.8252, "step": 7855 }, { "epoch": 2.209106239460371, "grad_norm": 0.4570039212703705, "learning_rate": 0.00013687046267078055, "loss": 0.8402, "step": 7860 }, { "epoch": 2.210511523327712, "grad_norm": 0.47615373134613037, "learning_rate": 0.00013677924961326892, "loss": 0.8196, "step": 7865 }, { "epoch": 2.2119168071950535, "grad_norm": 0.5366038680076599, "learning_rate": 0.00013668800115556112, "loss": 0.8256, "step": 7870 }, { "epoch": 2.2133220910623947, "grad_norm": 0.49585074186325073, "learning_rate": 0.00013659671738548422, "loss": 0.83, "step": 7875 }, { "epoch": 2.214727374929736, "grad_norm": 0.6336633563041687, "learning_rate": 0.00013650539839089927, "loss": 0.8379, "step": 7880 }, { "epoch": 2.216132658797077, "grad_norm": 0.5373387932777405, "learning_rate": 0.00013641404425970122, "loss": 0.8177, "step": 7885 }, { "epoch": 2.217537942664418, "grad_norm": 0.6974838376045227, "learning_rate": 0.00013632265507981884, "loss": 0.832, "step": 7890 }, { "epoch": 2.2189432265317595, "grad_norm": 0.5168099403381348, "learning_rate": 0.00013623123093921464, "loss": 0.84, "step": 7895 }, { "epoch": 2.2203485103991007, "grad_norm": 0.519497275352478, "learning_rate": 0.00013613977192588475, "loss": 0.8298, "step": 7900 }, { "epoch": 2.2217537942664416, "grad_norm": 0.582837700843811, "learning_rate": 0.00013604827812785893, "loss": 0.8315, "step": 7905 }, { "epoch": 2.223159078133783, "grad_norm": 0.5562664866447449, "learning_rate": 0.00013595674963320036, "loss": 0.8313, "step": 7910 }, { "epoch": 2.224564362001124, "grad_norm": 0.4701671898365021, "learning_rate": 0.0001358651865300056, "loss": 0.834, "step": 7915 }, { "epoch": 2.2259696458684655, "grad_norm": 0.511581540107727, "learning_rate": 0.0001357735889064046, "loss": 0.8407, "step": 7920 }, { "epoch": 2.2273749297358068, "grad_norm": 0.4707708954811096, "learning_rate": 0.0001356819568505605, "loss": 0.8304, "step": 7925 }, { "epoch": 2.228780213603148, "grad_norm": 0.6949105858802795, "learning_rate": 0.0001355902904506695, "loss": 0.825, "step": 7930 }, { "epoch": 2.230185497470489, "grad_norm": 0.6631535291671753, "learning_rate": 0.00013549858979496103, "loss": 0.8237, "step": 7935 }, { "epoch": 2.23159078133783, "grad_norm": 0.6554431915283203, "learning_rate": 0.0001354068549716973, "loss": 0.8445, "step": 7940 }, { "epoch": 2.2329960652051715, "grad_norm": 0.7375605702400208, "learning_rate": 0.0001353150860691736, "loss": 0.8313, "step": 7945 }, { "epoch": 2.2344013490725128, "grad_norm": 0.6232590079307556, "learning_rate": 0.0001352232831757178, "loss": 0.8802, "step": 7950 }, { "epoch": 2.235806632939854, "grad_norm": 0.517485499382019, "learning_rate": 0.0001351314463796907, "loss": 0.8358, "step": 7955 }, { "epoch": 2.237211916807195, "grad_norm": 0.5201471447944641, "learning_rate": 0.0001350395757694856, "loss": 0.8418, "step": 7960 }, { "epoch": 2.238617200674536, "grad_norm": 0.6151214838027954, "learning_rate": 0.00013494767143352839, "loss": 0.8203, "step": 7965 }, { "epoch": 2.2400224845418775, "grad_norm": 0.650915801525116, "learning_rate": 0.00013485573346027737, "loss": 0.8394, "step": 7970 }, { "epoch": 2.2414277684092188, "grad_norm": 0.6588686108589172, "learning_rate": 0.00013476376193822333, "loss": 0.8874, "step": 7975 }, { "epoch": 2.24283305227656, "grad_norm": 0.5741196274757385, "learning_rate": 0.00013467175695588924, "loss": 0.8298, "step": 7980 }, { "epoch": 2.244238336143901, "grad_norm": 0.7275395393371582, "learning_rate": 0.00013457971860183034, "loss": 0.8242, "step": 7985 }, { "epoch": 2.245643620011242, "grad_norm": 0.5300001502037048, "learning_rate": 0.00013448764696463394, "loss": 0.8333, "step": 7990 }, { "epoch": 2.2470489038785835, "grad_norm": 0.7253971099853516, "learning_rate": 0.00013439554213291944, "loss": 0.8215, "step": 7995 }, { "epoch": 2.2484541877459248, "grad_norm": 0.8704935312271118, "learning_rate": 0.00013430340419533812, "loss": 0.8268, "step": 8000 }, { "epoch": 2.249859471613266, "grad_norm": 0.5288625955581665, "learning_rate": 0.0001342112332405732, "loss": 0.8351, "step": 8005 }, { "epoch": 2.251264755480607, "grad_norm": 0.6613404154777527, "learning_rate": 0.00013411902935733967, "loss": 0.8311, "step": 8010 }, { "epoch": 2.252670039347948, "grad_norm": 0.5405227541923523, "learning_rate": 0.00013402679263438416, "loss": 0.825, "step": 8015 }, { "epoch": 2.2540753232152895, "grad_norm": 0.6060507893562317, "learning_rate": 0.00013393452316048493, "loss": 0.8174, "step": 8020 }, { "epoch": 2.2554806070826308, "grad_norm": 0.7184122204780579, "learning_rate": 0.00013384222102445174, "loss": 0.8349, "step": 8025 }, { "epoch": 2.256885890949972, "grad_norm": 0.6472933292388916, "learning_rate": 0.0001337498863151259, "loss": 0.8354, "step": 8030 }, { "epoch": 2.2582911748173133, "grad_norm": 0.5747955441474915, "learning_rate": 0.00013365751912137997, "loss": 0.8286, "step": 8035 }, { "epoch": 2.259696458684654, "grad_norm": 0.5253928899765015, "learning_rate": 0.00013356511953211772, "loss": 0.8343, "step": 8040 }, { "epoch": 2.2611017425519955, "grad_norm": 0.6021010279655457, "learning_rate": 0.00013347268763627427, "loss": 0.8414, "step": 8045 }, { "epoch": 2.2625070264193368, "grad_norm": 0.5077487826347351, "learning_rate": 0.0001333802235228157, "loss": 0.849, "step": 8050 }, { "epoch": 2.263912310286678, "grad_norm": 0.6178883910179138, "learning_rate": 0.00013328772728073913, "loss": 0.8167, "step": 8055 }, { "epoch": 2.265317594154019, "grad_norm": 0.5458176732063293, "learning_rate": 0.00013319519899907266, "loss": 0.8406, "step": 8060 }, { "epoch": 2.26672287802136, "grad_norm": 0.5361412763595581, "learning_rate": 0.00013310263876687516, "loss": 0.8466, "step": 8065 }, { "epoch": 2.2681281618887015, "grad_norm": 0.5330038666725159, "learning_rate": 0.0001330100466732363, "loss": 0.8307, "step": 8070 }, { "epoch": 2.2695334457560428, "grad_norm": 0.4969392418861389, "learning_rate": 0.00013291742280727633, "loss": 0.8324, "step": 8075 }, { "epoch": 2.270938729623384, "grad_norm": 0.5226914286613464, "learning_rate": 0.00013282476725814618, "loss": 0.8258, "step": 8080 }, { "epoch": 2.2723440134907253, "grad_norm": 0.6179579496383667, "learning_rate": 0.00013273208011502729, "loss": 0.8349, "step": 8085 }, { "epoch": 2.273749297358066, "grad_norm": 0.5634477138519287, "learning_rate": 0.0001326393614671314, "loss": 0.844, "step": 8090 }, { "epoch": 2.2751545812254075, "grad_norm": 0.6256124973297119, "learning_rate": 0.00013254661140370063, "loss": 0.8242, "step": 8095 }, { "epoch": 2.2765598650927488, "grad_norm": 0.7047878503799438, "learning_rate": 0.00013245383001400738, "loss": 0.8308, "step": 8100 }, { "epoch": 2.27796514896009, "grad_norm": 0.6778944134712219, "learning_rate": 0.00013236101738735415, "loss": 0.8253, "step": 8105 }, { "epoch": 2.279370432827431, "grad_norm": 0.5102590918540955, "learning_rate": 0.0001322681736130735, "loss": 0.8302, "step": 8110 }, { "epoch": 2.280775716694772, "grad_norm": 0.6197868585586548, "learning_rate": 0.000132175298780528, "loss": 0.8562, "step": 8115 }, { "epoch": 2.2821810005621135, "grad_norm": 0.500946044921875, "learning_rate": 0.0001320823929791101, "loss": 0.831, "step": 8120 }, { "epoch": 2.2835862844294548, "grad_norm": 0.9998027086257935, "learning_rate": 0.00013198945629824206, "loss": 0.8666, "step": 8125 }, { "epoch": 2.284991568296796, "grad_norm": 0.4757716953754425, "learning_rate": 0.00013189648882737587, "loss": 0.8212, "step": 8130 }, { "epoch": 2.2863968521641373, "grad_norm": 0.5170575380325317, "learning_rate": 0.0001318034906559931, "loss": 0.8198, "step": 8135 }, { "epoch": 2.287802136031478, "grad_norm": 0.5440870523452759, "learning_rate": 0.000131710461873605, "loss": 0.8358, "step": 8140 }, { "epoch": 2.2892074198988195, "grad_norm": 0.5350499749183655, "learning_rate": 0.00013161740256975213, "loss": 0.8351, "step": 8145 }, { "epoch": 2.2906127037661608, "grad_norm": 0.563395082950592, "learning_rate": 0.00013152431283400455, "loss": 0.8113, "step": 8150 }, { "epoch": 2.292017987633502, "grad_norm": 0.528664231300354, "learning_rate": 0.00013143119275596147, "loss": 0.8172, "step": 8155 }, { "epoch": 2.2934232715008434, "grad_norm": 0.5736309885978699, "learning_rate": 0.00013133804242525149, "loss": 0.8333, "step": 8160 }, { "epoch": 2.294828555368184, "grad_norm": 0.5612114071846008, "learning_rate": 0.00013124486193153215, "loss": 0.8242, "step": 8165 }, { "epoch": 2.2962338392355255, "grad_norm": 0.5501384735107422, "learning_rate": 0.00013115165136449018, "loss": 0.8321, "step": 8170 }, { "epoch": 2.2976391231028668, "grad_norm": 0.4476981461048126, "learning_rate": 0.00013105841081384112, "loss": 0.8179, "step": 8175 }, { "epoch": 2.299044406970208, "grad_norm": 0.5093850493431091, "learning_rate": 0.00013096514036932942, "loss": 0.8278, "step": 8180 }, { "epoch": 2.3004496908375494, "grad_norm": 0.5036539435386658, "learning_rate": 0.00013087184012072834, "loss": 0.8367, "step": 8185 }, { "epoch": 2.30185497470489, "grad_norm": 0.48888805508613586, "learning_rate": 0.00013077851015783981, "loss": 0.8652, "step": 8190 }, { "epoch": 2.3032602585722315, "grad_norm": 0.5186904668807983, "learning_rate": 0.00013068515057049432, "loss": 0.8339, "step": 8195 }, { "epoch": 2.304665542439573, "grad_norm": 0.524110734462738, "learning_rate": 0.0001305917614485509, "loss": 0.8292, "step": 8200 }, { "epoch": 2.306070826306914, "grad_norm": 0.6958850622177124, "learning_rate": 0.00013049834288189703, "loss": 0.8248, "step": 8205 }, { "epoch": 2.3074761101742554, "grad_norm": 0.6316078305244446, "learning_rate": 0.00013040489496044848, "loss": 0.8177, "step": 8210 }, { "epoch": 2.308881394041596, "grad_norm": 0.590225100517273, "learning_rate": 0.0001303114177741493, "loss": 0.8442, "step": 8215 }, { "epoch": 2.3102866779089375, "grad_norm": 0.6090141534805298, "learning_rate": 0.00013021791141297175, "loss": 0.8388, "step": 8220 }, { "epoch": 2.311691961776279, "grad_norm": 0.4995424449443817, "learning_rate": 0.0001301243759669161, "loss": 0.8403, "step": 8225 }, { "epoch": 2.31309724564362, "grad_norm": 0.5057527422904968, "learning_rate": 0.00013003081152601062, "loss": 0.8797, "step": 8230 }, { "epoch": 2.3145025295109614, "grad_norm": 0.5079846382141113, "learning_rate": 0.00012993721818031157, "loss": 0.823, "step": 8235 }, { "epoch": 2.3159078133783026, "grad_norm": 0.510495126247406, "learning_rate": 0.00012984359601990293, "loss": 0.8365, "step": 8240 }, { "epoch": 2.3173130972456435, "grad_norm": 0.5368478298187256, "learning_rate": 0.0001297499451348965, "loss": 0.8303, "step": 8245 }, { "epoch": 2.318718381112985, "grad_norm": 0.7168470621109009, "learning_rate": 0.00012965626561543163, "loss": 0.8357, "step": 8250 }, { "epoch": 2.320123664980326, "grad_norm": 0.5443083047866821, "learning_rate": 0.00012956255755167532, "loss": 0.8393, "step": 8255 }, { "epoch": 2.3215289488476674, "grad_norm": 0.5011507272720337, "learning_rate": 0.000129468821033822, "loss": 0.8418, "step": 8260 }, { "epoch": 2.322934232715008, "grad_norm": 0.4719626307487488, "learning_rate": 0.00012937505615209354, "loss": 0.841, "step": 8265 }, { "epoch": 2.3243395165823495, "grad_norm": 0.5777681469917297, "learning_rate": 0.00012928126299673902, "loss": 0.8114, "step": 8270 }, { "epoch": 2.325744800449691, "grad_norm": 0.573501467704773, "learning_rate": 0.00012918744165803478, "loss": 0.8154, "step": 8275 }, { "epoch": 2.327150084317032, "grad_norm": 0.5786929130554199, "learning_rate": 0.0001290935922262843, "loss": 0.8295, "step": 8280 }, { "epoch": 2.3285553681843734, "grad_norm": 0.4540880620479584, "learning_rate": 0.0001289997147918181, "loss": 0.8205, "step": 8285 }, { "epoch": 2.3299606520517147, "grad_norm": 0.6141250133514404, "learning_rate": 0.00012890580944499363, "loss": 0.8146, "step": 8290 }, { "epoch": 2.3313659359190555, "grad_norm": 0.49410197138786316, "learning_rate": 0.0001288118762761952, "loss": 0.8392, "step": 8295 }, { "epoch": 2.332771219786397, "grad_norm": 0.568728506565094, "learning_rate": 0.00012871791537583398, "loss": 0.8272, "step": 8300 }, { "epoch": 2.334176503653738, "grad_norm": 0.5576150417327881, "learning_rate": 0.00012862392683434765, "loss": 0.833, "step": 8305 }, { "epoch": 2.3355817875210794, "grad_norm": 0.5507808327674866, "learning_rate": 0.0001285299107422007, "loss": 0.8801, "step": 8310 }, { "epoch": 2.33698707138842, "grad_norm": 0.4993041455745697, "learning_rate": 0.00012843586718988407, "loss": 0.835, "step": 8315 }, { "epoch": 2.3383923552557615, "grad_norm": 0.6031385064125061, "learning_rate": 0.00012834179626791504, "loss": 0.8278, "step": 8320 }, { "epoch": 2.339797639123103, "grad_norm": 0.5530450344085693, "learning_rate": 0.00012824769806683736, "loss": 0.8371, "step": 8325 }, { "epoch": 2.341202922990444, "grad_norm": 0.5847424268722534, "learning_rate": 0.0001281535726772209, "loss": 0.8263, "step": 8330 }, { "epoch": 2.3426082068577854, "grad_norm": 0.6169049739837646, "learning_rate": 0.00012805942018966185, "loss": 0.858, "step": 8335 }, { "epoch": 2.3440134907251267, "grad_norm": 0.5686418414115906, "learning_rate": 0.00012796524069478242, "loss": 0.819, "step": 8340 }, { "epoch": 2.3454187745924675, "grad_norm": 0.4670698940753937, "learning_rate": 0.00012787103428323074, "loss": 0.8771, "step": 8345 }, { "epoch": 2.346824058459809, "grad_norm": 0.6976507306098938, "learning_rate": 0.00012777680104568098, "loss": 0.8355, "step": 8350 }, { "epoch": 2.34822934232715, "grad_norm": 1.8245209455490112, "learning_rate": 0.000127682541072833, "loss": 0.884, "step": 8355 }, { "epoch": 2.3496346261944914, "grad_norm": 0.469552218914032, "learning_rate": 0.00012758825445541248, "loss": 0.8433, "step": 8360 }, { "epoch": 2.3510399100618327, "grad_norm": 0.5654560327529907, "learning_rate": 0.00012749394128417073, "loss": 0.8521, "step": 8365 }, { "epoch": 2.3524451939291735, "grad_norm": 0.5556405186653137, "learning_rate": 0.00012739960164988463, "loss": 0.8344, "step": 8370 }, { "epoch": 2.353850477796515, "grad_norm": 0.47403526306152344, "learning_rate": 0.00012730523564335645, "loss": 0.8248, "step": 8375 }, { "epoch": 2.355255761663856, "grad_norm": 0.6413717269897461, "learning_rate": 0.000127210843355414, "loss": 0.8386, "step": 8380 }, { "epoch": 2.3566610455311974, "grad_norm": 0.5786089301109314, "learning_rate": 0.0001271164248769102, "loss": 0.8139, "step": 8385 }, { "epoch": 2.3580663293985387, "grad_norm": 0.813075602054596, "learning_rate": 0.00012702198029872325, "loss": 0.8247, "step": 8390 }, { "epoch": 2.35947161326588, "grad_norm": 0.5730018019676208, "learning_rate": 0.0001269275097117566, "loss": 0.8141, "step": 8395 }, { "epoch": 2.360876897133221, "grad_norm": 0.6490610837936401, "learning_rate": 0.0001268330132069385, "loss": 0.8265, "step": 8400 }, { "epoch": 2.362282181000562, "grad_norm": 0.6267738938331604, "learning_rate": 0.00012673849087522238, "loss": 0.8281, "step": 8405 }, { "epoch": 2.3636874648679034, "grad_norm": 0.49770984053611755, "learning_rate": 0.0001266439428075863, "loss": 0.8287, "step": 8410 }, { "epoch": 2.3650927487352447, "grad_norm": 0.49454283714294434, "learning_rate": 0.00012654936909503334, "loss": 0.8916, "step": 8415 }, { "epoch": 2.3664980326025855, "grad_norm": 0.5877101421356201, "learning_rate": 0.00012645476982859103, "loss": 0.8296, "step": 8420 }, { "epoch": 2.367903316469927, "grad_norm": 0.49286267161369324, "learning_rate": 0.00012636014509931164, "loss": 0.8411, "step": 8425 }, { "epoch": 2.369308600337268, "grad_norm": 0.5770230889320374, "learning_rate": 0.00012626549499827192, "loss": 0.8398, "step": 8430 }, { "epoch": 2.3707138842046094, "grad_norm": 0.7743233442306519, "learning_rate": 0.00012617081961657293, "loss": 0.8281, "step": 8435 }, { "epoch": 2.3721191680719507, "grad_norm": 2.1253602504730225, "learning_rate": 0.0001260761190453403, "loss": 0.8958, "step": 8440 }, { "epoch": 2.373524451939292, "grad_norm": 0.5004022121429443, "learning_rate": 0.00012598139337572362, "loss": 0.8387, "step": 8445 }, { "epoch": 2.374929735806633, "grad_norm": 0.5295646786689758, "learning_rate": 0.00012588664269889694, "loss": 0.8281, "step": 8450 }, { "epoch": 2.376335019673974, "grad_norm": 0.5158722400665283, "learning_rate": 0.00012579186710605811, "loss": 0.8308, "step": 8455 }, { "epoch": 2.3777403035413154, "grad_norm": 0.6024075746536255, "learning_rate": 0.00012569706668842906, "loss": 0.8334, "step": 8460 }, { "epoch": 2.3791455874086567, "grad_norm": 0.5735334753990173, "learning_rate": 0.00012560224153725566, "loss": 0.8347, "step": 8465 }, { "epoch": 2.3805508712759975, "grad_norm": 0.641499400138855, "learning_rate": 0.00012550739174380758, "loss": 0.845, "step": 8470 }, { "epoch": 2.381956155143339, "grad_norm": 0.49807512760162354, "learning_rate": 0.00012541251739937813, "loss": 0.8269, "step": 8475 }, { "epoch": 2.38336143901068, "grad_norm": 0.48801207542419434, "learning_rate": 0.00012531761859528435, "loss": 0.8376, "step": 8480 }, { "epoch": 2.3847667228780214, "grad_norm": 0.6413435339927673, "learning_rate": 0.00012522269542286673, "loss": 0.8223, "step": 8485 }, { "epoch": 2.3861720067453627, "grad_norm": 0.49994173645973206, "learning_rate": 0.0001251277479734893, "loss": 0.8438, "step": 8490 }, { "epoch": 2.387577290612704, "grad_norm": 0.5298976302146912, "learning_rate": 0.00012503277633853937, "loss": 0.8348, "step": 8495 }, { "epoch": 2.388982574480045, "grad_norm": 0.5225727558135986, "learning_rate": 0.00012493778060942762, "loss": 0.8215, "step": 8500 }, { "epoch": 2.390387858347386, "grad_norm": 0.760673999786377, "learning_rate": 0.0001248427608775878, "loss": 0.8427, "step": 8505 }, { "epoch": 2.3917931422147274, "grad_norm": 0.8451589345932007, "learning_rate": 0.00012474771723447695, "loss": 0.8329, "step": 8510 }, { "epoch": 2.3931984260820687, "grad_norm": 0.5589327812194824, "learning_rate": 0.0001246526497715749, "loss": 0.8293, "step": 8515 }, { "epoch": 2.39460370994941, "grad_norm": 0.5081980228424072, "learning_rate": 0.0001245575585803846, "loss": 0.8299, "step": 8520 }, { "epoch": 2.396008993816751, "grad_norm": 0.46641793847084045, "learning_rate": 0.00012446244375243173, "loss": 0.8253, "step": 8525 }, { "epoch": 2.397414277684092, "grad_norm": 0.4882664680480957, "learning_rate": 0.00012436730537926472, "loss": 0.862, "step": 8530 }, { "epoch": 2.3988195615514334, "grad_norm": 0.5426138043403625, "learning_rate": 0.00012427214355245468, "loss": 0.8386, "step": 8535 }, { "epoch": 2.4002248454187747, "grad_norm": 0.5505515933036804, "learning_rate": 0.00012417695836359536, "loss": 0.8249, "step": 8540 }, { "epoch": 2.401630129286116, "grad_norm": 0.5572278499603271, "learning_rate": 0.00012408174990430291, "loss": 0.8337, "step": 8545 }, { "epoch": 2.403035413153457, "grad_norm": 0.531121551990509, "learning_rate": 0.00012398651826621591, "loss": 0.8207, "step": 8550 }, { "epoch": 2.404440697020798, "grad_norm": 0.5037003755569458, "learning_rate": 0.00012389126354099528, "loss": 0.8337, "step": 8555 }, { "epoch": 2.4058459808881394, "grad_norm": 0.4792149066925049, "learning_rate": 0.00012379598582032409, "loss": 0.8284, "step": 8560 }, { "epoch": 2.4072512647554807, "grad_norm": 0.714592456817627, "learning_rate": 0.00012370068519590755, "loss": 0.833, "step": 8565 }, { "epoch": 2.408656548622822, "grad_norm": 0.53053879737854, "learning_rate": 0.00012360536175947307, "loss": 0.8248, "step": 8570 }, { "epoch": 2.410061832490163, "grad_norm": 0.49217337369918823, "learning_rate": 0.00012351001560276974, "loss": 0.8436, "step": 8575 }, { "epoch": 2.411467116357504, "grad_norm": 0.505234956741333, "learning_rate": 0.0001234146468175688, "loss": 0.828, "step": 8580 }, { "epoch": 2.4128724002248454, "grad_norm": 0.5867990851402283, "learning_rate": 0.00012331925549566303, "loss": 0.8234, "step": 8585 }, { "epoch": 2.4142776840921867, "grad_norm": 0.49216267466545105, "learning_rate": 0.0001232238417288671, "loss": 0.8372, "step": 8590 }, { "epoch": 2.415682967959528, "grad_norm": 0.48326772451400757, "learning_rate": 0.00012312840560901715, "loss": 0.8278, "step": 8595 }, { "epoch": 2.4170882518268693, "grad_norm": 0.5994096398353577, "learning_rate": 0.00012303294722797094, "loss": 0.8541, "step": 8600 }, { "epoch": 2.41849353569421, "grad_norm": 0.7651786208152771, "learning_rate": 0.00012293746667760757, "loss": 0.843, "step": 8605 }, { "epoch": 2.4198988195615514, "grad_norm": 0.5067122578620911, "learning_rate": 0.00012284196404982746, "loss": 0.8182, "step": 8610 }, { "epoch": 2.4213041034288927, "grad_norm": 0.6482433676719666, "learning_rate": 0.00012274643943655238, "loss": 0.8396, "step": 8615 }, { "epoch": 2.422709387296234, "grad_norm": 0.5103670358657837, "learning_rate": 0.00012265089292972517, "loss": 0.8315, "step": 8620 }, { "epoch": 2.424114671163575, "grad_norm": 0.4920249283313751, "learning_rate": 0.00012255532462130984, "loss": 0.8371, "step": 8625 }, { "epoch": 2.425519955030916, "grad_norm": 0.5247959494590759, "learning_rate": 0.00012245973460329123, "loss": 0.8413, "step": 8630 }, { "epoch": 2.4269252388982574, "grad_norm": 0.5151497721672058, "learning_rate": 0.0001223641229676753, "loss": 0.8212, "step": 8635 }, { "epoch": 2.4283305227655987, "grad_norm": 0.4917905032634735, "learning_rate": 0.00012226848980648856, "loss": 0.8403, "step": 8640 }, { "epoch": 2.42973580663294, "grad_norm": 0.46560433506965637, "learning_rate": 0.00012217283521177844, "loss": 0.8267, "step": 8645 }, { "epoch": 2.4311410905002813, "grad_norm": 0.47513896226882935, "learning_rate": 0.00012207715927561288, "loss": 0.83, "step": 8650 }, { "epoch": 2.432546374367622, "grad_norm": 0.7692919373512268, "learning_rate": 0.00012198146209008046, "loss": 0.814, "step": 8655 }, { "epoch": 2.4339516582349634, "grad_norm": 0.5252566337585449, "learning_rate": 0.00012188574374729014, "loss": 0.8461, "step": 8660 }, { "epoch": 2.4353569421023047, "grad_norm": 0.5163432359695435, "learning_rate": 0.0001217900043393712, "loss": 0.8415, "step": 8665 }, { "epoch": 2.436762225969646, "grad_norm": 0.4713330566883087, "learning_rate": 0.00012169424395847335, "loss": 0.8203, "step": 8670 }, { "epoch": 2.438167509836987, "grad_norm": 0.5539212226867676, "learning_rate": 0.00012159846269676638, "loss": 0.8269, "step": 8675 }, { "epoch": 2.439572793704328, "grad_norm": 0.5295586585998535, "learning_rate": 0.00012150266064644013, "loss": 0.8373, "step": 8680 }, { "epoch": 2.4409780775716694, "grad_norm": 0.6764742136001587, "learning_rate": 0.0001214068378997046, "loss": 0.8362, "step": 8685 }, { "epoch": 2.4423833614390107, "grad_norm": 0.5514596700668335, "learning_rate": 0.00012131099454878952, "loss": 0.819, "step": 8690 }, { "epoch": 2.443788645306352, "grad_norm": 0.5967371463775635, "learning_rate": 0.00012121513068594463, "loss": 0.8281, "step": 8695 }, { "epoch": 2.4451939291736933, "grad_norm": 0.5824768543243408, "learning_rate": 0.0001211192464034393, "loss": 0.8334, "step": 8700 }, { "epoch": 2.446599213041034, "grad_norm": 0.5651732087135315, "learning_rate": 0.00012102334179356265, "loss": 0.8314, "step": 8705 }, { "epoch": 2.4480044969083754, "grad_norm": 0.4605449438095093, "learning_rate": 0.00012092741694862324, "loss": 0.8284, "step": 8710 }, { "epoch": 2.4494097807757167, "grad_norm": 0.6510748267173767, "learning_rate": 0.00012083147196094917, "loss": 0.8355, "step": 8715 }, { "epoch": 2.450815064643058, "grad_norm": 0.5636357069015503, "learning_rate": 0.000120735506922888, "loss": 0.8297, "step": 8720 }, { "epoch": 2.4522203485103993, "grad_norm": 0.5459914207458496, "learning_rate": 0.00012063952192680643, "loss": 0.8466, "step": 8725 }, { "epoch": 2.45362563237774, "grad_norm": 0.5210451483726501, "learning_rate": 0.00012054351706509055, "loss": 0.8223, "step": 8730 }, { "epoch": 2.4550309162450814, "grad_norm": 0.4928796589374542, "learning_rate": 0.00012044749243014539, "loss": 0.8372, "step": 8735 }, { "epoch": 2.4564362001124227, "grad_norm": 0.47832977771759033, "learning_rate": 0.00012035144811439516, "loss": 0.8335, "step": 8740 }, { "epoch": 2.457841483979764, "grad_norm": 0.46908292174339294, "learning_rate": 0.00012025538421028293, "loss": 0.835, "step": 8745 }, { "epoch": 2.4592467678471053, "grad_norm": 0.4670230448246002, "learning_rate": 0.00012015930081027065, "loss": 0.8326, "step": 8750 }, { "epoch": 2.4606520517144466, "grad_norm": 0.47103649377822876, "learning_rate": 0.00012006319800683905, "loss": 0.8179, "step": 8755 }, { "epoch": 2.4620573355817874, "grad_norm": 0.46885553002357483, "learning_rate": 0.0001199670758924875, "loss": 0.8179, "step": 8760 }, { "epoch": 2.4634626194491287, "grad_norm": 0.5258293151855469, "learning_rate": 0.00011987093455973397, "loss": 0.8278, "step": 8765 }, { "epoch": 2.46486790331647, "grad_norm": 0.47522857785224915, "learning_rate": 0.00011977477410111492, "loss": 0.8351, "step": 8770 }, { "epoch": 2.4662731871838113, "grad_norm": 0.8046366572380066, "learning_rate": 0.00011967859460918531, "loss": 0.8654, "step": 8775 }, { "epoch": 2.467678471051152, "grad_norm": 0.7628329396247864, "learning_rate": 0.00011958239617651825, "loss": 0.8776, "step": 8780 }, { "epoch": 2.4690837549184934, "grad_norm": 0.6719951629638672, "learning_rate": 0.00011948617889570522, "loss": 0.831, "step": 8785 }, { "epoch": 2.4704890387858347, "grad_norm": 0.4928390383720398, "learning_rate": 0.00011938994285935577, "loss": 0.8218, "step": 8790 }, { "epoch": 2.471894322653176, "grad_norm": 0.6428622603416443, "learning_rate": 0.00011929368816009752, "loss": 0.8246, "step": 8795 }, { "epoch": 2.4732996065205173, "grad_norm": 0.49304255843162537, "learning_rate": 0.00011919741489057612, "loss": 0.84, "step": 8800 }, { "epoch": 2.4747048903878586, "grad_norm": 0.4877597987651825, "learning_rate": 0.00011910112314345494, "loss": 0.8731, "step": 8805 }, { "epoch": 2.4761101742551994, "grad_norm": 0.5115801692008972, "learning_rate": 0.00011900481301141531, "loss": 0.8181, "step": 8810 }, { "epoch": 2.4775154581225407, "grad_norm": 0.5835259556770325, "learning_rate": 0.00011890848458715611, "loss": 0.8265, "step": 8815 }, { "epoch": 2.478920741989882, "grad_norm": 0.5181130170822144, "learning_rate": 0.00011881213796339392, "loss": 0.8356, "step": 8820 }, { "epoch": 2.4803260258572233, "grad_norm": 0.7785965800285339, "learning_rate": 0.00011871577323286285, "loss": 0.8406, "step": 8825 }, { "epoch": 2.481731309724564, "grad_norm": 0.5177497863769531, "learning_rate": 0.0001186193904883143, "loss": 0.8282, "step": 8830 }, { "epoch": 2.4831365935919054, "grad_norm": 0.5036835074424744, "learning_rate": 0.00011852298982251718, "loss": 0.8379, "step": 8835 }, { "epoch": 2.4845418774592467, "grad_norm": 0.5061383247375488, "learning_rate": 0.00011842657132825752, "loss": 0.8269, "step": 8840 }, { "epoch": 2.485947161326588, "grad_norm": 0.543797492980957, "learning_rate": 0.00011833013509833859, "loss": 0.8327, "step": 8845 }, { "epoch": 2.4873524451939293, "grad_norm": 0.48197653889656067, "learning_rate": 0.0001182336812255807, "loss": 0.8245, "step": 8850 }, { "epoch": 2.4887577290612706, "grad_norm": 0.7762760519981384, "learning_rate": 0.00011813720980282115, "loss": 0.8118, "step": 8855 }, { "epoch": 2.4901630129286114, "grad_norm": 0.5974884033203125, "learning_rate": 0.00011804072092291414, "loss": 0.8398, "step": 8860 }, { "epoch": 2.4915682967959527, "grad_norm": 0.5197319984436035, "learning_rate": 0.0001179442146787306, "loss": 0.8331, "step": 8865 }, { "epoch": 2.492973580663294, "grad_norm": 0.5598850846290588, "learning_rate": 0.00011784769116315825, "loss": 0.8247, "step": 8870 }, { "epoch": 2.4943788645306353, "grad_norm": 0.5612096786499023, "learning_rate": 0.00011775115046910148, "loss": 0.8329, "step": 8875 }, { "epoch": 2.4957841483979766, "grad_norm": 0.48368990421295166, "learning_rate": 0.00011765459268948111, "loss": 0.8363, "step": 8880 }, { "epoch": 2.4971894322653174, "grad_norm": 0.4993809163570404, "learning_rate": 0.00011755801791723442, "loss": 0.8422, "step": 8885 }, { "epoch": 2.4985947161326587, "grad_norm": 0.6153314113616943, "learning_rate": 0.00011746142624531509, "loss": 0.8329, "step": 8890 }, { "epoch": 2.5, "grad_norm": 0.6654748320579529, "learning_rate": 0.00011736481776669306, "loss": 0.8187, "step": 8895 }, { "epoch": 2.5014052838673413, "grad_norm": 0.9411234855651855, "learning_rate": 0.0001172681925743544, "loss": 0.841, "step": 8900 }, { "epoch": 2.5028105677346826, "grad_norm": 0.5395568013191223, "learning_rate": 0.00011717155076130133, "loss": 0.8348, "step": 8905 }, { "epoch": 2.504215851602024, "grad_norm": 0.6058316826820374, "learning_rate": 0.00011707489242055203, "loss": 0.8215, "step": 8910 }, { "epoch": 2.5056211354693647, "grad_norm": 0.5065362453460693, "learning_rate": 0.0001169782176451406, "loss": 0.8298, "step": 8915 }, { "epoch": 2.507026419336706, "grad_norm": 0.5494495034217834, "learning_rate": 0.00011688152652811692, "loss": 0.8375, "step": 8920 }, { "epoch": 2.5084317032040473, "grad_norm": 0.648129940032959, "learning_rate": 0.00011678481916254669, "loss": 0.8225, "step": 8925 }, { "epoch": 2.5098369870713886, "grad_norm": 0.5400100946426392, "learning_rate": 0.00011668809564151117, "loss": 0.8286, "step": 8930 }, { "epoch": 2.5112422709387294, "grad_norm": 0.5004889369010925, "learning_rate": 0.00011659135605810716, "loss": 0.8301, "step": 8935 }, { "epoch": 2.5126475548060707, "grad_norm": 0.4907006621360779, "learning_rate": 0.00011649460050544698, "loss": 0.8297, "step": 8940 }, { "epoch": 2.514052838673412, "grad_norm": 0.4957001507282257, "learning_rate": 0.00011639782907665828, "loss": 0.822, "step": 8945 }, { "epoch": 2.5154581225407533, "grad_norm": 0.555509090423584, "learning_rate": 0.00011630104186488405, "loss": 0.8272, "step": 8950 }, { "epoch": 2.5168634064080946, "grad_norm": 0.4936651587486267, "learning_rate": 0.00011620423896328234, "loss": 0.8339, "step": 8955 }, { "epoch": 2.518268690275436, "grad_norm": 0.515689492225647, "learning_rate": 0.00011610742046502648, "loss": 0.8274, "step": 8960 }, { "epoch": 2.5196739741427767, "grad_norm": 0.48193639516830444, "learning_rate": 0.00011601058646330463, "loss": 0.8298, "step": 8965 }, { "epoch": 2.521079258010118, "grad_norm": 0.5371465682983398, "learning_rate": 0.00011591373705132, "loss": 0.8789, "step": 8970 }, { "epoch": 2.5224845418774593, "grad_norm": 0.5975221395492554, "learning_rate": 0.00011581687232229062, "loss": 0.8154, "step": 8975 }, { "epoch": 2.5238898257448006, "grad_norm": 0.5806734561920166, "learning_rate": 0.0001157199923694492, "loss": 0.838, "step": 8980 }, { "epoch": 2.5252951096121414, "grad_norm": 0.5201011300086975, "learning_rate": 0.00011562309728604314, "loss": 0.8465, "step": 8985 }, { "epoch": 2.5267003934794827, "grad_norm": 0.45833903551101685, "learning_rate": 0.00011552618716533441, "loss": 0.8104, "step": 8990 }, { "epoch": 2.528105677346824, "grad_norm": 0.48683154582977295, "learning_rate": 0.00011542926210059944, "loss": 0.8418, "step": 8995 }, { "epoch": 2.5295109612141653, "grad_norm": 0.49923381209373474, "learning_rate": 0.00011533232218512904, "loss": 0.8247, "step": 9000 }, { "epoch": 2.5309162450815066, "grad_norm": 0.4724583625793457, "learning_rate": 0.00011523536751222836, "loss": 0.8475, "step": 9005 }, { "epoch": 2.532321528948848, "grad_norm": 0.4973272681236267, "learning_rate": 0.00011513839817521668, "loss": 0.8373, "step": 9010 }, { "epoch": 2.5337268128161887, "grad_norm": 0.5687633752822876, "learning_rate": 0.00011504141426742742, "loss": 0.8362, "step": 9015 }, { "epoch": 2.53513209668353, "grad_norm": 0.5235709547996521, "learning_rate": 0.00011494441588220808, "loss": 0.8448, "step": 9020 }, { "epoch": 2.5365373805508713, "grad_norm": 0.5844621062278748, "learning_rate": 0.00011484740311292002, "loss": 0.8388, "step": 9025 }, { "epoch": 2.5379426644182126, "grad_norm": 0.5028629302978516, "learning_rate": 0.0001147503760529385, "loss": 0.8249, "step": 9030 }, { "epoch": 2.5393479482855534, "grad_norm": 0.7003517746925354, "learning_rate": 0.00011465333479565248, "loss": 0.8491, "step": 9035 }, { "epoch": 2.5407532321528947, "grad_norm": 0.5840672254562378, "learning_rate": 0.00011455627943446461, "loss": 0.8363, "step": 9040 }, { "epoch": 2.542158516020236, "grad_norm": 0.6555740237236023, "learning_rate": 0.00011445921006279115, "loss": 0.8065, "step": 9045 }, { "epoch": 2.5435637998875773, "grad_norm": 0.5312934517860413, "learning_rate": 0.00011436212677406178, "loss": 0.8258, "step": 9050 }, { "epoch": 2.5449690837549186, "grad_norm": 0.49611037969589233, "learning_rate": 0.00011426502966171966, "loss": 0.8276, "step": 9055 }, { "epoch": 2.54637436762226, "grad_norm": 0.4570031464099884, "learning_rate": 0.00011416791881922115, "loss": 0.8228, "step": 9060 }, { "epoch": 2.547779651489601, "grad_norm": 0.5568541884422302, "learning_rate": 0.00011407079434003591, "loss": 0.8441, "step": 9065 }, { "epoch": 2.549184935356942, "grad_norm": 0.5997769832611084, "learning_rate": 0.00011397365631764669, "loss": 0.8447, "step": 9070 }, { "epoch": 2.5505902192242833, "grad_norm": 0.5078825354576111, "learning_rate": 0.00011387650484554928, "loss": 0.8321, "step": 9075 }, { "epoch": 2.5519955030916246, "grad_norm": 0.6241423487663269, "learning_rate": 0.00011377934001725243, "loss": 0.8353, "step": 9080 }, { "epoch": 2.5534007869589654, "grad_norm": 0.5224640369415283, "learning_rate": 0.00011368216192627773, "loss": 0.8359, "step": 9085 }, { "epoch": 2.5548060708263067, "grad_norm": 0.5141652822494507, "learning_rate": 0.00011358497066615951, "loss": 0.8559, "step": 9090 }, { "epoch": 2.556211354693648, "grad_norm": 0.4821389615535736, "learning_rate": 0.00011348776633044483, "loss": 0.8179, "step": 9095 }, { "epoch": 2.5576166385609893, "grad_norm": 0.5298710465431213, "learning_rate": 0.00011339054901269328, "loss": 0.8222, "step": 9100 }, { "epoch": 2.5590219224283306, "grad_norm": 0.5908434391021729, "learning_rate": 0.000113293318806477, "loss": 0.8311, "step": 9105 }, { "epoch": 2.560427206295672, "grad_norm": 0.6493870615959167, "learning_rate": 0.00011319607580538055, "loss": 0.8314, "step": 9110 }, { "epoch": 2.561832490163013, "grad_norm": 0.6728103756904602, "learning_rate": 0.00011309882010300068, "loss": 0.8411, "step": 9115 }, { "epoch": 2.563237774030354, "grad_norm": 0.8559635877609253, "learning_rate": 0.00011300155179294647, "loss": 0.828, "step": 9120 }, { "epoch": 2.5646430578976953, "grad_norm": 0.5684670209884644, "learning_rate": 0.00011290427096883914, "loss": 0.8408, "step": 9125 }, { "epoch": 2.5660483417650366, "grad_norm": 0.5541810393333435, "learning_rate": 0.0001128069777243119, "loss": 0.8161, "step": 9130 }, { "epoch": 2.567453625632378, "grad_norm": 0.7451156973838806, "learning_rate": 0.00011270967215300998, "loss": 0.8366, "step": 9135 }, { "epoch": 2.5688589094997187, "grad_norm": 0.5444964170455933, "learning_rate": 0.00011261235434859041, "loss": 0.8494, "step": 9140 }, { "epoch": 2.57026419336706, "grad_norm": 0.6006889343261719, "learning_rate": 0.00011251502440472206, "loss": 0.8353, "step": 9145 }, { "epoch": 2.5716694772344013, "grad_norm": 0.5542729496955872, "learning_rate": 0.00011241768241508537, "loss": 0.8183, "step": 9150 }, { "epoch": 2.5730747611017426, "grad_norm": 0.5589856505393982, "learning_rate": 0.00011232032847337252, "loss": 0.822, "step": 9155 }, { "epoch": 2.574480044969084, "grad_norm": 0.8560423254966736, "learning_rate": 0.00011222296267328711, "loss": 0.8365, "step": 9160 }, { "epoch": 2.575885328836425, "grad_norm": 0.5394550561904907, "learning_rate": 0.00011212558510854416, "loss": 0.8757, "step": 9165 }, { "epoch": 2.577290612703766, "grad_norm": 0.4776645600795746, "learning_rate": 0.00011202819587287001, "loss": 0.8282, "step": 9170 }, { "epoch": 2.5786958965711073, "grad_norm": 0.5669585466384888, "learning_rate": 0.00011193079506000226, "loss": 0.8186, "step": 9175 }, { "epoch": 2.5801011804384486, "grad_norm": 0.5108312964439392, "learning_rate": 0.00011183338276368964, "loss": 0.8433, "step": 9180 }, { "epoch": 2.58150646430579, "grad_norm": 0.5904492735862732, "learning_rate": 0.00011173595907769193, "loss": 0.8166, "step": 9185 }, { "epoch": 2.5829117481731307, "grad_norm": 0.8534032702445984, "learning_rate": 0.00011163852409577988, "loss": 0.8372, "step": 9190 }, { "epoch": 2.584317032040472, "grad_norm": 0.6443466544151306, "learning_rate": 0.0001115410779117351, "loss": 0.8292, "step": 9195 }, { "epoch": 2.5857223159078133, "grad_norm": 0.5002859234809875, "learning_rate": 0.00011144362061934996, "loss": 0.8433, "step": 9200 }, { "epoch": 2.5871275997751546, "grad_norm": 0.463712215423584, "learning_rate": 0.0001113461523124276, "loss": 0.8074, "step": 9205 }, { "epoch": 2.588532883642496, "grad_norm": 0.6767189502716064, "learning_rate": 0.00011124867308478167, "loss": 0.8422, "step": 9210 }, { "epoch": 2.589938167509837, "grad_norm": 0.6467483043670654, "learning_rate": 0.00011115118303023641, "loss": 0.8301, "step": 9215 }, { "epoch": 2.591343451377178, "grad_norm": 0.730329692363739, "learning_rate": 0.00011105368224262642, "loss": 0.8292, "step": 9220 }, { "epoch": 2.5927487352445193, "grad_norm": 0.6260877251625061, "learning_rate": 0.00011095617081579663, "loss": 0.8136, "step": 9225 }, { "epoch": 2.5941540191118606, "grad_norm": 0.5238600373268127, "learning_rate": 0.0001108586488436023, "loss": 0.8227, "step": 9230 }, { "epoch": 2.595559302979202, "grad_norm": 0.5659443736076355, "learning_rate": 0.00011076111641990874, "loss": 0.8446, "step": 9235 }, { "epoch": 2.5969645868465427, "grad_norm": 0.5455038547515869, "learning_rate": 0.00011066357363859135, "loss": 0.83, "step": 9240 }, { "epoch": 2.598369870713884, "grad_norm": 0.5853575468063354, "learning_rate": 0.00011056602059353549, "loss": 0.8279, "step": 9245 }, { "epoch": 2.5997751545812253, "grad_norm": 0.5086763501167297, "learning_rate": 0.00011046845737863643, "loss": 0.8341, "step": 9250 }, { "epoch": 2.6011804384485666, "grad_norm": 0.511115550994873, "learning_rate": 0.00011037088408779921, "loss": 0.8431, "step": 9255 }, { "epoch": 2.602585722315908, "grad_norm": 0.46000874042510986, "learning_rate": 0.00011027330081493858, "loss": 0.8333, "step": 9260 }, { "epoch": 2.603991006183249, "grad_norm": 0.5504966974258423, "learning_rate": 0.0001101757076539789, "loss": 0.8325, "step": 9265 }, { "epoch": 2.6053962900505905, "grad_norm": 0.4799230992794037, "learning_rate": 0.00011007810469885398, "loss": 0.811, "step": 9270 }, { "epoch": 2.6068015739179313, "grad_norm": 0.49556994438171387, "learning_rate": 0.00010998049204350714, "loss": 0.8431, "step": 9275 }, { "epoch": 2.6082068577852726, "grad_norm": 0.4762093722820282, "learning_rate": 0.00010988286978189099, "loss": 0.8237, "step": 9280 }, { "epoch": 2.609612141652614, "grad_norm": 0.5235040187835693, "learning_rate": 0.00010978523800796747, "loss": 0.8327, "step": 9285 }, { "epoch": 2.611017425519955, "grad_norm": 0.4617610573768616, "learning_rate": 0.00010968759681570755, "loss": 0.8218, "step": 9290 }, { "epoch": 2.612422709387296, "grad_norm": 0.5892154574394226, "learning_rate": 0.00010958994629909134, "loss": 0.8135, "step": 9295 }, { "epoch": 2.6138279932546373, "grad_norm": 1.155057668685913, "learning_rate": 0.00010949228655210788, "loss": 0.8801, "step": 9300 }, { "epoch": 2.6152332771219786, "grad_norm": 0.4898678958415985, "learning_rate": 0.00010939461766875519, "loss": 0.8188, "step": 9305 }, { "epoch": 2.61663856098932, "grad_norm": 0.8992647528648376, "learning_rate": 0.00010929693974303995, "loss": 0.8842, "step": 9310 }, { "epoch": 2.618043844856661, "grad_norm": 0.5119013786315918, "learning_rate": 0.00010919925286897765, "loss": 0.8279, "step": 9315 }, { "epoch": 2.6194491287240025, "grad_norm": 0.6281744241714478, "learning_rate": 0.00010910155714059235, "loss": 0.8224, "step": 9320 }, { "epoch": 2.6208544125913433, "grad_norm": 0.500538170337677, "learning_rate": 0.00010900385265191661, "loss": 0.8267, "step": 9325 }, { "epoch": 2.6222596964586846, "grad_norm": 0.5560206174850464, "learning_rate": 0.00010890613949699146, "loss": 0.8277, "step": 9330 }, { "epoch": 2.623664980326026, "grad_norm": 0.5452876687049866, "learning_rate": 0.0001088084177698663, "loss": 0.8385, "step": 9335 }, { "epoch": 2.625070264193367, "grad_norm": 0.5131661891937256, "learning_rate": 0.00010871068756459867, "loss": 0.8291, "step": 9340 }, { "epoch": 2.626475548060708, "grad_norm": 0.5180448889732361, "learning_rate": 0.0001086129489752544, "loss": 0.809, "step": 9345 }, { "epoch": 2.6278808319280493, "grad_norm": 0.4731772243976593, "learning_rate": 0.00010851520209590728, "loss": 0.8358, "step": 9350 }, { "epoch": 2.6292861157953906, "grad_norm": 0.4596666991710663, "learning_rate": 0.00010841744702063916, "loss": 0.8344, "step": 9355 }, { "epoch": 2.630691399662732, "grad_norm": 0.5088164806365967, "learning_rate": 0.00010831968384353974, "loss": 0.8811, "step": 9360 }, { "epoch": 2.632096683530073, "grad_norm": 0.5293578505516052, "learning_rate": 0.00010822191265870656, "loss": 0.8207, "step": 9365 }, { "epoch": 2.6335019673974145, "grad_norm": 0.5973764657974243, "learning_rate": 0.0001081241335602448, "loss": 0.8295, "step": 9370 }, { "epoch": 2.6349072512647553, "grad_norm": 0.48312100768089294, "learning_rate": 0.00010802634664226723, "loss": 0.8355, "step": 9375 }, { "epoch": 2.6363125351320966, "grad_norm": 0.8425654768943787, "learning_rate": 0.00010792855199889431, "loss": 0.8285, "step": 9380 }, { "epoch": 2.637717818999438, "grad_norm": 5.292347431182861, "learning_rate": 0.00010783074972425378, "loss": 0.8939, "step": 9385 }, { "epoch": 2.639123102866779, "grad_norm": 0.9256778359413147, "learning_rate": 0.00010773293991248079, "loss": 0.8325, "step": 9390 }, { "epoch": 2.64052838673412, "grad_norm": 0.5325646996498108, "learning_rate": 0.00010763512265771772, "loss": 0.8237, "step": 9395 }, { "epoch": 2.6419336706014613, "grad_norm": 0.5501137971878052, "learning_rate": 0.00010753729805411412, "loss": 0.8207, "step": 9400 }, { "epoch": 2.6433389544688026, "grad_norm": 0.6308431625366211, "learning_rate": 0.00010743946619582664, "loss": 0.8215, "step": 9405 }, { "epoch": 2.644744238336144, "grad_norm": 0.48350989818573, "learning_rate": 0.0001073416271770189, "loss": 0.8325, "step": 9410 }, { "epoch": 2.646149522203485, "grad_norm": 0.5262377262115479, "learning_rate": 0.0001072437810918614, "loss": 0.8252, "step": 9415 }, { "epoch": 2.6475548060708265, "grad_norm": 0.5404006838798523, "learning_rate": 0.00010714592803453139, "loss": 0.8764, "step": 9420 }, { "epoch": 2.648960089938168, "grad_norm": 0.4993666410446167, "learning_rate": 0.00010704806809921292, "loss": 0.8264, "step": 9425 }, { "epoch": 2.6503653738055086, "grad_norm": 0.5050360560417175, "learning_rate": 0.00010695020138009666, "loss": 0.8199, "step": 9430 }, { "epoch": 2.65177065767285, "grad_norm": 0.9669815301895142, "learning_rate": 0.00010685232797137976, "loss": 0.8458, "step": 9435 }, { "epoch": 2.653175941540191, "grad_norm": 1.6229816675186157, "learning_rate": 0.0001067544479672658, "loss": 0.8799, "step": 9440 }, { "epoch": 2.654581225407532, "grad_norm": 0.4925650954246521, "learning_rate": 0.00010665656146196475, "loss": 0.8317, "step": 9445 }, { "epoch": 2.6559865092748733, "grad_norm": 0.4756428599357605, "learning_rate": 0.00010655866854969278, "loss": 0.8261, "step": 9450 }, { "epoch": 2.6573917931422146, "grad_norm": 0.4711908996105194, "learning_rate": 0.00010646076932467232, "loss": 0.8339, "step": 9455 }, { "epoch": 2.658797077009556, "grad_norm": 0.49394097924232483, "learning_rate": 0.00010636286388113184, "loss": 0.8682, "step": 9460 }, { "epoch": 2.660202360876897, "grad_norm": 0.5166155695915222, "learning_rate": 0.00010626495231330568, "loss": 0.8346, "step": 9465 }, { "epoch": 2.6616076447442385, "grad_norm": 0.49899595975875854, "learning_rate": 0.0001061670347154343, "loss": 0.8214, "step": 9470 }, { "epoch": 2.66301292861158, "grad_norm": 0.477092444896698, "learning_rate": 0.00010606911118176372, "loss": 0.8332, "step": 9475 }, { "epoch": 2.6644182124789206, "grad_norm": 0.4610581696033478, "learning_rate": 0.00010597118180654584, "loss": 0.8328, "step": 9480 }, { "epoch": 2.665823496346262, "grad_norm": 0.5816370844841003, "learning_rate": 0.00010587324668403815, "loss": 0.8287, "step": 9485 }, { "epoch": 2.667228780213603, "grad_norm": 0.5491670966148376, "learning_rate": 0.00010577530590850362, "loss": 0.8295, "step": 9490 }, { "epoch": 2.6686340640809445, "grad_norm": 0.6526244282722473, "learning_rate": 0.00010567735957421072, "loss": 0.8422, "step": 9495 }, { "epoch": 2.6700393479482853, "grad_norm": 0.48390865325927734, "learning_rate": 0.00010557940777543323, "loss": 0.8362, "step": 9500 }, { "epoch": 2.6714446318156266, "grad_norm": 0.7073639631271362, "learning_rate": 0.0001054814506064502, "loss": 0.8279, "step": 9505 }, { "epoch": 2.672849915682968, "grad_norm": 0.5461817383766174, "learning_rate": 0.00010538348816154586, "loss": 0.811, "step": 9510 }, { "epoch": 2.674255199550309, "grad_norm": 0.7969082593917847, "learning_rate": 0.00010528552053500955, "loss": 0.8468, "step": 9515 }, { "epoch": 2.6756604834176505, "grad_norm": 0.5298776030540466, "learning_rate": 0.00010518754782113551, "loss": 0.8451, "step": 9520 }, { "epoch": 2.677065767284992, "grad_norm": 0.5770809054374695, "learning_rate": 0.00010508957011422292, "loss": 0.8535, "step": 9525 }, { "epoch": 2.6784710511523326, "grad_norm": 0.5948202610015869, "learning_rate": 0.0001049915875085758, "loss": 0.8389, "step": 9530 }, { "epoch": 2.679876335019674, "grad_norm": 0.6123007535934448, "learning_rate": 0.00010489360009850285, "loss": 0.8212, "step": 9535 }, { "epoch": 2.681281618887015, "grad_norm": 0.49327778816223145, "learning_rate": 0.0001047956079783174, "loss": 0.8323, "step": 9540 }, { "epoch": 2.6826869027543565, "grad_norm": 0.5005403161048889, "learning_rate": 0.00010469761124233731, "loss": 0.8243, "step": 9545 }, { "epoch": 2.6840921866216974, "grad_norm": 0.48121288418769836, "learning_rate": 0.00010459960998488489, "loss": 0.8327, "step": 9550 }, { "epoch": 2.6854974704890386, "grad_norm": 0.4431888163089752, "learning_rate": 0.0001045016043002868, "loss": 0.8167, "step": 9555 }, { "epoch": 2.68690275435638, "grad_norm": 0.5393797159194946, "learning_rate": 0.00010440359428287394, "loss": 0.8376, "step": 9560 }, { "epoch": 2.688308038223721, "grad_norm": 0.6236591339111328, "learning_rate": 0.00010430558002698145, "loss": 0.8155, "step": 9565 }, { "epoch": 2.6897133220910625, "grad_norm": 0.6783631443977356, "learning_rate": 0.00010420756162694847, "loss": 0.8437, "step": 9570 }, { "epoch": 2.691118605958404, "grad_norm": 0.6380444765090942, "learning_rate": 0.00010410953917711814, "loss": 0.8262, "step": 9575 }, { "epoch": 2.6925238898257446, "grad_norm": 0.5137963891029358, "learning_rate": 0.00010401151277183754, "loss": 0.8265, "step": 9580 }, { "epoch": 2.693929173693086, "grad_norm": 0.49073556065559387, "learning_rate": 0.00010391348250545754, "loss": 0.8279, "step": 9585 }, { "epoch": 2.695334457560427, "grad_norm": 0.5200812220573425, "learning_rate": 0.00010381544847233271, "loss": 0.8291, "step": 9590 }, { "epoch": 2.6967397414277685, "grad_norm": 0.47082236409187317, "learning_rate": 0.00010371741076682124, "loss": 0.8221, "step": 9595 }, { "epoch": 2.6981450252951094, "grad_norm": 0.47038206458091736, "learning_rate": 0.0001036193694832849, "loss": 0.8269, "step": 9600 }, { "epoch": 2.6995503091624506, "grad_norm": 0.5449730753898621, "learning_rate": 0.00010352132471608882, "loss": 0.8582, "step": 9605 }, { "epoch": 2.700955593029792, "grad_norm": 0.5254364013671875, "learning_rate": 0.00010342327655960162, "loss": 0.8257, "step": 9610 }, { "epoch": 2.7023608768971332, "grad_norm": 1.2105820178985596, "learning_rate": 0.00010332522510819504, "loss": 0.8759, "step": 9615 }, { "epoch": 2.7037661607644745, "grad_norm": 0.5029426217079163, "learning_rate": 0.00010322717045624411, "loss": 0.8339, "step": 9620 }, { "epoch": 2.705171444631816, "grad_norm": 0.5147092938423157, "learning_rate": 0.00010312911269812677, "loss": 0.8317, "step": 9625 }, { "epoch": 2.706576728499157, "grad_norm": 0.5242138504981995, "learning_rate": 0.00010303105192822418, "loss": 0.8138, "step": 9630 }, { "epoch": 2.707982012366498, "grad_norm": 0.49158915877342224, "learning_rate": 0.00010293298824092022, "loss": 0.8231, "step": 9635 }, { "epoch": 2.7093872962338392, "grad_norm": 0.5118553042411804, "learning_rate": 0.00010283492173060163, "loss": 0.8274, "step": 9640 }, { "epoch": 2.7107925801011805, "grad_norm": 0.4782494008541107, "learning_rate": 0.00010273685249165791, "loss": 0.8295, "step": 9645 }, { "epoch": 2.712197863968522, "grad_norm": 0.5195757150650024, "learning_rate": 0.00010263878061848115, "loss": 0.8161, "step": 9650 }, { "epoch": 2.7136031478358627, "grad_norm": 0.6431840658187866, "learning_rate": 0.00010254070620546594, "loss": 0.8534, "step": 9655 }, { "epoch": 2.715008431703204, "grad_norm": 0.5024916529655457, "learning_rate": 0.00010244262934700937, "loss": 0.828, "step": 9660 }, { "epoch": 2.7164137155705452, "grad_norm": 0.7182828187942505, "learning_rate": 0.0001023445501375109, "loss": 0.8114, "step": 9665 }, { "epoch": 2.7178189994378865, "grad_norm": 0.4836368262767792, "learning_rate": 0.00010224646867137217, "loss": 0.8295, "step": 9670 }, { "epoch": 2.719224283305228, "grad_norm": 0.5435917377471924, "learning_rate": 0.00010214838504299704, "loss": 0.8316, "step": 9675 }, { "epoch": 2.720629567172569, "grad_norm": 0.4549933671951294, "learning_rate": 0.0001020502993467915, "loss": 0.8427, "step": 9680 }, { "epoch": 2.72203485103991, "grad_norm": 0.4845207631587982, "learning_rate": 0.0001019522116771634, "loss": 0.8404, "step": 9685 }, { "epoch": 2.7234401349072512, "grad_norm": 0.4945378303527832, "learning_rate": 0.00010185412212852267, "loss": 0.834, "step": 9690 }, { "epoch": 2.7248454187745925, "grad_norm": 0.5254123210906982, "learning_rate": 0.00010175603079528088, "loss": 0.8175, "step": 9695 }, { "epoch": 2.726250702641934, "grad_norm": 0.47113460302352905, "learning_rate": 0.00010165793777185144, "loss": 0.8417, "step": 9700 }, { "epoch": 2.7276559865092747, "grad_norm": 0.5494838356971741, "learning_rate": 0.00010155984315264928, "loss": 0.83, "step": 9705 }, { "epoch": 2.729061270376616, "grad_norm": 0.5023055672645569, "learning_rate": 0.00010146174703209093, "loss": 0.8283, "step": 9710 }, { "epoch": 2.7304665542439572, "grad_norm": 0.5280230045318604, "learning_rate": 0.0001013636495045944, "loss": 0.8179, "step": 9715 }, { "epoch": 2.7318718381112985, "grad_norm": 0.5023404359817505, "learning_rate": 0.00010126555066457895, "loss": 0.8158, "step": 9720 }, { "epoch": 2.73327712197864, "grad_norm": 0.6119568943977356, "learning_rate": 0.00010116745060646522, "loss": 0.8223, "step": 9725 }, { "epoch": 2.734682405845981, "grad_norm": 0.5735021829605103, "learning_rate": 0.00010106934942467492, "loss": 0.8365, "step": 9730 }, { "epoch": 2.736087689713322, "grad_norm": 0.7572410106658936, "learning_rate": 0.00010097124721363087, "loss": 0.8502, "step": 9735 }, { "epoch": 2.7374929735806632, "grad_norm": 0.5611612200737, "learning_rate": 0.00010087314406775699, "loss": 0.8203, "step": 9740 }, { "epoch": 2.7388982574480045, "grad_norm": 0.5417007207870483, "learning_rate": 0.00010077504008147791, "loss": 0.8276, "step": 9745 }, { "epoch": 2.740303541315346, "grad_norm": 0.5647456645965576, "learning_rate": 0.00010067693534921923, "loss": 0.8128, "step": 9750 }, { "epoch": 2.7417088251826867, "grad_norm": 0.5229110717773438, "learning_rate": 0.00010057882996540712, "loss": 0.8161, "step": 9755 }, { "epoch": 2.743114109050028, "grad_norm": 0.642594575881958, "learning_rate": 0.00010048072402446855, "loss": 0.8396, "step": 9760 }, { "epoch": 2.7445193929173692, "grad_norm": 0.641390860080719, "learning_rate": 0.00010038261762083092, "loss": 0.8289, "step": 9765 }, { "epoch": 2.7459246767847105, "grad_norm": 0.641538143157959, "learning_rate": 0.00010028451084892207, "loss": 0.8648, "step": 9770 }, { "epoch": 2.747329960652052, "grad_norm": 0.6418523192405701, "learning_rate": 0.00010018640380317024, "loss": 0.8176, "step": 9775 }, { "epoch": 2.748735244519393, "grad_norm": 0.5464898943901062, "learning_rate": 0.00010008829657800388, "loss": 0.8263, "step": 9780 }, { "epoch": 2.7501405283867344, "grad_norm": 0.5861994028091431, "learning_rate": 9.999018926785165e-05, "loss": 0.8119, "step": 9785 }, { "epoch": 2.7515458122540752, "grad_norm": 0.526378333568573, "learning_rate": 9.989208196714231e-05, "loss": 0.826, "step": 9790 }, { "epoch": 2.7529510961214165, "grad_norm": 0.46394699811935425, "learning_rate": 9.979397477030455e-05, "loss": 0.8186, "step": 9795 }, { "epoch": 2.754356379988758, "grad_norm": 0.6000794172286987, "learning_rate": 9.969586777176703e-05, "loss": 0.8306, "step": 9800 }, { "epoch": 2.7557616638560987, "grad_norm": 0.6603537797927856, "learning_rate": 9.959776106595817e-05, "loss": 0.8328, "step": 9805 }, { "epoch": 2.75716694772344, "grad_norm": 0.5899302363395691, "learning_rate": 9.949965474730614e-05, "loss": 0.8117, "step": 9810 }, { "epoch": 2.7585722315907812, "grad_norm": 0.5155988335609436, "learning_rate": 9.940154891023868e-05, "loss": 0.8241, "step": 9815 }, { "epoch": 2.7599775154581225, "grad_norm": 0.5121949315071106, "learning_rate": 9.930344364918315e-05, "loss": 0.8168, "step": 9820 }, { "epoch": 2.761382799325464, "grad_norm": 0.5585014820098877, "learning_rate": 9.920533905856632e-05, "loss": 0.827, "step": 9825 }, { "epoch": 2.762788083192805, "grad_norm": 0.5437676906585693, "learning_rate": 9.910723523281425e-05, "loss": 0.835, "step": 9830 }, { "epoch": 2.7641933670601464, "grad_norm": 0.4653185307979584, "learning_rate": 9.900913226635234e-05, "loss": 0.8105, "step": 9835 }, { "epoch": 2.7655986509274872, "grad_norm": 0.6214808821678162, "learning_rate": 9.891103025360519e-05, "loss": 0.8285, "step": 9840 }, { "epoch": 2.7670039347948285, "grad_norm": 0.5066220760345459, "learning_rate": 9.881292928899644e-05, "loss": 0.8395, "step": 9845 }, { "epoch": 2.76840921866217, "grad_norm": 0.4878048896789551, "learning_rate": 9.871482946694864e-05, "loss": 0.8348, "step": 9850 }, { "epoch": 2.769814502529511, "grad_norm": 0.4976538121700287, "learning_rate": 9.861673088188337e-05, "loss": 0.8206, "step": 9855 }, { "epoch": 2.771219786396852, "grad_norm": 0.47465085983276367, "learning_rate": 9.851863362822097e-05, "loss": 0.8196, "step": 9860 }, { "epoch": 2.7726250702641932, "grad_norm": 0.5739564895629883, "learning_rate": 9.842053780038046e-05, "loss": 0.8219, "step": 9865 }, { "epoch": 2.7740303541315345, "grad_norm": 0.5575153827667236, "learning_rate": 9.832244349277957e-05, "loss": 0.8424, "step": 9870 }, { "epoch": 2.775435637998876, "grad_norm": 0.4867574870586395, "learning_rate": 9.822435079983448e-05, "loss": 0.8182, "step": 9875 }, { "epoch": 2.776840921866217, "grad_norm": 0.6961890459060669, "learning_rate": 9.812625981595993e-05, "loss": 0.8261, "step": 9880 }, { "epoch": 2.7782462057335584, "grad_norm": 0.5481508374214172, "learning_rate": 9.802817063556882e-05, "loss": 0.8243, "step": 9885 }, { "epoch": 2.7796514896008993, "grad_norm": 0.5110167264938354, "learning_rate": 9.793008335307252e-05, "loss": 0.8308, "step": 9890 }, { "epoch": 2.7810567734682405, "grad_norm": 0.524135947227478, "learning_rate": 9.783199806288052e-05, "loss": 0.8249, "step": 9895 }, { "epoch": 2.782462057335582, "grad_norm": 0.5294224619865417, "learning_rate": 9.773391485940025e-05, "loss": 0.8272, "step": 9900 }, { "epoch": 2.783867341202923, "grad_norm": 0.5189291834831238, "learning_rate": 9.763583383703732e-05, "loss": 0.8342, "step": 9905 }, { "epoch": 2.785272625070264, "grad_norm": 0.5350440144538879, "learning_rate": 9.753775509019515e-05, "loss": 0.8275, "step": 9910 }, { "epoch": 2.7866779089376053, "grad_norm": 0.5991389751434326, "learning_rate": 9.7439678713275e-05, "loss": 0.8254, "step": 9915 }, { "epoch": 2.7880831928049465, "grad_norm": 0.5112239122390747, "learning_rate": 9.734160480067578e-05, "loss": 0.8376, "step": 9920 }, { "epoch": 2.789488476672288, "grad_norm": 0.5403845906257629, "learning_rate": 9.724353344679412e-05, "loss": 0.8288, "step": 9925 }, { "epoch": 2.790893760539629, "grad_norm": 0.5947050452232361, "learning_rate": 9.714546474602415e-05, "loss": 0.8397, "step": 9930 }, { "epoch": 2.7922990444069704, "grad_norm": 0.6373836994171143, "learning_rate": 9.704739879275742e-05, "loss": 0.8318, "step": 9935 }, { "epoch": 2.7937043282743113, "grad_norm": 0.5135063529014587, "learning_rate": 9.694933568138287e-05, "loss": 0.9294, "step": 9940 }, { "epoch": 2.7951096121416525, "grad_norm": 0.5365307331085205, "learning_rate": 9.68512755062867e-05, "loss": 0.8305, "step": 9945 }, { "epoch": 2.796514896008994, "grad_norm": 0.6203733086585999, "learning_rate": 9.675321836185231e-05, "loss": 0.8291, "step": 9950 }, { "epoch": 2.797920179876335, "grad_norm": 0.7198903560638428, "learning_rate": 9.665516434246005e-05, "loss": 0.8435, "step": 9955 }, { "epoch": 2.799325463743676, "grad_norm": 0.5159870982170105, "learning_rate": 9.655711354248745e-05, "loss": 0.8361, "step": 9960 }, { "epoch": 2.8007307476110173, "grad_norm": 0.47406426072120667, "learning_rate": 9.645906605630885e-05, "loss": 0.8475, "step": 9965 }, { "epoch": 2.8021360314783585, "grad_norm": 0.5789318680763245, "learning_rate": 9.636102197829536e-05, "loss": 0.824, "step": 9970 }, { "epoch": 2.8035413153457, "grad_norm": 0.5253451466560364, "learning_rate": 9.626298140281488e-05, "loss": 0.822, "step": 9975 }, { "epoch": 2.804946599213041, "grad_norm": 0.48433390259742737, "learning_rate": 9.61649444242319e-05, "loss": 0.8304, "step": 9980 }, { "epoch": 2.8063518830803824, "grad_norm": 0.4688815176486969, "learning_rate": 9.60669111369075e-05, "loss": 0.8241, "step": 9985 }, { "epoch": 2.8077571669477237, "grad_norm": 0.4556552767753601, "learning_rate": 9.596888163519912e-05, "loss": 0.8197, "step": 9990 }, { "epoch": 2.8091624508150645, "grad_norm": 0.6112373471260071, "learning_rate": 9.587085601346062e-05, "loss": 0.8722, "step": 9995 }, { "epoch": 2.810567734682406, "grad_norm": 0.4816291630268097, "learning_rate": 9.577283436604216e-05, "loss": 0.8149, "step": 10000 }, { "epoch": 2.811973018549747, "grad_norm": 0.49455633759498596, "learning_rate": 9.567481678728994e-05, "loss": 0.8923, "step": 10005 }, { "epoch": 2.8133783024170884, "grad_norm": 0.6647067070007324, "learning_rate": 9.557680337154635e-05, "loss": 0.8188, "step": 10010 }, { "epoch": 2.8147835862844293, "grad_norm": 0.4788486659526825, "learning_rate": 9.547879421314976e-05, "loss": 0.8133, "step": 10015 }, { "epoch": 2.8161888701517706, "grad_norm": 0.529371976852417, "learning_rate": 9.538078940643449e-05, "loss": 0.8359, "step": 10020 }, { "epoch": 2.817594154019112, "grad_norm": 0.4806664288043976, "learning_rate": 9.528278904573054e-05, "loss": 0.8163, "step": 10025 }, { "epoch": 2.818999437886453, "grad_norm": 0.600096583366394, "learning_rate": 9.518479322536372e-05, "loss": 0.832, "step": 10030 }, { "epoch": 2.8204047217537944, "grad_norm": 0.5037655830383301, "learning_rate": 9.508680203965549e-05, "loss": 0.8428, "step": 10035 }, { "epoch": 2.8218100056211357, "grad_norm": 0.4676769971847534, "learning_rate": 9.498881558292279e-05, "loss": 0.8422, "step": 10040 }, { "epoch": 2.8232152894884766, "grad_norm": 0.5020796060562134, "learning_rate": 9.489083394947802e-05, "loss": 0.8177, "step": 10045 }, { "epoch": 2.824620573355818, "grad_norm": 0.5521809458732605, "learning_rate": 9.479285723362897e-05, "loss": 0.8494, "step": 10050 }, { "epoch": 2.826025857223159, "grad_norm": 0.8967176079750061, "learning_rate": 9.469488552967872e-05, "loss": 0.831, "step": 10055 }, { "epoch": 2.8274311410905004, "grad_norm": 0.6544962525367737, "learning_rate": 9.45969189319254e-05, "loss": 0.8201, "step": 10060 }, { "epoch": 2.8288364249578413, "grad_norm": 0.6380069255828857, "learning_rate": 9.449895753466231e-05, "loss": 0.818, "step": 10065 }, { "epoch": 2.8302417088251826, "grad_norm": 0.5384354591369629, "learning_rate": 9.440100143217786e-05, "loss": 0.8233, "step": 10070 }, { "epoch": 2.831646992692524, "grad_norm": 0.47108352184295654, "learning_rate": 9.430305071875513e-05, "loss": 0.8183, "step": 10075 }, { "epoch": 2.833052276559865, "grad_norm": 0.5076127052307129, "learning_rate": 9.420510548867216e-05, "loss": 0.8208, "step": 10080 }, { "epoch": 2.8344575604272064, "grad_norm": 0.5295957326889038, "learning_rate": 9.410716583620168e-05, "loss": 0.821, "step": 10085 }, { "epoch": 2.8358628442945477, "grad_norm": 0.5895918607711792, "learning_rate": 9.40092318556111e-05, "loss": 0.8119, "step": 10090 }, { "epoch": 2.8372681281618886, "grad_norm": 0.498048335313797, "learning_rate": 9.391130364116227e-05, "loss": 0.8261, "step": 10095 }, { "epoch": 2.83867341202923, "grad_norm": 0.4717422127723694, "learning_rate": 9.381338128711155e-05, "loss": 0.8282, "step": 10100 }, { "epoch": 2.840078695896571, "grad_norm": 0.5129121541976929, "learning_rate": 9.371546488770973e-05, "loss": 0.8409, "step": 10105 }, { "epoch": 2.8414839797639124, "grad_norm": 0.5050500631332397, "learning_rate": 9.361755453720166e-05, "loss": 0.8281, "step": 10110 }, { "epoch": 2.8428892636312533, "grad_norm": 0.45085790753364563, "learning_rate": 9.351965032982657e-05, "loss": 0.819, "step": 10115 }, { "epoch": 2.8442945474985946, "grad_norm": 0.5280387997627258, "learning_rate": 9.342175235981773e-05, "loss": 0.8242, "step": 10120 }, { "epoch": 2.845699831365936, "grad_norm": 0.5300459861755371, "learning_rate": 9.33238607214024e-05, "loss": 0.8261, "step": 10125 }, { "epoch": 2.847105115233277, "grad_norm": 0.4671616852283478, "learning_rate": 9.322597550880167e-05, "loss": 0.8289, "step": 10130 }, { "epoch": 2.8485103991006184, "grad_norm": 0.5089605450630188, "learning_rate": 9.312809681623051e-05, "loss": 0.8258, "step": 10135 }, { "epoch": 2.8499156829679597, "grad_norm": 0.5198221802711487, "learning_rate": 9.303022473789763e-05, "loss": 0.8245, "step": 10140 }, { "epoch": 2.851320966835301, "grad_norm": 0.49434977769851685, "learning_rate": 9.293235936800539e-05, "loss": 0.8719, "step": 10145 }, { "epoch": 2.852726250702642, "grad_norm": 0.5880835652351379, "learning_rate": 9.283450080074958e-05, "loss": 0.8309, "step": 10150 }, { "epoch": 2.854131534569983, "grad_norm": 0.5376434922218323, "learning_rate": 9.273664913031957e-05, "loss": 0.8724, "step": 10155 }, { "epoch": 2.8555368184373244, "grad_norm": 0.47303545475006104, "learning_rate": 9.263880445089803e-05, "loss": 0.8168, "step": 10160 }, { "epoch": 2.8569421023046653, "grad_norm": 0.5622953176498413, "learning_rate": 9.254096685666091e-05, "loss": 0.8086, "step": 10165 }, { "epoch": 2.8583473861720066, "grad_norm": 0.5584001541137695, "learning_rate": 9.244313644177733e-05, "loss": 0.8264, "step": 10170 }, { "epoch": 2.859752670039348, "grad_norm": 0.5387102365493774, "learning_rate": 9.234531330040954e-05, "loss": 0.82, "step": 10175 }, { "epoch": 2.861157953906689, "grad_norm": 0.508496880531311, "learning_rate": 9.22474975267128e-05, "loss": 0.8267, "step": 10180 }, { "epoch": 2.8625632377740304, "grad_norm": 0.5421685576438904, "learning_rate": 9.214968921483512e-05, "loss": 0.8235, "step": 10185 }, { "epoch": 2.8639685216413717, "grad_norm": 0.5567712783813477, "learning_rate": 9.205188845891752e-05, "loss": 0.8799, "step": 10190 }, { "epoch": 2.865373805508713, "grad_norm": 0.5794306993484497, "learning_rate": 9.19540953530937e-05, "loss": 0.8315, "step": 10195 }, { "epoch": 2.866779089376054, "grad_norm": 0.5016175508499146, "learning_rate": 9.185630999148993e-05, "loss": 0.8304, "step": 10200 }, { "epoch": 2.868184373243395, "grad_norm": 0.4961920976638794, "learning_rate": 9.175853246822505e-05, "loss": 0.8354, "step": 10205 }, { "epoch": 2.8695896571107364, "grad_norm": 0.5245194435119629, "learning_rate": 9.166076287741044e-05, "loss": 0.8231, "step": 10210 }, { "epoch": 2.8709949409780777, "grad_norm": 0.4738057255744934, "learning_rate": 9.156300131314975e-05, "loss": 0.8353, "step": 10215 }, { "epoch": 2.8724002248454186, "grad_norm": 0.605202853679657, "learning_rate": 9.146524786953889e-05, "loss": 0.8662, "step": 10220 }, { "epoch": 2.87380550871276, "grad_norm": 0.5499880909919739, "learning_rate": 9.136750264066606e-05, "loss": 0.8283, "step": 10225 }, { "epoch": 2.875210792580101, "grad_norm": 0.5103596448898315, "learning_rate": 9.12697657206115e-05, "loss": 0.8298, "step": 10230 }, { "epoch": 2.8766160764474424, "grad_norm": 0.5948740243911743, "learning_rate": 9.117203720344735e-05, "loss": 0.8234, "step": 10235 }, { "epoch": 2.8780213603147837, "grad_norm": 0.5047176480293274, "learning_rate": 9.107431718323782e-05, "loss": 0.8308, "step": 10240 }, { "epoch": 2.879426644182125, "grad_norm": 0.5018946528434753, "learning_rate": 9.097660575403888e-05, "loss": 0.826, "step": 10245 }, { "epoch": 2.880831928049466, "grad_norm": 0.482831209897995, "learning_rate": 9.087890300989823e-05, "loss": 0.8181, "step": 10250 }, { "epoch": 2.882237211916807, "grad_norm": 0.4997115135192871, "learning_rate": 9.078120904485518e-05, "loss": 0.8149, "step": 10255 }, { "epoch": 2.8836424957841484, "grad_norm": 0.6287440657615662, "learning_rate": 9.068352395294063e-05, "loss": 0.8261, "step": 10260 }, { "epoch": 2.8850477796514897, "grad_norm": 0.6290640234947205, "learning_rate": 9.058584782817697e-05, "loss": 0.828, "step": 10265 }, { "epoch": 2.8864530635188306, "grad_norm": 0.5529760122299194, "learning_rate": 9.048818076457783e-05, "loss": 0.8261, "step": 10270 }, { "epoch": 2.887858347386172, "grad_norm": 0.5642558932304382, "learning_rate": 9.039052285614828e-05, "loss": 0.8132, "step": 10275 }, { "epoch": 2.889263631253513, "grad_norm": 0.5524906516075134, "learning_rate": 9.029287419688447e-05, "loss": 0.8321, "step": 10280 }, { "epoch": 2.8906689151208544, "grad_norm": 0.5699685215950012, "learning_rate": 9.019523488077374e-05, "loss": 0.8184, "step": 10285 }, { "epoch": 2.8920741989881957, "grad_norm": 0.4990377724170685, "learning_rate": 9.009760500179428e-05, "loss": 0.8376, "step": 10290 }, { "epoch": 2.893479482855537, "grad_norm": 0.5495244860649109, "learning_rate": 8.999998465391533e-05, "loss": 0.8281, "step": 10295 }, { "epoch": 2.894884766722878, "grad_norm": 0.6051164865493774, "learning_rate": 8.990237393109702e-05, "loss": 0.8264, "step": 10300 }, { "epoch": 2.896290050590219, "grad_norm": 0.514365553855896, "learning_rate": 8.980477292728997e-05, "loss": 0.8352, "step": 10305 }, { "epoch": 2.8976953344575604, "grad_norm": 0.6341189742088318, "learning_rate": 8.970718173643566e-05, "loss": 0.833, "step": 10310 }, { "epoch": 2.8991006183249017, "grad_norm": 0.5225892066955566, "learning_rate": 8.960960045246607e-05, "loss": 0.8237, "step": 10315 }, { "epoch": 2.9005059021922426, "grad_norm": 0.4649873971939087, "learning_rate": 8.951202916930363e-05, "loss": 0.8346, "step": 10320 }, { "epoch": 2.901911186059584, "grad_norm": 0.6664191484451294, "learning_rate": 8.941446798086112e-05, "loss": 0.8207, "step": 10325 }, { "epoch": 2.903316469926925, "grad_norm": 0.4937977194786072, "learning_rate": 8.931691698104165e-05, "loss": 0.8227, "step": 10330 }, { "epoch": 2.9047217537942664, "grad_norm": 0.5461153388023376, "learning_rate": 8.921937626373852e-05, "loss": 0.8236, "step": 10335 }, { "epoch": 2.9061270376616077, "grad_norm": 0.4974842071533203, "learning_rate": 8.912184592283509e-05, "loss": 0.8239, "step": 10340 }, { "epoch": 2.907532321528949, "grad_norm": 0.6387110948562622, "learning_rate": 8.902432605220472e-05, "loss": 0.8279, "step": 10345 }, { "epoch": 2.9089376053962903, "grad_norm": 0.5886310338973999, "learning_rate": 8.892681674571081e-05, "loss": 0.8218, "step": 10350 }, { "epoch": 2.910342889263631, "grad_norm": 0.6844249963760376, "learning_rate": 8.882931809720653e-05, "loss": 0.864, "step": 10355 }, { "epoch": 2.9117481731309725, "grad_norm": 0.6174393892288208, "learning_rate": 8.873183020053469e-05, "loss": 0.8297, "step": 10360 }, { "epoch": 2.9131534569983137, "grad_norm": 0.4688795208930969, "learning_rate": 8.863435314952787e-05, "loss": 0.8364, "step": 10365 }, { "epoch": 2.914558740865655, "grad_norm": 0.5638646483421326, "learning_rate": 8.853688703800821e-05, "loss": 0.8313, "step": 10370 }, { "epoch": 2.915964024732996, "grad_norm": 0.4737720191478729, "learning_rate": 8.843943195978723e-05, "loss": 0.8266, "step": 10375 }, { "epoch": 2.917369308600337, "grad_norm": 0.4690057337284088, "learning_rate": 8.834198800866593e-05, "loss": 0.8173, "step": 10380 }, { "epoch": 2.9187745924676785, "grad_norm": 0.5100188255310059, "learning_rate": 8.824455527843457e-05, "loss": 0.825, "step": 10385 }, { "epoch": 2.9201798763350197, "grad_norm": 0.5377050042152405, "learning_rate": 8.814713386287256e-05, "loss": 0.8473, "step": 10390 }, { "epoch": 2.921585160202361, "grad_norm": 0.5518834590911865, "learning_rate": 8.804972385574849e-05, "loss": 0.8107, "step": 10395 }, { "epoch": 2.9229904440697023, "grad_norm": 0.4859972596168518, "learning_rate": 8.795232535081991e-05, "loss": 0.8423, "step": 10400 }, { "epoch": 2.924395727937043, "grad_norm": 0.46924281120300293, "learning_rate": 8.785493844183339e-05, "loss": 0.8339, "step": 10405 }, { "epoch": 2.9258010118043845, "grad_norm": 0.4706718623638153, "learning_rate": 8.77575632225242e-05, "loss": 0.8273, "step": 10410 }, { "epoch": 2.9272062956717257, "grad_norm": 0.5642886161804199, "learning_rate": 8.766019978661643e-05, "loss": 0.8043, "step": 10415 }, { "epoch": 2.928611579539067, "grad_norm": 0.5513885617256165, "learning_rate": 8.75628482278229e-05, "loss": 0.8301, "step": 10420 }, { "epoch": 2.930016863406408, "grad_norm": 0.7229651808738708, "learning_rate": 8.74655086398449e-05, "loss": 0.8206, "step": 10425 }, { "epoch": 2.931422147273749, "grad_norm": 0.5825774073600769, "learning_rate": 8.736818111637222e-05, "loss": 0.8292, "step": 10430 }, { "epoch": 2.9328274311410905, "grad_norm": 0.5014123320579529, "learning_rate": 8.727086575108304e-05, "loss": 0.8367, "step": 10435 }, { "epoch": 2.9342327150084317, "grad_norm": 0.508975625038147, "learning_rate": 8.717356263764389e-05, "loss": 0.8179, "step": 10440 }, { "epoch": 2.935637998875773, "grad_norm": 0.5300000309944153, "learning_rate": 8.70762718697094e-05, "loss": 0.8632, "step": 10445 }, { "epoch": 2.9370432827431143, "grad_norm": 0.5742501020431519, "learning_rate": 8.69789935409224e-05, "loss": 0.838, "step": 10450 }, { "epoch": 2.938448566610455, "grad_norm": 0.5139035582542419, "learning_rate": 8.688172774491377e-05, "loss": 0.8241, "step": 10455 }, { "epoch": 2.9398538504777965, "grad_norm": 0.4528915286064148, "learning_rate": 8.678447457530226e-05, "loss": 0.8363, "step": 10460 }, { "epoch": 2.9412591343451377, "grad_norm": 0.5604382157325745, "learning_rate": 8.668723412569446e-05, "loss": 0.8262, "step": 10465 }, { "epoch": 2.942664418212479, "grad_norm": 0.5670209527015686, "learning_rate": 8.659000648968476e-05, "loss": 0.832, "step": 10470 }, { "epoch": 2.94406970207982, "grad_norm": 0.504100501537323, "learning_rate": 8.649279176085524e-05, "loss": 0.8415, "step": 10475 }, { "epoch": 2.945474985947161, "grad_norm": 0.4927058815956116, "learning_rate": 8.639559003277548e-05, "loss": 0.8201, "step": 10480 }, { "epoch": 2.9468802698145025, "grad_norm": 0.5819506049156189, "learning_rate": 8.62984013990026e-05, "loss": 0.833, "step": 10485 }, { "epoch": 2.9482855536818438, "grad_norm": 0.512156069278717, "learning_rate": 8.62012259530811e-05, "loss": 0.8363, "step": 10490 }, { "epoch": 2.949690837549185, "grad_norm": 0.5958979725837708, "learning_rate": 8.610406378854284e-05, "loss": 0.8338, "step": 10495 }, { "epoch": 2.9510961214165263, "grad_norm": 0.46423158049583435, "learning_rate": 8.600691499890678e-05, "loss": 0.8148, "step": 10500 }, { "epoch": 2.952501405283867, "grad_norm": 0.48579469323158264, "learning_rate": 8.590977967767909e-05, "loss": 0.8345, "step": 10505 }, { "epoch": 2.9539066891512085, "grad_norm": 0.5756452083587646, "learning_rate": 8.581265791835303e-05, "loss": 0.8228, "step": 10510 }, { "epoch": 2.9553119730185498, "grad_norm": 0.4622665047645569, "learning_rate": 8.571554981440864e-05, "loss": 0.8306, "step": 10515 }, { "epoch": 2.956717256885891, "grad_norm": 0.473634272813797, "learning_rate": 8.561845545931297e-05, "loss": 0.8174, "step": 10520 }, { "epoch": 2.958122540753232, "grad_norm": 0.5003515481948853, "learning_rate": 8.552137494651975e-05, "loss": 0.8762, "step": 10525 }, { "epoch": 2.959527824620573, "grad_norm": 0.4785330891609192, "learning_rate": 8.542430836946949e-05, "loss": 0.8322, "step": 10530 }, { "epoch": 2.9609331084879145, "grad_norm": 0.4684329628944397, "learning_rate": 8.532725582158912e-05, "loss": 0.8216, "step": 10535 }, { "epoch": 2.9623383923552558, "grad_norm": 0.6075077056884766, "learning_rate": 8.523021739629221e-05, "loss": 0.8183, "step": 10540 }, { "epoch": 2.963743676222597, "grad_norm": 0.4996055066585541, "learning_rate": 8.513319318697868e-05, "loss": 0.831, "step": 10545 }, { "epoch": 2.9651489600899383, "grad_norm": 0.48926323652267456, "learning_rate": 8.50361832870348e-05, "loss": 0.8103, "step": 10550 }, { "epoch": 2.9665542439572796, "grad_norm": 0.5758883953094482, "learning_rate": 8.493918778983301e-05, "loss": 0.819, "step": 10555 }, { "epoch": 2.9679595278246205, "grad_norm": 0.5499513745307922, "learning_rate": 8.484220678873192e-05, "loss": 0.8314, "step": 10560 }, { "epoch": 2.9693648116919618, "grad_norm": 0.49780401587486267, "learning_rate": 8.474524037707625e-05, "loss": 0.8338, "step": 10565 }, { "epoch": 2.970770095559303, "grad_norm": 0.5025187134742737, "learning_rate": 8.464828864819651e-05, "loss": 0.8234, "step": 10570 }, { "epoch": 2.9721753794266443, "grad_norm": 0.45492085814476013, "learning_rate": 8.455135169540923e-05, "loss": 0.8225, "step": 10575 }, { "epoch": 2.973580663293985, "grad_norm": 0.4675244987010956, "learning_rate": 8.44544296120167e-05, "loss": 0.8211, "step": 10580 }, { "epoch": 2.9749859471613265, "grad_norm": 0.4467267394065857, "learning_rate": 8.435752249130689e-05, "loss": 0.8309, "step": 10585 }, { "epoch": 2.9763912310286678, "grad_norm": 0.46692943572998047, "learning_rate": 8.426063042655326e-05, "loss": 0.818, "step": 10590 }, { "epoch": 2.977796514896009, "grad_norm": 0.4968641698360443, "learning_rate": 8.41637535110149e-05, "loss": 0.8167, "step": 10595 }, { "epoch": 2.9792017987633503, "grad_norm": 0.5355315804481506, "learning_rate": 8.406689183793632e-05, "loss": 0.8164, "step": 10600 }, { "epoch": 2.9806070826306916, "grad_norm": 0.5333823561668396, "learning_rate": 8.39700455005473e-05, "loss": 0.8276, "step": 10605 }, { "epoch": 2.9820123664980325, "grad_norm": 0.5763578414916992, "learning_rate": 8.387321459206287e-05, "loss": 0.8282, "step": 10610 }, { "epoch": 2.9834176503653738, "grad_norm": 0.4867122769355774, "learning_rate": 8.377639920568323e-05, "loss": 0.8334, "step": 10615 }, { "epoch": 2.984822934232715, "grad_norm": 0.5764617323875427, "learning_rate": 8.367959943459366e-05, "loss": 0.8135, "step": 10620 }, { "epoch": 2.9862282181000563, "grad_norm": 0.6305127143859863, "learning_rate": 8.358281537196435e-05, "loss": 0.8757, "step": 10625 }, { "epoch": 2.987633501967397, "grad_norm": 0.5238044261932373, "learning_rate": 8.34860471109504e-05, "loss": 0.8323, "step": 10630 }, { "epoch": 2.9890387858347385, "grad_norm": 0.6222472786903381, "learning_rate": 8.338929474469177e-05, "loss": 0.8144, "step": 10635 }, { "epoch": 2.9904440697020798, "grad_norm": 0.6870152950286865, "learning_rate": 8.329255836631297e-05, "loss": 0.8293, "step": 10640 }, { "epoch": 2.991849353569421, "grad_norm": 0.6090087890625, "learning_rate": 8.319583806892324e-05, "loss": 0.8272, "step": 10645 }, { "epoch": 2.9932546374367623, "grad_norm": 0.5994482636451721, "learning_rate": 8.30991339456163e-05, "loss": 0.8229, "step": 10650 }, { "epoch": 2.9946599213041036, "grad_norm": 0.7766991853713989, "learning_rate": 8.300244608947034e-05, "loss": 0.8365, "step": 10655 }, { "epoch": 2.9960652051714445, "grad_norm": 0.4686606824398041, "learning_rate": 8.290577459354785e-05, "loss": 0.8189, "step": 10660 }, { "epoch": 2.9974704890387858, "grad_norm": 0.5815786123275757, "learning_rate": 8.280911955089556e-05, "loss": 0.8192, "step": 10665 }, { "epoch": 2.998875772906127, "grad_norm": 0.46798279881477356, "learning_rate": 8.271248105454444e-05, "loss": 0.8141, "step": 10670 }, { "epoch": 3.0, "eval_loss": 0.8574034571647644, "eval_runtime": 642.7609, "eval_samples_per_second": 6.996, "eval_steps_per_second": 0.583, "step": 10674 }, { "epoch": 3.0002810567734683, "grad_norm": 0.5984673500061035, "learning_rate": 8.261585919750945e-05, "loss": 0.8314, "step": 10675 }, { "epoch": 3.0016863406408096, "grad_norm": 0.4934682250022888, "learning_rate": 8.251925407278958e-05, "loss": 0.7858, "step": 10680 }, { "epoch": 3.0030916245081505, "grad_norm": 0.5943470001220703, "learning_rate": 8.242266577336769e-05, "loss": 0.7909, "step": 10685 }, { "epoch": 3.0044969083754918, "grad_norm": 0.491205632686615, "learning_rate": 8.232609439221053e-05, "loss": 0.7954, "step": 10690 }, { "epoch": 3.005902192242833, "grad_norm": 0.538317859172821, "learning_rate": 8.222954002226839e-05, "loss": 0.7859, "step": 10695 }, { "epoch": 3.0073074761101743, "grad_norm": 0.5916352868080139, "learning_rate": 8.213300275647535e-05, "loss": 0.7962, "step": 10700 }, { "epoch": 3.0087127599775156, "grad_norm": 0.5188941359519958, "learning_rate": 8.2036482687749e-05, "loss": 0.7802, "step": 10705 }, { "epoch": 3.0101180438448565, "grad_norm": 0.579635500907898, "learning_rate": 8.193997990899027e-05, "loss": 0.7976, "step": 10710 }, { "epoch": 3.0115233277121978, "grad_norm": 0.5322796702384949, "learning_rate": 8.184349451308358e-05, "loss": 0.7759, "step": 10715 }, { "epoch": 3.012928611579539, "grad_norm": 0.6273241639137268, "learning_rate": 8.174702659289656e-05, "loss": 0.7876, "step": 10720 }, { "epoch": 3.0143338954468804, "grad_norm": 0.5047946572303772, "learning_rate": 8.165057624128004e-05, "loss": 0.8025, "step": 10725 }, { "epoch": 3.0157391793142216, "grad_norm": 0.5199359655380249, "learning_rate": 8.155414355106787e-05, "loss": 0.7995, "step": 10730 }, { "epoch": 3.0171444631815625, "grad_norm": 0.4892846643924713, "learning_rate": 8.145772861507701e-05, "loss": 0.7916, "step": 10735 }, { "epoch": 3.0185497470489038, "grad_norm": 0.5490760207176208, "learning_rate": 8.13613315261073e-05, "loss": 0.7872, "step": 10740 }, { "epoch": 3.019955030916245, "grad_norm": 0.6799941062927246, "learning_rate": 8.126495237694128e-05, "loss": 0.7925, "step": 10745 }, { "epoch": 3.0213603147835864, "grad_norm": 0.5038847327232361, "learning_rate": 8.11685912603444e-05, "loss": 0.8293, "step": 10750 }, { "epoch": 3.0227655986509276, "grad_norm": 0.5515377521514893, "learning_rate": 8.107224826906462e-05, "loss": 0.7846, "step": 10755 }, { "epoch": 3.0241708825182685, "grad_norm": 0.5621849894523621, "learning_rate": 8.097592349583261e-05, "loss": 0.7921, "step": 10760 }, { "epoch": 3.0255761663856098, "grad_norm": 0.5750982165336609, "learning_rate": 8.08796170333613e-05, "loss": 0.781, "step": 10765 }, { "epoch": 3.026981450252951, "grad_norm": 0.5427119135856628, "learning_rate": 8.078332897434617e-05, "loss": 0.7927, "step": 10770 }, { "epoch": 3.0283867341202924, "grad_norm": 0.5602450966835022, "learning_rate": 8.068705941146488e-05, "loss": 0.8031, "step": 10775 }, { "epoch": 3.0297920179876336, "grad_norm": 0.5285422205924988, "learning_rate": 8.059080843737732e-05, "loss": 0.8123, "step": 10780 }, { "epoch": 3.0311973018549745, "grad_norm": 0.5058332085609436, "learning_rate": 8.049457614472552e-05, "loss": 0.7873, "step": 10785 }, { "epoch": 3.032602585722316, "grad_norm": 0.5054517984390259, "learning_rate": 8.039836262613348e-05, "loss": 0.7725, "step": 10790 }, { "epoch": 3.034007869589657, "grad_norm": 0.5090791583061218, "learning_rate": 8.030216797420721e-05, "loss": 0.7995, "step": 10795 }, { "epoch": 3.0354131534569984, "grad_norm": 0.5371220707893372, "learning_rate": 8.020599228153441e-05, "loss": 0.7901, "step": 10800 }, { "epoch": 3.0368184373243396, "grad_norm": 0.4719543159008026, "learning_rate": 8.010983564068466e-05, "loss": 0.7903, "step": 10805 }, { "epoch": 3.038223721191681, "grad_norm": 0.5546330213546753, "learning_rate": 8.001369814420924e-05, "loss": 0.7912, "step": 10810 }, { "epoch": 3.039629005059022, "grad_norm": 0.4600013494491577, "learning_rate": 7.991757988464081e-05, "loss": 0.7809, "step": 10815 }, { "epoch": 3.041034288926363, "grad_norm": 0.5031879544258118, "learning_rate": 7.982148095449371e-05, "loss": 0.7994, "step": 10820 }, { "epoch": 3.0424395727937044, "grad_norm": 0.4986834228038788, "learning_rate": 7.972540144626359e-05, "loss": 0.7846, "step": 10825 }, { "epoch": 3.0438448566610457, "grad_norm": 0.5494052767753601, "learning_rate": 7.962934145242741e-05, "loss": 0.7892, "step": 10830 }, { "epoch": 3.045250140528387, "grad_norm": 0.5115951299667358, "learning_rate": 7.953330106544334e-05, "loss": 0.7771, "step": 10835 }, { "epoch": 3.046655424395728, "grad_norm": 0.5679638385772705, "learning_rate": 7.943728037775071e-05, "loss": 0.7988, "step": 10840 }, { "epoch": 3.048060708263069, "grad_norm": 0.50021892786026, "learning_rate": 7.934127948176992e-05, "loss": 0.7859, "step": 10845 }, { "epoch": 3.0494659921304104, "grad_norm": 0.5187581777572632, "learning_rate": 7.924529846990215e-05, "loss": 0.7894, "step": 10850 }, { "epoch": 3.0508712759977517, "grad_norm": 0.4814312160015106, "learning_rate": 7.914933743452964e-05, "loss": 0.7976, "step": 10855 }, { "epoch": 3.052276559865093, "grad_norm": 0.49869638681411743, "learning_rate": 7.905339646801534e-05, "loss": 0.8099, "step": 10860 }, { "epoch": 3.053681843732434, "grad_norm": 0.5544595718383789, "learning_rate": 7.895747566270288e-05, "loss": 0.7911, "step": 10865 }, { "epoch": 3.055087127599775, "grad_norm": 0.4815824627876282, "learning_rate": 7.886157511091641e-05, "loss": 0.7901, "step": 10870 }, { "epoch": 3.0564924114671164, "grad_norm": 0.632993221282959, "learning_rate": 7.876569490496068e-05, "loss": 0.78, "step": 10875 }, { "epoch": 3.0578976953344577, "grad_norm": 0.5329408049583435, "learning_rate": 7.866983513712084e-05, "loss": 0.8005, "step": 10880 }, { "epoch": 3.059302979201799, "grad_norm": 0.5657827258110046, "learning_rate": 7.857399589966233e-05, "loss": 0.7902, "step": 10885 }, { "epoch": 3.06070826306914, "grad_norm": 0.47787678241729736, "learning_rate": 7.847817728483088e-05, "loss": 0.7819, "step": 10890 }, { "epoch": 3.062113546936481, "grad_norm": 0.5120819807052612, "learning_rate": 7.838237938485233e-05, "loss": 0.7855, "step": 10895 }, { "epoch": 3.0635188308038224, "grad_norm": 0.5875343084335327, "learning_rate": 7.828660229193262e-05, "loss": 0.7848, "step": 10900 }, { "epoch": 3.0649241146711637, "grad_norm": 0.5025877952575684, "learning_rate": 7.819084609825761e-05, "loss": 0.7896, "step": 10905 }, { "epoch": 3.066329398538505, "grad_norm": 0.5116874575614929, "learning_rate": 7.809511089599309e-05, "loss": 0.7804, "step": 10910 }, { "epoch": 3.067734682405846, "grad_norm": 0.4917939305305481, "learning_rate": 7.799939677728469e-05, "loss": 0.7897, "step": 10915 }, { "epoch": 3.069139966273187, "grad_norm": 0.5206174254417419, "learning_rate": 7.790370383425756e-05, "loss": 0.7964, "step": 10920 }, { "epoch": 3.0705452501405284, "grad_norm": 0.6208391785621643, "learning_rate": 7.780803215901666e-05, "loss": 0.7778, "step": 10925 }, { "epoch": 3.0719505340078697, "grad_norm": 0.6187405586242676, "learning_rate": 7.771238184364643e-05, "loss": 0.8014, "step": 10930 }, { "epoch": 3.073355817875211, "grad_norm": 0.5639635920524597, "learning_rate": 7.761675298021075e-05, "loss": 0.7861, "step": 10935 }, { "epoch": 3.074761101742552, "grad_norm": 0.5515173673629761, "learning_rate": 7.752114566075279e-05, "loss": 0.7903, "step": 10940 }, { "epoch": 3.076166385609893, "grad_norm": 0.6317364573478699, "learning_rate": 7.742555997729504e-05, "loss": 0.8106, "step": 10945 }, { "epoch": 3.0775716694772344, "grad_norm": 0.49566203355789185, "learning_rate": 7.732999602183919e-05, "loss": 0.8006, "step": 10950 }, { "epoch": 3.0789769533445757, "grad_norm": 0.5118474364280701, "learning_rate": 7.7234453886366e-05, "loss": 0.7936, "step": 10955 }, { "epoch": 3.080382237211917, "grad_norm": 0.5737119913101196, "learning_rate": 7.713893366283515e-05, "loss": 0.7785, "step": 10960 }, { "epoch": 3.081787521079258, "grad_norm": 0.5119444727897644, "learning_rate": 7.704343544318532e-05, "loss": 0.7899, "step": 10965 }, { "epoch": 3.083192804946599, "grad_norm": 0.6276198625564575, "learning_rate": 7.694795931933403e-05, "loss": 0.7935, "step": 10970 }, { "epoch": 3.0845980888139404, "grad_norm": 0.5036231279373169, "learning_rate": 7.685250538317741e-05, "loss": 0.8082, "step": 10975 }, { "epoch": 3.0860033726812817, "grad_norm": 0.49849963188171387, "learning_rate": 7.675707372659035e-05, "loss": 0.7793, "step": 10980 }, { "epoch": 3.087408656548623, "grad_norm": 0.5414336323738098, "learning_rate": 7.666166444142621e-05, "loss": 0.7803, "step": 10985 }, { "epoch": 3.088813940415964, "grad_norm": 0.49839186668395996, "learning_rate": 7.6566277619517e-05, "loss": 0.8002, "step": 10990 }, { "epoch": 3.090219224283305, "grad_norm": 0.48829421401023865, "learning_rate": 7.647091335267281e-05, "loss": 0.7972, "step": 10995 }, { "epoch": 3.0916245081506464, "grad_norm": 0.5062868595123291, "learning_rate": 7.637557173268227e-05, "loss": 0.794, "step": 11000 }, { "epoch": 3.0930297920179877, "grad_norm": 0.4741092622280121, "learning_rate": 7.628025285131212e-05, "loss": 0.7878, "step": 11005 }, { "epoch": 3.094435075885329, "grad_norm": 0.5686512589454651, "learning_rate": 7.618495680030718e-05, "loss": 0.7966, "step": 11010 }, { "epoch": 3.0958403597526702, "grad_norm": 0.5308302044868469, "learning_rate": 7.608968367139038e-05, "loss": 0.7893, "step": 11015 }, { "epoch": 3.097245643620011, "grad_norm": 0.5594052672386169, "learning_rate": 7.599443355626251e-05, "loss": 0.7939, "step": 11020 }, { "epoch": 3.0986509274873524, "grad_norm": 0.4811815917491913, "learning_rate": 7.58992065466023e-05, "loss": 0.7954, "step": 11025 }, { "epoch": 3.1000562113546937, "grad_norm": 0.566451370716095, "learning_rate": 7.580400273406611e-05, "loss": 0.7763, "step": 11030 }, { "epoch": 3.101461495222035, "grad_norm": 0.6021385192871094, "learning_rate": 7.570882221028805e-05, "loss": 0.7923, "step": 11035 }, { "epoch": 3.1028667790893762, "grad_norm": 0.5467907190322876, "learning_rate": 7.56136650668799e-05, "loss": 0.7989, "step": 11040 }, { "epoch": 3.104272062956717, "grad_norm": 0.5167945623397827, "learning_rate": 7.551853139543074e-05, "loss": 0.8013, "step": 11045 }, { "epoch": 3.1056773468240584, "grad_norm": 0.523460865020752, "learning_rate": 7.54234212875072e-05, "loss": 0.8119, "step": 11050 }, { "epoch": 3.1070826306913997, "grad_norm": 0.47800132632255554, "learning_rate": 7.532833483465322e-05, "loss": 0.7958, "step": 11055 }, { "epoch": 3.108487914558741, "grad_norm": 0.4831900894641876, "learning_rate": 7.523327212838993e-05, "loss": 0.782, "step": 11060 }, { "epoch": 3.1098931984260823, "grad_norm": 0.4977155327796936, "learning_rate": 7.51382332602156e-05, "loss": 0.7959, "step": 11065 }, { "epoch": 3.111298482293423, "grad_norm": 0.5085614323616028, "learning_rate": 7.504321832160559e-05, "loss": 0.7913, "step": 11070 }, { "epoch": 3.1127037661607644, "grad_norm": 0.5714178681373596, "learning_rate": 7.494822740401226e-05, "loss": 0.7831, "step": 11075 }, { "epoch": 3.1141090500281057, "grad_norm": 0.5226171016693115, "learning_rate": 7.485326059886471e-05, "loss": 0.7732, "step": 11080 }, { "epoch": 3.115514333895447, "grad_norm": 0.6232420802116394, "learning_rate": 7.475831799756897e-05, "loss": 0.8024, "step": 11085 }, { "epoch": 3.1169196177627883, "grad_norm": 0.5652778744697571, "learning_rate": 7.466339969150776e-05, "loss": 0.7916, "step": 11090 }, { "epoch": 3.118324901630129, "grad_norm": 0.4653576612472534, "learning_rate": 7.456850577204037e-05, "loss": 0.7967, "step": 11095 }, { "epoch": 3.1197301854974704, "grad_norm": 0.5993885397911072, "learning_rate": 7.447363633050259e-05, "loss": 0.7994, "step": 11100 }, { "epoch": 3.1211354693648117, "grad_norm": 0.6875457763671875, "learning_rate": 7.437879145820674e-05, "loss": 0.7762, "step": 11105 }, { "epoch": 3.122540753232153, "grad_norm": 0.6032678484916687, "learning_rate": 7.428397124644142e-05, "loss": 0.8443, "step": 11110 }, { "epoch": 3.1239460370994943, "grad_norm": 0.5008665919303894, "learning_rate": 7.418917578647151e-05, "loss": 0.8003, "step": 11115 }, { "epoch": 3.125351320966835, "grad_norm": 0.5989721417427063, "learning_rate": 7.40944051695381e-05, "loss": 0.7874, "step": 11120 }, { "epoch": 3.1267566048341764, "grad_norm": 0.6037493348121643, "learning_rate": 7.399965948685832e-05, "loss": 0.7908, "step": 11125 }, { "epoch": 3.1281618887015177, "grad_norm": 0.5136246681213379, "learning_rate": 7.390493882962534e-05, "loss": 0.7868, "step": 11130 }, { "epoch": 3.129567172568859, "grad_norm": 0.5260531902313232, "learning_rate": 7.38102432890082e-05, "loss": 0.7865, "step": 11135 }, { "epoch": 3.1309724564362003, "grad_norm": 0.6083421111106873, "learning_rate": 7.371557295615184e-05, "loss": 0.8002, "step": 11140 }, { "epoch": 3.132377740303541, "grad_norm": 0.5094545483589172, "learning_rate": 7.362092792217687e-05, "loss": 0.7901, "step": 11145 }, { "epoch": 3.1337830241708824, "grad_norm": 0.6604807376861572, "learning_rate": 7.352630827817954e-05, "loss": 0.7975, "step": 11150 }, { "epoch": 3.1351883080382237, "grad_norm": 0.5617685914039612, "learning_rate": 7.343171411523169e-05, "loss": 0.8134, "step": 11155 }, { "epoch": 3.136593591905565, "grad_norm": 0.5299571752548218, "learning_rate": 7.333714552438067e-05, "loss": 0.7944, "step": 11160 }, { "epoch": 3.1379988757729063, "grad_norm": 0.5054970383644104, "learning_rate": 7.324260259664918e-05, "loss": 0.7887, "step": 11165 }, { "epoch": 3.1394041596402475, "grad_norm": 0.4990473985671997, "learning_rate": 7.314808542303518e-05, "loss": 0.7882, "step": 11170 }, { "epoch": 3.1408094435075884, "grad_norm": 0.6480944752693176, "learning_rate": 7.305359409451191e-05, "loss": 0.8355, "step": 11175 }, { "epoch": 3.1422147273749297, "grad_norm": 0.49085888266563416, "learning_rate": 7.295912870202773e-05, "loss": 0.7944, "step": 11180 }, { "epoch": 3.143620011242271, "grad_norm": 0.5148762464523315, "learning_rate": 7.286468933650597e-05, "loss": 0.7887, "step": 11185 }, { "epoch": 3.1450252951096123, "grad_norm": 0.5787303447723389, "learning_rate": 7.277027608884497e-05, "loss": 0.7783, "step": 11190 }, { "epoch": 3.1464305789769536, "grad_norm": 0.4866994023323059, "learning_rate": 7.267588904991792e-05, "loss": 0.7936, "step": 11195 }, { "epoch": 3.1478358628442944, "grad_norm": 0.6382277607917786, "learning_rate": 7.25815283105728e-05, "loss": 0.781, "step": 11200 }, { "epoch": 3.1492411467116357, "grad_norm": 0.5190382599830627, "learning_rate": 7.248719396163217e-05, "loss": 0.8067, "step": 11205 }, { "epoch": 3.150646430578977, "grad_norm": 0.5271970629692078, "learning_rate": 7.239288609389334e-05, "loss": 0.8089, "step": 11210 }, { "epoch": 3.1520517144463183, "grad_norm": 0.6586330533027649, "learning_rate": 7.229860479812806e-05, "loss": 0.7886, "step": 11215 }, { "epoch": 3.1534569983136596, "grad_norm": 0.5243818163871765, "learning_rate": 7.220435016508249e-05, "loss": 0.8002, "step": 11220 }, { "epoch": 3.1548622821810004, "grad_norm": 0.5264410376548767, "learning_rate": 7.211012228547715e-05, "loss": 0.7902, "step": 11225 }, { "epoch": 3.1562675660483417, "grad_norm": 0.6128091216087341, "learning_rate": 7.201592125000683e-05, "loss": 0.7887, "step": 11230 }, { "epoch": 3.157672849915683, "grad_norm": 0.5602926015853882, "learning_rate": 7.192174714934043e-05, "loss": 0.8016, "step": 11235 }, { "epoch": 3.1590781337830243, "grad_norm": 0.4874815046787262, "learning_rate": 7.182760007412097e-05, "loss": 0.7918, "step": 11240 }, { "epoch": 3.1604834176503656, "grad_norm": 0.5565770268440247, "learning_rate": 7.173348011496542e-05, "loss": 0.8176, "step": 11245 }, { "epoch": 3.1618887015177064, "grad_norm": 0.5076163411140442, "learning_rate": 7.163938736246472e-05, "loss": 0.8053, "step": 11250 }, { "epoch": 3.1632939853850477, "grad_norm": 0.5580770373344421, "learning_rate": 7.15453219071835e-05, "loss": 0.7897, "step": 11255 }, { "epoch": 3.164699269252389, "grad_norm": 0.468325138092041, "learning_rate": 7.145128383966022e-05, "loss": 0.7865, "step": 11260 }, { "epoch": 3.1661045531197303, "grad_norm": 0.5661777257919312, "learning_rate": 7.135727325040698e-05, "loss": 0.7794, "step": 11265 }, { "epoch": 3.1675098369870716, "grad_norm": 0.47927501797676086, "learning_rate": 7.126329022990943e-05, "loss": 0.8006, "step": 11270 }, { "epoch": 3.1689151208544124, "grad_norm": 0.5229851007461548, "learning_rate": 7.116933486862656e-05, "loss": 0.7988, "step": 11275 }, { "epoch": 3.1703204047217537, "grad_norm": 0.5205390453338623, "learning_rate": 7.107540725699089e-05, "loss": 0.7854, "step": 11280 }, { "epoch": 3.171725688589095, "grad_norm": 0.5371643304824829, "learning_rate": 7.098150748540819e-05, "loss": 0.7936, "step": 11285 }, { "epoch": 3.1731309724564363, "grad_norm": 0.473400741815567, "learning_rate": 7.08876356442574e-05, "loss": 0.7815, "step": 11290 }, { "epoch": 3.1745362563237776, "grad_norm": 0.4691773056983948, "learning_rate": 7.079379182389059e-05, "loss": 0.7762, "step": 11295 }, { "epoch": 3.1759415401911184, "grad_norm": 0.486338347196579, "learning_rate": 7.069997611463288e-05, "loss": 0.7904, "step": 11300 }, { "epoch": 3.1773468240584597, "grad_norm": 0.5320138931274414, "learning_rate": 7.060618860678237e-05, "loss": 0.7984, "step": 11305 }, { "epoch": 3.178752107925801, "grad_norm": 0.48783358931541443, "learning_rate": 7.051242939060983e-05, "loss": 0.875, "step": 11310 }, { "epoch": 3.1801573917931423, "grad_norm": 0.519739031791687, "learning_rate": 7.041869855635904e-05, "loss": 0.781, "step": 11315 }, { "epoch": 3.1815626756604836, "grad_norm": 0.5107063055038452, "learning_rate": 7.03249961942464e-05, "loss": 0.7877, "step": 11320 }, { "epoch": 3.182967959527825, "grad_norm": 0.5117635726928711, "learning_rate": 7.023132239446074e-05, "loss": 0.793, "step": 11325 }, { "epoch": 3.1843732433951657, "grad_norm": 0.5889191031455994, "learning_rate": 7.013767724716356e-05, "loss": 0.7894, "step": 11330 }, { "epoch": 3.185778527262507, "grad_norm": 0.5487093329429626, "learning_rate": 7.004406084248878e-05, "loss": 0.7881, "step": 11335 }, { "epoch": 3.1871838111298483, "grad_norm": 0.4754599928855896, "learning_rate": 6.995047327054262e-05, "loss": 0.7883, "step": 11340 }, { "epoch": 3.1885890949971896, "grad_norm": 0.5368080139160156, "learning_rate": 6.985691462140352e-05, "loss": 0.7889, "step": 11345 }, { "epoch": 3.1899943788645304, "grad_norm": 0.5230497717857361, "learning_rate": 6.97633849851221e-05, "loss": 0.7838, "step": 11350 }, { "epoch": 3.1913996627318717, "grad_norm": 0.4732072651386261, "learning_rate": 6.966988445172112e-05, "loss": 0.8018, "step": 11355 }, { "epoch": 3.192804946599213, "grad_norm": 0.7186647653579712, "learning_rate": 6.957641311119517e-05, "loss": 0.8392, "step": 11360 }, { "epoch": 3.1942102304665543, "grad_norm": 0.5402554273605347, "learning_rate": 6.948297105351091e-05, "loss": 0.7886, "step": 11365 }, { "epoch": 3.1956155143338956, "grad_norm": 0.550915539264679, "learning_rate": 6.938955836860677e-05, "loss": 0.7775, "step": 11370 }, { "epoch": 3.197020798201237, "grad_norm": 0.499353289604187, "learning_rate": 6.929617514639288e-05, "loss": 0.7933, "step": 11375 }, { "epoch": 3.1984260820685777, "grad_norm": 0.495159775018692, "learning_rate": 6.920282147675095e-05, "loss": 0.789, "step": 11380 }, { "epoch": 3.199831365935919, "grad_norm": 0.5102311372756958, "learning_rate": 6.910949744953438e-05, "loss": 0.7843, "step": 11385 }, { "epoch": 3.2012366498032603, "grad_norm": 0.5126308798789978, "learning_rate": 6.901620315456793e-05, "loss": 0.7959, "step": 11390 }, { "epoch": 3.2026419336706016, "grad_norm": 0.6104722023010254, "learning_rate": 6.892293868164785e-05, "loss": 0.8012, "step": 11395 }, { "epoch": 3.204047217537943, "grad_norm": 0.49474766850471497, "learning_rate": 6.882970412054158e-05, "loss": 0.8101, "step": 11400 }, { "epoch": 3.2054525014052837, "grad_norm": 0.520660400390625, "learning_rate": 6.873649956098778e-05, "loss": 0.8117, "step": 11405 }, { "epoch": 3.206857785272625, "grad_norm": 0.6017019748687744, "learning_rate": 6.864332509269633e-05, "loss": 0.8399, "step": 11410 }, { "epoch": 3.2082630691399663, "grad_norm": 0.4822956621646881, "learning_rate": 6.855018080534804e-05, "loss": 0.7875, "step": 11415 }, { "epoch": 3.2096683530073076, "grad_norm": 0.537238359451294, "learning_rate": 6.845706678859473e-05, "loss": 0.797, "step": 11420 }, { "epoch": 3.211073636874649, "grad_norm": 0.5192193388938904, "learning_rate": 6.836398313205905e-05, "loss": 0.8137, "step": 11425 }, { "epoch": 3.2124789207419897, "grad_norm": 0.5809474587440491, "learning_rate": 6.82709299253345e-05, "loss": 0.7843, "step": 11430 }, { "epoch": 3.213884204609331, "grad_norm": 0.5197656750679016, "learning_rate": 6.817790725798515e-05, "loss": 0.7871, "step": 11435 }, { "epoch": 3.2152894884766723, "grad_norm": 0.6199994683265686, "learning_rate": 6.808491521954576e-05, "loss": 0.8056, "step": 11440 }, { "epoch": 3.2166947723440136, "grad_norm": 0.5630050897598267, "learning_rate": 6.799195389952162e-05, "loss": 0.7921, "step": 11445 }, { "epoch": 3.218100056211355, "grad_norm": 0.5317882895469666, "learning_rate": 6.78990233873884e-05, "loss": 0.7821, "step": 11450 }, { "epoch": 3.2195053400786957, "grad_norm": 0.4947078227996826, "learning_rate": 6.780612377259215e-05, "loss": 0.7922, "step": 11455 }, { "epoch": 3.220910623946037, "grad_norm": 0.5854290127754211, "learning_rate": 6.771325514454917e-05, "loss": 0.7796, "step": 11460 }, { "epoch": 3.2223159078133783, "grad_norm": 0.5860560536384583, "learning_rate": 6.762041759264596e-05, "loss": 0.7837, "step": 11465 }, { "epoch": 3.2237211916807196, "grad_norm": 0.5405226349830627, "learning_rate": 6.752761120623907e-05, "loss": 0.7883, "step": 11470 }, { "epoch": 3.225126475548061, "grad_norm": 0.6428171396255493, "learning_rate": 6.743483607465508e-05, "loss": 0.7898, "step": 11475 }, { "epoch": 3.2265317594154017, "grad_norm": 0.5037165284156799, "learning_rate": 6.734209228719051e-05, "loss": 0.7974, "step": 11480 }, { "epoch": 3.227937043282743, "grad_norm": 0.5832943320274353, "learning_rate": 6.72493799331116e-05, "loss": 0.8045, "step": 11485 }, { "epoch": 3.2293423271500843, "grad_norm": 0.5611792802810669, "learning_rate": 6.715669910165445e-05, "loss": 0.7885, "step": 11490 }, { "epoch": 3.2307476110174256, "grad_norm": 0.49044761061668396, "learning_rate": 6.70640498820248e-05, "loss": 0.7896, "step": 11495 }, { "epoch": 3.232152894884767, "grad_norm": 0.4808729887008667, "learning_rate": 6.6971432363398e-05, "loss": 0.7872, "step": 11500 }, { "epoch": 3.2335581787521077, "grad_norm": 0.49567148089408875, "learning_rate": 6.687884663491874e-05, "loss": 0.7877, "step": 11505 }, { "epoch": 3.234963462619449, "grad_norm": 0.5091308951377869, "learning_rate": 6.678629278570125e-05, "loss": 0.8029, "step": 11510 }, { "epoch": 3.2363687464867903, "grad_norm": 0.4903222322463989, "learning_rate": 6.669377090482903e-05, "loss": 0.799, "step": 11515 }, { "epoch": 3.2377740303541316, "grad_norm": 0.4882327616214752, "learning_rate": 6.660128108135481e-05, "loss": 0.7867, "step": 11520 }, { "epoch": 3.239179314221473, "grad_norm": 0.5147338509559631, "learning_rate": 6.650882340430048e-05, "loss": 0.8014, "step": 11525 }, { "epoch": 3.240584598088814, "grad_norm": 0.4741876721382141, "learning_rate": 6.641639796265696e-05, "loss": 0.7802, "step": 11530 }, { "epoch": 3.241989881956155, "grad_norm": 0.5736861228942871, "learning_rate": 6.632400484538422e-05, "loss": 0.8062, "step": 11535 }, { "epoch": 3.2433951658234963, "grad_norm": 0.5249696373939514, "learning_rate": 6.623164414141094e-05, "loss": 0.8, "step": 11540 }, { "epoch": 3.2448004496908376, "grad_norm": 0.5339483022689819, "learning_rate": 6.613931593963483e-05, "loss": 0.7891, "step": 11545 }, { "epoch": 3.246205733558179, "grad_norm": 0.6143501996994019, "learning_rate": 6.604702032892221e-05, "loss": 0.8513, "step": 11550 }, { "epoch": 3.2476110174255197, "grad_norm": 0.5576615929603577, "learning_rate": 6.595475739810792e-05, "loss": 0.7899, "step": 11555 }, { "epoch": 3.249016301292861, "grad_norm": 0.5930474996566772, "learning_rate": 6.586252723599553e-05, "loss": 0.797, "step": 11560 }, { "epoch": 3.2504215851602023, "grad_norm": 0.5222381949424744, "learning_rate": 6.577032993135699e-05, "loss": 0.7892, "step": 11565 }, { "epoch": 3.2518268690275436, "grad_norm": 0.5676156282424927, "learning_rate": 6.567816557293262e-05, "loss": 0.7971, "step": 11570 }, { "epoch": 3.253232152894885, "grad_norm": 0.5307049751281738, "learning_rate": 6.558603424943104e-05, "loss": 0.7918, "step": 11575 }, { "epoch": 3.254637436762226, "grad_norm": 0.5155093669891357, "learning_rate": 6.549393604952906e-05, "loss": 0.7893, "step": 11580 }, { "epoch": 3.256042720629567, "grad_norm": 0.4945572018623352, "learning_rate": 6.540187106187167e-05, "loss": 0.7838, "step": 11585 }, { "epoch": 3.2574480044969083, "grad_norm": 0.570228099822998, "learning_rate": 6.530983937507173e-05, "loss": 0.7912, "step": 11590 }, { "epoch": 3.2588532883642496, "grad_norm": 0.5457921028137207, "learning_rate": 6.521784107771027e-05, "loss": 0.7801, "step": 11595 }, { "epoch": 3.260258572231591, "grad_norm": 0.5242686867713928, "learning_rate": 6.512587625833602e-05, "loss": 0.7951, "step": 11600 }, { "epoch": 3.261663856098932, "grad_norm": 0.5158045291900635, "learning_rate": 6.503394500546558e-05, "loss": 0.7868, "step": 11605 }, { "epoch": 3.263069139966273, "grad_norm": 0.5344769358634949, "learning_rate": 6.494204740758314e-05, "loss": 0.8, "step": 11610 }, { "epoch": 3.2644744238336143, "grad_norm": 0.5959330797195435, "learning_rate": 6.48501835531406e-05, "loss": 0.7896, "step": 11615 }, { "epoch": 3.2658797077009556, "grad_norm": 0.518044114112854, "learning_rate": 6.475835353055735e-05, "loss": 0.7932, "step": 11620 }, { "epoch": 3.267284991568297, "grad_norm": 0.5310397148132324, "learning_rate": 6.466655742822017e-05, "loss": 0.7983, "step": 11625 }, { "epoch": 3.268690275435638, "grad_norm": 0.5325920581817627, "learning_rate": 6.457479533448323e-05, "loss": 0.7946, "step": 11630 }, { "epoch": 3.270095559302979, "grad_norm": 0.7155055999755859, "learning_rate": 6.4483067337668e-05, "loss": 0.7785, "step": 11635 }, { "epoch": 3.2715008431703203, "grad_norm": 0.5178682208061218, "learning_rate": 6.439137352606311e-05, "loss": 0.7872, "step": 11640 }, { "epoch": 3.2729061270376616, "grad_norm": 0.4876868426799774, "learning_rate": 6.42997139879242e-05, "loss": 0.8078, "step": 11645 }, { "epoch": 3.274311410905003, "grad_norm": 0.5107020735740662, "learning_rate": 6.420808881147406e-05, "loss": 0.8063, "step": 11650 }, { "epoch": 3.275716694772344, "grad_norm": 0.49932652711868286, "learning_rate": 6.411649808490236e-05, "loss": 0.7829, "step": 11655 }, { "epoch": 3.277121978639685, "grad_norm": 0.552949845790863, "learning_rate": 6.40249418963655e-05, "loss": 0.7929, "step": 11660 }, { "epoch": 3.2785272625070263, "grad_norm": 0.5141507983207703, "learning_rate": 6.393342033398681e-05, "loss": 0.8005, "step": 11665 }, { "epoch": 3.2799325463743676, "grad_norm": 0.541842520236969, "learning_rate": 6.384193348585617e-05, "loss": 0.7945, "step": 11670 }, { "epoch": 3.281337830241709, "grad_norm": 0.53905189037323, "learning_rate": 6.375048144003013e-05, "loss": 0.7838, "step": 11675 }, { "epoch": 3.28274311410905, "grad_norm": 0.49840521812438965, "learning_rate": 6.365906428453167e-05, "loss": 0.7822, "step": 11680 }, { "epoch": 3.2841483979763915, "grad_norm": 0.5327207446098328, "learning_rate": 6.356768210735023e-05, "loss": 0.7924, "step": 11685 }, { "epoch": 3.2855536818437323, "grad_norm": 0.507588803768158, "learning_rate": 6.347633499644158e-05, "loss": 0.8038, "step": 11690 }, { "epoch": 3.2869589657110736, "grad_norm": 0.5312201976776123, "learning_rate": 6.338502303972774e-05, "loss": 0.8128, "step": 11695 }, { "epoch": 3.288364249578415, "grad_norm": 0.5181981325149536, "learning_rate": 6.329374632509686e-05, "loss": 0.7969, "step": 11700 }, { "epoch": 3.289769533445756, "grad_norm": 0.6399694681167603, "learning_rate": 6.32025049404032e-05, "loss": 0.7881, "step": 11705 }, { "epoch": 3.291174817313097, "grad_norm": 0.6674182415008545, "learning_rate": 6.311129897346708e-05, "loss": 0.7811, "step": 11710 }, { "epoch": 3.2925801011804383, "grad_norm": 0.492032527923584, "learning_rate": 6.302012851207455e-05, "loss": 0.7961, "step": 11715 }, { "epoch": 3.2939853850477796, "grad_norm": 0.5350286364555359, "learning_rate": 6.292899364397764e-05, "loss": 0.7901, "step": 11720 }, { "epoch": 3.295390668915121, "grad_norm": 0.5172516107559204, "learning_rate": 6.283789445689414e-05, "loss": 0.7946, "step": 11725 }, { "epoch": 3.296795952782462, "grad_norm": 0.5363070964813232, "learning_rate": 6.274683103850734e-05, "loss": 0.7859, "step": 11730 }, { "epoch": 3.2982012366498035, "grad_norm": 0.5336095094680786, "learning_rate": 6.265580347646624e-05, "loss": 0.7982, "step": 11735 }, { "epoch": 3.2996065205171443, "grad_norm": 0.5102813839912415, "learning_rate": 6.256481185838528e-05, "loss": 0.7899, "step": 11740 }, { "epoch": 3.3010118043844856, "grad_norm": 0.5685641169548035, "learning_rate": 6.247385627184435e-05, "loss": 0.7871, "step": 11745 }, { "epoch": 3.302417088251827, "grad_norm": 0.534301221370697, "learning_rate": 6.238293680438854e-05, "loss": 0.7816, "step": 11750 }, { "epoch": 3.303822372119168, "grad_norm": 0.52571702003479, "learning_rate": 6.22920535435283e-05, "loss": 0.8023, "step": 11755 }, { "epoch": 3.305227655986509, "grad_norm": 0.5615999102592468, "learning_rate": 6.220120657673922e-05, "loss": 0.7981, "step": 11760 }, { "epoch": 3.3066329398538503, "grad_norm": 0.5294827818870544, "learning_rate": 6.211039599146184e-05, "loss": 0.8098, "step": 11765 }, { "epoch": 3.3080382237211916, "grad_norm": 0.5420514941215515, "learning_rate": 6.201962187510179e-05, "loss": 0.7976, "step": 11770 }, { "epoch": 3.309443507588533, "grad_norm": 0.505811333656311, "learning_rate": 6.192888431502962e-05, "loss": 0.8005, "step": 11775 }, { "epoch": 3.310848791455874, "grad_norm": 0.49639084935188293, "learning_rate": 6.183818339858064e-05, "loss": 0.8182, "step": 11780 }, { "epoch": 3.3122540753232155, "grad_norm": 0.7370562553405762, "learning_rate": 6.174751921305484e-05, "loss": 0.7969, "step": 11785 }, { "epoch": 3.3136593591905563, "grad_norm": 0.4888727366924286, "learning_rate": 6.165689184571694e-05, "loss": 0.808, "step": 11790 }, { "epoch": 3.3150646430578976, "grad_norm": 0.535828173160553, "learning_rate": 6.15663013837962e-05, "loss": 0.7836, "step": 11795 }, { "epoch": 3.316469926925239, "grad_norm": 0.5442777276039124, "learning_rate": 6.147574791448638e-05, "loss": 0.795, "step": 11800 }, { "epoch": 3.31787521079258, "grad_norm": 0.5069718360900879, "learning_rate": 6.138523152494557e-05, "loss": 0.8176, "step": 11805 }, { "epoch": 3.3192804946599215, "grad_norm": 0.5373730659484863, "learning_rate": 6.129475230229622e-05, "loss": 0.8136, "step": 11810 }, { "epoch": 3.3206857785272623, "grad_norm": 0.5106169581413269, "learning_rate": 6.120431033362503e-05, "loss": 0.7883, "step": 11815 }, { "epoch": 3.3220910623946036, "grad_norm": 0.5332974195480347, "learning_rate": 6.111390570598274e-05, "loss": 0.8001, "step": 11820 }, { "epoch": 3.323496346261945, "grad_norm": 0.5583652257919312, "learning_rate": 6.102353850638428e-05, "loss": 0.7907, "step": 11825 }, { "epoch": 3.324901630129286, "grad_norm": 0.56987065076828, "learning_rate": 6.0933208821808465e-05, "loss": 0.7943, "step": 11830 }, { "epoch": 3.3263069139966275, "grad_norm": 0.5036487579345703, "learning_rate": 6.084291673919806e-05, "loss": 0.7914, "step": 11835 }, { "epoch": 3.3277121978639688, "grad_norm": 0.5210623741149902, "learning_rate": 6.075266234545956e-05, "loss": 0.8094, "step": 11840 }, { "epoch": 3.3291174817313096, "grad_norm": 0.5213614106178284, "learning_rate": 6.066244572746327e-05, "loss": 0.7805, "step": 11845 }, { "epoch": 3.330522765598651, "grad_norm": 0.5487878918647766, "learning_rate": 6.057226697204308e-05, "loss": 0.7955, "step": 11850 }, { "epoch": 3.331928049465992, "grad_norm": 0.6446609497070312, "learning_rate": 6.048212616599645e-05, "loss": 0.7966, "step": 11855 }, { "epoch": 3.3333333333333335, "grad_norm": 0.5579555034637451, "learning_rate": 6.039202339608432e-05, "loss": 0.7979, "step": 11860 }, { "epoch": 3.3347386172006743, "grad_norm": 0.5008167028427124, "learning_rate": 6.030195874903104e-05, "loss": 0.7986, "step": 11865 }, { "epoch": 3.3361439010680156, "grad_norm": 0.6530143022537231, "learning_rate": 6.021193231152424e-05, "loss": 0.7906, "step": 11870 }, { "epoch": 3.337549184935357, "grad_norm": 0.5034095644950867, "learning_rate": 6.012194417021475e-05, "loss": 0.8082, "step": 11875 }, { "epoch": 3.338954468802698, "grad_norm": 0.7468091249465942, "learning_rate": 6.0031994411716594e-05, "loss": 0.8013, "step": 11880 }, { "epoch": 3.3403597526700395, "grad_norm": 0.5912469029426575, "learning_rate": 5.9942083122606864e-05, "loss": 0.8076, "step": 11885 }, { "epoch": 3.3417650365373808, "grad_norm": 0.5136885046958923, "learning_rate": 5.98522103894255e-05, "loss": 0.792, "step": 11890 }, { "epoch": 3.3431703204047216, "grad_norm": 0.5261921882629395, "learning_rate": 5.976237629867545e-05, "loss": 0.7974, "step": 11895 }, { "epoch": 3.344575604272063, "grad_norm": 0.5408470630645752, "learning_rate": 5.9672580936822465e-05, "loss": 0.7946, "step": 11900 }, { "epoch": 3.345980888139404, "grad_norm": 0.5385935306549072, "learning_rate": 5.9582824390295e-05, "loss": 0.7778, "step": 11905 }, { "epoch": 3.3473861720067455, "grad_norm": 0.5939579010009766, "learning_rate": 5.9493106745484096e-05, "loss": 0.8341, "step": 11910 }, { "epoch": 3.3487914558740863, "grad_norm": 0.5419361591339111, "learning_rate": 5.9403428088743416e-05, "loss": 0.8016, "step": 11915 }, { "epoch": 3.3501967397414276, "grad_norm": 0.6431915760040283, "learning_rate": 5.9313788506389115e-05, "loss": 0.7874, "step": 11920 }, { "epoch": 3.351602023608769, "grad_norm": 0.5178044438362122, "learning_rate": 5.922418808469963e-05, "loss": 0.7925, "step": 11925 }, { "epoch": 3.35300730747611, "grad_norm": 0.5566776394844055, "learning_rate": 5.9134626909915825e-05, "loss": 0.7794, "step": 11930 }, { "epoch": 3.3544125913434515, "grad_norm": 0.529966413974762, "learning_rate": 5.904510506824074e-05, "loss": 0.7994, "step": 11935 }, { "epoch": 3.3558178752107928, "grad_norm": 0.5139265656471252, "learning_rate": 5.895562264583958e-05, "loss": 0.8013, "step": 11940 }, { "epoch": 3.3572231590781336, "grad_norm": 0.5273216962814331, "learning_rate": 5.886617972883952e-05, "loss": 0.7953, "step": 11945 }, { "epoch": 3.358628442945475, "grad_norm": 0.5380910634994507, "learning_rate": 5.877677640332979e-05, "loss": 0.7986, "step": 11950 }, { "epoch": 3.360033726812816, "grad_norm": 0.4941082298755646, "learning_rate": 5.868741275536158e-05, "loss": 0.7855, "step": 11955 }, { "epoch": 3.3614390106801575, "grad_norm": 0.4764517545700073, "learning_rate": 5.859808887094771e-05, "loss": 0.7901, "step": 11960 }, { "epoch": 3.362844294547499, "grad_norm": 0.587144672870636, "learning_rate": 5.850880483606289e-05, "loss": 0.7967, "step": 11965 }, { "epoch": 3.3642495784148396, "grad_norm": 0.5278950929641724, "learning_rate": 5.8419560736643374e-05, "loss": 0.7887, "step": 11970 }, { "epoch": 3.365654862282181, "grad_norm": 0.5130148530006409, "learning_rate": 5.833035665858706e-05, "loss": 0.784, "step": 11975 }, { "epoch": 3.367060146149522, "grad_norm": 0.5496023893356323, "learning_rate": 5.8241192687753245e-05, "loss": 0.7911, "step": 11980 }, { "epoch": 3.3684654300168635, "grad_norm": 0.5216452479362488, "learning_rate": 5.8152068909962665e-05, "loss": 0.811, "step": 11985 }, { "epoch": 3.369870713884205, "grad_norm": 0.49990314245224, "learning_rate": 5.806298541099742e-05, "loss": 0.8043, "step": 11990 }, { "epoch": 3.3712759977515456, "grad_norm": 0.5206735134124756, "learning_rate": 5.797394227660068e-05, "loss": 0.8125, "step": 11995 }, { "epoch": 3.372681281618887, "grad_norm": 0.5224774479866028, "learning_rate": 5.788493959247694e-05, "loss": 0.7833, "step": 12000 }, { "epoch": 3.374086565486228, "grad_norm": 0.5243582725524902, "learning_rate": 5.779597744429166e-05, "loss": 0.8006, "step": 12005 }, { "epoch": 3.3754918493535695, "grad_norm": 0.5527240037918091, "learning_rate": 5.770705591767135e-05, "loss": 0.7957, "step": 12010 }, { "epoch": 3.376897133220911, "grad_norm": 0.5153176784515381, "learning_rate": 5.7618175098203355e-05, "loss": 0.78, "step": 12015 }, { "epoch": 3.3783024170882516, "grad_norm": 0.604854166507721, "learning_rate": 5.75293350714359e-05, "loss": 0.7887, "step": 12020 }, { "epoch": 3.379707700955593, "grad_norm": 0.6152995228767395, "learning_rate": 5.7440535922877926e-05, "loss": 0.8006, "step": 12025 }, { "epoch": 3.381112984822934, "grad_norm": 0.5055761337280273, "learning_rate": 5.7351777737998955e-05, "loss": 0.8001, "step": 12030 }, { "epoch": 3.3825182686902755, "grad_norm": 0.5889893770217896, "learning_rate": 5.72630606022292e-05, "loss": 0.7956, "step": 12035 }, { "epoch": 3.383923552557617, "grad_norm": 0.5556915402412415, "learning_rate": 5.7174384600959294e-05, "loss": 0.7849, "step": 12040 }, { "epoch": 3.385328836424958, "grad_norm": 0.7165861129760742, "learning_rate": 5.708574981954033e-05, "loss": 0.7997, "step": 12045 }, { "epoch": 3.386734120292299, "grad_norm": 0.6431218981742859, "learning_rate": 5.699715634328362e-05, "loss": 0.7927, "step": 12050 }, { "epoch": 3.38813940415964, "grad_norm": 0.5332899689674377, "learning_rate": 5.690860425746084e-05, "loss": 0.7962, "step": 12055 }, { "epoch": 3.3895446880269815, "grad_norm": 0.5417184233665466, "learning_rate": 5.682009364730376e-05, "loss": 0.7856, "step": 12060 }, { "epoch": 3.390949971894323, "grad_norm": 0.5324541330337524, "learning_rate": 5.6731624598004254e-05, "loss": 0.7881, "step": 12065 }, { "epoch": 3.3923552557616636, "grad_norm": 0.544225811958313, "learning_rate": 5.6643197194714184e-05, "loss": 0.8035, "step": 12070 }, { "epoch": 3.393760539629005, "grad_norm": 0.5472251176834106, "learning_rate": 5.655481152254534e-05, "loss": 0.834, "step": 12075 }, { "epoch": 3.395165823496346, "grad_norm": 0.5320506691932678, "learning_rate": 5.646646766656938e-05, "loss": 0.7933, "step": 12080 }, { "epoch": 3.3965711073636875, "grad_norm": 0.5282317996025085, "learning_rate": 5.6378165711817554e-05, "loss": 0.7776, "step": 12085 }, { "epoch": 3.397976391231029, "grad_norm": 0.5126026272773743, "learning_rate": 5.628990574328099e-05, "loss": 0.792, "step": 12090 }, { "epoch": 3.39938167509837, "grad_norm": 0.49469032883644104, "learning_rate": 5.620168784591031e-05, "loss": 0.789, "step": 12095 }, { "epoch": 3.400786958965711, "grad_norm": 0.5256794095039368, "learning_rate": 5.611351210461559e-05, "loss": 0.7976, "step": 12100 }, { "epoch": 3.402192242833052, "grad_norm": 0.5282547473907471, "learning_rate": 5.602537860426641e-05, "loss": 0.7873, "step": 12105 }, { "epoch": 3.4035975267003935, "grad_norm": 0.5821418166160583, "learning_rate": 5.5937287429691664e-05, "loss": 0.7942, "step": 12110 }, { "epoch": 3.405002810567735, "grad_norm": 0.5334835648536682, "learning_rate": 5.584923866567954e-05, "loss": 0.7937, "step": 12115 }, { "epoch": 3.4064080944350756, "grad_norm": 0.5103002786636353, "learning_rate": 5.576123239697735e-05, "loss": 0.7858, "step": 12120 }, { "epoch": 3.407813378302417, "grad_norm": 0.5318170189857483, "learning_rate": 5.5673268708291526e-05, "loss": 0.8589, "step": 12125 }, { "epoch": 3.409218662169758, "grad_norm": 0.5624529719352722, "learning_rate": 5.558534768428758e-05, "loss": 0.7899, "step": 12130 }, { "epoch": 3.4106239460370995, "grad_norm": 0.5814023613929749, "learning_rate": 5.54974694095898e-05, "loss": 0.794, "step": 12135 }, { "epoch": 3.412029229904441, "grad_norm": 0.5576233863830566, "learning_rate": 5.5409633968781495e-05, "loss": 0.7996, "step": 12140 }, { "epoch": 3.413434513771782, "grad_norm": 0.5190870761871338, "learning_rate": 5.532184144640464e-05, "loss": 0.7997, "step": 12145 }, { "epoch": 3.414839797639123, "grad_norm": 0.5696029663085938, "learning_rate": 5.523409192696003e-05, "loss": 0.7931, "step": 12150 }, { "epoch": 3.416245081506464, "grad_norm": 0.5543867349624634, "learning_rate": 5.514638549490686e-05, "loss": 0.7841, "step": 12155 }, { "epoch": 3.4176503653738055, "grad_norm": 0.5611448884010315, "learning_rate": 5.5058722234663026e-05, "loss": 0.8076, "step": 12160 }, { "epoch": 3.419055649241147, "grad_norm": 0.7647613286972046, "learning_rate": 5.497110223060482e-05, "loss": 0.8003, "step": 12165 }, { "epoch": 3.420460933108488, "grad_norm": 0.5532127022743225, "learning_rate": 5.488352556706689e-05, "loss": 0.7898, "step": 12170 }, { "epoch": 3.421866216975829, "grad_norm": 0.6410893797874451, "learning_rate": 5.4795992328342185e-05, "loss": 0.799, "step": 12175 }, { "epoch": 3.4232715008431702, "grad_norm": 0.48902472853660583, "learning_rate": 5.4708502598681835e-05, "loss": 0.7883, "step": 12180 }, { "epoch": 3.4246767847105115, "grad_norm": 0.495099276304245, "learning_rate": 5.462105646229515e-05, "loss": 0.7942, "step": 12185 }, { "epoch": 3.426082068577853, "grad_norm": 0.5260860919952393, "learning_rate": 5.453365400334937e-05, "loss": 0.7951, "step": 12190 }, { "epoch": 3.427487352445194, "grad_norm": 0.5531935691833496, "learning_rate": 5.444629530596979e-05, "loss": 0.7798, "step": 12195 }, { "epoch": 3.4288926363125354, "grad_norm": 0.5121034383773804, "learning_rate": 5.435898045423955e-05, "loss": 0.833, "step": 12200 }, { "epoch": 3.4302979201798762, "grad_norm": 0.5159273743629456, "learning_rate": 5.4271709532199646e-05, "loss": 0.7916, "step": 12205 }, { "epoch": 3.4317032040472175, "grad_norm": 0.6313301920890808, "learning_rate": 5.418448262384864e-05, "loss": 0.7915, "step": 12210 }, { "epoch": 3.433108487914559, "grad_norm": 0.5238364934921265, "learning_rate": 5.409729981314291e-05, "loss": 0.7942, "step": 12215 }, { "epoch": 3.4345137717819, "grad_norm": 0.5627838969230652, "learning_rate": 5.4010161183996264e-05, "loss": 0.7899, "step": 12220 }, { "epoch": 3.435919055649241, "grad_norm": 0.5431748032569885, "learning_rate": 5.3923066820280075e-05, "loss": 0.7829, "step": 12225 }, { "epoch": 3.4373243395165822, "grad_norm": 0.5447311401367188, "learning_rate": 5.383601680582304e-05, "loss": 0.793, "step": 12230 }, { "epoch": 3.4387296233839235, "grad_norm": 0.47594648599624634, "learning_rate": 5.374901122441121e-05, "loss": 0.7832, "step": 12235 }, { "epoch": 3.440134907251265, "grad_norm": 0.5034043788909912, "learning_rate": 5.366205015978787e-05, "loss": 0.7853, "step": 12240 }, { "epoch": 3.441540191118606, "grad_norm": 0.5102277398109436, "learning_rate": 5.35751336956534e-05, "loss": 0.7788, "step": 12245 }, { "epoch": 3.4429454749859474, "grad_norm": 0.5406153798103333, "learning_rate": 5.348826191566532e-05, "loss": 0.7859, "step": 12250 }, { "epoch": 3.4443507588532882, "grad_norm": 0.4994388222694397, "learning_rate": 5.340143490343813e-05, "loss": 0.7925, "step": 12255 }, { "epoch": 3.4457560427206295, "grad_norm": 0.6013630032539368, "learning_rate": 5.331465274254318e-05, "loss": 0.8095, "step": 12260 }, { "epoch": 3.447161326587971, "grad_norm": 0.5830052495002747, "learning_rate": 5.322791551650872e-05, "loss": 0.7978, "step": 12265 }, { "epoch": 3.448566610455312, "grad_norm": 0.5456408858299255, "learning_rate": 5.3141223308819696e-05, "loss": 0.7905, "step": 12270 }, { "epoch": 3.449971894322653, "grad_norm": 0.5100853443145752, "learning_rate": 5.3054576202917896e-05, "loss": 0.777, "step": 12275 }, { "epoch": 3.4513771781899942, "grad_norm": 0.5169028639793396, "learning_rate": 5.296797428220138e-05, "loss": 0.8264, "step": 12280 }, { "epoch": 3.4527824620573355, "grad_norm": 0.5377866625785828, "learning_rate": 5.288141763002499e-05, "loss": 0.787, "step": 12285 }, { "epoch": 3.454187745924677, "grad_norm": 0.5345072150230408, "learning_rate": 5.27949063296999e-05, "loss": 0.7897, "step": 12290 }, { "epoch": 3.455593029792018, "grad_norm": 0.515516459941864, "learning_rate": 5.270844046449357e-05, "loss": 0.7872, "step": 12295 }, { "epoch": 3.4569983136593594, "grad_norm": 0.5370394587516785, "learning_rate": 5.2622020117629824e-05, "loss": 0.8009, "step": 12300 }, { "epoch": 3.4584035975267002, "grad_norm": 0.5313602089881897, "learning_rate": 5.2535645372288655e-05, "loss": 0.7967, "step": 12305 }, { "epoch": 3.4598088813940415, "grad_norm": 0.4978809058666229, "learning_rate": 5.244931631160619e-05, "loss": 0.7815, "step": 12310 }, { "epoch": 3.461214165261383, "grad_norm": 0.5503342151641846, "learning_rate": 5.2363033018674444e-05, "loss": 0.7922, "step": 12315 }, { "epoch": 3.462619449128724, "grad_norm": 0.5449639558792114, "learning_rate": 5.227679557654151e-05, "loss": 0.8062, "step": 12320 }, { "epoch": 3.4640247329960654, "grad_norm": 0.5676690936088562, "learning_rate": 5.219060406821141e-05, "loss": 0.8049, "step": 12325 }, { "epoch": 3.4654300168634062, "grad_norm": 0.49038565158843994, "learning_rate": 5.210445857664377e-05, "loss": 0.7954, "step": 12330 }, { "epoch": 3.4668353007307475, "grad_norm": 0.582527756690979, "learning_rate": 5.2018359184754065e-05, "loss": 0.7865, "step": 12335 }, { "epoch": 3.468240584598089, "grad_norm": 0.6153197884559631, "learning_rate": 5.193230597541332e-05, "loss": 0.7932, "step": 12340 }, { "epoch": 3.46964586846543, "grad_norm": 0.5249049067497253, "learning_rate": 5.184629903144821e-05, "loss": 0.7931, "step": 12345 }, { "epoch": 3.4710511523327714, "grad_norm": 0.5095629096031189, "learning_rate": 5.176033843564072e-05, "loss": 0.7963, "step": 12350 }, { "epoch": 3.4724564362001122, "grad_norm": 0.4929661750793457, "learning_rate": 5.167442427072837e-05, "loss": 0.7887, "step": 12355 }, { "epoch": 3.4738617200674535, "grad_norm": 0.5074275732040405, "learning_rate": 5.1588556619403985e-05, "loss": 0.7862, "step": 12360 }, { "epoch": 3.475267003934795, "grad_norm": 0.5591232776641846, "learning_rate": 5.150273556431542e-05, "loss": 0.7957, "step": 12365 }, { "epoch": 3.476672287802136, "grad_norm": 0.575605571269989, "learning_rate": 5.141696118806598e-05, "loss": 0.794, "step": 12370 }, { "epoch": 3.4780775716694774, "grad_norm": 0.5372434258460999, "learning_rate": 5.1331233573213844e-05, "loss": 0.8056, "step": 12375 }, { "epoch": 3.4794828555368182, "grad_norm": 0.5141233801841736, "learning_rate": 5.1245552802272275e-05, "loss": 0.7907, "step": 12380 }, { "epoch": 3.4808881394041595, "grad_norm": 0.511116087436676, "learning_rate": 5.115991895770933e-05, "loss": 0.7909, "step": 12385 }, { "epoch": 3.482293423271501, "grad_norm": 0.4942467212677002, "learning_rate": 5.107433212194801e-05, "loss": 0.7964, "step": 12390 }, { "epoch": 3.483698707138842, "grad_norm": 0.5006674528121948, "learning_rate": 5.098879237736609e-05, "loss": 0.7924, "step": 12395 }, { "epoch": 3.4851039910061834, "grad_norm": 0.47983628511428833, "learning_rate": 5.090329980629587e-05, "loss": 0.7918, "step": 12400 }, { "epoch": 3.4865092748735247, "grad_norm": 0.5198928713798523, "learning_rate": 5.0817854491024384e-05, "loss": 0.837, "step": 12405 }, { "epoch": 3.4879145587408655, "grad_norm": 0.5449215769767761, "learning_rate": 5.073245651379316e-05, "loss": 0.7916, "step": 12410 }, { "epoch": 3.489319842608207, "grad_norm": 0.49125024676322937, "learning_rate": 5.0647105956798116e-05, "loss": 0.7824, "step": 12415 }, { "epoch": 3.490725126475548, "grad_norm": 0.5600565671920776, "learning_rate": 5.056180290218957e-05, "loss": 0.8073, "step": 12420 }, { "epoch": 3.4921304103428894, "grad_norm": 0.49226316809654236, "learning_rate": 5.047654743207209e-05, "loss": 0.7841, "step": 12425 }, { "epoch": 3.4935356942102302, "grad_norm": 0.5037206411361694, "learning_rate": 5.03913396285045e-05, "loss": 0.803, "step": 12430 }, { "epoch": 3.4949409780775715, "grad_norm": 0.5516871213912964, "learning_rate": 5.030617957349962e-05, "loss": 0.7919, "step": 12435 }, { "epoch": 3.496346261944913, "grad_norm": 0.5346094965934753, "learning_rate": 5.022106734902444e-05, "loss": 0.7907, "step": 12440 }, { "epoch": 3.497751545812254, "grad_norm": 0.49881866574287415, "learning_rate": 5.013600303699987e-05, "loss": 0.8021, "step": 12445 }, { "epoch": 3.4991568296795954, "grad_norm": 0.5102651715278625, "learning_rate": 5.005098671930073e-05, "loss": 0.7913, "step": 12450 }, { "epoch": 3.5005621135469367, "grad_norm": 0.5428999662399292, "learning_rate": 4.996601847775553e-05, "loss": 0.8005, "step": 12455 }, { "epoch": 3.5019673974142775, "grad_norm": 0.6149014830589294, "learning_rate": 4.988109839414664e-05, "loss": 0.797, "step": 12460 }, { "epoch": 3.503372681281619, "grad_norm": 0.4979318678379059, "learning_rate": 4.979622655021003e-05, "loss": 0.7881, "step": 12465 }, { "epoch": 3.50477796514896, "grad_norm": 0.5156809687614441, "learning_rate": 4.971140302763524e-05, "loss": 0.7809, "step": 12470 }, { "epoch": 3.5061832490163014, "grad_norm": 0.5653836131095886, "learning_rate": 4.9626627908065294e-05, "loss": 0.8515, "step": 12475 }, { "epoch": 3.5075885328836423, "grad_norm": 0.6203331351280212, "learning_rate": 4.9541901273096626e-05, "loss": 0.7845, "step": 12480 }, { "epoch": 3.5089938167509835, "grad_norm": 0.5102115273475647, "learning_rate": 4.9457223204279066e-05, "loss": 0.7831, "step": 12485 }, { "epoch": 3.510399100618325, "grad_norm": 0.5312532782554626, "learning_rate": 4.937259378311555e-05, "loss": 0.7937, "step": 12490 }, { "epoch": 3.511804384485666, "grad_norm": 0.5389629602432251, "learning_rate": 4.928801309106233e-05, "loss": 0.7994, "step": 12495 }, { "epoch": 3.5132096683530074, "grad_norm": 0.5493090748786926, "learning_rate": 4.920348120952874e-05, "loss": 0.787, "step": 12500 }, { "epoch": 3.5146149522203487, "grad_norm": 0.49603936076164246, "learning_rate": 4.9118998219877044e-05, "loss": 0.841, "step": 12505 }, { "epoch": 3.51602023608769, "grad_norm": 0.5134543180465698, "learning_rate": 4.9034564203422526e-05, "loss": 0.7841, "step": 12510 }, { "epoch": 3.517425519955031, "grad_norm": 0.508040726184845, "learning_rate": 4.8950179241433325e-05, "loss": 0.803, "step": 12515 }, { "epoch": 3.518830803822372, "grad_norm": 0.52872633934021, "learning_rate": 4.886584341513035e-05, "loss": 0.7982, "step": 12520 }, { "epoch": 3.5202360876897134, "grad_norm": 0.49528661370277405, "learning_rate": 4.878155680568721e-05, "loss": 0.7869, "step": 12525 }, { "epoch": 3.5216413715570543, "grad_norm": 0.5169901251792908, "learning_rate": 4.8697319494230176e-05, "loss": 0.7899, "step": 12530 }, { "epoch": 3.5230466554243955, "grad_norm": 0.553735077381134, "learning_rate": 4.8613131561838054e-05, "loss": 0.7899, "step": 12535 }, { "epoch": 3.524451939291737, "grad_norm": 0.5353341698646545, "learning_rate": 4.8528993089542065e-05, "loss": 0.7942, "step": 12540 }, { "epoch": 3.525857223159078, "grad_norm": 0.5440569519996643, "learning_rate": 4.84449041583259e-05, "loss": 0.797, "step": 12545 }, { "epoch": 3.5272625070264194, "grad_norm": 0.4897547960281372, "learning_rate": 4.836086484912553e-05, "loss": 0.808, "step": 12550 }, { "epoch": 3.5286677908937607, "grad_norm": 0.49370864033699036, "learning_rate": 4.827687524282923e-05, "loss": 0.7879, "step": 12555 }, { "epoch": 3.530073074761102, "grad_norm": 0.5558463931083679, "learning_rate": 4.81929354202773e-05, "loss": 0.7819, "step": 12560 }, { "epoch": 3.531478358628443, "grad_norm": 0.5049279928207397, "learning_rate": 4.8109045462262233e-05, "loss": 0.7829, "step": 12565 }, { "epoch": 3.532883642495784, "grad_norm": 0.5592412948608398, "learning_rate": 4.802520544952852e-05, "loss": 0.8016, "step": 12570 }, { "epoch": 3.5342889263631254, "grad_norm": 0.5813235640525818, "learning_rate": 4.794141546277253e-05, "loss": 0.7847, "step": 12575 }, { "epoch": 3.5356942102304667, "grad_norm": 0.5444972515106201, "learning_rate": 4.785767558264252e-05, "loss": 0.801, "step": 12580 }, { "epoch": 3.5370994940978076, "grad_norm": 0.550809383392334, "learning_rate": 4.777398588973852e-05, "loss": 0.7921, "step": 12585 }, { "epoch": 3.538504777965149, "grad_norm": 0.5555300116539001, "learning_rate": 4.7690346464612255e-05, "loss": 0.8072, "step": 12590 }, { "epoch": 3.53991006183249, "grad_norm": 0.5757794976234436, "learning_rate": 4.7606757387766996e-05, "loss": 0.7817, "step": 12595 }, { "epoch": 3.5413153456998314, "grad_norm": 0.5164437890052795, "learning_rate": 4.752321873965765e-05, "loss": 0.7962, "step": 12600 }, { "epoch": 3.5427206295671727, "grad_norm": 0.5786922574043274, "learning_rate": 4.743973060069058e-05, "loss": 0.7916, "step": 12605 }, { "epoch": 3.544125913434514, "grad_norm": 0.5284855961799622, "learning_rate": 4.735629305122343e-05, "loss": 0.7832, "step": 12610 }, { "epoch": 3.545531197301855, "grad_norm": 0.5422257781028748, "learning_rate": 4.7272906171565266e-05, "loss": 0.8164, "step": 12615 }, { "epoch": 3.546936481169196, "grad_norm": 0.5167434811592102, "learning_rate": 4.718957004197634e-05, "loss": 0.7981, "step": 12620 }, { "epoch": 3.5483417650365374, "grad_norm": 0.5607612133026123, "learning_rate": 4.710628474266805e-05, "loss": 0.7916, "step": 12625 }, { "epoch": 3.5497470489038787, "grad_norm": 0.5161816477775574, "learning_rate": 4.702305035380288e-05, "loss": 0.7861, "step": 12630 }, { "epoch": 3.5511523327712196, "grad_norm": 0.6233710646629333, "learning_rate": 4.693986695549432e-05, "loss": 0.7981, "step": 12635 }, { "epoch": 3.552557616638561, "grad_norm": 0.5273126363754272, "learning_rate": 4.6856734627806754e-05, "loss": 0.8011, "step": 12640 }, { "epoch": 3.553962900505902, "grad_norm": 0.5350555181503296, "learning_rate": 4.677365345075548e-05, "loss": 0.7916, "step": 12645 }, { "epoch": 3.5553681843732434, "grad_norm": 0.5276046991348267, "learning_rate": 4.669062350430644e-05, "loss": 0.7841, "step": 12650 }, { "epoch": 3.5567734682405847, "grad_norm": 0.6036867499351501, "learning_rate": 4.660764486837635e-05, "loss": 0.8028, "step": 12655 }, { "epoch": 3.558178752107926, "grad_norm": 0.5013184547424316, "learning_rate": 4.65247176228326e-05, "loss": 0.794, "step": 12660 }, { "epoch": 3.559584035975267, "grad_norm": 0.5366091728210449, "learning_rate": 4.644184184749295e-05, "loss": 0.8003, "step": 12665 }, { "epoch": 3.560989319842608, "grad_norm": 0.48761817812919617, "learning_rate": 4.6359017622125745e-05, "loss": 0.7976, "step": 12670 }, { "epoch": 3.5623946037099494, "grad_norm": 0.556034505367279, "learning_rate": 4.6276245026449694e-05, "loss": 0.7858, "step": 12675 }, { "epoch": 3.5637998875772907, "grad_norm": 0.5266842842102051, "learning_rate": 4.6193524140133784e-05, "loss": 0.8347, "step": 12680 }, { "epoch": 3.5652051714446316, "grad_norm": 0.5990260243415833, "learning_rate": 4.6110855042797264e-05, "loss": 0.7882, "step": 12685 }, { "epoch": 3.566610455311973, "grad_norm": 0.5775483250617981, "learning_rate": 4.602823781400951e-05, "loss": 0.7857, "step": 12690 }, { "epoch": 3.568015739179314, "grad_norm": 0.5805662870407104, "learning_rate": 4.594567253329002e-05, "loss": 0.7875, "step": 12695 }, { "epoch": 3.5694210230466554, "grad_norm": 0.5316618084907532, "learning_rate": 4.586315928010817e-05, "loss": 0.7848, "step": 12700 }, { "epoch": 3.5708263069139967, "grad_norm": 0.5092722177505493, "learning_rate": 4.5780698133883405e-05, "loss": 0.7953, "step": 12705 }, { "epoch": 3.572231590781338, "grad_norm": 0.48918163776397705, "learning_rate": 4.5698289173984944e-05, "loss": 0.7876, "step": 12710 }, { "epoch": 3.5736368746486793, "grad_norm": 0.5924161672592163, "learning_rate": 4.561593247973181e-05, "loss": 0.7996, "step": 12715 }, { "epoch": 3.57504215851602, "grad_norm": 0.6285293102264404, "learning_rate": 4.5533628130392656e-05, "loss": 0.7904, "step": 12720 }, { "epoch": 3.5764474423833614, "grad_norm": 0.6160907745361328, "learning_rate": 4.54513762051858e-05, "loss": 0.7892, "step": 12725 }, { "epoch": 3.5778527262507027, "grad_norm": 0.5645490288734436, "learning_rate": 4.536917678327912e-05, "loss": 0.8048, "step": 12730 }, { "epoch": 3.579258010118044, "grad_norm": 0.5955885052680969, "learning_rate": 4.5287029943789935e-05, "loss": 0.7905, "step": 12735 }, { "epoch": 3.580663293985385, "grad_norm": 0.5727905035018921, "learning_rate": 4.520493576578494e-05, "loss": 0.8024, "step": 12740 }, { "epoch": 3.582068577852726, "grad_norm": 0.592326819896698, "learning_rate": 4.512289432828015e-05, "loss": 0.7992, "step": 12745 }, { "epoch": 3.5834738617200674, "grad_norm": 0.5737560987472534, "learning_rate": 4.5040905710240885e-05, "loss": 0.792, "step": 12750 }, { "epoch": 3.5848791455874087, "grad_norm": 0.4912492632865906, "learning_rate": 4.495896999058149e-05, "loss": 0.7914, "step": 12755 }, { "epoch": 3.58628442945475, "grad_norm": 0.4853997230529785, "learning_rate": 4.48770872481655e-05, "loss": 0.7819, "step": 12760 }, { "epoch": 3.5876897133220913, "grad_norm": 0.5390718579292297, "learning_rate": 4.479525756180545e-05, "loss": 0.7925, "step": 12765 }, { "epoch": 3.589094997189432, "grad_norm": 0.5622910261154175, "learning_rate": 4.471348101026274e-05, "loss": 0.7948, "step": 12770 }, { "epoch": 3.5905002810567734, "grad_norm": 0.5547574758529663, "learning_rate": 4.463175767224769e-05, "loss": 0.7913, "step": 12775 }, { "epoch": 3.5919055649241147, "grad_norm": 0.5188785791397095, "learning_rate": 4.455008762641937e-05, "loss": 0.793, "step": 12780 }, { "epoch": 3.593310848791456, "grad_norm": 0.4869183599948883, "learning_rate": 4.4468470951385666e-05, "loss": 0.7952, "step": 12785 }, { "epoch": 3.594716132658797, "grad_norm": 0.5184937119483948, "learning_rate": 4.438690772570291e-05, "loss": 0.7909, "step": 12790 }, { "epoch": 3.596121416526138, "grad_norm": 0.5637416243553162, "learning_rate": 4.43053980278761e-05, "loss": 0.8009, "step": 12795 }, { "epoch": 3.5975267003934794, "grad_norm": 0.48426154255867004, "learning_rate": 4.4223941936358745e-05, "loss": 0.8062, "step": 12800 }, { "epoch": 3.5989319842608207, "grad_norm": 0.48236146569252014, "learning_rate": 4.414253952955262e-05, "loss": 0.7967, "step": 12805 }, { "epoch": 3.600337268128162, "grad_norm": 0.5122547745704651, "learning_rate": 4.406119088580796e-05, "loss": 0.801, "step": 12810 }, { "epoch": 3.6017425519955033, "grad_norm": 0.5660848617553711, "learning_rate": 4.397989608342319e-05, "loss": 0.7827, "step": 12815 }, { "epoch": 3.603147835862844, "grad_norm": 0.5280227065086365, "learning_rate": 4.389865520064499e-05, "loss": 0.7914, "step": 12820 }, { "epoch": 3.6045531197301854, "grad_norm": 0.4934808015823364, "learning_rate": 4.3817468315667954e-05, "loss": 0.7792, "step": 12825 }, { "epoch": 3.6059584035975267, "grad_norm": 0.52223801612854, "learning_rate": 4.373633550663495e-05, "loss": 0.7834, "step": 12830 }, { "epoch": 3.607363687464868, "grad_norm": 0.5264943838119507, "learning_rate": 4.365525685163668e-05, "loss": 0.7889, "step": 12835 }, { "epoch": 3.608768971332209, "grad_norm": 0.5082861185073853, "learning_rate": 4.357423242871164e-05, "loss": 0.7941, "step": 12840 }, { "epoch": 3.61017425519955, "grad_norm": 0.5219237804412842, "learning_rate": 4.349326231584624e-05, "loss": 0.7887, "step": 12845 }, { "epoch": 3.6115795390668914, "grad_norm": 0.48076823353767395, "learning_rate": 4.341234659097459e-05, "loss": 0.7818, "step": 12850 }, { "epoch": 3.6129848229342327, "grad_norm": 0.4861706793308258, "learning_rate": 4.333148533197849e-05, "loss": 0.7901, "step": 12855 }, { "epoch": 3.614390106801574, "grad_norm": 0.515626072883606, "learning_rate": 4.32506786166872e-05, "loss": 0.7808, "step": 12860 }, { "epoch": 3.6157953906689153, "grad_norm": 0.5344110131263733, "learning_rate": 4.316992652287758e-05, "loss": 0.7773, "step": 12865 }, { "epoch": 3.6172006745362566, "grad_norm": 0.5144559741020203, "learning_rate": 4.3089229128273924e-05, "loss": 0.8073, "step": 12870 }, { "epoch": 3.6186059584035974, "grad_norm": 0.5614593029022217, "learning_rate": 4.300858651054774e-05, "loss": 0.7971, "step": 12875 }, { "epoch": 3.6200112422709387, "grad_norm": 0.4780723750591278, "learning_rate": 4.2927998747318034e-05, "loss": 0.7915, "step": 12880 }, { "epoch": 3.62141652613828, "grad_norm": 0.533393919467926, "learning_rate": 4.284746591615084e-05, "loss": 0.7805, "step": 12885 }, { "epoch": 3.622821810005621, "grad_norm": 0.5291486382484436, "learning_rate": 4.276698809455944e-05, "loss": 0.7906, "step": 12890 }, { "epoch": 3.624227093872962, "grad_norm": 0.4856104552745819, "learning_rate": 4.2686565360004e-05, "loss": 0.7866, "step": 12895 }, { "epoch": 3.6256323777403034, "grad_norm": 0.5947971940040588, "learning_rate": 4.260619778989183e-05, "loss": 0.8074, "step": 12900 }, { "epoch": 3.6270376616076447, "grad_norm": 0.5176420211791992, "learning_rate": 4.252588546157713e-05, "loss": 0.7776, "step": 12905 }, { "epoch": 3.628442945474986, "grad_norm": 0.4930765926837921, "learning_rate": 4.244562845236079e-05, "loss": 0.7908, "step": 12910 }, { "epoch": 3.6298482293423273, "grad_norm": 0.5033612847328186, "learning_rate": 4.2365426839490583e-05, "loss": 0.7942, "step": 12915 }, { "epoch": 3.6312535132096686, "grad_norm": 0.5101487636566162, "learning_rate": 4.228528070016094e-05, "loss": 0.7964, "step": 12920 }, { "epoch": 3.6326587970770094, "grad_norm": 0.48270928859710693, "learning_rate": 4.220519011151289e-05, "loss": 0.784, "step": 12925 }, { "epoch": 3.6340640809443507, "grad_norm": 0.5077676177024841, "learning_rate": 4.2125155150633986e-05, "loss": 0.7954, "step": 12930 }, { "epoch": 3.635469364811692, "grad_norm": 0.5236606001853943, "learning_rate": 4.204517589455825e-05, "loss": 0.7917, "step": 12935 }, { "epoch": 3.6368746486790333, "grad_norm": 0.503719687461853, "learning_rate": 4.19652524202661e-05, "loss": 0.7852, "step": 12940 }, { "epoch": 3.638279932546374, "grad_norm": 0.5560110807418823, "learning_rate": 4.18853848046842e-05, "loss": 0.7773, "step": 12945 }, { "epoch": 3.6396852164137155, "grad_norm": 0.5376433730125427, "learning_rate": 4.1805573124685504e-05, "loss": 0.7888, "step": 12950 }, { "epoch": 3.6410905002810567, "grad_norm": 0.5354559421539307, "learning_rate": 4.1725817457089146e-05, "loss": 0.7928, "step": 12955 }, { "epoch": 3.642495784148398, "grad_norm": 0.5309024453163147, "learning_rate": 4.164611787866034e-05, "loss": 0.8369, "step": 12960 }, { "epoch": 3.6439010680157393, "grad_norm": 0.48679086565971375, "learning_rate": 4.156647446611023e-05, "loss": 0.7882, "step": 12965 }, { "epoch": 3.6453063518830806, "grad_norm": 0.5110394954681396, "learning_rate": 4.148688729609599e-05, "loss": 0.7856, "step": 12970 }, { "epoch": 3.6467116357504215, "grad_norm": 0.5115504264831543, "learning_rate": 4.1407356445220655e-05, "loss": 0.8106, "step": 12975 }, { "epoch": 3.6481169196177627, "grad_norm": 0.5382710695266724, "learning_rate": 4.132788199003302e-05, "loss": 0.7896, "step": 12980 }, { "epoch": 3.649522203485104, "grad_norm": 0.5351971983909607, "learning_rate": 4.124846400702761e-05, "loss": 0.7915, "step": 12985 }, { "epoch": 3.6509274873524453, "grad_norm": 0.5722103118896484, "learning_rate": 4.116910257264461e-05, "loss": 0.8012, "step": 12990 }, { "epoch": 3.652332771219786, "grad_norm": 0.4893995225429535, "learning_rate": 4.108979776326979e-05, "loss": 0.7898, "step": 12995 }, { "epoch": 3.6537380550871275, "grad_norm": 0.5160402059555054, "learning_rate": 4.101054965523432e-05, "loss": 0.7932, "step": 13000 }, { "epoch": 3.6551433389544687, "grad_norm": 0.4884108901023865, "learning_rate": 4.093135832481491e-05, "loss": 0.7948, "step": 13005 }, { "epoch": 3.65654862282181, "grad_norm": 0.48411718010902405, "learning_rate": 4.0852223848233604e-05, "loss": 0.7861, "step": 13010 }, { "epoch": 3.6579539066891513, "grad_norm": 0.4931657016277313, "learning_rate": 4.077314630165763e-05, "loss": 0.7853, "step": 13015 }, { "epoch": 3.6593591905564926, "grad_norm": 0.5150357484817505, "learning_rate": 4.069412576119953e-05, "loss": 0.7864, "step": 13020 }, { "epoch": 3.6607644744238335, "grad_norm": 0.530714213848114, "learning_rate": 4.061516230291693e-05, "loss": 0.7786, "step": 13025 }, { "epoch": 3.6621697582911747, "grad_norm": 0.5072176456451416, "learning_rate": 4.053625600281252e-05, "loss": 0.776, "step": 13030 }, { "epoch": 3.663575042158516, "grad_norm": 0.49845072627067566, "learning_rate": 4.0457406936833984e-05, "loss": 0.8124, "step": 13035 }, { "epoch": 3.6649803260258573, "grad_norm": 0.5175867080688477, "learning_rate": 4.0378615180873905e-05, "loss": 0.7966, "step": 13040 }, { "epoch": 3.666385609893198, "grad_norm": 0.49814727902412415, "learning_rate": 4.0299880810769694e-05, "loss": 0.7934, "step": 13045 }, { "epoch": 3.6677908937605395, "grad_norm": 0.5344975590705872, "learning_rate": 4.02212039023036e-05, "loss": 0.7882, "step": 13050 }, { "epoch": 3.6691961776278808, "grad_norm": 0.5497987866401672, "learning_rate": 4.014258453120242e-05, "loss": 0.7858, "step": 13055 }, { "epoch": 3.670601461495222, "grad_norm": 0.5829319953918457, "learning_rate": 4.006402277313771e-05, "loss": 0.7908, "step": 13060 }, { "epoch": 3.6720067453625633, "grad_norm": 0.4865242838859558, "learning_rate": 3.9985518703725545e-05, "loss": 0.7913, "step": 13065 }, { "epoch": 3.6734120292299046, "grad_norm": 0.53497314453125, "learning_rate": 3.990707239852638e-05, "loss": 0.7808, "step": 13070 }, { "epoch": 3.674817313097246, "grad_norm": 0.5179198980331421, "learning_rate": 3.9828683933045186e-05, "loss": 0.7808, "step": 13075 }, { "epoch": 3.6762225969645868, "grad_norm": 0.5384912490844727, "learning_rate": 3.975035338273121e-05, "loss": 0.8018, "step": 13080 }, { "epoch": 3.677627880831928, "grad_norm": 0.5635836124420166, "learning_rate": 3.9672080822977985e-05, "loss": 0.7842, "step": 13085 }, { "epoch": 3.6790331646992693, "grad_norm": 0.5859927535057068, "learning_rate": 3.9593866329123184e-05, "loss": 0.7991, "step": 13090 }, { "epoch": 3.6804384485666106, "grad_norm": 0.5475590229034424, "learning_rate": 3.951570997644862e-05, "loss": 0.8046, "step": 13095 }, { "epoch": 3.6818437324339515, "grad_norm": 0.5195204019546509, "learning_rate": 3.94376118401802e-05, "loss": 0.7896, "step": 13100 }, { "epoch": 3.6832490163012928, "grad_norm": 0.5426797270774841, "learning_rate": 3.9359571995487644e-05, "loss": 0.79, "step": 13105 }, { "epoch": 3.684654300168634, "grad_norm": 0.49161791801452637, "learning_rate": 3.928159051748469e-05, "loss": 0.7918, "step": 13110 }, { "epoch": 3.6860595840359753, "grad_norm": 0.5060259699821472, "learning_rate": 3.9203667481228876e-05, "loss": 0.7774, "step": 13115 }, { "epoch": 3.6874648679033166, "grad_norm": 0.4989676773548126, "learning_rate": 3.9125802961721536e-05, "loss": 0.7903, "step": 13120 }, { "epoch": 3.688870151770658, "grad_norm": 0.7031319737434387, "learning_rate": 3.904799703390752e-05, "loss": 0.793, "step": 13125 }, { "epoch": 3.6902754356379988, "grad_norm": 0.6419675946235657, "learning_rate": 3.897024977267546e-05, "loss": 0.7808, "step": 13130 }, { "epoch": 3.69168071950534, "grad_norm": 0.5509933829307556, "learning_rate": 3.8892561252857415e-05, "loss": 0.8033, "step": 13135 }, { "epoch": 3.6930860033726813, "grad_norm": 0.5243677496910095, "learning_rate": 3.881493154922898e-05, "loss": 0.7917, "step": 13140 }, { "epoch": 3.6944912872400226, "grad_norm": 0.5301499366760254, "learning_rate": 3.873736073650906e-05, "loss": 0.8439, "step": 13145 }, { "epoch": 3.6958965711073635, "grad_norm": 0.5213268995285034, "learning_rate": 3.865984888935996e-05, "loss": 0.7959, "step": 13150 }, { "epoch": 3.6973018549747048, "grad_norm": 0.49913641810417175, "learning_rate": 3.858239608238718e-05, "loss": 0.7911, "step": 13155 }, { "epoch": 3.698707138842046, "grad_norm": 0.5024813413619995, "learning_rate": 3.850500239013937e-05, "loss": 0.7901, "step": 13160 }, { "epoch": 3.7001124227093873, "grad_norm": 0.5601816773414612, "learning_rate": 3.842766788710832e-05, "loss": 0.793, "step": 13165 }, { "epoch": 3.7015177065767286, "grad_norm": 0.49501365423202515, "learning_rate": 3.8350392647728896e-05, "loss": 0.794, "step": 13170 }, { "epoch": 3.70292299044407, "grad_norm": 0.5512212514877319, "learning_rate": 3.827317674637878e-05, "loss": 0.7949, "step": 13175 }, { "epoch": 3.7043282743114108, "grad_norm": 0.5936423540115356, "learning_rate": 3.8196020257378686e-05, "loss": 0.7948, "step": 13180 }, { "epoch": 3.705733558178752, "grad_norm": 0.5287837386131287, "learning_rate": 3.811892325499208e-05, "loss": 0.7922, "step": 13185 }, { "epoch": 3.7071388420460933, "grad_norm": 0.6881371736526489, "learning_rate": 3.804188581342517e-05, "loss": 0.8122, "step": 13190 }, { "epoch": 3.7085441259134346, "grad_norm": 0.5230666995048523, "learning_rate": 3.796490800682687e-05, "loss": 0.8076, "step": 13195 }, { "epoch": 3.7099494097807755, "grad_norm": 0.5672796368598938, "learning_rate": 3.7887989909288646e-05, "loss": 0.7956, "step": 13200 }, { "epoch": 3.7113546936481168, "grad_norm": 0.48925039172172546, "learning_rate": 3.7811131594844574e-05, "loss": 0.841, "step": 13205 }, { "epoch": 3.712759977515458, "grad_norm": 0.5012375712394714, "learning_rate": 3.773433313747105e-05, "loss": 0.7958, "step": 13210 }, { "epoch": 3.7141652613827993, "grad_norm": 0.5377150177955627, "learning_rate": 3.765759461108698e-05, "loss": 0.8373, "step": 13215 }, { "epoch": 3.7155705452501406, "grad_norm": 0.5510297417640686, "learning_rate": 3.758091608955354e-05, "loss": 0.7929, "step": 13220 }, { "epoch": 3.716975829117482, "grad_norm": 0.4897007346153259, "learning_rate": 3.750429764667421e-05, "loss": 0.7859, "step": 13225 }, { "epoch": 3.718381112984823, "grad_norm": 0.5488338470458984, "learning_rate": 3.742773935619451e-05, "loss": 0.7972, "step": 13230 }, { "epoch": 3.719786396852164, "grad_norm": 0.5065768957138062, "learning_rate": 3.735124129180219e-05, "loss": 0.7969, "step": 13235 }, { "epoch": 3.7211916807195053, "grad_norm": 0.4968884289264679, "learning_rate": 3.7274803527126966e-05, "loss": 0.7883, "step": 13240 }, { "epoch": 3.7225969645868466, "grad_norm": 0.5004639625549316, "learning_rate": 3.719842613574056e-05, "loss": 0.7893, "step": 13245 }, { "epoch": 3.7240022484541875, "grad_norm": 0.6538251042366028, "learning_rate": 3.712210919115655e-05, "loss": 0.7924, "step": 13250 }, { "epoch": 3.7254075323215288, "grad_norm": 0.5038894414901733, "learning_rate": 3.7045852766830344e-05, "loss": 0.7805, "step": 13255 }, { "epoch": 3.72681281618887, "grad_norm": 0.5190169215202332, "learning_rate": 3.696965693615915e-05, "loss": 0.8538, "step": 13260 }, { "epoch": 3.7282181000562113, "grad_norm": 0.5098369717597961, "learning_rate": 3.68935217724817e-05, "loss": 0.7711, "step": 13265 }, { "epoch": 3.7296233839235526, "grad_norm": 0.5032815933227539, "learning_rate": 3.6817447349078506e-05, "loss": 0.8048, "step": 13270 }, { "epoch": 3.731028667790894, "grad_norm": 0.5347372889518738, "learning_rate": 3.674143373917157e-05, "loss": 0.7887, "step": 13275 }, { "epoch": 3.732433951658235, "grad_norm": 0.5623759031295776, "learning_rate": 3.6665481015924265e-05, "loss": 0.7976, "step": 13280 }, { "epoch": 3.733839235525576, "grad_norm": 0.5230100750923157, "learning_rate": 3.658958925244145e-05, "loss": 0.7769, "step": 13285 }, { "epoch": 3.7352445193929174, "grad_norm": 0.6419785618782043, "learning_rate": 3.651375852176935e-05, "loss": 0.7898, "step": 13290 }, { "epoch": 3.7366498032602586, "grad_norm": 0.5499880313873291, "learning_rate": 3.643798889689539e-05, "loss": 0.793, "step": 13295 }, { "epoch": 3.7380550871276, "grad_norm": 0.57011479139328, "learning_rate": 3.636228045074812e-05, "loss": 0.7857, "step": 13300 }, { "epoch": 3.7394603709949408, "grad_norm": 0.4934341013431549, "learning_rate": 3.62866332561973e-05, "loss": 0.8345, "step": 13305 }, { "epoch": 3.740865654862282, "grad_norm": 0.47908616065979004, "learning_rate": 3.621104738605373e-05, "loss": 0.786, "step": 13310 }, { "epoch": 3.7422709387296234, "grad_norm": 0.5038416385650635, "learning_rate": 3.61355229130691e-05, "loss": 0.7937, "step": 13315 }, { "epoch": 3.7436762225969646, "grad_norm": 0.5455382466316223, "learning_rate": 3.606005990993607e-05, "loss": 0.7889, "step": 13320 }, { "epoch": 3.745081506464306, "grad_norm": 0.48870712518692017, "learning_rate": 3.5984658449288155e-05, "loss": 0.7941, "step": 13325 }, { "epoch": 3.746486790331647, "grad_norm": 0.49450066685676575, "learning_rate": 3.590931860369963e-05, "loss": 0.7958, "step": 13330 }, { "epoch": 3.747892074198988, "grad_norm": 0.47441360354423523, "learning_rate": 3.5834040445685324e-05, "loss": 0.7849, "step": 13335 }, { "epoch": 3.7492973580663294, "grad_norm": 0.49870437383651733, "learning_rate": 3.575882404770093e-05, "loss": 0.7893, "step": 13340 }, { "epoch": 3.7507026419336706, "grad_norm": 0.4865383505821228, "learning_rate": 3.5683669482142565e-05, "loss": 0.7878, "step": 13345 }, { "epoch": 3.752107925801012, "grad_norm": 0.4772275388240814, "learning_rate": 3.5608576821346786e-05, "loss": 0.7967, "step": 13350 }, { "epoch": 3.753513209668353, "grad_norm": 0.5540780425071716, "learning_rate": 3.553354613759064e-05, "loss": 0.8323, "step": 13355 }, { "epoch": 3.754918493535694, "grad_norm": 0.5324468016624451, "learning_rate": 3.545857750309153e-05, "loss": 0.7924, "step": 13360 }, { "epoch": 3.7563237774030354, "grad_norm": 0.5483556985855103, "learning_rate": 3.538367099000711e-05, "loss": 0.7986, "step": 13365 }, { "epoch": 3.7577290612703766, "grad_norm": 0.5705327391624451, "learning_rate": 3.530882667043519e-05, "loss": 0.7925, "step": 13370 }, { "epoch": 3.759134345137718, "grad_norm": 0.5296326279640198, "learning_rate": 3.5234044616413816e-05, "loss": 0.7952, "step": 13375 }, { "epoch": 3.7605396290050592, "grad_norm": 0.5047909021377563, "learning_rate": 3.515932489992104e-05, "loss": 0.812, "step": 13380 }, { "epoch": 3.7619449128724, "grad_norm": 0.4710387885570526, "learning_rate": 3.508466759287494e-05, "loss": 0.7827, "step": 13385 }, { "epoch": 3.7633501967397414, "grad_norm": 0.5809809565544128, "learning_rate": 3.5010072767133504e-05, "loss": 0.7874, "step": 13390 }, { "epoch": 3.7647554806070826, "grad_norm": 0.5445647835731506, "learning_rate": 3.493554049449461e-05, "loss": 0.7853, "step": 13395 }, { "epoch": 3.766160764474424, "grad_norm": 0.586715042591095, "learning_rate": 3.4861070846695945e-05, "loss": 0.7984, "step": 13400 }, { "epoch": 3.767566048341765, "grad_norm": 0.5245351195335388, "learning_rate": 3.478666389541481e-05, "loss": 0.7974, "step": 13405 }, { "epoch": 3.768971332209106, "grad_norm": 0.5032010674476624, "learning_rate": 3.471231971226826e-05, "loss": 0.78, "step": 13410 }, { "epoch": 3.7703766160764474, "grad_norm": 0.5067157745361328, "learning_rate": 3.4638038368812965e-05, "loss": 0.7949, "step": 13415 }, { "epoch": 3.7717818999437887, "grad_norm": 0.5336160063743591, "learning_rate": 3.4563819936544985e-05, "loss": 0.7977, "step": 13420 }, { "epoch": 3.77318718381113, "grad_norm": 0.5501193404197693, "learning_rate": 3.448966448689992e-05, "loss": 0.7966, "step": 13425 }, { "epoch": 3.7745924676784712, "grad_norm": 0.5124372839927673, "learning_rate": 3.4415572091252756e-05, "loss": 0.7849, "step": 13430 }, { "epoch": 3.7759977515458125, "grad_norm": 0.5596308708190918, "learning_rate": 3.434154282091775e-05, "loss": 0.7827, "step": 13435 }, { "epoch": 3.7774030354131534, "grad_norm": 0.5546366572380066, "learning_rate": 3.42675767471484e-05, "loss": 0.7851, "step": 13440 }, { "epoch": 3.7788083192804947, "grad_norm": 0.5042093396186829, "learning_rate": 3.419367394113742e-05, "loss": 0.7815, "step": 13445 }, { "epoch": 3.780213603147836, "grad_norm": 0.49269187450408936, "learning_rate": 3.411983447401656e-05, "loss": 0.7876, "step": 13450 }, { "epoch": 3.781618887015177, "grad_norm": 0.4880962073802948, "learning_rate": 3.404605841685672e-05, "loss": 0.7881, "step": 13455 }, { "epoch": 3.783024170882518, "grad_norm": 0.5173097252845764, "learning_rate": 3.3972345840667606e-05, "loss": 0.7966, "step": 13460 }, { "epoch": 3.7844294547498594, "grad_norm": 0.5115389823913574, "learning_rate": 3.3898696816397944e-05, "loss": 0.7871, "step": 13465 }, { "epoch": 3.7858347386172007, "grad_norm": 0.49383237957954407, "learning_rate": 3.3825111414935284e-05, "loss": 0.8044, "step": 13470 }, { "epoch": 3.787240022484542, "grad_norm": 0.5096961259841919, "learning_rate": 3.3751589707105856e-05, "loss": 0.7931, "step": 13475 }, { "epoch": 3.7886453063518832, "grad_norm": 0.5193392038345337, "learning_rate": 3.367813176367467e-05, "loss": 0.8222, "step": 13480 }, { "epoch": 3.7900505902192245, "grad_norm": 0.6020642518997192, "learning_rate": 3.360473765534534e-05, "loss": 0.79, "step": 13485 }, { "epoch": 3.7914558740865654, "grad_norm": 0.679407000541687, "learning_rate": 3.353140745276002e-05, "loss": 0.7824, "step": 13490 }, { "epoch": 3.7928611579539067, "grad_norm": 0.5894041061401367, "learning_rate": 3.345814122649937e-05, "loss": 0.8103, "step": 13495 }, { "epoch": 3.794266441821248, "grad_norm": 0.4902842938899994, "learning_rate": 3.338493904708246e-05, "loss": 0.7874, "step": 13500 }, { "epoch": 3.7956717256885892, "grad_norm": 0.5133805871009827, "learning_rate": 3.3311800984966776e-05, "loss": 0.8053, "step": 13505 }, { "epoch": 3.79707700955593, "grad_norm": 0.5681965351104736, "learning_rate": 3.323872711054796e-05, "loss": 0.8309, "step": 13510 }, { "epoch": 3.7984822934232714, "grad_norm": 0.5490449666976929, "learning_rate": 3.316571749415998e-05, "loss": 0.7835, "step": 13515 }, { "epoch": 3.7998875772906127, "grad_norm": 0.5362129211425781, "learning_rate": 3.309277220607493e-05, "loss": 0.8493, "step": 13520 }, { "epoch": 3.801292861157954, "grad_norm": 0.5412881374359131, "learning_rate": 3.3019891316503016e-05, "loss": 0.7863, "step": 13525 }, { "epoch": 3.8026981450252952, "grad_norm": 0.5421592593193054, "learning_rate": 3.294707489559237e-05, "loss": 0.802, "step": 13530 }, { "epoch": 3.8041034288926365, "grad_norm": 0.5225282311439514, "learning_rate": 3.287432301342914e-05, "loss": 0.7921, "step": 13535 }, { "epoch": 3.8055087127599774, "grad_norm": 0.49669602513313293, "learning_rate": 3.2801635740037375e-05, "loss": 0.7848, "step": 13540 }, { "epoch": 3.8069139966273187, "grad_norm": 0.5054916143417358, "learning_rate": 3.2729013145378894e-05, "loss": 0.7869, "step": 13545 }, { "epoch": 3.80831928049466, "grad_norm": 0.5187205076217651, "learning_rate": 3.265645529935327e-05, "loss": 0.8285, "step": 13550 }, { "epoch": 3.8097245643620012, "grad_norm": 0.5249623656272888, "learning_rate": 3.2583962271797776e-05, "loss": 0.7834, "step": 13555 }, { "epoch": 3.811129848229342, "grad_norm": 0.5045213103294373, "learning_rate": 3.251153413248731e-05, "loss": 0.8339, "step": 13560 }, { "epoch": 3.8125351320966834, "grad_norm": 0.49637290835380554, "learning_rate": 3.243917095113422e-05, "loss": 0.7872, "step": 13565 }, { "epoch": 3.8139404159640247, "grad_norm": 0.49728259444236755, "learning_rate": 3.2366872797388434e-05, "loss": 0.787, "step": 13570 }, { "epoch": 3.815345699831366, "grad_norm": 0.5004485249519348, "learning_rate": 3.22946397408373e-05, "loss": 0.7981, "step": 13575 }, { "epoch": 3.8167509836987072, "grad_norm": 0.5126949548721313, "learning_rate": 3.2222471851005375e-05, "loss": 0.8253, "step": 13580 }, { "epoch": 3.8181562675660485, "grad_norm": 0.5166481137275696, "learning_rate": 3.2150369197354636e-05, "loss": 0.8044, "step": 13585 }, { "epoch": 3.8195615514333894, "grad_norm": 0.4981909394264221, "learning_rate": 3.2078331849284204e-05, "loss": 0.7918, "step": 13590 }, { "epoch": 3.8209668353007307, "grad_norm": 0.5490385890007019, "learning_rate": 3.200635987613038e-05, "loss": 0.7832, "step": 13595 }, { "epoch": 3.822372119168072, "grad_norm": 0.4970734119415283, "learning_rate": 3.1934453347166484e-05, "loss": 0.7895, "step": 13600 }, { "epoch": 3.8237774030354132, "grad_norm": 0.5258076190948486, "learning_rate": 3.1862612331602904e-05, "loss": 0.8058, "step": 13605 }, { "epoch": 3.825182686902754, "grad_norm": 0.6316525936126709, "learning_rate": 3.1790836898586976e-05, "loss": 0.7868, "step": 13610 }, { "epoch": 3.8265879707700954, "grad_norm": 0.5370373725891113, "learning_rate": 3.171912711720281e-05, "loss": 0.8033, "step": 13615 }, { "epoch": 3.8279932546374367, "grad_norm": 0.5009310841560364, "learning_rate": 3.164748305647144e-05, "loss": 0.7875, "step": 13620 }, { "epoch": 3.829398538504778, "grad_norm": 0.5032376646995544, "learning_rate": 3.1575904785350586e-05, "loss": 0.7787, "step": 13625 }, { "epoch": 3.8308038223721192, "grad_norm": 0.4967799484729767, "learning_rate": 3.1504392372734715e-05, "loss": 0.786, "step": 13630 }, { "epoch": 3.8322091062394605, "grad_norm": 0.5378280282020569, "learning_rate": 3.143294588745478e-05, "loss": 0.797, "step": 13635 }, { "epoch": 3.833614390106802, "grad_norm": 0.517133355140686, "learning_rate": 3.136156539827837e-05, "loss": 0.8284, "step": 13640 }, { "epoch": 3.8350196739741427, "grad_norm": 0.53566974401474, "learning_rate": 3.129025097390955e-05, "loss": 0.7862, "step": 13645 }, { "epoch": 3.836424957841484, "grad_norm": 0.5064863562583923, "learning_rate": 3.1219002682988774e-05, "loss": 0.79, "step": 13650 }, { "epoch": 3.8378302417088253, "grad_norm": 0.49409398436546326, "learning_rate": 3.114782059409284e-05, "loss": 0.8419, "step": 13655 }, { "epoch": 3.8392355255761665, "grad_norm": 0.551094651222229, "learning_rate": 3.107670477573484e-05, "loss": 0.7942, "step": 13660 }, { "epoch": 3.8406408094435074, "grad_norm": 0.4756724238395691, "learning_rate": 3.100565529636412e-05, "loss": 0.7884, "step": 13665 }, { "epoch": 3.8420460933108487, "grad_norm": 0.5304540395736694, "learning_rate": 3.093467222436605e-05, "loss": 0.8083, "step": 13670 }, { "epoch": 3.84345137717819, "grad_norm": 0.4981602728366852, "learning_rate": 3.0863755628062196e-05, "loss": 0.7893, "step": 13675 }, { "epoch": 3.8448566610455313, "grad_norm": 0.48895543813705444, "learning_rate": 3.079290557571014e-05, "loss": 0.7858, "step": 13680 }, { "epoch": 3.8462619449128725, "grad_norm": 0.5378122329711914, "learning_rate": 3.072212213550332e-05, "loss": 0.7804, "step": 13685 }, { "epoch": 3.847667228780214, "grad_norm": 0.5221678614616394, "learning_rate": 3.065140537557114e-05, "loss": 0.7889, "step": 13690 }, { "epoch": 3.8490725126475547, "grad_norm": 0.5159165859222412, "learning_rate": 3.05807553639788e-05, "loss": 0.7954, "step": 13695 }, { "epoch": 3.850477796514896, "grad_norm": 0.5093886852264404, "learning_rate": 3.0510172168727325e-05, "loss": 0.7932, "step": 13700 }, { "epoch": 3.8518830803822373, "grad_norm": 0.5088807344436646, "learning_rate": 3.043965585775329e-05, "loss": 0.7728, "step": 13705 }, { "epoch": 3.8532883642495785, "grad_norm": 0.5359891653060913, "learning_rate": 3.0369206498928993e-05, "loss": 0.7857, "step": 13710 }, { "epoch": 3.8546936481169194, "grad_norm": 0.5391300320625305, "learning_rate": 3.02988241600623e-05, "loss": 0.7837, "step": 13715 }, { "epoch": 3.8560989319842607, "grad_norm": 0.5096182227134705, "learning_rate": 3.0228508908896458e-05, "loss": 0.8227, "step": 13720 }, { "epoch": 3.857504215851602, "grad_norm": 0.4783703088760376, "learning_rate": 3.0158260813110263e-05, "loss": 0.7892, "step": 13725 }, { "epoch": 3.8589094997189433, "grad_norm": 0.535466194152832, "learning_rate": 3.0088079940317814e-05, "loss": 0.7936, "step": 13730 }, { "epoch": 3.8603147835862845, "grad_norm": 0.5733150243759155, "learning_rate": 3.0017966358068572e-05, "loss": 0.7844, "step": 13735 }, { "epoch": 3.861720067453626, "grad_norm": 0.5078558921813965, "learning_rate": 2.9947920133847106e-05, "loss": 0.7834, "step": 13740 }, { "epoch": 3.8631253513209667, "grad_norm": 0.5028983950614929, "learning_rate": 2.9877941335073233e-05, "loss": 0.7907, "step": 13745 }, { "epoch": 3.864530635188308, "grad_norm": 0.4952241778373718, "learning_rate": 2.9808030029101964e-05, "loss": 0.7852, "step": 13750 }, { "epoch": 3.8659359190556493, "grad_norm": 0.5080897808074951, "learning_rate": 2.9738186283223146e-05, "loss": 0.7885, "step": 13755 }, { "epoch": 3.8673412029229906, "grad_norm": 0.47834813594818115, "learning_rate": 2.966841016466174e-05, "loss": 0.805, "step": 13760 }, { "epoch": 3.8687464867903314, "grad_norm": 0.5171611905097961, "learning_rate": 2.9598701740577593e-05, "loss": 0.8313, "step": 13765 }, { "epoch": 3.8701517706576727, "grad_norm": 0.5075935125350952, "learning_rate": 2.95290610780654e-05, "loss": 0.7942, "step": 13770 }, { "epoch": 3.871557054525014, "grad_norm": 0.5939414501190186, "learning_rate": 2.9459488244154555e-05, "loss": 0.7886, "step": 13775 }, { "epoch": 3.8729623383923553, "grad_norm": 0.5912177562713623, "learning_rate": 2.9389983305809253e-05, "loss": 0.7975, "step": 13780 }, { "epoch": 3.8743676222596966, "grad_norm": 0.6902967095375061, "learning_rate": 2.9320546329928366e-05, "loss": 0.7856, "step": 13785 }, { "epoch": 3.875772906127038, "grad_norm": 0.5265038013458252, "learning_rate": 2.925117738334523e-05, "loss": 0.7851, "step": 13790 }, { "epoch": 3.877178189994379, "grad_norm": 0.5476034879684448, "learning_rate": 2.9181876532827766e-05, "loss": 0.7903, "step": 13795 }, { "epoch": 3.87858347386172, "grad_norm": 0.5170614719390869, "learning_rate": 2.9112643845078436e-05, "loss": 0.7874, "step": 13800 }, { "epoch": 3.8799887577290613, "grad_norm": 0.483660489320755, "learning_rate": 2.9043479386734007e-05, "loss": 0.787, "step": 13805 }, { "epoch": 3.8813940415964026, "grad_norm": 0.5684224963188171, "learning_rate": 2.897438322436554e-05, "loss": 0.7852, "step": 13810 }, { "epoch": 3.8827993254637434, "grad_norm": 0.509792685508728, "learning_rate": 2.8905355424478418e-05, "loss": 0.8273, "step": 13815 }, { "epoch": 3.8842046093310847, "grad_norm": 0.5174958109855652, "learning_rate": 2.883639605351226e-05, "loss": 0.7799, "step": 13820 }, { "epoch": 3.885609893198426, "grad_norm": 0.5141977071762085, "learning_rate": 2.8767505177840716e-05, "loss": 0.8001, "step": 13825 }, { "epoch": 3.8870151770657673, "grad_norm": 0.5077630281448364, "learning_rate": 2.8698682863771586e-05, "loss": 0.7866, "step": 13830 }, { "epoch": 3.8884204609331086, "grad_norm": 0.46954312920570374, "learning_rate": 2.8629929177546665e-05, "loss": 0.7822, "step": 13835 }, { "epoch": 3.88982574480045, "grad_norm": 0.5315179824829102, "learning_rate": 2.8561244185341706e-05, "loss": 0.7972, "step": 13840 }, { "epoch": 3.891231028667791, "grad_norm": 0.4870656132698059, "learning_rate": 2.8492627953266328e-05, "loss": 0.7788, "step": 13845 }, { "epoch": 3.892636312535132, "grad_norm": 0.5004348754882812, "learning_rate": 2.8424080547363952e-05, "loss": 0.7958, "step": 13850 }, { "epoch": 3.8940415964024733, "grad_norm": 0.5151705741882324, "learning_rate": 2.8355602033611774e-05, "loss": 0.7857, "step": 13855 }, { "epoch": 3.8954468802698146, "grad_norm": 0.505160391330719, "learning_rate": 2.828719247792071e-05, "loss": 0.7884, "step": 13860 }, { "epoch": 3.896852164137156, "grad_norm": 0.49165022373199463, "learning_rate": 2.8218851946135217e-05, "loss": 0.7811, "step": 13865 }, { "epoch": 3.8982574480044967, "grad_norm": 0.5374483466148376, "learning_rate": 2.8150580504033396e-05, "loss": 0.7831, "step": 13870 }, { "epoch": 3.899662731871838, "grad_norm": 0.5054756999015808, "learning_rate": 2.8082378217326842e-05, "loss": 0.7998, "step": 13875 }, { "epoch": 3.9010680157391793, "grad_norm": 0.5778508186340332, "learning_rate": 2.8014245151660524e-05, "loss": 0.7787, "step": 13880 }, { "epoch": 3.9024732996065206, "grad_norm": 0.5403375625610352, "learning_rate": 2.7946181372612858e-05, "loss": 0.7901, "step": 13885 }, { "epoch": 3.903878583473862, "grad_norm": 0.49616947770118713, "learning_rate": 2.7878186945695526e-05, "loss": 0.7848, "step": 13890 }, { "epoch": 3.905283867341203, "grad_norm": 0.4898039698600769, "learning_rate": 2.7810261936353487e-05, "loss": 0.7883, "step": 13895 }, { "epoch": 3.906689151208544, "grad_norm": 0.5719556212425232, "learning_rate": 2.7742406409964882e-05, "loss": 0.7943, "step": 13900 }, { "epoch": 3.9080944350758853, "grad_norm": 0.5483629107475281, "learning_rate": 2.767462043184096e-05, "loss": 0.7846, "step": 13905 }, { "epoch": 3.9094997189432266, "grad_norm": 0.5457851886749268, "learning_rate": 2.7606904067226046e-05, "loss": 0.7911, "step": 13910 }, { "epoch": 3.910905002810568, "grad_norm": 0.5014635324478149, "learning_rate": 2.7539257381297422e-05, "loss": 0.8347, "step": 13915 }, { "epoch": 3.9123102866779087, "grad_norm": 0.5336162447929382, "learning_rate": 2.7471680439165348e-05, "loss": 0.7902, "step": 13920 }, { "epoch": 3.91371557054525, "grad_norm": 0.5272439122200012, "learning_rate": 2.740417330587294e-05, "loss": 0.7884, "step": 13925 }, { "epoch": 3.9151208544125913, "grad_norm": 0.5460571050643921, "learning_rate": 2.733673604639615e-05, "loss": 0.7974, "step": 13930 }, { "epoch": 3.9165261382799326, "grad_norm": 0.5119529366493225, "learning_rate": 2.7269368725643594e-05, "loss": 0.7927, "step": 13935 }, { "epoch": 3.917931422147274, "grad_norm": 0.5041726231575012, "learning_rate": 2.7202071408456654e-05, "loss": 0.788, "step": 13940 }, { "epoch": 3.919336706014615, "grad_norm": 0.5189741849899292, "learning_rate": 2.713484415960931e-05, "loss": 0.7826, "step": 13945 }, { "epoch": 3.920741989881956, "grad_norm": 0.5067157745361328, "learning_rate": 2.7067687043808086e-05, "loss": 0.782, "step": 13950 }, { "epoch": 3.9221472737492973, "grad_norm": 0.50437992811203, "learning_rate": 2.7000600125692033e-05, "loss": 0.7965, "step": 13955 }, { "epoch": 3.9235525576166386, "grad_norm": 0.5085188746452332, "learning_rate": 2.69335834698326e-05, "loss": 0.7952, "step": 13960 }, { "epoch": 3.92495784148398, "grad_norm": 0.5374967455863953, "learning_rate": 2.6866637140733663e-05, "loss": 0.7938, "step": 13965 }, { "epoch": 3.9263631253513207, "grad_norm": 0.5251932740211487, "learning_rate": 2.679976120283131e-05, "loss": 0.7813, "step": 13970 }, { "epoch": 3.927768409218662, "grad_norm": 0.49533960223197937, "learning_rate": 2.6732955720493957e-05, "loss": 0.805, "step": 13975 }, { "epoch": 3.9291736930860033, "grad_norm": 0.4887608289718628, "learning_rate": 2.6666220758022233e-05, "loss": 0.7844, "step": 13980 }, { "epoch": 3.9305789769533446, "grad_norm": 0.4844372570514679, "learning_rate": 2.659955637964877e-05, "loss": 0.7866, "step": 13985 }, { "epoch": 3.931984260820686, "grad_norm": 0.4718281626701355, "learning_rate": 2.6532962649538384e-05, "loss": 0.7938, "step": 13990 }, { "epoch": 3.933389544688027, "grad_norm": 0.5602251291275024, "learning_rate": 2.6466439631787833e-05, "loss": 0.7871, "step": 13995 }, { "epoch": 3.9347948285553684, "grad_norm": 0.535500168800354, "learning_rate": 2.6399987390425816e-05, "loss": 0.7922, "step": 14000 }, { "epoch": 3.9362001124227093, "grad_norm": 0.5144920945167542, "learning_rate": 2.6333605989412947e-05, "loss": 0.7763, "step": 14005 }, { "epoch": 3.9376053962900506, "grad_norm": 0.5025665163993835, "learning_rate": 2.6267295492641607e-05, "loss": 0.7908, "step": 14010 }, { "epoch": 3.939010680157392, "grad_norm": 0.49400579929351807, "learning_rate": 2.6201055963935995e-05, "loss": 0.7964, "step": 14015 }, { "epoch": 3.940415964024733, "grad_norm": 0.5128166675567627, "learning_rate": 2.6134887467051894e-05, "loss": 0.8326, "step": 14020 }, { "epoch": 3.941821247892074, "grad_norm": 0.54892897605896, "learning_rate": 2.6068790065676806e-05, "loss": 0.7847, "step": 14025 }, { "epoch": 3.9432265317594153, "grad_norm": 0.5115255117416382, "learning_rate": 2.6002763823429808e-05, "loss": 0.7959, "step": 14030 }, { "epoch": 3.9446318156267566, "grad_norm": 0.49871450662612915, "learning_rate": 2.5936808803861468e-05, "loss": 0.7917, "step": 14035 }, { "epoch": 3.946037099494098, "grad_norm": 0.4911504089832306, "learning_rate": 2.5870925070453746e-05, "loss": 0.7823, "step": 14040 }, { "epoch": 3.947442383361439, "grad_norm": 0.5270660519599915, "learning_rate": 2.5805112686620046e-05, "loss": 0.7865, "step": 14045 }, { "epoch": 3.9488476672287804, "grad_norm": 0.501278817653656, "learning_rate": 2.5739371715705117e-05, "loss": 0.7967, "step": 14050 }, { "epoch": 3.9502529510961213, "grad_norm": 0.5706464648246765, "learning_rate": 2.567370222098494e-05, "loss": 0.7918, "step": 14055 }, { "epoch": 3.9516582349634626, "grad_norm": 0.5234654545783997, "learning_rate": 2.5608104265666687e-05, "loss": 0.797, "step": 14060 }, { "epoch": 3.953063518830804, "grad_norm": 0.555103600025177, "learning_rate": 2.554257791288871e-05, "loss": 0.8022, "step": 14065 }, { "epoch": 3.954468802698145, "grad_norm": 0.5400828719139099, "learning_rate": 2.5477123225720433e-05, "loss": 0.7918, "step": 14070 }, { "epoch": 3.955874086565486, "grad_norm": 0.5535014271736145, "learning_rate": 2.5411740267162254e-05, "loss": 0.7976, "step": 14075 }, { "epoch": 3.9572793704328273, "grad_norm": 0.49540334939956665, "learning_rate": 2.5346429100145584e-05, "loss": 0.798, "step": 14080 }, { "epoch": 3.9586846543001686, "grad_norm": 0.5534090995788574, "learning_rate": 2.5281189787532756e-05, "loss": 0.7798, "step": 14085 }, { "epoch": 3.96008993816751, "grad_norm": 0.503334105014801, "learning_rate": 2.5216022392116844e-05, "loss": 0.7877, "step": 14090 }, { "epoch": 3.961495222034851, "grad_norm": 0.5218794941902161, "learning_rate": 2.5150926976621803e-05, "loss": 0.7796, "step": 14095 }, { "epoch": 3.9629005059021924, "grad_norm": 0.488923579454422, "learning_rate": 2.5085903603702267e-05, "loss": 0.7821, "step": 14100 }, { "epoch": 3.9643057897695333, "grad_norm": 0.5786020159721375, "learning_rate": 2.5020952335943514e-05, "loss": 0.8033, "step": 14105 }, { "epoch": 3.9657110736368746, "grad_norm": 0.5032353401184082, "learning_rate": 2.4956073235861453e-05, "loss": 0.7795, "step": 14110 }, { "epoch": 3.967116357504216, "grad_norm": 0.504985511302948, "learning_rate": 2.489126636590251e-05, "loss": 0.8391, "step": 14115 }, { "epoch": 3.968521641371557, "grad_norm": 0.5181665420532227, "learning_rate": 2.482653178844363e-05, "loss": 0.7793, "step": 14120 }, { "epoch": 3.969926925238898, "grad_norm": 0.46679332852363586, "learning_rate": 2.4761869565792062e-05, "loss": 0.7851, "step": 14125 }, { "epoch": 3.9713322091062393, "grad_norm": 0.5379502177238464, "learning_rate": 2.4697279760185542e-05, "loss": 0.8018, "step": 14130 }, { "epoch": 3.9727374929735806, "grad_norm": 0.543921947479248, "learning_rate": 2.4632762433792046e-05, "loss": 0.7999, "step": 14135 }, { "epoch": 3.974142776840922, "grad_norm": 0.5470147728919983, "learning_rate": 2.4568317648709825e-05, "loss": 0.7912, "step": 14140 }, { "epoch": 3.975548060708263, "grad_norm": 0.5134099721908569, "learning_rate": 2.4503945466967225e-05, "loss": 0.7833, "step": 14145 }, { "epoch": 3.9769533445756045, "grad_norm": 0.557580828666687, "learning_rate": 2.4439645950522784e-05, "loss": 0.7919, "step": 14150 }, { "epoch": 3.9783586284429457, "grad_norm": 0.4935877025127411, "learning_rate": 2.437541916126509e-05, "loss": 0.7874, "step": 14155 }, { "epoch": 3.9797639123102866, "grad_norm": 0.5110975503921509, "learning_rate": 2.431126516101272e-05, "loss": 0.7974, "step": 14160 }, { "epoch": 3.981169196177628, "grad_norm": 0.4988139271736145, "learning_rate": 2.42471840115142e-05, "loss": 0.7843, "step": 14165 }, { "epoch": 3.982574480044969, "grad_norm": 0.5011380314826965, "learning_rate": 2.418317577444792e-05, "loss": 0.7945, "step": 14170 }, { "epoch": 3.98397976391231, "grad_norm": 0.51650071144104, "learning_rate": 2.4119240511422126e-05, "loss": 0.7769, "step": 14175 }, { "epoch": 3.9853850477796513, "grad_norm": 0.5402264595031738, "learning_rate": 2.405537828397475e-05, "loss": 0.8084, "step": 14180 }, { "epoch": 3.9867903316469926, "grad_norm": 0.6527597904205322, "learning_rate": 2.3991589153573513e-05, "loss": 0.79, "step": 14185 }, { "epoch": 3.988195615514334, "grad_norm": 0.6313776969909668, "learning_rate": 2.3927873181615768e-05, "loss": 0.8267, "step": 14190 }, { "epoch": 3.989600899381675, "grad_norm": 0.5356908440589905, "learning_rate": 2.386423042942837e-05, "loss": 0.7881, "step": 14195 }, { "epoch": 3.9910061832490165, "grad_norm": 0.5451526045799255, "learning_rate": 2.3800660958267795e-05, "loss": 0.7883, "step": 14200 }, { "epoch": 3.9924114671163577, "grad_norm": 0.5183944702148438, "learning_rate": 2.3737164829319915e-05, "loss": 0.7769, "step": 14205 }, { "epoch": 3.9938167509836986, "grad_norm": 0.5416154265403748, "learning_rate": 2.3673742103700135e-05, "loss": 0.7842, "step": 14210 }, { "epoch": 3.99522203485104, "grad_norm": 0.5259328484535217, "learning_rate": 2.361039284245302e-05, "loss": 0.7908, "step": 14215 }, { "epoch": 3.996627318718381, "grad_norm": 0.49881279468536377, "learning_rate": 2.3547117106552574e-05, "loss": 0.7924, "step": 14220 }, { "epoch": 3.9980326025857225, "grad_norm": 0.5812293291091919, "learning_rate": 2.3483914956901996e-05, "loss": 0.7961, "step": 14225 }, { "epoch": 3.9994378864530633, "grad_norm": 0.5475685000419617, "learning_rate": 2.34207864543336e-05, "loss": 0.838, "step": 14230 }, { "epoch": 4.0, "eval_loss": 0.8493756055831909, "eval_runtime": 627.4844, "eval_samples_per_second": 7.167, "eval_steps_per_second": 0.598, "step": 14232 }, { "epoch": 4.000843170320405, "grad_norm": 0.5091008543968201, "learning_rate": 2.3357731659608872e-05, "loss": 0.7801, "step": 14235 }, { "epoch": 4.002248454187746, "grad_norm": 0.5158987045288086, "learning_rate": 2.329475063341835e-05, "loss": 0.7712, "step": 14240 }, { "epoch": 4.003653738055087, "grad_norm": 0.6087315678596497, "learning_rate": 2.3231843436381594e-05, "loss": 0.7642, "step": 14245 }, { "epoch": 4.0050590219224285, "grad_norm": 0.5059085488319397, "learning_rate": 2.3169010129046966e-05, "loss": 0.7619, "step": 14250 }, { "epoch": 4.00646430578977, "grad_norm": 0.5402360558509827, "learning_rate": 2.3106250771891912e-05, "loss": 0.7601, "step": 14255 }, { "epoch": 4.007869589657111, "grad_norm": 0.5491556525230408, "learning_rate": 2.304356542532259e-05, "loss": 0.8093, "step": 14260 }, { "epoch": 4.009274873524452, "grad_norm": 0.5153757929801941, "learning_rate": 2.2980954149673893e-05, "loss": 0.7517, "step": 14265 }, { "epoch": 4.010680157391793, "grad_norm": 0.5505621433258057, "learning_rate": 2.291841700520947e-05, "loss": 0.7519, "step": 14270 }, { "epoch": 4.012085441259134, "grad_norm": 0.5181359052658081, "learning_rate": 2.285595405212162e-05, "loss": 0.7579, "step": 14275 }, { "epoch": 4.013490725126475, "grad_norm": 0.5227612257003784, "learning_rate": 2.2793565350531242e-05, "loss": 0.7562, "step": 14280 }, { "epoch": 4.014896008993817, "grad_norm": 0.5462778210639954, "learning_rate": 2.2731250960487705e-05, "loss": 0.76, "step": 14285 }, { "epoch": 4.016301292861158, "grad_norm": 0.5738673806190491, "learning_rate": 2.2669010941968905e-05, "loss": 0.7577, "step": 14290 }, { "epoch": 4.017706576728499, "grad_norm": 0.5354070663452148, "learning_rate": 2.2606845354881155e-05, "loss": 0.769, "step": 14295 }, { "epoch": 4.0191118605958405, "grad_norm": 0.5066848397254944, "learning_rate": 2.254475425905912e-05, "loss": 0.7566, "step": 14300 }, { "epoch": 4.020517144463182, "grad_norm": 0.5838959813117981, "learning_rate": 2.2482737714265756e-05, "loss": 0.7689, "step": 14305 }, { "epoch": 4.021922428330523, "grad_norm": 0.5021213293075562, "learning_rate": 2.2420795780192283e-05, "loss": 0.7606, "step": 14310 }, { "epoch": 4.023327712197864, "grad_norm": 0.5116584897041321, "learning_rate": 2.23589285164581e-05, "loss": 0.7675, "step": 14315 }, { "epoch": 4.024732996065205, "grad_norm": 0.5282134413719177, "learning_rate": 2.2297135982610707e-05, "loss": 0.758, "step": 14320 }, { "epoch": 4.026138279932546, "grad_norm": 0.5259456038475037, "learning_rate": 2.2235418238125705e-05, "loss": 0.7679, "step": 14325 }, { "epoch": 4.027543563799887, "grad_norm": 0.4868493974208832, "learning_rate": 2.2173775342406712e-05, "loss": 0.7529, "step": 14330 }, { "epoch": 4.028948847667229, "grad_norm": 0.6483232378959656, "learning_rate": 2.211220735478533e-05, "loss": 0.755, "step": 14335 }, { "epoch": 4.03035413153457, "grad_norm": 0.505325436592102, "learning_rate": 2.2050714334520972e-05, "loss": 0.7598, "step": 14340 }, { "epoch": 4.031759415401911, "grad_norm": 0.5150840282440186, "learning_rate": 2.198929634080098e-05, "loss": 0.7528, "step": 14345 }, { "epoch": 4.0331646992692525, "grad_norm": 0.5273594260215759, "learning_rate": 2.1927953432740444e-05, "loss": 0.7509, "step": 14350 }, { "epoch": 4.034569983136594, "grad_norm": 0.5245232582092285, "learning_rate": 2.1866685669382204e-05, "loss": 0.7609, "step": 14355 }, { "epoch": 4.035975267003935, "grad_norm": 0.5435845851898193, "learning_rate": 2.180549310969676e-05, "loss": 0.7657, "step": 14360 }, { "epoch": 4.037380550871276, "grad_norm": 0.578952431678772, "learning_rate": 2.174437581258224e-05, "loss": 0.7708, "step": 14365 }, { "epoch": 4.038785834738618, "grad_norm": 0.5136585235595703, "learning_rate": 2.168333383686434e-05, "loss": 0.7654, "step": 14370 }, { "epoch": 4.040191118605958, "grad_norm": 0.5169129967689514, "learning_rate": 2.162236724129618e-05, "loss": 0.7583, "step": 14375 }, { "epoch": 4.041596402473299, "grad_norm": 0.4992331564426422, "learning_rate": 2.156147608455843e-05, "loss": 0.7746, "step": 14380 }, { "epoch": 4.043001686340641, "grad_norm": 0.5378484129905701, "learning_rate": 2.150066042525912e-05, "loss": 0.765, "step": 14385 }, { "epoch": 4.044406970207982, "grad_norm": 0.5266295075416565, "learning_rate": 2.143992032193356e-05, "loss": 0.759, "step": 14390 }, { "epoch": 4.045812254075323, "grad_norm": 0.5418915152549744, "learning_rate": 2.137925583304439e-05, "loss": 0.7647, "step": 14395 }, { "epoch": 4.0472175379426645, "grad_norm": 0.5171228647232056, "learning_rate": 2.1318667016981465e-05, "loss": 0.7654, "step": 14400 }, { "epoch": 4.048622821810006, "grad_norm": 0.5772991180419922, "learning_rate": 2.1258153932061808e-05, "loss": 0.7505, "step": 14405 }, { "epoch": 4.050028105677347, "grad_norm": 0.5454348921775818, "learning_rate": 2.1197716636529518e-05, "loss": 0.7525, "step": 14410 }, { "epoch": 4.051433389544688, "grad_norm": 0.5749043226242065, "learning_rate": 2.1137355188555798e-05, "loss": 0.7605, "step": 14415 }, { "epoch": 4.05283867341203, "grad_norm": 0.6449677348136902, "learning_rate": 2.1077069646238824e-05, "loss": 0.7761, "step": 14420 }, { "epoch": 4.05424395727937, "grad_norm": 0.5155068039894104, "learning_rate": 2.101686006760365e-05, "loss": 0.7587, "step": 14425 }, { "epoch": 4.055649241146711, "grad_norm": 0.5196123719215393, "learning_rate": 2.095672651060231e-05, "loss": 0.7678, "step": 14430 }, { "epoch": 4.057054525014053, "grad_norm": 0.5773482322692871, "learning_rate": 2.0896669033113626e-05, "loss": 0.7604, "step": 14435 }, { "epoch": 4.058459808881394, "grad_norm": 0.5296126008033752, "learning_rate": 2.083668769294321e-05, "loss": 0.7573, "step": 14440 }, { "epoch": 4.059865092748735, "grad_norm": 0.563080906867981, "learning_rate": 2.0776782547823337e-05, "loss": 0.7622, "step": 14445 }, { "epoch": 4.0612703766160765, "grad_norm": 0.5182024240493774, "learning_rate": 2.0716953655413007e-05, "loss": 0.759, "step": 14450 }, { "epoch": 4.062675660483418, "grad_norm": 0.5270100235939026, "learning_rate": 2.0657201073297805e-05, "loss": 0.7609, "step": 14455 }, { "epoch": 4.064080944350759, "grad_norm": 0.5340908169746399, "learning_rate": 2.0597524858989857e-05, "loss": 0.7647, "step": 14460 }, { "epoch": 4.0654862282181, "grad_norm": 0.5522120594978333, "learning_rate": 2.0537925069927798e-05, "loss": 0.7605, "step": 14465 }, { "epoch": 4.066891512085442, "grad_norm": 0.5666811466217041, "learning_rate": 2.0478401763476694e-05, "loss": 0.7582, "step": 14470 }, { "epoch": 4.068296795952782, "grad_norm": 0.5185369253158569, "learning_rate": 2.041895499692804e-05, "loss": 0.7639, "step": 14475 }, { "epoch": 4.069702079820123, "grad_norm": 0.5280120968818665, "learning_rate": 2.0359584827499544e-05, "loss": 0.7737, "step": 14480 }, { "epoch": 4.071107363687465, "grad_norm": 0.5524877905845642, "learning_rate": 2.0300291312335317e-05, "loss": 0.8033, "step": 14485 }, { "epoch": 4.072512647554806, "grad_norm": 0.5730440616607666, "learning_rate": 2.0241074508505643e-05, "loss": 0.7678, "step": 14490 }, { "epoch": 4.073917931422147, "grad_norm": 0.5500258207321167, "learning_rate": 2.0181934473006935e-05, "loss": 0.7645, "step": 14495 }, { "epoch": 4.0753232152894885, "grad_norm": 0.4928179681301117, "learning_rate": 2.0122871262761754e-05, "loss": 0.7624, "step": 14500 }, { "epoch": 4.07672849915683, "grad_norm": 0.5159469842910767, "learning_rate": 2.0063884934618728e-05, "loss": 0.8052, "step": 14505 }, { "epoch": 4.078133783024171, "grad_norm": 0.5040550231933594, "learning_rate": 2.0004975545352457e-05, "loss": 0.7689, "step": 14510 }, { "epoch": 4.079539066891512, "grad_norm": 0.5132153034210205, "learning_rate": 1.99461431516635e-05, "loss": 0.755, "step": 14515 }, { "epoch": 4.080944350758854, "grad_norm": 0.49842506647109985, "learning_rate": 1.9887387810178317e-05, "loss": 0.7574, "step": 14520 }, { "epoch": 4.082349634626194, "grad_norm": 0.5168430209159851, "learning_rate": 1.98287095774492e-05, "loss": 0.7632, "step": 14525 }, { "epoch": 4.083754918493535, "grad_norm": 0.5256431102752686, "learning_rate": 1.9770108509954167e-05, "loss": 0.7644, "step": 14530 }, { "epoch": 4.085160202360877, "grad_norm": 0.5269330739974976, "learning_rate": 1.971158466409706e-05, "loss": 0.7577, "step": 14535 }, { "epoch": 4.086565486228218, "grad_norm": 0.615835964679718, "learning_rate": 1.9653138096207324e-05, "loss": 0.7582, "step": 14540 }, { "epoch": 4.087970770095559, "grad_norm": 0.590116560459137, "learning_rate": 1.959476886254009e-05, "loss": 0.759, "step": 14545 }, { "epoch": 4.0893760539629005, "grad_norm": 0.6698099374771118, "learning_rate": 1.9536477019275955e-05, "loss": 0.753, "step": 14550 }, { "epoch": 4.090781337830242, "grad_norm": 0.49944236874580383, "learning_rate": 1.9478262622521114e-05, "loss": 0.7655, "step": 14555 }, { "epoch": 4.092186621697583, "grad_norm": 0.5291188955307007, "learning_rate": 1.942012572830719e-05, "loss": 0.7573, "step": 14560 }, { "epoch": 4.093591905564924, "grad_norm": 0.5382302403450012, "learning_rate": 1.9362066392591205e-05, "loss": 0.7569, "step": 14565 }, { "epoch": 4.094997189432266, "grad_norm": 0.531326413154602, "learning_rate": 1.9304084671255542e-05, "loss": 0.7542, "step": 14570 }, { "epoch": 4.096402473299607, "grad_norm": 0.5258771777153015, "learning_rate": 1.9246180620107858e-05, "loss": 0.7661, "step": 14575 }, { "epoch": 4.097807757166947, "grad_norm": 0.5160670876502991, "learning_rate": 1.9188354294881115e-05, "loss": 0.7618, "step": 14580 }, { "epoch": 4.099213041034289, "grad_norm": 0.531150758266449, "learning_rate": 1.9130605751233355e-05, "loss": 0.8067, "step": 14585 }, { "epoch": 4.10061832490163, "grad_norm": 0.4959104657173157, "learning_rate": 1.9072935044747843e-05, "loss": 0.7663, "step": 14590 }, { "epoch": 4.102023608768971, "grad_norm": 0.5348832607269287, "learning_rate": 1.901534223093291e-05, "loss": 0.7668, "step": 14595 }, { "epoch": 4.1034288926363125, "grad_norm": 0.5030957460403442, "learning_rate": 1.895782736522187e-05, "loss": 0.7525, "step": 14600 }, { "epoch": 4.104834176503654, "grad_norm": 0.5333991646766663, "learning_rate": 1.8900390502973065e-05, "loss": 0.7675, "step": 14605 }, { "epoch": 4.106239460370995, "grad_norm": 0.5182572603225708, "learning_rate": 1.884303169946974e-05, "loss": 0.7722, "step": 14610 }, { "epoch": 4.107644744238336, "grad_norm": 0.6066407561302185, "learning_rate": 1.8785751009919994e-05, "loss": 0.7678, "step": 14615 }, { "epoch": 4.109050028105678, "grad_norm": 0.5684912800788879, "learning_rate": 1.8728548489456765e-05, "loss": 0.7594, "step": 14620 }, { "epoch": 4.110455311973019, "grad_norm": 0.5319727659225464, "learning_rate": 1.8671424193137733e-05, "loss": 0.7567, "step": 14625 }, { "epoch": 4.111860595840359, "grad_norm": 0.5175413489341736, "learning_rate": 1.8614378175945334e-05, "loss": 0.7609, "step": 14630 }, { "epoch": 4.113265879707701, "grad_norm": 0.5334261655807495, "learning_rate": 1.8557410492786554e-05, "loss": 0.7608, "step": 14635 }, { "epoch": 4.114671163575042, "grad_norm": 0.5544094443321228, "learning_rate": 1.8500521198493082e-05, "loss": 0.7675, "step": 14640 }, { "epoch": 4.116076447442383, "grad_norm": 0.5419462323188782, "learning_rate": 1.844371034782112e-05, "loss": 0.761, "step": 14645 }, { "epoch": 4.1174817313097245, "grad_norm": 0.5597703456878662, "learning_rate": 1.8386977995451394e-05, "loss": 0.759, "step": 14650 }, { "epoch": 4.118887015177066, "grad_norm": 0.5377607941627502, "learning_rate": 1.8330324195989023e-05, "loss": 0.7761, "step": 14655 }, { "epoch": 4.120292299044407, "grad_norm": 0.523078441619873, "learning_rate": 1.8273749003963547e-05, "loss": 0.7736, "step": 14660 }, { "epoch": 4.121697582911748, "grad_norm": 0.5491028428077698, "learning_rate": 1.821725247382886e-05, "loss": 0.7631, "step": 14665 }, { "epoch": 4.12310286677909, "grad_norm": 0.5564166307449341, "learning_rate": 1.8160834659963143e-05, "loss": 0.766, "step": 14670 }, { "epoch": 4.124508150646431, "grad_norm": 0.5371769070625305, "learning_rate": 1.810449561666877e-05, "loss": 0.7734, "step": 14675 }, { "epoch": 4.125913434513771, "grad_norm": 0.5102625489234924, "learning_rate": 1.8048235398172354e-05, "loss": 0.7551, "step": 14680 }, { "epoch": 4.127318718381113, "grad_norm": 0.5413872599601746, "learning_rate": 1.799205405862463e-05, "loss": 0.7703, "step": 14685 }, { "epoch": 4.128724002248454, "grad_norm": 0.7608216404914856, "learning_rate": 1.7935951652100347e-05, "loss": 0.7821, "step": 14690 }, { "epoch": 4.130129286115795, "grad_norm": 0.4896598160266876, "learning_rate": 1.7879928232598342e-05, "loss": 0.7622, "step": 14695 }, { "epoch": 4.1315345699831365, "grad_norm": 0.5583370923995972, "learning_rate": 1.7823983854041428e-05, "loss": 0.7617, "step": 14700 }, { "epoch": 4.132939853850478, "grad_norm": 0.5344766974449158, "learning_rate": 1.776811857027635e-05, "loss": 0.7666, "step": 14705 }, { "epoch": 4.134345137717819, "grad_norm": 0.5449461936950684, "learning_rate": 1.771233243507361e-05, "loss": 0.7723, "step": 14710 }, { "epoch": 4.13575042158516, "grad_norm": 0.5095350742340088, "learning_rate": 1.765662550212769e-05, "loss": 0.7694, "step": 14715 }, { "epoch": 4.137155705452502, "grad_norm": 0.566623330116272, "learning_rate": 1.7600997825056798e-05, "loss": 0.7864, "step": 14720 }, { "epoch": 4.138560989319843, "grad_norm": 0.520380973815918, "learning_rate": 1.7545449457402752e-05, "loss": 0.7662, "step": 14725 }, { "epoch": 4.139966273187184, "grad_norm": 0.5430235266685486, "learning_rate": 1.748998045263114e-05, "loss": 0.7603, "step": 14730 }, { "epoch": 4.141371557054525, "grad_norm": 0.5299351215362549, "learning_rate": 1.743459086413114e-05, "loss": 0.7691, "step": 14735 }, { "epoch": 4.142776840921866, "grad_norm": 0.5721263289451599, "learning_rate": 1.7379280745215498e-05, "loss": 0.783, "step": 14740 }, { "epoch": 4.144182124789207, "grad_norm": 0.5396542549133301, "learning_rate": 1.732405014912042e-05, "loss": 0.7685, "step": 14745 }, { "epoch": 4.1455874086565485, "grad_norm": 0.5272538065910339, "learning_rate": 1.7268899129005622e-05, "loss": 0.7782, "step": 14750 }, { "epoch": 4.14699269252389, "grad_norm": 0.5426777601242065, "learning_rate": 1.7213827737954248e-05, "loss": 0.7727, "step": 14755 }, { "epoch": 4.148397976391231, "grad_norm": 0.5160602927207947, "learning_rate": 1.715883602897268e-05, "loss": 0.7647, "step": 14760 }, { "epoch": 4.149803260258572, "grad_norm": 0.5610844492912292, "learning_rate": 1.710392405499077e-05, "loss": 0.7588, "step": 14765 }, { "epoch": 4.151208544125914, "grad_norm": 0.5141729116439819, "learning_rate": 1.7049091868861523e-05, "loss": 0.7676, "step": 14770 }, { "epoch": 4.152613827993255, "grad_norm": 0.5534763336181641, "learning_rate": 1.699433952336118e-05, "loss": 0.7607, "step": 14775 }, { "epoch": 4.154019111860596, "grad_norm": 0.5679239630699158, "learning_rate": 1.693966707118909e-05, "loss": 0.7701, "step": 14780 }, { "epoch": 4.155424395727937, "grad_norm": 0.5184338092803955, "learning_rate": 1.688507456496776e-05, "loss": 0.7668, "step": 14785 }, { "epoch": 4.156829679595278, "grad_norm": 0.6242445707321167, "learning_rate": 1.6830562057242738e-05, "loss": 0.7614, "step": 14790 }, { "epoch": 4.158234963462619, "grad_norm": 0.5235223174095154, "learning_rate": 1.6776129600482537e-05, "loss": 0.7554, "step": 14795 }, { "epoch": 4.1596402473299605, "grad_norm": 0.5166909098625183, "learning_rate": 1.672177724707865e-05, "loss": 0.753, "step": 14800 }, { "epoch": 4.161045531197302, "grad_norm": 0.5642342567443848, "learning_rate": 1.6667505049345476e-05, "loss": 0.7636, "step": 14805 }, { "epoch": 4.162450815064643, "grad_norm": 0.5586567521095276, "learning_rate": 1.661331305952025e-05, "loss": 0.767, "step": 14810 }, { "epoch": 4.163856098931984, "grad_norm": 0.5883127450942993, "learning_rate": 1.6559201329763006e-05, "loss": 0.7709, "step": 14815 }, { "epoch": 4.165261382799326, "grad_norm": 0.5193496942520142, "learning_rate": 1.6505169912156548e-05, "loss": 0.7653, "step": 14820 }, { "epoch": 4.166666666666667, "grad_norm": 0.5206704139709473, "learning_rate": 1.6451218858706374e-05, "loss": 0.7685, "step": 14825 }, { "epoch": 4.168071950534008, "grad_norm": 0.5586879253387451, "learning_rate": 1.639734822134057e-05, "loss": 0.757, "step": 14830 }, { "epoch": 4.169477234401349, "grad_norm": 0.5711727738380432, "learning_rate": 1.634355805190989e-05, "loss": 0.7513, "step": 14835 }, { "epoch": 4.17088251826869, "grad_norm": 0.5114162564277649, "learning_rate": 1.6289848402187648e-05, "loss": 0.7684, "step": 14840 }, { "epoch": 4.172287802136031, "grad_norm": 0.5075726509094238, "learning_rate": 1.6236219323869618e-05, "loss": 0.7433, "step": 14845 }, { "epoch": 4.1736930860033725, "grad_norm": 0.5122740864753723, "learning_rate": 1.6182670868574003e-05, "loss": 0.7636, "step": 14850 }, { "epoch": 4.175098369870714, "grad_norm": 0.5521731972694397, "learning_rate": 1.612920308784145e-05, "loss": 0.7657, "step": 14855 }, { "epoch": 4.176503653738055, "grad_norm": 0.5010851621627808, "learning_rate": 1.607581603313495e-05, "loss": 0.7655, "step": 14860 }, { "epoch": 4.177908937605396, "grad_norm": 0.5235019326210022, "learning_rate": 1.6022509755839797e-05, "loss": 0.7776, "step": 14865 }, { "epoch": 4.179314221472738, "grad_norm": 0.5572540760040283, "learning_rate": 1.596928430726351e-05, "loss": 0.7525, "step": 14870 }, { "epoch": 4.180719505340079, "grad_norm": 0.5532335638999939, "learning_rate": 1.5916139738635825e-05, "loss": 0.7541, "step": 14875 }, { "epoch": 4.18212478920742, "grad_norm": 0.5307499766349792, "learning_rate": 1.5863076101108675e-05, "loss": 0.7653, "step": 14880 }, { "epoch": 4.1835300730747615, "grad_norm": 0.5084945559501648, "learning_rate": 1.5810093445756002e-05, "loss": 0.76, "step": 14885 }, { "epoch": 4.184935356942102, "grad_norm": 0.5106390714645386, "learning_rate": 1.575719182357386e-05, "loss": 0.7607, "step": 14890 }, { "epoch": 4.186340640809443, "grad_norm": 0.5546446442604065, "learning_rate": 1.5704371285480345e-05, "loss": 0.7616, "step": 14895 }, { "epoch": 4.1877459246767845, "grad_norm": 0.5922245383262634, "learning_rate": 1.565163188231541e-05, "loss": 0.7512, "step": 14900 }, { "epoch": 4.189151208544126, "grad_norm": 0.5253076553344727, "learning_rate": 1.5598973664841e-05, "loss": 0.7547, "step": 14905 }, { "epoch": 4.190556492411467, "grad_norm": 0.5457403659820557, "learning_rate": 1.5546396683740892e-05, "loss": 0.7719, "step": 14910 }, { "epoch": 4.191961776278808, "grad_norm": 0.5217248201370239, "learning_rate": 1.549390098962067e-05, "loss": 0.7663, "step": 14915 }, { "epoch": 4.19336706014615, "grad_norm": 0.5782322883605957, "learning_rate": 1.5441486633007674e-05, "loss": 0.7631, "step": 14920 }, { "epoch": 4.194772344013491, "grad_norm": 0.557896614074707, "learning_rate": 1.5389153664350963e-05, "loss": 0.7501, "step": 14925 }, { "epoch": 4.196177627880832, "grad_norm": 0.5055593848228455, "learning_rate": 1.533690213402129e-05, "loss": 0.7582, "step": 14930 }, { "epoch": 4.1975829117481736, "grad_norm": 0.5190460085868835, "learning_rate": 1.528473209231095e-05, "loss": 0.7647, "step": 14935 }, { "epoch": 4.198988195615514, "grad_norm": 0.5824263095855713, "learning_rate": 1.5232643589433848e-05, "loss": 0.7549, "step": 14940 }, { "epoch": 4.200393479482855, "grad_norm": 0.535484790802002, "learning_rate": 1.5180636675525428e-05, "loss": 0.7728, "step": 14945 }, { "epoch": 4.2017987633501965, "grad_norm": 0.5152845978736877, "learning_rate": 1.5128711400642593e-05, "loss": 0.7504, "step": 14950 }, { "epoch": 4.203204047217538, "grad_norm": 0.5120229125022888, "learning_rate": 1.5076867814763629e-05, "loss": 0.7655, "step": 14955 }, { "epoch": 4.204609331084879, "grad_norm": 0.5439524054527283, "learning_rate": 1.5025105967788222e-05, "loss": 0.7572, "step": 14960 }, { "epoch": 4.20601461495222, "grad_norm": 0.5316490530967712, "learning_rate": 1.4973425909537409e-05, "loss": 0.7708, "step": 14965 }, { "epoch": 4.207419898819562, "grad_norm": 0.5262763500213623, "learning_rate": 1.4921827689753465e-05, "loss": 0.7703, "step": 14970 }, { "epoch": 4.208825182686903, "grad_norm": 0.5419159531593323, "learning_rate": 1.4870311358099932e-05, "loss": 0.7453, "step": 14975 }, { "epoch": 4.210230466554244, "grad_norm": 0.5065816640853882, "learning_rate": 1.4818876964161499e-05, "loss": 0.7686, "step": 14980 }, { "epoch": 4.2116357504215856, "grad_norm": 0.5062248706817627, "learning_rate": 1.4767524557444034e-05, "loss": 0.8098, "step": 14985 }, { "epoch": 4.213041034288926, "grad_norm": 0.5111182332038879, "learning_rate": 1.4716254187374422e-05, "loss": 0.7687, "step": 14990 }, { "epoch": 4.214446318156267, "grad_norm": 0.5519953966140747, "learning_rate": 1.4665065903300645e-05, "loss": 0.7524, "step": 14995 }, { "epoch": 4.2158516020236085, "grad_norm": 0.5973950028419495, "learning_rate": 1.4613959754491691e-05, "loss": 0.7635, "step": 15000 }, { "epoch": 4.21725688589095, "grad_norm": 0.558695912361145, "learning_rate": 1.456293579013741e-05, "loss": 0.7794, "step": 15005 }, { "epoch": 4.218662169758291, "grad_norm": 0.5113961696624756, "learning_rate": 1.4511994059348622e-05, "loss": 0.7664, "step": 15010 }, { "epoch": 4.220067453625632, "grad_norm": 0.517517626285553, "learning_rate": 1.446113461115698e-05, "loss": 0.7711, "step": 15015 }, { "epoch": 4.221472737492974, "grad_norm": 0.5033154487609863, "learning_rate": 1.4410357494514947e-05, "loss": 0.7702, "step": 15020 }, { "epoch": 4.222878021360315, "grad_norm": 0.5211993455886841, "learning_rate": 1.435966275829571e-05, "loss": 0.7544, "step": 15025 }, { "epoch": 4.224283305227656, "grad_norm": 0.5256550908088684, "learning_rate": 1.4309050451293205e-05, "loss": 0.769, "step": 15030 }, { "epoch": 4.225688589094998, "grad_norm": 0.5042311549186707, "learning_rate": 1.4258520622222004e-05, "loss": 0.7731, "step": 15035 }, { "epoch": 4.227093872962339, "grad_norm": 0.5198147892951965, "learning_rate": 1.4208073319717285e-05, "loss": 0.7614, "step": 15040 }, { "epoch": 4.228499156829679, "grad_norm": 0.5736011862754822, "learning_rate": 1.4157708592334818e-05, "loss": 0.7517, "step": 15045 }, { "epoch": 4.2299044406970205, "grad_norm": 0.5595988035202026, "learning_rate": 1.4107426488550868e-05, "loss": 0.7668, "step": 15050 }, { "epoch": 4.231309724564362, "grad_norm": 0.5281693935394287, "learning_rate": 1.4057227056762235e-05, "loss": 0.7587, "step": 15055 }, { "epoch": 4.232715008431703, "grad_norm": 0.4986502230167389, "learning_rate": 1.4007110345286056e-05, "loss": 0.767, "step": 15060 }, { "epoch": 4.234120292299044, "grad_norm": 0.5474176406860352, "learning_rate": 1.3957076402359903e-05, "loss": 0.7629, "step": 15065 }, { "epoch": 4.235525576166386, "grad_norm": 0.5422942042350769, "learning_rate": 1.3907125276141675e-05, "loss": 0.7607, "step": 15070 }, { "epoch": 4.236930860033727, "grad_norm": 0.5286979675292969, "learning_rate": 1.3857257014709579e-05, "loss": 0.7523, "step": 15075 }, { "epoch": 4.238336143901068, "grad_norm": 0.5423422455787659, "learning_rate": 1.3807471666062022e-05, "loss": 0.7587, "step": 15080 }, { "epoch": 4.23974142776841, "grad_norm": 0.5019691586494446, "learning_rate": 1.375776927811765e-05, "loss": 0.7666, "step": 15085 }, { "epoch": 4.24114671163575, "grad_norm": 0.6004632115364075, "learning_rate": 1.370814989871525e-05, "loss": 0.7947, "step": 15090 }, { "epoch": 4.242551995503091, "grad_norm": 0.5509012937545776, "learning_rate": 1.3658613575613654e-05, "loss": 0.7564, "step": 15095 }, { "epoch": 4.2439572793704325, "grad_norm": 0.5599709749221802, "learning_rate": 1.3609160356491835e-05, "loss": 0.7566, "step": 15100 }, { "epoch": 4.245362563237774, "grad_norm": 0.5406939387321472, "learning_rate": 1.3559790288948737e-05, "loss": 0.7611, "step": 15105 }, { "epoch": 4.246767847105115, "grad_norm": 0.532951831817627, "learning_rate": 1.3510503420503295e-05, "loss": 0.7601, "step": 15110 }, { "epoch": 4.248173130972456, "grad_norm": 0.5300197601318359, "learning_rate": 1.3461299798594296e-05, "loss": 0.7572, "step": 15115 }, { "epoch": 4.249578414839798, "grad_norm": 0.5183244943618774, "learning_rate": 1.3412179470580488e-05, "loss": 0.8164, "step": 15120 }, { "epoch": 4.250983698707139, "grad_norm": 0.5541335940361023, "learning_rate": 1.3363142483740398e-05, "loss": 0.7659, "step": 15125 }, { "epoch": 4.25238898257448, "grad_norm": 0.544763445854187, "learning_rate": 1.331418888527236e-05, "loss": 0.7673, "step": 15130 }, { "epoch": 4.253794266441822, "grad_norm": 0.566593587398529, "learning_rate": 1.326531872229444e-05, "loss": 0.7693, "step": 15135 }, { "epoch": 4.255199550309163, "grad_norm": 0.5759837031364441, "learning_rate": 1.3216532041844377e-05, "loss": 0.7653, "step": 15140 }, { "epoch": 4.256604834176503, "grad_norm": 0.5467374920845032, "learning_rate": 1.3167828890879607e-05, "loss": 0.7719, "step": 15145 }, { "epoch": 4.2580101180438445, "grad_norm": 0.5818522572517395, "learning_rate": 1.3119209316277092e-05, "loss": 0.7721, "step": 15150 }, { "epoch": 4.259415401911186, "grad_norm": 0.49862098693847656, "learning_rate": 1.3070673364833419e-05, "loss": 0.7689, "step": 15155 }, { "epoch": 4.260820685778527, "grad_norm": 0.5164232850074768, "learning_rate": 1.3022221083264685e-05, "loss": 0.7648, "step": 15160 }, { "epoch": 4.262225969645868, "grad_norm": 0.5314116477966309, "learning_rate": 1.2973852518206375e-05, "loss": 0.7673, "step": 15165 }, { "epoch": 4.26363125351321, "grad_norm": 0.521121084690094, "learning_rate": 1.292556771621347e-05, "loss": 0.7661, "step": 15170 }, { "epoch": 4.265036537380551, "grad_norm": 0.5553752183914185, "learning_rate": 1.2877366723760365e-05, "loss": 0.7547, "step": 15175 }, { "epoch": 4.266441821247892, "grad_norm": 0.5141292810440063, "learning_rate": 1.282924958724071e-05, "loss": 0.7715, "step": 15180 }, { "epoch": 4.267847105115234, "grad_norm": 0.554672360420227, "learning_rate": 1.278121635296744e-05, "loss": 0.7729, "step": 15185 }, { "epoch": 4.269252388982575, "grad_norm": 0.5302976965904236, "learning_rate": 1.2733267067172794e-05, "loss": 0.7554, "step": 15190 }, { "epoch": 4.270657672849916, "grad_norm": 0.5027261972427368, "learning_rate": 1.2685401776008188e-05, "loss": 0.767, "step": 15195 }, { "epoch": 4.2720629567172566, "grad_norm": 0.5278288722038269, "learning_rate": 1.2637620525544135e-05, "loss": 0.7617, "step": 15200 }, { "epoch": 4.273468240584598, "grad_norm": 0.5126884579658508, "learning_rate": 1.2589923361770339e-05, "loss": 0.7729, "step": 15205 }, { "epoch": 4.274873524451939, "grad_norm": 0.5113247036933899, "learning_rate": 1.254231033059554e-05, "loss": 0.7678, "step": 15210 }, { "epoch": 4.27627880831928, "grad_norm": 0.531362771987915, "learning_rate": 1.2494781477847517e-05, "loss": 0.813, "step": 15215 }, { "epoch": 4.277684092186622, "grad_norm": 0.5442110896110535, "learning_rate": 1.244733684927294e-05, "loss": 0.7555, "step": 15220 }, { "epoch": 4.279089376053963, "grad_norm": 0.5352432131767273, "learning_rate": 1.2399976490537557e-05, "loss": 0.7686, "step": 15225 }, { "epoch": 4.280494659921304, "grad_norm": 0.5032948851585388, "learning_rate": 1.2352700447225918e-05, "loss": 0.7678, "step": 15230 }, { "epoch": 4.281899943788646, "grad_norm": 0.5116592645645142, "learning_rate": 1.230550876484139e-05, "loss": 0.7748, "step": 15235 }, { "epoch": 4.283305227655987, "grad_norm": 0.5569871664047241, "learning_rate": 1.2258401488806214e-05, "loss": 0.7693, "step": 15240 }, { "epoch": 4.284710511523327, "grad_norm": 0.5144664645195007, "learning_rate": 1.2211378664461348e-05, "loss": 0.7605, "step": 15245 }, { "epoch": 4.286115795390669, "grad_norm": 0.5427828431129456, "learning_rate": 1.2164440337066496e-05, "loss": 0.7653, "step": 15250 }, { "epoch": 4.28752107925801, "grad_norm": 0.5294144749641418, "learning_rate": 1.2117586551799987e-05, "loss": 0.7809, "step": 15255 }, { "epoch": 4.288926363125351, "grad_norm": 0.5116956233978271, "learning_rate": 1.2070817353758812e-05, "loss": 0.7702, "step": 15260 }, { "epoch": 4.290331646992692, "grad_norm": 0.5081368088722229, "learning_rate": 1.2024132787958532e-05, "loss": 0.7511, "step": 15265 }, { "epoch": 4.291736930860034, "grad_norm": 0.5898987650871277, "learning_rate": 1.1977532899333265e-05, "loss": 0.7767, "step": 15270 }, { "epoch": 4.293142214727375, "grad_norm": 0.4957788586616516, "learning_rate": 1.1931017732735627e-05, "loss": 0.7618, "step": 15275 }, { "epoch": 4.294547498594716, "grad_norm": 0.5281771421432495, "learning_rate": 1.1884587332936658e-05, "loss": 0.7715, "step": 15280 }, { "epoch": 4.295952782462058, "grad_norm": 0.5147823095321655, "learning_rate": 1.1838241744625866e-05, "loss": 0.7568, "step": 15285 }, { "epoch": 4.297358066329399, "grad_norm": 0.5159749984741211, "learning_rate": 1.1791981012411047e-05, "loss": 0.7559, "step": 15290 }, { "epoch": 4.29876335019674, "grad_norm": 0.5492069125175476, "learning_rate": 1.174580518081838e-05, "loss": 0.7648, "step": 15295 }, { "epoch": 4.300168634064081, "grad_norm": 0.5118445754051208, "learning_rate": 1.169971429429234e-05, "loss": 0.7675, "step": 15300 }, { "epoch": 4.301573917931422, "grad_norm": 0.5533217787742615, "learning_rate": 1.1653708397195584e-05, "loss": 0.7551, "step": 15305 }, { "epoch": 4.302979201798763, "grad_norm": 0.5249292850494385, "learning_rate": 1.1607787533809012e-05, "loss": 0.7705, "step": 15310 }, { "epoch": 4.304384485666104, "grad_norm": 0.5463940501213074, "learning_rate": 1.1561951748331657e-05, "loss": 0.7654, "step": 15315 }, { "epoch": 4.305789769533446, "grad_norm": 0.5173946022987366, "learning_rate": 1.1516201084880685e-05, "loss": 0.7684, "step": 15320 }, { "epoch": 4.307195053400787, "grad_norm": 0.5160226225852966, "learning_rate": 1.1470535587491316e-05, "loss": 0.7517, "step": 15325 }, { "epoch": 4.308600337268128, "grad_norm": 0.5239101648330688, "learning_rate": 1.1424955300116802e-05, "loss": 0.764, "step": 15330 }, { "epoch": 4.31000562113547, "grad_norm": 0.49685534834861755, "learning_rate": 1.1379460266628395e-05, "loss": 0.7719, "step": 15335 }, { "epoch": 4.311410905002811, "grad_norm": 0.5540210604667664, "learning_rate": 1.1334050530815221e-05, "loss": 0.767, "step": 15340 }, { "epoch": 4.312816188870152, "grad_norm": 0.5216269493103027, "learning_rate": 1.1288726136384397e-05, "loss": 0.7611, "step": 15345 }, { "epoch": 4.314221472737493, "grad_norm": 0.5204557180404663, "learning_rate": 1.1243487126960862e-05, "loss": 0.7632, "step": 15350 }, { "epoch": 4.315626756604834, "grad_norm": 0.5199287533760071, "learning_rate": 1.119833354608738e-05, "loss": 0.7522, "step": 15355 }, { "epoch": 4.317032040472175, "grad_norm": 0.5106371641159058, "learning_rate": 1.1153265437224436e-05, "loss": 0.754, "step": 15360 }, { "epoch": 4.318437324339516, "grad_norm": 0.5255478620529175, "learning_rate": 1.1108282843750318e-05, "loss": 0.7619, "step": 15365 }, { "epoch": 4.319842608206858, "grad_norm": 0.540982186794281, "learning_rate": 1.106338580896098e-05, "loss": 0.7582, "step": 15370 }, { "epoch": 4.321247892074199, "grad_norm": 0.5933972597122192, "learning_rate": 1.1018574376070012e-05, "loss": 0.7546, "step": 15375 }, { "epoch": 4.32265317594154, "grad_norm": 0.5034326314926147, "learning_rate": 1.0973848588208635e-05, "loss": 0.7602, "step": 15380 }, { "epoch": 4.324058459808882, "grad_norm": 0.5004976391792297, "learning_rate": 1.0929208488425624e-05, "loss": 0.7574, "step": 15385 }, { "epoch": 4.325463743676223, "grad_norm": 0.5454297661781311, "learning_rate": 1.0884654119687287e-05, "loss": 0.7643, "step": 15390 }, { "epoch": 4.326869027543564, "grad_norm": 0.5362131595611572, "learning_rate": 1.0840185524877388e-05, "loss": 0.7601, "step": 15395 }, { "epoch": 4.328274311410905, "grad_norm": 0.5598523020744324, "learning_rate": 1.0795802746797157e-05, "loss": 0.7513, "step": 15400 }, { "epoch": 4.329679595278246, "grad_norm": 0.5776387453079224, "learning_rate": 1.0751505828165253e-05, "loss": 0.7659, "step": 15405 }, { "epoch": 4.331084879145587, "grad_norm": 0.5143082737922668, "learning_rate": 1.0707294811617607e-05, "loss": 0.7705, "step": 15410 }, { "epoch": 4.332490163012928, "grad_norm": 0.5051475763320923, "learning_rate": 1.0663169739707557e-05, "loss": 0.7609, "step": 15415 }, { "epoch": 4.33389544688027, "grad_norm": 0.5366913080215454, "learning_rate": 1.0619130654905695e-05, "loss": 0.7665, "step": 15420 }, { "epoch": 4.335300730747611, "grad_norm": 0.5204769968986511, "learning_rate": 1.0575177599599818e-05, "loss": 0.7678, "step": 15425 }, { "epoch": 4.336706014614952, "grad_norm": 0.5569886565208435, "learning_rate": 1.0531310616094958e-05, "loss": 0.7628, "step": 15430 }, { "epoch": 4.338111298482294, "grad_norm": 0.6373873949050903, "learning_rate": 1.048752974661329e-05, "loss": 0.7731, "step": 15435 }, { "epoch": 4.339516582349635, "grad_norm": 0.5396785140037537, "learning_rate": 1.0443835033294113e-05, "loss": 0.7491, "step": 15440 }, { "epoch": 4.340921866216976, "grad_norm": 0.5870918035507202, "learning_rate": 1.0400226518193756e-05, "loss": 0.768, "step": 15445 }, { "epoch": 4.3423271500843175, "grad_norm": 0.5297492742538452, "learning_rate": 1.0356704243285631e-05, "loss": 0.7622, "step": 15450 }, { "epoch": 4.343732433951658, "grad_norm": 0.5458266735076904, "learning_rate": 1.0313268250460118e-05, "loss": 0.7672, "step": 15455 }, { "epoch": 4.345137717818999, "grad_norm": 0.5890789031982422, "learning_rate": 1.0269918581524596e-05, "loss": 0.7483, "step": 15460 }, { "epoch": 4.3465430016863404, "grad_norm": 0.5684923529624939, "learning_rate": 1.0226655278203267e-05, "loss": 0.7606, "step": 15465 }, { "epoch": 4.347948285553682, "grad_norm": 0.533937931060791, "learning_rate": 1.0183478382137291e-05, "loss": 0.7585, "step": 15470 }, { "epoch": 4.349353569421023, "grad_norm": 0.5563515424728394, "learning_rate": 1.0140387934884609e-05, "loss": 0.7688, "step": 15475 }, { "epoch": 4.350758853288364, "grad_norm": 0.5250877737998962, "learning_rate": 1.0097383977919995e-05, "loss": 0.7556, "step": 15480 }, { "epoch": 4.352164137155706, "grad_norm": 0.5379632711410522, "learning_rate": 1.0054466552634934e-05, "loss": 0.773, "step": 15485 }, { "epoch": 4.353569421023047, "grad_norm": 0.5072575807571411, "learning_rate": 1.0011635700337662e-05, "loss": 0.7553, "step": 15490 }, { "epoch": 4.354974704890388, "grad_norm": 0.5212836861610413, "learning_rate": 9.968891462253083e-06, "loss": 0.76, "step": 15495 }, { "epoch": 4.3563799887577295, "grad_norm": 0.5077595114707947, "learning_rate": 9.926233879522683e-06, "loss": 0.762, "step": 15500 }, { "epoch": 4.35778527262507, "grad_norm": 0.5250112414360046, "learning_rate": 9.8836629932046e-06, "loss": 0.7732, "step": 15505 }, { "epoch": 4.359190556492411, "grad_norm": 0.49983668327331543, "learning_rate": 9.841178844273502e-06, "loss": 0.7607, "step": 15510 }, { "epoch": 4.3605958403597525, "grad_norm": 0.5391385555267334, "learning_rate": 9.79878147362061e-06, "loss": 0.7721, "step": 15515 }, { "epoch": 4.362001124227094, "grad_norm": 0.5197238326072693, "learning_rate": 9.756470922053529e-06, "loss": 0.7712, "step": 15520 }, { "epoch": 4.363406408094435, "grad_norm": 0.5797452330589294, "learning_rate": 9.714247230296391e-06, "loss": 0.7648, "step": 15525 }, { "epoch": 4.364811691961776, "grad_norm": 0.6094208359718323, "learning_rate": 9.672110438989701e-06, "loss": 0.7616, "step": 15530 }, { "epoch": 4.366216975829118, "grad_norm": 0.5721893310546875, "learning_rate": 9.630060588690292e-06, "loss": 0.7581, "step": 15535 }, { "epoch": 4.367622259696459, "grad_norm": 0.5357393026351929, "learning_rate": 9.588097719871347e-06, "loss": 0.7634, "step": 15540 }, { "epoch": 4.3690275435638, "grad_norm": 0.5096330642700195, "learning_rate": 9.546221872922322e-06, "loss": 0.7596, "step": 15545 }, { "epoch": 4.3704328274311415, "grad_norm": 0.5289176106452942, "learning_rate": 9.504433088148934e-06, "loss": 0.7496, "step": 15550 }, { "epoch": 4.371838111298482, "grad_norm": 0.5366396903991699, "learning_rate": 9.462731405773039e-06, "loss": 0.7589, "step": 15555 }, { "epoch": 4.373243395165823, "grad_norm": 0.5225094556808472, "learning_rate": 9.421116865932711e-06, "loss": 0.7554, "step": 15560 }, { "epoch": 4.3746486790331645, "grad_norm": 0.5416147112846375, "learning_rate": 9.379589508682152e-06, "loss": 0.7664, "step": 15565 }, { "epoch": 4.376053962900506, "grad_norm": 0.541254460811615, "learning_rate": 9.338149373991611e-06, "loss": 0.7587, "step": 15570 }, { "epoch": 4.377459246767847, "grad_norm": 0.522843599319458, "learning_rate": 9.296796501747406e-06, "loss": 0.8008, "step": 15575 }, { "epoch": 4.378864530635188, "grad_norm": 0.5551980137825012, "learning_rate": 9.255530931751866e-06, "loss": 0.771, "step": 15580 }, { "epoch": 4.38026981450253, "grad_norm": 0.5036432147026062, "learning_rate": 9.214352703723327e-06, "loss": 0.7634, "step": 15585 }, { "epoch": 4.381675098369871, "grad_norm": 0.49579012393951416, "learning_rate": 9.173261857295989e-06, "loss": 0.7678, "step": 15590 }, { "epoch": 4.383080382237212, "grad_norm": 0.5740176439285278, "learning_rate": 9.13225843201998e-06, "loss": 0.7705, "step": 15595 }, { "epoch": 4.3844856661045535, "grad_norm": 0.5432335138320923, "learning_rate": 9.091342467361308e-06, "loss": 0.7656, "step": 15600 }, { "epoch": 4.385890949971895, "grad_norm": 0.515156090259552, "learning_rate": 9.050514002701748e-06, "loss": 0.7561, "step": 15605 }, { "epoch": 4.387296233839235, "grad_norm": 0.5516624450683594, "learning_rate": 9.00977307733889e-06, "loss": 0.7573, "step": 15610 }, { "epoch": 4.3887015177065765, "grad_norm": 0.5266974568367004, "learning_rate": 8.969119730486075e-06, "loss": 0.7579, "step": 15615 }, { "epoch": 4.390106801573918, "grad_norm": 0.5234085321426392, "learning_rate": 8.928554001272337e-06, "loss": 0.7637, "step": 15620 }, { "epoch": 4.391512085441259, "grad_norm": 0.49571606516838074, "learning_rate": 8.888075928742357e-06, "loss": 0.7528, "step": 15625 }, { "epoch": 4.3929173693086, "grad_norm": 0.5543531775474548, "learning_rate": 8.847685551856455e-06, "loss": 0.7663, "step": 15630 }, { "epoch": 4.394322653175942, "grad_norm": 0.5438440442085266, "learning_rate": 8.807382909490603e-06, "loss": 0.7748, "step": 15635 }, { "epoch": 4.395727937043283, "grad_norm": 0.5622188448905945, "learning_rate": 8.767168040436235e-06, "loss": 0.7497, "step": 15640 }, { "epoch": 4.397133220910624, "grad_norm": 0.5074348449707031, "learning_rate": 8.72704098340037e-06, "loss": 0.7475, "step": 15645 }, { "epoch": 4.3985385047779655, "grad_norm": 0.5234601497650146, "learning_rate": 8.687001777005465e-06, "loss": 0.7619, "step": 15650 }, { "epoch": 4.399943788645306, "grad_norm": 0.5129987597465515, "learning_rate": 8.647050459789474e-06, "loss": 0.7652, "step": 15655 }, { "epoch": 4.401349072512647, "grad_norm": 0.5113297700881958, "learning_rate": 8.60718707020567e-06, "loss": 0.7547, "step": 15660 }, { "epoch": 4.4027543563799885, "grad_norm": 0.7454274892807007, "learning_rate": 8.56741164662278e-06, "loss": 0.764, "step": 15665 }, { "epoch": 4.40415964024733, "grad_norm": 0.5510764718055725, "learning_rate": 8.527724227324851e-06, "loss": 0.767, "step": 15670 }, { "epoch": 4.405564924114671, "grad_norm": 0.63843834400177, "learning_rate": 8.48812485051116e-06, "loss": 0.7677, "step": 15675 }, { "epoch": 4.406970207982012, "grad_norm": 0.5484469532966614, "learning_rate": 8.448613554296304e-06, "loss": 0.7667, "step": 15680 }, { "epoch": 4.408375491849354, "grad_norm": 0.6233168840408325, "learning_rate": 8.409190376710097e-06, "loss": 0.7638, "step": 15685 }, { "epoch": 4.409780775716695, "grad_norm": 0.6250582933425903, "learning_rate": 8.369855355697554e-06, "loss": 0.7635, "step": 15690 }, { "epoch": 4.411186059584036, "grad_norm": 0.5301230549812317, "learning_rate": 8.330608529118756e-06, "loss": 0.7624, "step": 15695 }, { "epoch": 4.4125913434513775, "grad_norm": 0.5399486422538757, "learning_rate": 8.291449934748985e-06, "loss": 0.7563, "step": 15700 }, { "epoch": 4.413996627318719, "grad_norm": 0.530448853969574, "learning_rate": 8.252379610278582e-06, "loss": 0.8029, "step": 15705 }, { "epoch": 4.415401911186059, "grad_norm": 0.5428824424743652, "learning_rate": 8.213397593312866e-06, "loss": 0.7827, "step": 15710 }, { "epoch": 4.4168071950534005, "grad_norm": 0.688864529132843, "learning_rate": 8.174503921372246e-06, "loss": 0.7641, "step": 15715 }, { "epoch": 4.418212478920742, "grad_norm": 0.5148605704307556, "learning_rate": 8.13569863189204e-06, "loss": 0.7754, "step": 15720 }, { "epoch": 4.419617762788083, "grad_norm": 0.509868323802948, "learning_rate": 8.096981762222534e-06, "loss": 0.7706, "step": 15725 }, { "epoch": 4.421023046655424, "grad_norm": 0.5653720498085022, "learning_rate": 8.058353349628877e-06, "loss": 0.7484, "step": 15730 }, { "epoch": 4.422428330522766, "grad_norm": 0.524649441242218, "learning_rate": 8.0198134312911e-06, "loss": 0.7613, "step": 15735 }, { "epoch": 4.423833614390107, "grad_norm": 0.5777130722999573, "learning_rate": 7.981362044304074e-06, "loss": 0.7628, "step": 15740 }, { "epoch": 4.425238898257448, "grad_norm": 0.4954436123371124, "learning_rate": 7.942999225677394e-06, "loss": 0.7615, "step": 15745 }, { "epoch": 4.4266441821247895, "grad_norm": 0.5425832867622375, "learning_rate": 7.904725012335457e-06, "loss": 0.7525, "step": 15750 }, { "epoch": 4.428049465992131, "grad_norm": 0.607391893863678, "learning_rate": 7.866539441117383e-06, "loss": 0.7587, "step": 15755 }, { "epoch": 4.429454749859472, "grad_norm": 0.5088168382644653, "learning_rate": 7.82844254877697e-06, "loss": 0.7697, "step": 15760 }, { "epoch": 4.4308600337268125, "grad_norm": 0.5161483287811279, "learning_rate": 7.790434371982624e-06, "loss": 0.7699, "step": 15765 }, { "epoch": 4.432265317594154, "grad_norm": 0.5147720575332642, "learning_rate": 7.75251494731739e-06, "loss": 0.7515, "step": 15770 }, { "epoch": 4.433670601461495, "grad_norm": 0.519303023815155, "learning_rate": 7.714684311278908e-06, "loss": 0.7599, "step": 15775 }, { "epoch": 4.435075885328836, "grad_norm": 0.5051060318946838, "learning_rate": 7.676942500279316e-06, "loss": 0.7647, "step": 15780 }, { "epoch": 4.436481169196178, "grad_norm": 0.5474300384521484, "learning_rate": 7.6392895506453e-06, "loss": 0.7635, "step": 15785 }, { "epoch": 4.437886453063519, "grad_norm": 0.5233967304229736, "learning_rate": 7.601725498617985e-06, "loss": 0.7639, "step": 15790 }, { "epoch": 4.43929173693086, "grad_norm": 0.5138359665870667, "learning_rate": 7.564250380352966e-06, "loss": 0.7558, "step": 15795 }, { "epoch": 4.4406970207982015, "grad_norm": 0.5092753171920776, "learning_rate": 7.526864231920183e-06, "loss": 0.7662, "step": 15800 }, { "epoch": 4.442102304665543, "grad_norm": 0.5444674491882324, "learning_rate": 7.489567089303984e-06, "loss": 0.763, "step": 15805 }, { "epoch": 4.443507588532883, "grad_norm": 0.5607460141181946, "learning_rate": 7.452358988403063e-06, "loss": 0.7821, "step": 15810 }, { "epoch": 4.4449128724002245, "grad_norm": 0.5317428112030029, "learning_rate": 7.4152399650303515e-06, "loss": 0.7736, "step": 15815 }, { "epoch": 4.446318156267566, "grad_norm": 0.5346295833587646, "learning_rate": 7.378210054913104e-06, "loss": 0.7567, "step": 15820 }, { "epoch": 4.447723440134907, "grad_norm": 0.5458088517189026, "learning_rate": 7.341269293692765e-06, "loss": 0.7599, "step": 15825 }, { "epoch": 4.449128724002248, "grad_norm": 0.5532664656639099, "learning_rate": 7.304417716924994e-06, "loss": 0.7883, "step": 15830 }, { "epoch": 4.45053400786959, "grad_norm": 0.6144757866859436, "learning_rate": 7.267655360079595e-06, "loss": 0.7551, "step": 15835 }, { "epoch": 4.451939291736931, "grad_norm": 0.5150646567344666, "learning_rate": 7.230982258540508e-06, "loss": 0.7669, "step": 15840 }, { "epoch": 4.453344575604272, "grad_norm": 0.5442806482315063, "learning_rate": 7.194398447605788e-06, "loss": 0.761, "step": 15845 }, { "epoch": 4.4547498594716135, "grad_norm": 0.5361512899398804, "learning_rate": 7.157903962487489e-06, "loss": 0.7558, "step": 15850 }, { "epoch": 4.456155143338955, "grad_norm": 0.506851851940155, "learning_rate": 7.121498838311713e-06, "loss": 0.7636, "step": 15855 }, { "epoch": 4.457560427206296, "grad_norm": 0.5137316584587097, "learning_rate": 7.0851831101185764e-06, "loss": 0.766, "step": 15860 }, { "epoch": 4.4589657110736365, "grad_norm": 0.5157634615898132, "learning_rate": 7.048956812862151e-06, "loss": 0.7577, "step": 15865 }, { "epoch": 4.460370994940978, "grad_norm": 0.6385879516601562, "learning_rate": 7.012819981410379e-06, "loss": 0.7752, "step": 15870 }, { "epoch": 4.461776278808319, "grad_norm": 0.5130940675735474, "learning_rate": 6.976772650545138e-06, "loss": 0.7726, "step": 15875 }, { "epoch": 4.46318156267566, "grad_norm": 0.5212185382843018, "learning_rate": 6.940814854962141e-06, "loss": 0.7591, "step": 15880 }, { "epoch": 4.464586846543002, "grad_norm": 0.5533390045166016, "learning_rate": 6.9049466292709584e-06, "loss": 0.8123, "step": 15885 }, { "epoch": 4.465992130410343, "grad_norm": 0.5034365653991699, "learning_rate": 6.869168007994897e-06, "loss": 0.7644, "step": 15890 }, { "epoch": 4.467397414277684, "grad_norm": 0.5371700525283813, "learning_rate": 6.833479025571044e-06, "loss": 0.7671, "step": 15895 }, { "epoch": 4.4688026981450255, "grad_norm": 0.5383287072181702, "learning_rate": 6.797879716350242e-06, "loss": 0.7615, "step": 15900 }, { "epoch": 4.470207982012367, "grad_norm": 0.5258095264434814, "learning_rate": 6.7623701145969495e-06, "loss": 0.7732, "step": 15905 }, { "epoch": 4.471613265879708, "grad_norm": 0.5264952778816223, "learning_rate": 6.726950254489328e-06, "loss": 0.7599, "step": 15910 }, { "epoch": 4.4730185497470485, "grad_norm": 0.557831883430481, "learning_rate": 6.691620170119173e-06, "loss": 0.7707, "step": 15915 }, { "epoch": 4.47442383361439, "grad_norm": 0.5028601884841919, "learning_rate": 6.656379895491826e-06, "loss": 0.7693, "step": 15920 }, { "epoch": 4.475829117481731, "grad_norm": 0.5211167931556702, "learning_rate": 6.621229464526235e-06, "loss": 0.7557, "step": 15925 }, { "epoch": 4.477234401349072, "grad_norm": 0.5280676484107971, "learning_rate": 6.586168911054835e-06, "loss": 0.7496, "step": 15930 }, { "epoch": 4.478639685216414, "grad_norm": 0.5282173156738281, "learning_rate": 6.551198268823588e-06, "loss": 0.7588, "step": 15935 }, { "epoch": 4.480044969083755, "grad_norm": 0.5561384558677673, "learning_rate": 6.51631757149187e-06, "loss": 0.8083, "step": 15940 }, { "epoch": 4.481450252951096, "grad_norm": 0.5195598006248474, "learning_rate": 6.4815268526325465e-06, "loss": 0.7791, "step": 15945 }, { "epoch": 4.4828555368184375, "grad_norm": 0.5344316363334656, "learning_rate": 6.44682614573181e-06, "loss": 0.7518, "step": 15950 }, { "epoch": 4.484260820685779, "grad_norm": 0.5257565975189209, "learning_rate": 6.412215484189288e-06, "loss": 0.7538, "step": 15955 }, { "epoch": 4.48566610455312, "grad_norm": 0.4952334463596344, "learning_rate": 6.377694901317865e-06, "loss": 0.7582, "step": 15960 }, { "epoch": 4.4870713884204605, "grad_norm": 0.5309807062149048, "learning_rate": 6.343264430343776e-06, "loss": 0.7604, "step": 15965 }, { "epoch": 4.488476672287802, "grad_norm": 0.5829389691352844, "learning_rate": 6.308924104406511e-06, "loss": 0.8324, "step": 15970 }, { "epoch": 4.489881956155143, "grad_norm": 0.5203680992126465, "learning_rate": 6.274673956558774e-06, "loss": 0.7751, "step": 15975 }, { "epoch": 4.491287240022484, "grad_norm": 0.5616741180419922, "learning_rate": 6.240514019766497e-06, "loss": 0.7647, "step": 15980 }, { "epoch": 4.492692523889826, "grad_norm": 0.5452646613121033, "learning_rate": 6.206444326908778e-06, "loss": 0.7675, "step": 15985 }, { "epoch": 4.494097807757167, "grad_norm": 0.5431065559387207, "learning_rate": 6.172464910777853e-06, "loss": 0.7525, "step": 15990 }, { "epoch": 4.495503091624508, "grad_norm": 0.550834596157074, "learning_rate": 6.138575804079072e-06, "loss": 0.7584, "step": 15995 }, { "epoch": 4.4969083754918495, "grad_norm": 0.5268097519874573, "learning_rate": 6.104777039430842e-06, "loss": 0.7632, "step": 16000 }, { "epoch": 4.498313659359191, "grad_norm": 0.5832206606864929, "learning_rate": 6.071068649364642e-06, "loss": 0.7646, "step": 16005 }, { "epoch": 4.499718943226532, "grad_norm": 0.5139409303665161, "learning_rate": 6.037450666324939e-06, "loss": 0.8085, "step": 16010 }, { "epoch": 4.501124227093873, "grad_norm": 0.6051554679870605, "learning_rate": 6.0039231226691976e-06, "loss": 0.755, "step": 16015 }, { "epoch": 4.502529510961214, "grad_norm": 0.586642324924469, "learning_rate": 5.970486050667834e-06, "loss": 0.7656, "step": 16020 }, { "epoch": 4.503934794828555, "grad_norm": 0.5114891529083252, "learning_rate": 5.937139482504206e-06, "loss": 0.7529, "step": 16025 }, { "epoch": 4.505340078695896, "grad_norm": 0.5245956778526306, "learning_rate": 5.903883450274506e-06, "loss": 0.752, "step": 16030 }, { "epoch": 4.506745362563238, "grad_norm": 0.5149771571159363, "learning_rate": 5.870717985987817e-06, "loss": 0.7527, "step": 16035 }, { "epoch": 4.508150646430579, "grad_norm": 0.5146948099136353, "learning_rate": 5.837643121566072e-06, "loss": 0.757, "step": 16040 }, { "epoch": 4.50955593029792, "grad_norm": 0.5637561678886414, "learning_rate": 5.804658888843961e-06, "loss": 0.7752, "step": 16045 }, { "epoch": 4.5109612141652615, "grad_norm": 0.57065349817276, "learning_rate": 5.771765319568967e-06, "loss": 0.7598, "step": 16050 }, { "epoch": 4.512366498032603, "grad_norm": 0.6010316014289856, "learning_rate": 5.738962445401308e-06, "loss": 0.8019, "step": 16055 }, { "epoch": 4.513771781899944, "grad_norm": 0.5177924036979675, "learning_rate": 5.7062502979138955e-06, "loss": 0.7554, "step": 16060 }, { "epoch": 4.5151770657672845, "grad_norm": 0.5299884080886841, "learning_rate": 5.673628908592321e-06, "loss": 0.7604, "step": 16065 }, { "epoch": 4.516582349634627, "grad_norm": 0.535160481929779, "learning_rate": 5.641098308834802e-06, "loss": 0.7599, "step": 16070 }, { "epoch": 4.517987633501967, "grad_norm": 0.5174487829208374, "learning_rate": 5.608658529952238e-06, "loss": 0.7566, "step": 16075 }, { "epoch": 4.519392917369308, "grad_norm": 0.510744035243988, "learning_rate": 5.576309603168017e-06, "loss": 0.7561, "step": 16080 }, { "epoch": 4.52079820123665, "grad_norm": 0.5133626461029053, "learning_rate": 5.544051559618135e-06, "loss": 0.7474, "step": 16085 }, { "epoch": 4.522203485103991, "grad_norm": 0.5179252624511719, "learning_rate": 5.511884430351111e-06, "loss": 0.7613, "step": 16090 }, { "epoch": 4.523608768971332, "grad_norm": 0.5430054664611816, "learning_rate": 5.479808246327989e-06, "loss": 0.761, "step": 16095 }, { "epoch": 4.5250140528386735, "grad_norm": 0.530794084072113, "learning_rate": 5.447823038422206e-06, "loss": 0.7604, "step": 16100 }, { "epoch": 4.526419336706015, "grad_norm": 0.5674378871917725, "learning_rate": 5.4159288374196705e-06, "loss": 0.774, "step": 16105 }, { "epoch": 4.527824620573356, "grad_norm": 0.5048345327377319, "learning_rate": 5.384125674018725e-06, "loss": 0.8147, "step": 16110 }, { "epoch": 4.529229904440697, "grad_norm": 0.5289924144744873, "learning_rate": 5.352413578830029e-06, "loss": 0.7554, "step": 16115 }, { "epoch": 4.530635188308038, "grad_norm": 0.5061425566673279, "learning_rate": 5.320792582376622e-06, "loss": 0.7521, "step": 16120 }, { "epoch": 4.532040472175379, "grad_norm": 0.5277791023254395, "learning_rate": 5.289262715093879e-06, "loss": 0.7567, "step": 16125 }, { "epoch": 4.53344575604272, "grad_norm": 0.5125671625137329, "learning_rate": 5.257824007329437e-06, "loss": 0.7617, "step": 16130 }, { "epoch": 4.534851039910062, "grad_norm": 0.5968344807624817, "learning_rate": 5.226476489343168e-06, "loss": 0.7655, "step": 16135 }, { "epoch": 4.536256323777403, "grad_norm": 0.5773058533668518, "learning_rate": 5.195220191307226e-06, "loss": 0.7576, "step": 16140 }, { "epoch": 4.537661607644744, "grad_norm": 0.5427569150924683, "learning_rate": 5.1640551433059685e-06, "loss": 0.771, "step": 16145 }, { "epoch": 4.5390668915120855, "grad_norm": 0.5206560492515564, "learning_rate": 5.132981375335843e-06, "loss": 0.7581, "step": 16150 }, { "epoch": 4.540472175379427, "grad_norm": 0.5196772813796997, "learning_rate": 5.101998917305517e-06, "loss": 0.768, "step": 16155 }, { "epoch": 4.541877459246768, "grad_norm": 0.5120740532875061, "learning_rate": 5.071107799035746e-06, "loss": 0.745, "step": 16160 }, { "epoch": 4.543282743114109, "grad_norm": 0.5352237820625305, "learning_rate": 5.04030805025939e-06, "loss": 0.7531, "step": 16165 }, { "epoch": 4.544688026981451, "grad_norm": 0.5228151082992554, "learning_rate": 5.009599700621314e-06, "loss": 0.7602, "step": 16170 }, { "epoch": 4.546093310848791, "grad_norm": 0.5458948612213135, "learning_rate": 4.978982779678476e-06, "loss": 0.7561, "step": 16175 }, { "epoch": 4.547498594716132, "grad_norm": 0.5529809594154358, "learning_rate": 4.94845731689979e-06, "loss": 0.8094, "step": 16180 }, { "epoch": 4.548903878583474, "grad_norm": 0.5764121413230896, "learning_rate": 4.918023341666145e-06, "loss": 0.7596, "step": 16185 }, { "epoch": 4.550309162450815, "grad_norm": 0.5057691335678101, "learning_rate": 4.887680883270385e-06, "loss": 0.7736, "step": 16190 }, { "epoch": 4.551714446318156, "grad_norm": 0.5106030106544495, "learning_rate": 4.857429970917282e-06, "loss": 0.7563, "step": 16195 }, { "epoch": 4.5531197301854975, "grad_norm": 0.49489980936050415, "learning_rate": 4.82727063372348e-06, "loss": 0.7626, "step": 16200 }, { "epoch": 4.554525014052839, "grad_norm": 0.5165312886238098, "learning_rate": 4.797202900717457e-06, "loss": 0.756, "step": 16205 }, { "epoch": 4.55593029792018, "grad_norm": 0.5062229633331299, "learning_rate": 4.7672268008395415e-06, "loss": 0.7612, "step": 16210 }, { "epoch": 4.557335581787521, "grad_norm": 0.5089703798294067, "learning_rate": 4.737342362941899e-06, "loss": 0.7699, "step": 16215 }, { "epoch": 4.558740865654862, "grad_norm": 0.5935402512550354, "learning_rate": 4.707549615788398e-06, "loss": 0.7597, "step": 16220 }, { "epoch": 4.560146149522204, "grad_norm": 0.55452561378479, "learning_rate": 4.6778485880547115e-06, "loss": 0.7709, "step": 16225 }, { "epoch": 4.561551433389544, "grad_norm": 0.5225324034690857, "learning_rate": 4.648239308328228e-06, "loss": 0.762, "step": 16230 }, { "epoch": 4.562956717256886, "grad_norm": 0.519692063331604, "learning_rate": 4.618721805107995e-06, "loss": 0.7638, "step": 16235 }, { "epoch": 4.564362001124227, "grad_norm": 0.5288437008857727, "learning_rate": 4.589296106804753e-06, "loss": 0.76, "step": 16240 }, { "epoch": 4.565767284991568, "grad_norm": 0.5157180428504944, "learning_rate": 4.559962241740867e-06, "loss": 0.7628, "step": 16245 }, { "epoch": 4.5671725688589095, "grad_norm": 0.5235899686813354, "learning_rate": 4.530720238150332e-06, "loss": 0.7634, "step": 16250 }, { "epoch": 4.568577852726251, "grad_norm": 0.5051559805870056, "learning_rate": 4.501570124178689e-06, "loss": 0.7726, "step": 16255 }, { "epoch": 4.569983136593592, "grad_norm": 0.5335587859153748, "learning_rate": 4.472511927883072e-06, "loss": 0.763, "step": 16260 }, { "epoch": 4.571388420460933, "grad_norm": 0.5303399562835693, "learning_rate": 4.4435456772321085e-06, "loss": 0.7794, "step": 16265 }, { "epoch": 4.572793704328275, "grad_norm": 0.5151443481445312, "learning_rate": 4.414671400105985e-06, "loss": 0.7641, "step": 16270 }, { "epoch": 4.574198988195615, "grad_norm": 0.5121344327926636, "learning_rate": 4.3858891242962825e-06, "loss": 0.7451, "step": 16275 }, { "epoch": 4.575604272062956, "grad_norm": 0.5286407470703125, "learning_rate": 4.357198877506086e-06, "loss": 0.7743, "step": 16280 }, { "epoch": 4.577009555930298, "grad_norm": 0.5202147364616394, "learning_rate": 4.328600687349904e-06, "loss": 0.764, "step": 16285 }, { "epoch": 4.578414839797639, "grad_norm": 0.5142115354537964, "learning_rate": 4.30009458135362e-06, "loss": 0.7661, "step": 16290 }, { "epoch": 4.57982012366498, "grad_norm": 0.5535638332366943, "learning_rate": 4.271680586954474e-06, "loss": 0.7589, "step": 16295 }, { "epoch": 4.5812254075323215, "grad_norm": 0.5315068960189819, "learning_rate": 4.2433587315010905e-06, "loss": 0.7642, "step": 16300 }, { "epoch": 4.582630691399663, "grad_norm": 0.5588862299919128, "learning_rate": 4.2151290422533855e-06, "loss": 0.7632, "step": 16305 }, { "epoch": 4.584035975267004, "grad_norm": 0.5527219772338867, "learning_rate": 4.186991546382535e-06, "loss": 0.7657, "step": 16310 }, { "epoch": 4.585441259134345, "grad_norm": 0.5488102436065674, "learning_rate": 4.158946270971031e-06, "loss": 0.7743, "step": 16315 }, { "epoch": 4.586846543001687, "grad_norm": 0.5361760258674622, "learning_rate": 4.130993243012582e-06, "loss": 0.7533, "step": 16320 }, { "epoch": 4.588251826869028, "grad_norm": 0.4912406802177429, "learning_rate": 4.103132489412087e-06, "loss": 0.7597, "step": 16325 }, { "epoch": 4.589657110736368, "grad_norm": 0.5646995902061462, "learning_rate": 4.075364036985663e-06, "loss": 0.7607, "step": 16330 }, { "epoch": 4.59106239460371, "grad_norm": 0.5502670407295227, "learning_rate": 4.047687912460563e-06, "loss": 0.7692, "step": 16335 }, { "epoch": 4.592467678471051, "grad_norm": 0.5219792127609253, "learning_rate": 4.0201041424752006e-06, "loss": 0.8079, "step": 16340 }, { "epoch": 4.593872962338392, "grad_norm": 0.5074188709259033, "learning_rate": 3.992612753579061e-06, "loss": 0.8031, "step": 16345 }, { "epoch": 4.5952782462057336, "grad_norm": 0.5782954096794128, "learning_rate": 3.965213772232745e-06, "loss": 0.7648, "step": 16350 }, { "epoch": 4.596683530073075, "grad_norm": 0.5999060869216919, "learning_rate": 3.93790722480788e-06, "loss": 0.7574, "step": 16355 }, { "epoch": 4.598088813940416, "grad_norm": 0.5622010827064514, "learning_rate": 3.910693137587185e-06, "loss": 0.777, "step": 16360 }, { "epoch": 4.599494097807757, "grad_norm": 0.5037088990211487, "learning_rate": 3.883571536764297e-06, "loss": 0.7594, "step": 16365 }, { "epoch": 4.600899381675099, "grad_norm": 0.5346062779426575, "learning_rate": 3.856542448443889e-06, "loss": 0.7749, "step": 16370 }, { "epoch": 4.602304665542439, "grad_norm": 0.5186667442321777, "learning_rate": 3.8296058986416196e-06, "loss": 0.7553, "step": 16375 }, { "epoch": 4.60370994940978, "grad_norm": 0.5188003778457642, "learning_rate": 3.802761913283992e-06, "loss": 0.771, "step": 16380 }, { "epoch": 4.605115233277122, "grad_norm": 0.5112647414207458, "learning_rate": 3.7760105182084838e-06, "loss": 0.7583, "step": 16385 }, { "epoch": 4.606520517144463, "grad_norm": 0.5383247137069702, "learning_rate": 3.7493517391634426e-06, "loss": 0.7726, "step": 16390 }, { "epoch": 4.607925801011804, "grad_norm": 0.5470320582389832, "learning_rate": 3.7227856018080655e-06, "loss": 0.7592, "step": 16395 }, { "epoch": 4.609331084879146, "grad_norm": 0.5709768533706665, "learning_rate": 3.696312131712376e-06, "loss": 0.7748, "step": 16400 }, { "epoch": 4.610736368746487, "grad_norm": 0.5174989104270935, "learning_rate": 3.6699313543572034e-06, "loss": 0.7677, "step": 16405 }, { "epoch": 4.612141652613828, "grad_norm": 0.5489538311958313, "learning_rate": 3.6436432951341914e-06, "loss": 0.7547, "step": 16410 }, { "epoch": 4.613546936481169, "grad_norm": 0.565827488899231, "learning_rate": 3.6174479793456894e-06, "loss": 0.77, "step": 16415 }, { "epoch": 4.614952220348511, "grad_norm": 0.5434299111366272, "learning_rate": 3.591345432204807e-06, "loss": 0.7526, "step": 16420 }, { "epoch": 4.616357504215852, "grad_norm": 0.5446549654006958, "learning_rate": 3.565335678835391e-06, "loss": 0.7752, "step": 16425 }, { "epoch": 4.617762788083192, "grad_norm": 0.5367806553840637, "learning_rate": 3.5394187442719494e-06, "loss": 0.7712, "step": 16430 }, { "epoch": 4.619168071950534, "grad_norm": 0.5381150245666504, "learning_rate": 3.5135946534596175e-06, "loss": 0.81, "step": 16435 }, { "epoch": 4.620573355817875, "grad_norm": 0.5752198100090027, "learning_rate": 3.4878634312542125e-06, "loss": 0.7584, "step": 16440 }, { "epoch": 4.621978639685216, "grad_norm": 0.5078454613685608, "learning_rate": 3.4622251024221674e-06, "loss": 0.7533, "step": 16445 }, { "epoch": 4.623383923552558, "grad_norm": 0.5012295842170715, "learning_rate": 3.4366796916404875e-06, "loss": 0.753, "step": 16450 }, { "epoch": 4.624789207419899, "grad_norm": 0.507514476776123, "learning_rate": 3.411227223496749e-06, "loss": 0.7937, "step": 16455 }, { "epoch": 4.62619449128724, "grad_norm": 0.49216654896736145, "learning_rate": 3.3858677224890557e-06, "loss": 0.7496, "step": 16460 }, { "epoch": 4.627599775154581, "grad_norm": 0.5676223039627075, "learning_rate": 3.36060121302606e-06, "loss": 0.77, "step": 16465 }, { "epoch": 4.629005059021923, "grad_norm": 0.5882121920585632, "learning_rate": 3.3354277194268755e-06, "loss": 0.7718, "step": 16470 }, { "epoch": 4.630410342889264, "grad_norm": 0.5617997646331787, "learning_rate": 3.310347265921121e-06, "loss": 0.7586, "step": 16475 }, { "epoch": 4.631815626756605, "grad_norm": 0.5538641214370728, "learning_rate": 3.2853598766488523e-06, "loss": 0.7652, "step": 16480 }, { "epoch": 4.633220910623946, "grad_norm": 0.5209192633628845, "learning_rate": 3.2604655756605206e-06, "loss": 0.768, "step": 16485 }, { "epoch": 4.634626194491287, "grad_norm": 0.6011056303977966, "learning_rate": 3.2356643869170254e-06, "loss": 0.7707, "step": 16490 }, { "epoch": 4.636031478358628, "grad_norm": 0.5389314889907837, "learning_rate": 3.2109563342896053e-06, "loss": 0.7543, "step": 16495 }, { "epoch": 4.63743676222597, "grad_norm": 0.541045069694519, "learning_rate": 3.1863414415598923e-06, "loss": 0.7532, "step": 16500 }, { "epoch": 4.638842046093311, "grad_norm": 0.5399346351623535, "learning_rate": 3.1618197324198352e-06, "loss": 0.7775, "step": 16505 }, { "epoch": 4.640247329960652, "grad_norm": 0.5333517789840698, "learning_rate": 3.1373912304716758e-06, "loss": 0.7716, "step": 16510 }, { "epoch": 4.641652613827993, "grad_norm": 0.5069199800491333, "learning_rate": 3.113055959227984e-06, "loss": 0.7525, "step": 16515 }, { "epoch": 4.643057897695335, "grad_norm": 0.5578119158744812, "learning_rate": 3.0888139421115347e-06, "loss": 0.7621, "step": 16520 }, { "epoch": 4.644463181562676, "grad_norm": 0.5046953558921814, "learning_rate": 3.064665202455408e-06, "loss": 0.7598, "step": 16525 }, { "epoch": 4.645868465430016, "grad_norm": 0.5236426591873169, "learning_rate": 3.040609763502866e-06, "loss": 0.7749, "step": 16530 }, { "epoch": 4.647273749297358, "grad_norm": 0.5197000503540039, "learning_rate": 3.016647648407389e-06, "loss": 0.7596, "step": 16535 }, { "epoch": 4.648679033164699, "grad_norm": 0.5232204794883728, "learning_rate": 2.992778880232594e-06, "loss": 0.8043, "step": 16540 }, { "epoch": 4.65008431703204, "grad_norm": 0.506645679473877, "learning_rate": 2.969003481952315e-06, "loss": 0.7428, "step": 16545 }, { "epoch": 4.651489600899382, "grad_norm": 0.5069267749786377, "learning_rate": 2.9453214764504576e-06, "loss": 0.7707, "step": 16550 }, { "epoch": 4.652894884766723, "grad_norm": 0.5028716325759888, "learning_rate": 2.921732886521067e-06, "loss": 0.7609, "step": 16555 }, { "epoch": 4.654300168634064, "grad_norm": 0.5739719867706299, "learning_rate": 2.8982377348682697e-06, "loss": 0.7573, "step": 16560 }, { "epoch": 4.655705452501405, "grad_norm": 0.5546180009841919, "learning_rate": 2.874836044106266e-06, "loss": 0.762, "step": 16565 }, { "epoch": 4.657110736368747, "grad_norm": 0.5253195762634277, "learning_rate": 2.8515278367592823e-06, "loss": 0.7555, "step": 16570 }, { "epoch": 4.658516020236088, "grad_norm": 0.5367646217346191, "learning_rate": 2.828313135261573e-06, "loss": 0.7514, "step": 16575 }, { "epoch": 4.659921304103429, "grad_norm": 0.5407754778862, "learning_rate": 2.8051919619573986e-06, "loss": 0.7546, "step": 16580 }, { "epoch": 4.66132658797077, "grad_norm": 0.5303405523300171, "learning_rate": 2.782164339101001e-06, "loss": 0.7582, "step": 16585 }, { "epoch": 4.662731871838111, "grad_norm": 0.5342952609062195, "learning_rate": 2.7592302888565514e-06, "loss": 0.7662, "step": 16590 }, { "epoch": 4.664137155705452, "grad_norm": 0.5367206335067749, "learning_rate": 2.7363898332981696e-06, "loss": 0.7698, "step": 16595 }, { "epoch": 4.665542439572794, "grad_norm": 0.5240176916122437, "learning_rate": 2.7136429944099483e-06, "loss": 0.8022, "step": 16600 }, { "epoch": 4.666947723440135, "grad_norm": 0.5152767300605774, "learning_rate": 2.6909897940857966e-06, "loss": 0.7678, "step": 16605 }, { "epoch": 4.668353007307476, "grad_norm": 0.551674485206604, "learning_rate": 2.668430254129506e-06, "loss": 0.7693, "step": 16610 }, { "epoch": 4.669758291174817, "grad_norm": 0.5180559754371643, "learning_rate": 2.645964396254763e-06, "loss": 0.7608, "step": 16615 }, { "epoch": 4.671163575042159, "grad_norm": 0.5306651592254639, "learning_rate": 2.62359224208506e-06, "loss": 0.7579, "step": 16620 }, { "epoch": 4.6725688589095, "grad_norm": 0.5154826045036316, "learning_rate": 2.6013138131536717e-06, "loss": 0.7538, "step": 16625 }, { "epoch": 4.67397414277684, "grad_norm": 0.5123304724693298, "learning_rate": 2.579129130903701e-06, "loss": 0.7611, "step": 16630 }, { "epoch": 4.675379426644183, "grad_norm": 0.5388631224632263, "learning_rate": 2.5570382166880126e-06, "loss": 0.7551, "step": 16635 }, { "epoch": 4.676784710511523, "grad_norm": 0.5057893991470337, "learning_rate": 2.535041091769219e-06, "loss": 0.7537, "step": 16640 }, { "epoch": 4.678189994378864, "grad_norm": 0.5687735080718994, "learning_rate": 2.5131377773196184e-06, "loss": 0.7613, "step": 16645 }, { "epoch": 4.679595278246206, "grad_norm": 0.5253791809082031, "learning_rate": 2.4913282944212914e-06, "loss": 0.8126, "step": 16650 }, { "epoch": 4.681000562113547, "grad_norm": 0.5506506562232971, "learning_rate": 2.4696126640659566e-06, "loss": 0.7567, "step": 16655 }, { "epoch": 4.682405845980888, "grad_norm": 0.5498470664024353, "learning_rate": 2.4479909071549954e-06, "loss": 0.7759, "step": 16660 }, { "epoch": 4.6838111298482294, "grad_norm": 0.6022109389305115, "learning_rate": 2.4264630444994498e-06, "loss": 0.8043, "step": 16665 }, { "epoch": 4.685216413715571, "grad_norm": 0.5312721729278564, "learning_rate": 2.4050290968199884e-06, "loss": 0.7698, "step": 16670 }, { "epoch": 4.686621697582912, "grad_norm": 0.6051803827285767, "learning_rate": 2.3836890847468873e-06, "loss": 0.7579, "step": 16675 }, { "epoch": 4.688026981450253, "grad_norm": 0.5202509164810181, "learning_rate": 2.362443028820005e-06, "loss": 0.763, "step": 16680 }, { "epoch": 4.689432265317594, "grad_norm": 0.5393158793449402, "learning_rate": 2.3412909494887613e-06, "loss": 0.769, "step": 16685 }, { "epoch": 4.690837549184935, "grad_norm": 0.5265124440193176, "learning_rate": 2.3202328671121376e-06, "loss": 0.7679, "step": 16690 }, { "epoch": 4.692242833052276, "grad_norm": 0.5325393080711365, "learning_rate": 2.2992688019586206e-06, "loss": 0.7597, "step": 16695 }, { "epoch": 4.693648116919618, "grad_norm": 0.5232254266738892, "learning_rate": 2.2783987742062475e-06, "loss": 0.7658, "step": 16700 }, { "epoch": 4.695053400786959, "grad_norm": 0.579431414604187, "learning_rate": 2.257622803942483e-06, "loss": 0.7673, "step": 16705 }, { "epoch": 4.6964586846543, "grad_norm": 0.5365939736366272, "learning_rate": 2.23694091116432e-06, "loss": 0.7516, "step": 16710 }, { "epoch": 4.6978639685216415, "grad_norm": 0.5156635046005249, "learning_rate": 2.216353115778158e-06, "loss": 0.7662, "step": 16715 }, { "epoch": 4.699269252388983, "grad_norm": 0.5380986928939819, "learning_rate": 2.195859437599845e-06, "loss": 0.7537, "step": 16720 }, { "epoch": 4.700674536256324, "grad_norm": 0.5444085001945496, "learning_rate": 2.175459896354659e-06, "loss": 0.7707, "step": 16725 }, { "epoch": 4.702079820123665, "grad_norm": 0.5282695293426514, "learning_rate": 2.1551545116772265e-06, "loss": 0.7597, "step": 16730 }, { "epoch": 4.703485103991007, "grad_norm": 0.5188681483268738, "learning_rate": 2.1349433031115807e-06, "loss": 0.7615, "step": 16735 }, { "epoch": 4.704890387858347, "grad_norm": 0.5118913650512695, "learning_rate": 2.114826290111116e-06, "loss": 0.766, "step": 16740 }, { "epoch": 4.706295671725688, "grad_norm": 0.5214533805847168, "learning_rate": 2.094803492038533e-06, "loss": 0.75, "step": 16745 }, { "epoch": 4.70770095559303, "grad_norm": 0.5205959677696228, "learning_rate": 2.074874928165871e-06, "loss": 0.7511, "step": 16750 }, { "epoch": 4.709106239460371, "grad_norm": 0.5414964556694031, "learning_rate": 2.055040617674464e-06, "loss": 0.7702, "step": 16755 }, { "epoch": 4.710511523327712, "grad_norm": 0.5163237452507019, "learning_rate": 2.035300579654931e-06, "loss": 0.7565, "step": 16760 }, { "epoch": 4.7119168071950535, "grad_norm": 0.5173978805541992, "learning_rate": 2.015654833107161e-06, "loss": 0.7656, "step": 16765 }, { "epoch": 4.713322091062395, "grad_norm": 0.5129281282424927, "learning_rate": 1.996103396940252e-06, "loss": 0.752, "step": 16770 }, { "epoch": 4.714727374929736, "grad_norm": 0.5729324221611023, "learning_rate": 1.97664628997255e-06, "loss": 0.7571, "step": 16775 }, { "epoch": 4.716132658797077, "grad_norm": 0.5637750625610352, "learning_rate": 1.957283530931631e-06, "loss": 0.7708, "step": 16780 }, { "epoch": 4.717537942664418, "grad_norm": 0.5214433073997498, "learning_rate": 1.938015138454219e-06, "loss": 0.7593, "step": 16785 }, { "epoch": 4.71894322653176, "grad_norm": 0.5149796605110168, "learning_rate": 1.9188411310862466e-06, "loss": 0.7578, "step": 16790 }, { "epoch": 4.7203485103991, "grad_norm": 0.5418787002563477, "learning_rate": 1.8997615272827617e-06, "loss": 0.7822, "step": 16795 }, { "epoch": 4.721753794266442, "grad_norm": 0.49819424748420715, "learning_rate": 1.8807763454079975e-06, "loss": 0.7682, "step": 16800 }, { "epoch": 4.723159078133783, "grad_norm": 0.5212000012397766, "learning_rate": 1.8618856037352584e-06, "loss": 0.769, "step": 16805 }, { "epoch": 4.724564362001124, "grad_norm": 0.5406200289726257, "learning_rate": 1.843089320446978e-06, "loss": 0.7569, "step": 16810 }, { "epoch": 4.7259696458684655, "grad_norm": 0.5158624649047852, "learning_rate": 1.8243875136346623e-06, "loss": 0.7518, "step": 16815 }, { "epoch": 4.727374929735807, "grad_norm": 0.532293975353241, "learning_rate": 1.80578020129889e-06, "loss": 0.7533, "step": 16820 }, { "epoch": 4.728780213603148, "grad_norm": 0.5103762745857239, "learning_rate": 1.7872674013492796e-06, "loss": 0.7643, "step": 16825 }, { "epoch": 4.730185497470489, "grad_norm": 0.5502645969390869, "learning_rate": 1.7688491316044776e-06, "loss": 0.7552, "step": 16830 }, { "epoch": 4.731590781337831, "grad_norm": 0.4943491816520691, "learning_rate": 1.7505254097921807e-06, "loss": 0.7571, "step": 16835 }, { "epoch": 4.732996065205171, "grad_norm": 0.5176577568054199, "learning_rate": 1.7322962535490262e-06, "loss": 0.7556, "step": 16840 }, { "epoch": 4.734401349072512, "grad_norm": 0.534931480884552, "learning_rate": 1.7141616804206784e-06, "loss": 0.7638, "step": 16845 }, { "epoch": 4.735806632939854, "grad_norm": 0.530157744884491, "learning_rate": 1.696121707861731e-06, "loss": 0.7522, "step": 16850 }, { "epoch": 4.737211916807195, "grad_norm": 0.49656274914741516, "learning_rate": 1.6781763532357498e-06, "loss": 0.7595, "step": 16855 }, { "epoch": 4.738617200674536, "grad_norm": 0.5944429636001587, "learning_rate": 1.6603256338152295e-06, "loss": 0.7526, "step": 16860 }, { "epoch": 4.7400224845418775, "grad_norm": 0.5462916493415833, "learning_rate": 1.642569566781549e-06, "loss": 0.7656, "step": 16865 }, { "epoch": 4.741427768409219, "grad_norm": 0.5489064455032349, "learning_rate": 1.6249081692250257e-06, "loss": 0.7569, "step": 16870 }, { "epoch": 4.74283305227656, "grad_norm": 0.5147420763969421, "learning_rate": 1.6073414581448288e-06, "loss": 0.7659, "step": 16875 }, { "epoch": 4.744238336143901, "grad_norm": 0.5433098673820496, "learning_rate": 1.5898694504489776e-06, "loss": 0.7681, "step": 16880 }, { "epoch": 4.745643620011243, "grad_norm": 0.5174381732940674, "learning_rate": 1.5724921629543977e-06, "loss": 0.7943, "step": 16885 }, { "epoch": 4.747048903878584, "grad_norm": 0.6421141624450684, "learning_rate": 1.5552096123867655e-06, "loss": 0.7744, "step": 16890 }, { "epoch": 4.748454187745924, "grad_norm": 0.5340020656585693, "learning_rate": 1.5380218153806526e-06, "loss": 0.7633, "step": 16895 }, { "epoch": 4.749859471613266, "grad_norm": 0.536148190498352, "learning_rate": 1.5209287884793588e-06, "loss": 0.758, "step": 16900 }, { "epoch": 4.751264755480607, "grad_norm": 0.49202027916908264, "learning_rate": 1.503930548135024e-06, "loss": 0.7628, "step": 16905 }, { "epoch": 4.752670039347948, "grad_norm": 0.5275793671607971, "learning_rate": 1.4870271107085388e-06, "loss": 0.7621, "step": 16910 }, { "epoch": 4.7540753232152895, "grad_norm": 0.5963732004165649, "learning_rate": 1.4702184924695107e-06, "loss": 0.7658, "step": 16915 }, { "epoch": 4.755480607082631, "grad_norm": 0.5299362540245056, "learning_rate": 1.4535047095963427e-06, "loss": 0.8079, "step": 16920 }, { "epoch": 4.756885890949972, "grad_norm": 0.5836244821548462, "learning_rate": 1.4368857781761003e-06, "loss": 0.7688, "step": 16925 }, { "epoch": 4.758291174817313, "grad_norm": 0.5270640254020691, "learning_rate": 1.420361714204599e-06, "loss": 0.7769, "step": 16930 }, { "epoch": 4.759696458684655, "grad_norm": 0.5261182188987732, "learning_rate": 1.4039325335863162e-06, "loss": 0.7707, "step": 16935 }, { "epoch": 4.761101742551995, "grad_norm": 0.5504453182220459, "learning_rate": 1.3875982521344145e-06, "loss": 0.7674, "step": 16940 }, { "epoch": 4.762507026419336, "grad_norm": 0.5098614692687988, "learning_rate": 1.3713588855707282e-06, "loss": 0.7963, "step": 16945 }, { "epoch": 4.763912310286678, "grad_norm": 0.5301799178123474, "learning_rate": 1.355214449525699e-06, "loss": 0.7508, "step": 16950 }, { "epoch": 4.765317594154019, "grad_norm": 0.5289101600646973, "learning_rate": 1.3391649595384303e-06, "loss": 0.7491, "step": 16955 }, { "epoch": 4.76672287802136, "grad_norm": 0.5116060376167297, "learning_rate": 1.323210431056643e-06, "loss": 0.7704, "step": 16960 }, { "epoch": 4.7681281618887015, "grad_norm": 0.5461031198501587, "learning_rate": 1.3073508794366306e-06, "loss": 0.7646, "step": 16965 }, { "epoch": 4.769533445756043, "grad_norm": 0.550803542137146, "learning_rate": 1.291586319943283e-06, "loss": 0.7597, "step": 16970 }, { "epoch": 4.770938729623384, "grad_norm": 0.5380566120147705, "learning_rate": 1.2759167677500738e-06, "loss": 0.8017, "step": 16975 }, { "epoch": 4.772344013490725, "grad_norm": 0.5564086437225342, "learning_rate": 1.260342237938994e-06, "loss": 0.7696, "step": 16980 }, { "epoch": 4.773749297358067, "grad_norm": 0.543734610080719, "learning_rate": 1.244862745500619e-06, "loss": 0.851, "step": 16985 }, { "epoch": 4.775154581225408, "grad_norm": 0.5428884625434875, "learning_rate": 1.2294783053340419e-06, "loss": 0.7553, "step": 16990 }, { "epoch": 4.776559865092748, "grad_norm": 0.5188602209091187, "learning_rate": 1.2141889322468293e-06, "loss": 0.7577, "step": 16995 }, { "epoch": 4.77796514896009, "grad_norm": 0.5654597282409668, "learning_rate": 1.1989946409550867e-06, "loss": 0.7672, "step": 17000 }, { "epoch": 4.779370432827431, "grad_norm": 0.5196244716644287, "learning_rate": 1.1838954460833828e-06, "loss": 0.7977, "step": 17005 }, { "epoch": 4.780775716694772, "grad_norm": 0.4975441098213196, "learning_rate": 1.16889136216477e-06, "loss": 0.7551, "step": 17010 }, { "epoch": 4.7821810005621135, "grad_norm": 0.553006649017334, "learning_rate": 1.1539824036407522e-06, "loss": 0.7525, "step": 17015 }, { "epoch": 4.783586284429455, "grad_norm": 0.5071210861206055, "learning_rate": 1.13916858486125e-06, "loss": 0.766, "step": 17020 }, { "epoch": 4.784991568296796, "grad_norm": 0.49615663290023804, "learning_rate": 1.1244499200846582e-06, "loss": 0.7962, "step": 17025 }, { "epoch": 4.786396852164137, "grad_norm": 0.5509187579154968, "learning_rate": 1.1098264234777446e-06, "loss": 0.7916, "step": 17030 }, { "epoch": 4.787802136031479, "grad_norm": 0.5368015766143799, "learning_rate": 1.0952981091156833e-06, "loss": 0.7638, "step": 17035 }, { "epoch": 4.78920741989882, "grad_norm": 0.5518297553062439, "learning_rate": 1.0808649909820557e-06, "loss": 0.7621, "step": 17040 }, { "epoch": 4.790612703766161, "grad_norm": 0.56369948387146, "learning_rate": 1.0665270829688046e-06, "loss": 0.7639, "step": 17045 }, { "epoch": 4.792017987633502, "grad_norm": 0.5085920095443726, "learning_rate": 1.0522843988762355e-06, "loss": 0.7602, "step": 17050 }, { "epoch": 4.793423271500843, "grad_norm": 0.5725511312484741, "learning_rate": 1.0381369524129714e-06, "loss": 0.7746, "step": 17055 }, { "epoch": 4.794828555368184, "grad_norm": 0.5052986145019531, "learning_rate": 1.0240847571960199e-06, "loss": 0.7486, "step": 17060 }, { "epoch": 4.7962338392355255, "grad_norm": 0.5754660964012146, "learning_rate": 1.0101278267506842e-06, "loss": 0.7621, "step": 17065 }, { "epoch": 4.797639123102867, "grad_norm": 0.5558141469955444, "learning_rate": 9.962661745105517e-07, "loss": 0.7641, "step": 17070 }, { "epoch": 4.799044406970208, "grad_norm": 0.6065548658370972, "learning_rate": 9.8249981381755e-07, "loss": 0.7843, "step": 17075 }, { "epoch": 4.800449690837549, "grad_norm": 0.5406213998794556, "learning_rate": 9.688287579218581e-07, "loss": 0.7666, "step": 17080 }, { "epoch": 4.801854974704891, "grad_norm": 0.5441781878471375, "learning_rate": 9.552530199819165e-07, "loss": 0.7665, "step": 17085 }, { "epoch": 4.803260258572232, "grad_norm": 0.5460792779922485, "learning_rate": 9.41772613064451e-07, "loss": 0.7691, "step": 17090 }, { "epoch": 4.804665542439572, "grad_norm": 0.5391635894775391, "learning_rate": 9.283875501444162e-07, "loss": 0.7603, "step": 17095 }, { "epoch": 4.806070826306914, "grad_norm": 0.5086688995361328, "learning_rate": 9.15097844104984e-07, "loss": 0.7548, "step": 17100 }, { "epoch": 4.807476110174255, "grad_norm": 0.5172122716903687, "learning_rate": 9.019035077375448e-07, "loss": 0.7606, "step": 17105 }, { "epoch": 4.808881394041596, "grad_norm": 0.5639137625694275, "learning_rate": 8.888045537417399e-07, "loss": 0.7713, "step": 17110 }, { "epoch": 4.8102866779089375, "grad_norm": 0.5356950163841248, "learning_rate": 8.75800994725362e-07, "loss": 0.7661, "step": 17115 }, { "epoch": 4.811691961776279, "grad_norm": 0.5866556167602539, "learning_rate": 8.628928432043881e-07, "loss": 0.7602, "step": 17120 }, { "epoch": 4.81309724564362, "grad_norm": 0.5141949653625488, "learning_rate": 8.5008011160298e-07, "loss": 0.7577, "step": 17125 }, { "epoch": 4.814502529510961, "grad_norm": 0.5439788699150085, "learning_rate": 8.373628122534505e-07, "loss": 0.7715, "step": 17130 }, { "epoch": 4.815907813378303, "grad_norm": 0.5305047035217285, "learning_rate": 8.247409573962527e-07, "loss": 0.7718, "step": 17135 }, { "epoch": 4.817313097245644, "grad_norm": 0.565777063369751, "learning_rate": 8.122145591799801e-07, "loss": 0.7725, "step": 17140 }, { "epoch": 4.818718381112985, "grad_norm": 0.5218404531478882, "learning_rate": 7.997836296613437e-07, "loss": 0.7636, "step": 17145 }, { "epoch": 4.820123664980326, "grad_norm": 0.552274763584137, "learning_rate": 7.874481808051836e-07, "loss": 0.7678, "step": 17150 }, { "epoch": 4.821528948847667, "grad_norm": 0.524766206741333, "learning_rate": 7.75208224484425e-07, "loss": 0.7554, "step": 17155 }, { "epoch": 4.822934232715008, "grad_norm": 0.5318744778633118, "learning_rate": 7.630637724800771e-07, "loss": 0.7619, "step": 17160 }, { "epoch": 4.8243395165823495, "grad_norm": 0.5492753982543945, "learning_rate": 7.51014836481223e-07, "loss": 0.8131, "step": 17165 }, { "epoch": 4.825744800449691, "grad_norm": 0.5176735520362854, "learning_rate": 7.390614280850306e-07, "loss": 0.7591, "step": 17170 }, { "epoch": 4.827150084317032, "grad_norm": 0.554851233959198, "learning_rate": 7.272035587967074e-07, "loss": 0.7583, "step": 17175 }, { "epoch": 4.828555368184373, "grad_norm": 0.6030601263046265, "learning_rate": 7.154412400294908e-07, "loss": 0.7663, "step": 17180 }, { "epoch": 4.829960652051715, "grad_norm": 0.5561255216598511, "learning_rate": 7.037744831047022e-07, "loss": 0.7515, "step": 17185 }, { "epoch": 4.831365935919056, "grad_norm": 0.5055898427963257, "learning_rate": 6.922032992516148e-07, "loss": 0.7699, "step": 17190 }, { "epoch": 4.832771219786397, "grad_norm": 0.5240102410316467, "learning_rate": 6.807276996075529e-07, "loss": 0.7639, "step": 17195 }, { "epoch": 4.8341765036537385, "grad_norm": 0.530353844165802, "learning_rate": 6.693476952178479e-07, "loss": 0.7516, "step": 17200 }, { "epoch": 4.835581787521079, "grad_norm": 0.521259605884552, "learning_rate": 6.580632970357937e-07, "loss": 0.7587, "step": 17205 }, { "epoch": 4.83698707138842, "grad_norm": 0.5423803329467773, "learning_rate": 6.468745159226796e-07, "loss": 0.7535, "step": 17210 }, { "epoch": 4.8383923552557615, "grad_norm": 0.5381777286529541, "learning_rate": 6.357813626477471e-07, "loss": 0.7516, "step": 17215 }, { "epoch": 4.839797639123103, "grad_norm": 0.5275468826293945, "learning_rate": 6.247838478882328e-07, "loss": 0.7552, "step": 17220 }, { "epoch": 4.841202922990444, "grad_norm": 0.49720627069473267, "learning_rate": 6.138819822292807e-07, "loss": 0.767, "step": 17225 }, { "epoch": 4.842608206857785, "grad_norm": 0.564487636089325, "learning_rate": 6.030757761639749e-07, "loss": 0.7639, "step": 17230 }, { "epoch": 4.844013490725127, "grad_norm": 0.5002022385597229, "learning_rate": 5.92365240093351e-07, "loss": 0.7498, "step": 17235 }, { "epoch": 4.845418774592468, "grad_norm": 0.5189710855484009, "learning_rate": 5.817503843263516e-07, "loss": 0.7557, "step": 17240 }, { "epoch": 4.846824058459809, "grad_norm": 0.5181299448013306, "learning_rate": 5.712312190798264e-07, "loss": 0.7734, "step": 17245 }, { "epoch": 4.84822934232715, "grad_norm": 0.5273287892341614, "learning_rate": 5.608077544784984e-07, "loss": 0.7593, "step": 17250 }, { "epoch": 4.849634626194491, "grad_norm": 0.5158643126487732, "learning_rate": 5.504800005550204e-07, "loss": 0.765, "step": 17255 }, { "epoch": 4.851039910061832, "grad_norm": 0.5436376929283142, "learning_rate": 5.402479672498961e-07, "loss": 0.7596, "step": 17260 }, { "epoch": 4.8524451939291735, "grad_norm": 0.5546701550483704, "learning_rate": 5.301116644115034e-07, "loss": 0.7546, "step": 17265 }, { "epoch": 4.853850477796515, "grad_norm": 0.5897644758224487, "learning_rate": 5.200711017960824e-07, "loss": 0.7785, "step": 17270 }, { "epoch": 4.855255761663856, "grad_norm": 0.5387979745864868, "learning_rate": 5.10126289067725e-07, "loss": 0.7668, "step": 17275 }, { "epoch": 4.856661045531197, "grad_norm": 0.5134298801422119, "learning_rate": 5.002772357983521e-07, "loss": 0.7686, "step": 17280 }, { "epoch": 4.858066329398539, "grad_norm": 0.5166084170341492, "learning_rate": 4.905239514677251e-07, "loss": 0.7672, "step": 17285 }, { "epoch": 4.85947161326588, "grad_norm": 0.5076904296875, "learning_rate": 4.808664454634238e-07, "loss": 0.7592, "step": 17290 }, { "epoch": 4.860876897133221, "grad_norm": 0.5120134353637695, "learning_rate": 4.7130472708084573e-07, "loss": 0.7596, "step": 17295 }, { "epoch": 4.8622821810005625, "grad_norm": 0.5964426398277283, "learning_rate": 4.618388055231848e-07, "loss": 0.7614, "step": 17300 }, { "epoch": 4.863687464867903, "grad_norm": 0.5426851511001587, "learning_rate": 4.5246868990143076e-07, "loss": 0.8021, "step": 17305 }, { "epoch": 4.865092748735244, "grad_norm": 0.5391839742660522, "learning_rate": 4.431943892343693e-07, "loss": 0.7574, "step": 17310 }, { "epoch": 4.8664980326025855, "grad_norm": 0.5277245044708252, "learning_rate": 4.340159124485488e-07, "loss": 0.836, "step": 17315 }, { "epoch": 4.867903316469927, "grad_norm": 0.5153777003288269, "learning_rate": 4.249332683783025e-07, "loss": 0.7579, "step": 17320 }, { "epoch": 4.869308600337268, "grad_norm": 0.5122922658920288, "learning_rate": 4.1594646576572636e-07, "loss": 0.7648, "step": 17325 }, { "epoch": 4.870713884204609, "grad_norm": 0.5131497979164124, "learning_rate": 4.070555132606346e-07, "loss": 0.7643, "step": 17330 }, { "epoch": 4.872119168071951, "grad_norm": 0.5253489017486572, "learning_rate": 3.982604194206263e-07, "loss": 0.7559, "step": 17335 }, { "epoch": 4.873524451939292, "grad_norm": 0.5008916258811951, "learning_rate": 3.8956119271101876e-07, "loss": 0.7966, "step": 17340 }, { "epoch": 4.874929735806633, "grad_norm": 0.5834038853645325, "learning_rate": 3.809578415048587e-07, "loss": 0.7749, "step": 17345 }, { "epoch": 4.876335019673974, "grad_norm": 0.5124272108078003, "learning_rate": 3.7245037408291104e-07, "loss": 0.7509, "step": 17350 }, { "epoch": 4.877740303541316, "grad_norm": 0.5225289463996887, "learning_rate": 3.6403879863363687e-07, "loss": 0.7534, "step": 17355 }, { "epoch": 4.879145587408656, "grad_norm": 0.5116289854049683, "learning_rate": 3.5572312325323766e-07, "loss": 0.7674, "step": 17360 }, { "epoch": 4.8805508712759975, "grad_norm": 0.5568757653236389, "learning_rate": 3.475033559455665e-07, "loss": 0.7473, "step": 17365 }, { "epoch": 4.881956155143339, "grad_norm": 0.5169786810874939, "learning_rate": 3.393795046222059e-07, "loss": 0.7732, "step": 17370 }, { "epoch": 4.88336143901068, "grad_norm": 0.5193935036659241, "learning_rate": 3.3135157710240116e-07, "loss": 0.7599, "step": 17375 }, { "epoch": 4.884766722878021, "grad_norm": 0.49602463841438293, "learning_rate": 3.2341958111306025e-07, "loss": 0.7547, "step": 17380 }, { "epoch": 4.886172006745363, "grad_norm": 0.518610954284668, "learning_rate": 3.15583524288765e-07, "loss": 0.7663, "step": 17385 }, { "epoch": 4.887577290612704, "grad_norm": 0.4968721568584442, "learning_rate": 3.07843414171749e-07, "loss": 0.7559, "step": 17390 }, { "epoch": 4.888982574480045, "grad_norm": 0.541677713394165, "learning_rate": 3.001992582119306e-07, "loss": 0.7627, "step": 17395 }, { "epoch": 4.8903878583473865, "grad_norm": 0.5368949770927429, "learning_rate": 2.926510637668134e-07, "loss": 0.7738, "step": 17400 }, { "epoch": 4.891793142214727, "grad_norm": 0.5182912945747375, "learning_rate": 2.851988381015858e-07, "loss": 0.7614, "step": 17405 }, { "epoch": 4.893198426082068, "grad_norm": 0.5060423612594604, "learning_rate": 2.7784258838905454e-07, "loss": 0.755, "step": 17410 }, { "epoch": 4.8946037099494095, "grad_norm": 0.4969828724861145, "learning_rate": 2.705823217096226e-07, "loss": 0.7437, "step": 17415 }, { "epoch": 4.896008993816751, "grad_norm": 0.5251031517982483, "learning_rate": 2.634180450513446e-07, "loss": 0.7609, "step": 17420 }, { "epoch": 4.897414277684092, "grad_norm": 0.5827402472496033, "learning_rate": 2.5634976530988233e-07, "loss": 0.7557, "step": 17425 }, { "epoch": 4.898819561551433, "grad_norm": 0.5410636067390442, "learning_rate": 2.493774892884604e-07, "loss": 0.7615, "step": 17430 }, { "epoch": 4.900224845418775, "grad_norm": 0.5140436887741089, "learning_rate": 2.4250122369794403e-07, "loss": 0.7606, "step": 17435 }, { "epoch": 4.901630129286116, "grad_norm": 0.5037211775779724, "learning_rate": 2.3572097515676127e-07, "loss": 0.7665, "step": 17440 }, { "epoch": 4.903035413153457, "grad_norm": 0.5306110978126526, "learning_rate": 2.290367501909363e-07, "loss": 0.7716, "step": 17445 }, { "epoch": 4.9044406970207985, "grad_norm": 0.5421374440193176, "learning_rate": 2.2244855523406716e-07, "loss": 0.7621, "step": 17450 }, { "epoch": 4.90584598088814, "grad_norm": 0.5292686223983765, "learning_rate": 2.1595639662732593e-07, "loss": 0.758, "step": 17455 }, { "epoch": 4.90725126475548, "grad_norm": 0.530255138874054, "learning_rate": 2.0956028061942523e-07, "loss": 0.7678, "step": 17460 }, { "epoch": 4.9086565486228215, "grad_norm": 0.4856848418712616, "learning_rate": 2.03260213366685e-07, "loss": 0.7534, "step": 17465 }, { "epoch": 4.910061832490163, "grad_norm": 0.5324820876121521, "learning_rate": 1.970562009329324e-07, "loss": 0.7562, "step": 17470 }, { "epoch": 4.911467116357504, "grad_norm": 0.5274339318275452, "learning_rate": 1.9094824928954646e-07, "loss": 0.7672, "step": 17475 }, { "epoch": 4.912872400224845, "grad_norm": 0.5117448568344116, "learning_rate": 1.849363643154911e-07, "loss": 0.7625, "step": 17480 }, { "epoch": 4.914277684092187, "grad_norm": 0.4951476454734802, "learning_rate": 1.7902055179720434e-07, "loss": 0.7583, "step": 17485 }, { "epoch": 4.915682967959528, "grad_norm": 0.6083093285560608, "learning_rate": 1.7320081742869808e-07, "loss": 0.7563, "step": 17490 }, { "epoch": 4.917088251826869, "grad_norm": 0.504711389541626, "learning_rate": 1.6747716681148052e-07, "loss": 0.7667, "step": 17495 }, { "epoch": 4.9184935356942105, "grad_norm": 0.5517157316207886, "learning_rate": 1.6184960545460037e-07, "loss": 0.7644, "step": 17500 }, { "epoch": 4.919898819561551, "grad_norm": 0.5390472412109375, "learning_rate": 1.5631813877461376e-07, "loss": 0.7691, "step": 17505 }, { "epoch": 4.921304103428893, "grad_norm": 0.5268747210502625, "learning_rate": 1.50882772095573e-07, "loss": 0.7622, "step": 17510 }, { "epoch": 4.9227093872962335, "grad_norm": 0.5161256790161133, "learning_rate": 1.455435106490488e-07, "loss": 0.7602, "step": 17515 }, { "epoch": 4.924114671163575, "grad_norm": 0.5164275169372559, "learning_rate": 1.4030035957410814e-07, "loss": 0.7576, "step": 17520 }, { "epoch": 4.925519955030916, "grad_norm": 0.5255956649780273, "learning_rate": 1.3515332391730306e-07, "loss": 0.7661, "step": 17525 }, { "epoch": 4.926925238898257, "grad_norm": 0.527873694896698, "learning_rate": 1.3010240863268187e-07, "loss": 0.7529, "step": 17530 }, { "epoch": 4.928330522765599, "grad_norm": 0.5253483653068542, "learning_rate": 1.2514761858177793e-07, "loss": 0.7727, "step": 17535 }, { "epoch": 4.92973580663294, "grad_norm": 0.5246861577033997, "learning_rate": 1.2028895853358756e-07, "loss": 0.763, "step": 17540 }, { "epoch": 4.931141090500281, "grad_norm": 0.5083796977996826, "learning_rate": 1.1552643316459222e-07, "loss": 0.7606, "step": 17545 }, { "epoch": 4.9325463743676226, "grad_norm": 0.49852460622787476, "learning_rate": 1.1086004705875841e-07, "loss": 0.755, "step": 17550 }, { "epoch": 4.933951658234964, "grad_norm": 0.5587095022201538, "learning_rate": 1.0628980470750449e-07, "loss": 0.7587, "step": 17555 }, { "epoch": 4.935356942102304, "grad_norm": 0.5445096492767334, "learning_rate": 1.0181571050968953e-07, "loss": 0.764, "step": 17560 }, { "epoch": 4.9367622259696455, "grad_norm": 0.5117522478103638, "learning_rate": 9.743776877166877e-08, "loss": 0.7614, "step": 17565 }, { "epoch": 4.938167509836987, "grad_norm": 0.5934692025184631, "learning_rate": 9.315598370722711e-08, "loss": 0.8096, "step": 17570 }, { "epoch": 4.939572793704328, "grad_norm": 0.5090463161468506, "learning_rate": 8.897035943760124e-08, "loss": 0.7632, "step": 17575 }, { "epoch": 4.940978077571669, "grad_norm": 0.5313629508018494, "learning_rate": 8.488089999146854e-08, "loss": 0.7703, "step": 17580 }, { "epoch": 4.942383361439011, "grad_norm": 0.5156370401382446, "learning_rate": 8.08876093049582e-08, "loss": 0.7559, "step": 17585 }, { "epoch": 4.943788645306352, "grad_norm": 0.505222737789154, "learning_rate": 7.699049122162904e-08, "loss": 0.7479, "step": 17590 }, { "epoch": 4.945193929173693, "grad_norm": 0.5333053469657898, "learning_rate": 7.318954949248059e-08, "loss": 0.764, "step": 17595 }, { "epoch": 4.946599213041035, "grad_norm": 0.5412734150886536, "learning_rate": 6.948478777591971e-08, "loss": 0.7544, "step": 17600 }, { "epoch": 4.948004496908376, "grad_norm": 0.5213012099266052, "learning_rate": 6.587620963781626e-08, "loss": 0.7571, "step": 17605 }, { "epoch": 4.949409780775717, "grad_norm": 0.514471173286438, "learning_rate": 6.236381855143636e-08, "loss": 0.7522, "step": 17610 }, { "epoch": 4.9508150646430575, "grad_norm": 0.5372034907341003, "learning_rate": 5.8947617897464614e-08, "loss": 0.7646, "step": 17615 }, { "epoch": 4.952220348510399, "grad_norm": 0.5589454174041748, "learning_rate": 5.562761096402636e-08, "loss": 0.7702, "step": 17620 }, { "epoch": 4.95362563237774, "grad_norm": 0.4952004551887512, "learning_rate": 5.2403800946621006e-08, "loss": 0.7715, "step": 17625 }, { "epoch": 4.955030916245081, "grad_norm": 0.5731898546218872, "learning_rate": 4.9276190948199795e-08, "loss": 0.7667, "step": 17630 }, { "epoch": 4.956436200112423, "grad_norm": 0.5195038318634033, "learning_rate": 4.624478397909915e-08, "loss": 0.7522, "step": 17635 }, { "epoch": 4.957841483979764, "grad_norm": 0.5436112284660339, "learning_rate": 4.330958295705179e-08, "loss": 0.7555, "step": 17640 }, { "epoch": 4.959246767847105, "grad_norm": 0.5222539901733398, "learning_rate": 4.047059070720893e-08, "loss": 0.7617, "step": 17645 }, { "epoch": 4.960652051714447, "grad_norm": 0.5810261964797974, "learning_rate": 3.772780996211811e-08, "loss": 0.7628, "step": 17650 }, { "epoch": 4.962057335581788, "grad_norm": 0.49842745065689087, "learning_rate": 3.508124336170093e-08, "loss": 0.7526, "step": 17655 }, { "epoch": 4.963462619449128, "grad_norm": 0.5396468043327332, "learning_rate": 3.253089345331972e-08, "loss": 0.7686, "step": 17660 }, { "epoch": 4.9648679033164695, "grad_norm": 0.5041926503181458, "learning_rate": 3.007676269166648e-08, "loss": 0.7646, "step": 17665 }, { "epoch": 4.966273187183811, "grad_norm": 0.5142960548400879, "learning_rate": 2.7718853438873926e-08, "loss": 0.7689, "step": 17670 }, { "epoch": 4.967678471051152, "grad_norm": 0.5062931180000305, "learning_rate": 2.5457167964426653e-08, "loss": 0.7526, "step": 17675 }, { "epoch": 4.969083754918493, "grad_norm": 0.5020378232002258, "learning_rate": 2.3291708445216664e-08, "loss": 0.7528, "step": 17680 }, { "epoch": 4.970489038785835, "grad_norm": 0.6097744703292847, "learning_rate": 2.1222476965510052e-08, "loss": 0.7603, "step": 17685 }, { "epoch": 4.971894322653176, "grad_norm": 0.5038420557975769, "learning_rate": 1.9249475516947002e-08, "loss": 0.7533, "step": 17690 }, { "epoch": 4.973299606520517, "grad_norm": 0.5745223760604858, "learning_rate": 1.7372705998552896e-08, "loss": 0.7661, "step": 17695 }, { "epoch": 4.974704890387859, "grad_norm": 0.5366801023483276, "learning_rate": 1.5592170216716105e-08, "loss": 0.7602, "step": 17700 }, { "epoch": 4.9761101742552, "grad_norm": 0.5210517048835754, "learning_rate": 1.3907869885232405e-08, "loss": 0.7607, "step": 17705 }, { "epoch": 4.977515458122541, "grad_norm": 0.5279877781867981, "learning_rate": 1.2319806625227248e-08, "loss": 0.8017, "step": 17710 }, { "epoch": 4.9789207419898815, "grad_norm": 0.517043948173523, "learning_rate": 1.0827981965233492e-08, "loss": 0.777, "step": 17715 }, { "epoch": 4.980326025857223, "grad_norm": 0.527386486530304, "learning_rate": 9.432397341124777e-09, "loss": 0.7605, "step": 17720 }, { "epoch": 4.981731309724564, "grad_norm": 0.5199275016784668, "learning_rate": 8.133054096171044e-09, "loss": 0.7482, "step": 17725 }, { "epoch": 4.983136593591905, "grad_norm": 0.5189145803451538, "learning_rate": 6.9299534809941224e-09, "loss": 0.7552, "step": 17730 }, { "epoch": 4.984541877459247, "grad_norm": 0.5272063612937927, "learning_rate": 5.82309665357883e-09, "loss": 0.75, "step": 17735 }, { "epoch": 4.985947161326588, "grad_norm": 0.5504851937294006, "learning_rate": 4.81248467928408e-09, "loss": 0.7652, "step": 17740 }, { "epoch": 4.987352445193929, "grad_norm": 0.5174840688705444, "learning_rate": 3.898118530820671e-09, "loss": 0.7568, "step": 17745 }, { "epoch": 4.988757729061271, "grad_norm": 0.5700994729995728, "learning_rate": 3.0799990882734995e-09, "loss": 0.7614, "step": 17750 }, { "epoch": 4.990163012928612, "grad_norm": 0.507964015007019, "learning_rate": 2.3581271391015512e-09, "loss": 0.7574, "step": 17755 }, { "epoch": 4.991568296795953, "grad_norm": 0.5576549172401428, "learning_rate": 1.7325033780934973e-09, "loss": 0.7551, "step": 17760 }, { "epoch": 4.992973580663294, "grad_norm": 0.5367130041122437, "learning_rate": 1.2031284074121019e-09, "loss": 0.7534, "step": 17765 }, { "epoch": 4.994378864530635, "grad_norm": 0.5104110836982727, "learning_rate": 7.70002736594222e-10, "loss": 0.7582, "step": 17770 }, { "epoch": 4.995784148397976, "grad_norm": 0.5402113795280457, "learning_rate": 4.3312678251750115e-10, "loss": 0.7562, "step": 17775 }, { "epoch": 4.997189432265317, "grad_norm": 0.5511764287948608, "learning_rate": 1.9250086943367607e-10, "loss": 0.7579, "step": 17780 }, { "epoch": 4.998594716132659, "grad_norm": 0.5197888612747192, "learning_rate": 4.8125228935269606e-11, "loss": 0.7629, "step": 17785 }, { "epoch": 5.0, "grad_norm": 0.5132002234458923, "learning_rate": 0.0, "loss": 0.7613, "step": 17790 }, { "epoch": 5.0, "eval_loss": 0.8528754711151123, "eval_runtime": 646.1588, "eval_samples_per_second": 6.96, "eval_steps_per_second": 0.58, "step": 17790 }, { "epoch": 5.0, "step": 17790, "total_flos": 2.1191156679948894e+19, "train_loss": 0.9175008542622121, "train_runtime": 172180.6812, "train_samples_per_second": 2.48, "train_steps_per_second": 0.103 } ], "logging_steps": 5, "max_steps": 17790, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.1191156679948894e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }