{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.023355122759651296, "eval_steps": 244340, "global_step": 21505, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0231654898558259e-06, "eval_loss": 2.5595579147338867, "eval_runtime": 1107.6262, "eval_samples_per_second": 18.009, "eval_steps_per_second": 18.009, "step": 1 }, { "epoch": 5.115827449279129e-06, "grad_norm": 46.644649505615234, "learning_rate": 2.5000000000000004e-07, "loss": 2.9488, "step": 5 }, { "epoch": 1.0231654898558258e-05, "grad_norm": 16.871562957763672, "learning_rate": 5.000000000000001e-07, "loss": 2.4425, "step": 10 }, { "epoch": 1.5347482347837386e-05, "grad_norm": 35.369422912597656, "learning_rate": 7.5e-07, "loss": 3.7072, "step": 15 }, { "epoch": 2.0463309797116515e-05, "grad_norm": 51.66082763671875, "learning_rate": 1.0000000000000002e-06, "loss": 3.785, "step": 20 }, { "epoch": 2.5579137246395645e-05, "grad_norm": 38.07048034667969, "learning_rate": 1.25e-06, "loss": 2.9775, "step": 25 }, { "epoch": 3.069496469567477e-05, "grad_norm": 47.24363327026367, "learning_rate": 1.5e-06, "loss": 3.1277, "step": 30 }, { "epoch": 3.5810792144953904e-05, "grad_norm": 19.41057014465332, "learning_rate": 1.75e-06, "loss": 2.5908, "step": 35 }, { "epoch": 4.092661959423303e-05, "grad_norm": 32.873905181884766, "learning_rate": 2.0000000000000003e-06, "loss": 2.8039, "step": 40 }, { "epoch": 4.604244704351216e-05, "grad_norm": 40.20779037475586, "learning_rate": 2.25e-06, "loss": 2.869, "step": 45 }, { "epoch": 5.115827449279129e-05, "grad_norm": 30.175168991088867, "learning_rate": 2.5e-06, "loss": 3.1713, "step": 50 }, { "epoch": 5.6274101942070417e-05, "grad_norm": 37.6412353515625, "learning_rate": 2.7500000000000004e-06, "loss": 2.0915, "step": 55 }, { "epoch": 6.138992939134954e-05, "grad_norm": 12.04383373260498, "learning_rate": 3e-06, "loss": 2.475, "step": 60 }, { "epoch": 6.650575684062868e-05, "grad_norm": 22.07406234741211, "learning_rate": 3.2500000000000002e-06, "loss": 2.6268, "step": 65 }, { "epoch": 7.162158428990781e-05, "grad_norm": 41.397945404052734, "learning_rate": 3.5e-06, "loss": 2.7724, "step": 70 }, { "epoch": 7.673741173918693e-05, "grad_norm": 40.13980484008789, "learning_rate": 3.7500000000000005e-06, "loss": 2.5739, "step": 75 }, { "epoch": 8.185323918846606e-05, "grad_norm": 142.86746215820312, "learning_rate": 4.000000000000001e-06, "loss": 2.9967, "step": 80 }, { "epoch": 8.69690666377452e-05, "grad_norm": 26.266443252563477, "learning_rate": 4.25e-06, "loss": 2.7889, "step": 85 }, { "epoch": 9.208489408702431e-05, "grad_norm": 21.810531616210938, "learning_rate": 4.5e-06, "loss": 2.3884, "step": 90 }, { "epoch": 9.720072153630345e-05, "grad_norm": 107.49293518066406, "learning_rate": 4.75e-06, "loss": 2.8609, "step": 95 }, { "epoch": 0.00010231654898558258, "grad_norm": 19.106281280517578, "learning_rate": 5e-06, "loss": 2.9142, "step": 100 }, { "epoch": 0.0001074323764348617, "grad_norm": 281.74713134765625, "learning_rate": 4.999999999677055e-06, "loss": 3.5651, "step": 105 }, { "epoch": 0.00011254820388414083, "grad_norm": 28.53888511657715, "learning_rate": 4.999999998708215e-06, "loss": 2.2284, "step": 110 }, { "epoch": 0.00011766403133341997, "grad_norm": 36.39362335205078, "learning_rate": 4.999999997093483e-06, "loss": 2.5929, "step": 115 }, { "epoch": 0.00012277985878269909, "grad_norm": 69.10511779785156, "learning_rate": 4.999999994832858e-06, "loss": 2.1816, "step": 120 }, { "epoch": 0.00012789568623197822, "grad_norm": 30.005699157714844, "learning_rate": 4.999999991926341e-06, "loss": 2.5823, "step": 125 }, { "epoch": 0.00013301151368125735, "grad_norm": 43.19683074951172, "learning_rate": 4.999999988373931e-06, "loss": 2.5083, "step": 130 }, { "epoch": 0.00013812734113053648, "grad_norm": 22.510648727416992, "learning_rate": 4.999999984175628e-06, "loss": 2.9616, "step": 135 }, { "epoch": 0.00014324316857981562, "grad_norm": 33.55472183227539, "learning_rate": 4.9999999793314326e-06, "loss": 3.0127, "step": 140 }, { "epoch": 0.00014835899602909472, "grad_norm": 40.5036735534668, "learning_rate": 4.9999999738413445e-06, "loss": 2.0424, "step": 145 }, { "epoch": 0.00015347482347837386, "grad_norm": 64.86343383789062, "learning_rate": 4.999999967705363e-06, "loss": 2.4291, "step": 150 }, { "epoch": 0.000158590650927653, "grad_norm": 40.0426025390625, "learning_rate": 4.99999996092349e-06, "loss": 2.3936, "step": 155 }, { "epoch": 0.00016370647837693212, "grad_norm": 44.559181213378906, "learning_rate": 4.999999953495723e-06, "loss": 1.9493, "step": 160 }, { "epoch": 0.00016882230582621126, "grad_norm": 53.382911682128906, "learning_rate": 4.9999999454220635e-06, "loss": 2.8357, "step": 165 }, { "epoch": 0.0001739381332754904, "grad_norm": 55.66870880126953, "learning_rate": 4.999999936702512e-06, "loss": 2.9805, "step": 170 }, { "epoch": 0.0001790539607247695, "grad_norm": 38.79623794555664, "learning_rate": 4.999999927337067e-06, "loss": 2.1999, "step": 175 }, { "epoch": 0.00018416978817404863, "grad_norm": 18.3375186920166, "learning_rate": 4.99999991732573e-06, "loss": 1.7293, "step": 180 }, { "epoch": 0.00018928561562332776, "grad_norm": 20.161148071289062, "learning_rate": 4.9999999066684994e-06, "loss": 2.408, "step": 185 }, { "epoch": 0.0001944014430726069, "grad_norm": 22.443607330322266, "learning_rate": 4.999999895365377e-06, "loss": 2.0933, "step": 190 }, { "epoch": 0.00019951727052188603, "grad_norm": 23.837753295898438, "learning_rate": 4.999999883416361e-06, "loss": 2.1861, "step": 195 }, { "epoch": 0.00020463309797116516, "grad_norm": 186.116455078125, "learning_rate": 4.999999870821453e-06, "loss": 2.5305, "step": 200 }, { "epoch": 0.00020974892542044427, "grad_norm": 20.640235900878906, "learning_rate": 4.9999998575806515e-06, "loss": 2.5576, "step": 205 }, { "epoch": 0.0002148647528697234, "grad_norm": 22.074827194213867, "learning_rate": 4.999999843693958e-06, "loss": 2.3475, "step": 210 }, { "epoch": 0.00021998058031900253, "grad_norm": 70.14079284667969, "learning_rate": 4.999999829161371e-06, "loss": 2.5909, "step": 215 }, { "epoch": 0.00022509640776828167, "grad_norm": 35.07932662963867, "learning_rate": 4.999999813982892e-06, "loss": 2.3572, "step": 220 }, { "epoch": 0.0002302122352175608, "grad_norm": 26.83456039428711, "learning_rate": 4.99999979815852e-06, "loss": 2.7933, "step": 225 }, { "epoch": 0.00023532806266683993, "grad_norm": 18.856271743774414, "learning_rate": 4.999999781688256e-06, "loss": 2.458, "step": 230 }, { "epoch": 0.00024044389011611904, "grad_norm": 27.045888900756836, "learning_rate": 4.999999764572099e-06, "loss": 1.8263, "step": 235 }, { "epoch": 0.00024555971756539817, "grad_norm": 60.80609893798828, "learning_rate": 4.999999746810049e-06, "loss": 2.2335, "step": 240 }, { "epoch": 0.00025067554501467733, "grad_norm": 14.654311180114746, "learning_rate": 4.999999728402107e-06, "loss": 2.2414, "step": 245 }, { "epoch": 0.00025579137246395644, "grad_norm": 18.02696418762207, "learning_rate": 4.999999709348271e-06, "loss": 2.006, "step": 250 }, { "epoch": 0.00026090719991323554, "grad_norm": 29.15259552001953, "learning_rate": 4.999999689648544e-06, "loss": 2.0507, "step": 255 }, { "epoch": 0.0002660230273625147, "grad_norm": 83.37858581542969, "learning_rate": 4.999999669302923e-06, "loss": 2.5173, "step": 260 }, { "epoch": 0.0002711388548117938, "grad_norm": 22.95817756652832, "learning_rate": 4.99999964831141e-06, "loss": 2.2709, "step": 265 }, { "epoch": 0.00027625468226107297, "grad_norm": 23.628463745117188, "learning_rate": 4.999999626674003e-06, "loss": 2.1772, "step": 270 }, { "epoch": 0.0002813705097103521, "grad_norm": 31.795122146606445, "learning_rate": 4.999999604390706e-06, "loss": 2.2625, "step": 275 }, { "epoch": 0.00028648633715963124, "grad_norm": 22.730119705200195, "learning_rate": 4.999999581461514e-06, "loss": 2.4128, "step": 280 }, { "epoch": 0.00029160216460891034, "grad_norm": 206.92648315429688, "learning_rate": 4.9999995578864305e-06, "loss": 2.3086, "step": 285 }, { "epoch": 0.00029671799205818945, "grad_norm": 24.268672943115234, "learning_rate": 4.999999533665454e-06, "loss": 1.9789, "step": 290 }, { "epoch": 0.0003018338195074686, "grad_norm": 45.00935745239258, "learning_rate": 4.999999508798585e-06, "loss": 1.8613, "step": 295 }, { "epoch": 0.0003069496469567477, "grad_norm": 19.23946762084961, "learning_rate": 4.999999483285823e-06, "loss": 2.1529, "step": 300 }, { "epoch": 0.0003120654744060269, "grad_norm": 40.294219970703125, "learning_rate": 4.999999457127169e-06, "loss": 1.7083, "step": 305 }, { "epoch": 0.000317181301855306, "grad_norm": 47.162139892578125, "learning_rate": 4.999999430322622e-06, "loss": 2.6769, "step": 310 }, { "epoch": 0.0003222971293045851, "grad_norm": 39.38190460205078, "learning_rate": 4.999999402872184e-06, "loss": 2.0037, "step": 315 }, { "epoch": 0.00032741295675386425, "grad_norm": 129.1549530029297, "learning_rate": 4.999999374775851e-06, "loss": 1.8901, "step": 320 }, { "epoch": 0.00033252878420314335, "grad_norm": 52.74744415283203, "learning_rate": 4.999999346033626e-06, "loss": 1.9771, "step": 325 }, { "epoch": 0.0003376446116524225, "grad_norm": 24.42874526977539, "learning_rate": 4.9999993166455095e-06, "loss": 1.6838, "step": 330 }, { "epoch": 0.0003427604391017016, "grad_norm": 53.49464797973633, "learning_rate": 4.999999286611499e-06, "loss": 2.1214, "step": 335 }, { "epoch": 0.0003478762665509808, "grad_norm": 21.949249267578125, "learning_rate": 4.999999255931597e-06, "loss": 2.3871, "step": 340 }, { "epoch": 0.0003529920940002599, "grad_norm": 24.99782943725586, "learning_rate": 4.999999224605802e-06, "loss": 2.2523, "step": 345 }, { "epoch": 0.000358107921449539, "grad_norm": 43.55204391479492, "learning_rate": 4.999999192634114e-06, "loss": 2.2636, "step": 350 }, { "epoch": 0.00036322374889881815, "grad_norm": 16.509601593017578, "learning_rate": 4.999999160016534e-06, "loss": 1.8609, "step": 355 }, { "epoch": 0.00036833957634809726, "grad_norm": 31.136999130249023, "learning_rate": 4.999999126753062e-06, "loss": 2.0149, "step": 360 }, { "epoch": 0.0003734554037973764, "grad_norm": 16.354148864746094, "learning_rate": 4.999999092843697e-06, "loss": 2.1959, "step": 365 }, { "epoch": 0.0003785712312466555, "grad_norm": 19.727338790893555, "learning_rate": 4.9999990582884395e-06, "loss": 2.8055, "step": 370 }, { "epoch": 0.0003836870586959347, "grad_norm": 45.016353607177734, "learning_rate": 4.99999902308729e-06, "loss": 1.8424, "step": 375 }, { "epoch": 0.0003888028861452138, "grad_norm": 26.046506881713867, "learning_rate": 4.999998987240247e-06, "loss": 1.9209, "step": 380 }, { "epoch": 0.0003939187135944929, "grad_norm": 55.724586486816406, "learning_rate": 4.999998950747312e-06, "loss": 2.1993, "step": 385 }, { "epoch": 0.00039903454104377206, "grad_norm": 30.233474731445312, "learning_rate": 4.999998913608485e-06, "loss": 2.7299, "step": 390 }, { "epoch": 0.00040415036849305116, "grad_norm": 30.5318546295166, "learning_rate": 4.999998875823765e-06, "loss": 1.7659, "step": 395 }, { "epoch": 0.0004092661959423303, "grad_norm": 54.03743362426758, "learning_rate": 4.999998837393152e-06, "loss": 1.5705, "step": 400 }, { "epoch": 0.00041438202339160943, "grad_norm": 78.10746765136719, "learning_rate": 4.999998798316648e-06, "loss": 2.7342, "step": 405 }, { "epoch": 0.00041949785084088853, "grad_norm": 32.36627197265625, "learning_rate": 4.99999875859425e-06, "loss": 1.7948, "step": 410 }, { "epoch": 0.0004246136782901677, "grad_norm": 20.181705474853516, "learning_rate": 4.999998718225961e-06, "loss": 2.3085, "step": 415 }, { "epoch": 0.0004297295057394468, "grad_norm": 38.90582275390625, "learning_rate": 4.999998677211778e-06, "loss": 2.025, "step": 420 }, { "epoch": 0.00043484533318872596, "grad_norm": 28.475059509277344, "learning_rate": 4.999998635551704e-06, "loss": 1.9979, "step": 425 }, { "epoch": 0.00043996116063800507, "grad_norm": 22.81328582763672, "learning_rate": 4.999998593245738e-06, "loss": 2.57, "step": 430 }, { "epoch": 0.0004450769880872842, "grad_norm": 33.096107482910156, "learning_rate": 4.999998550293878e-06, "loss": 1.7722, "step": 435 }, { "epoch": 0.00045019281553656333, "grad_norm": 29.399497985839844, "learning_rate": 4.999998506696126e-06, "loss": 2.3991, "step": 440 }, { "epoch": 0.00045530864298584244, "grad_norm": 59.01570510864258, "learning_rate": 4.999998462452483e-06, "loss": 1.9509, "step": 445 }, { "epoch": 0.0004604244704351216, "grad_norm": 25.994796752929688, "learning_rate": 4.999998417562946e-06, "loss": 2.1287, "step": 450 }, { "epoch": 0.0004655402978844007, "grad_norm": 30.943418502807617, "learning_rate": 4.9999983720275176e-06, "loss": 2.6081, "step": 455 }, { "epoch": 0.00047065612533367986, "grad_norm": 45.00712203979492, "learning_rate": 4.999998325846196e-06, "loss": 2.3558, "step": 460 }, { "epoch": 0.00047577195278295897, "grad_norm": 43.46238708496094, "learning_rate": 4.999998279018983e-06, "loss": 2.2729, "step": 465 }, { "epoch": 0.0004808877802322381, "grad_norm": 18.63754653930664, "learning_rate": 4.999998231545878e-06, "loss": 3.2118, "step": 470 }, { "epoch": 0.00048600360768151724, "grad_norm": 42.721580505371094, "learning_rate": 4.9999981834268795e-06, "loss": 2.1978, "step": 475 }, { "epoch": 0.0004911194351307963, "grad_norm": 16.98427391052246, "learning_rate": 4.99999813466199e-06, "loss": 2.1814, "step": 480 }, { "epoch": 0.0004962352625800754, "grad_norm": 53.57073211669922, "learning_rate": 4.999998085251207e-06, "loss": 2.4188, "step": 485 }, { "epoch": 0.0005013510900293547, "grad_norm": 29.25585174560547, "learning_rate": 4.999998035194532e-06, "loss": 1.888, "step": 490 }, { "epoch": 0.0005064669174786338, "grad_norm": 22.437776565551758, "learning_rate": 4.999997984491965e-06, "loss": 1.7224, "step": 495 }, { "epoch": 0.0005115827449279129, "grad_norm": 46.60063552856445, "learning_rate": 4.9999979331435064e-06, "loss": 2.3471, "step": 500 }, { "epoch": 0.000516698572377192, "grad_norm": 40.54884338378906, "learning_rate": 4.999997881149155e-06, "loss": 1.6443, "step": 505 }, { "epoch": 0.0005218143998264711, "grad_norm": 38.45527267456055, "learning_rate": 4.9999978285089115e-06, "loss": 2.2639, "step": 510 }, { "epoch": 0.0005269302272757503, "grad_norm": 32.1842155456543, "learning_rate": 4.999997775222776e-06, "loss": 1.7848, "step": 515 }, { "epoch": 0.0005320460547250294, "grad_norm": 25.46072769165039, "learning_rate": 4.999997721290748e-06, "loss": 1.7813, "step": 520 }, { "epoch": 0.0005371618821743085, "grad_norm": 28.95370864868164, "learning_rate": 4.999997666712828e-06, "loss": 1.6053, "step": 525 }, { "epoch": 0.0005422777096235876, "grad_norm": 54.294010162353516, "learning_rate": 4.999997611489016e-06, "loss": 2.2499, "step": 530 }, { "epoch": 0.0005473935370728667, "grad_norm": 177.93283081054688, "learning_rate": 4.999997555619312e-06, "loss": 1.7878, "step": 535 }, { "epoch": 0.0005525093645221459, "grad_norm": 116.10316467285156, "learning_rate": 4.999997499103715e-06, "loss": 2.6684, "step": 540 }, { "epoch": 0.000557625191971425, "grad_norm": 66.39447784423828, "learning_rate": 4.999997441942227e-06, "loss": 3.1307, "step": 545 }, { "epoch": 0.0005627410194207042, "grad_norm": 201.83526611328125, "learning_rate": 4.999997384134846e-06, "loss": 2.4231, "step": 550 }, { "epoch": 0.0005678568468699833, "grad_norm": 38.58273696899414, "learning_rate": 4.9999973256815734e-06, "loss": 1.9285, "step": 555 }, { "epoch": 0.0005729726743192625, "grad_norm": 25.204904556274414, "learning_rate": 4.9999972665824084e-06, "loss": 2.2168, "step": 560 }, { "epoch": 0.0005780885017685416, "grad_norm": 40.735469818115234, "learning_rate": 4.999997206837351e-06, "loss": 2.5506, "step": 565 }, { "epoch": 0.0005832043292178207, "grad_norm": 22.397306442260742, "learning_rate": 4.9999971464464025e-06, "loss": 2.1926, "step": 570 }, { "epoch": 0.0005883201566670998, "grad_norm": 19.597490310668945, "learning_rate": 4.999997085409562e-06, "loss": 2.2038, "step": 575 }, { "epoch": 0.0005934359841163789, "grad_norm": 25.287269592285156, "learning_rate": 4.999997023726829e-06, "loss": 1.9473, "step": 580 }, { "epoch": 0.0005985518115656581, "grad_norm": 23.409454345703125, "learning_rate": 4.9999969613982045e-06, "loss": 2.0936, "step": 585 }, { "epoch": 0.0006036676390149372, "grad_norm": 23.643157958984375, "learning_rate": 4.999996898423688e-06, "loss": 1.889, "step": 590 }, { "epoch": 0.0006087834664642163, "grad_norm": 14.63953971862793, "learning_rate": 4.999996834803279e-06, "loss": 2.2501, "step": 595 }, { "epoch": 0.0006138992939134954, "grad_norm": 24.40838623046875, "learning_rate": 4.9999967705369786e-06, "loss": 1.9913, "step": 600 }, { "epoch": 0.0006190151213627745, "grad_norm": 43.94314193725586, "learning_rate": 4.9999967056247865e-06, "loss": 1.9051, "step": 605 }, { "epoch": 0.0006241309488120537, "grad_norm": 53.3669319152832, "learning_rate": 4.999996640066702e-06, "loss": 1.8114, "step": 610 }, { "epoch": 0.0006292467762613329, "grad_norm": 17.35694122314453, "learning_rate": 4.999996573862726e-06, "loss": 1.9889, "step": 615 }, { "epoch": 0.000634362603710612, "grad_norm": 30.487985610961914, "learning_rate": 4.999996507012858e-06, "loss": 2.0996, "step": 620 }, { "epoch": 0.0006394784311598911, "grad_norm": 35.32040023803711, "learning_rate": 4.999996439517097e-06, "loss": 2.5399, "step": 625 }, { "epoch": 0.0006445942586091702, "grad_norm": 70.43932342529297, "learning_rate": 4.999996371375446e-06, "loss": 2.2487, "step": 630 }, { "epoch": 0.0006497100860584494, "grad_norm": 65.34407043457031, "learning_rate": 4.999996302587903e-06, "loss": 1.8758, "step": 635 }, { "epoch": 0.0006548259135077285, "grad_norm": 35.37931442260742, "learning_rate": 4.999996233154467e-06, "loss": 2.1782, "step": 640 }, { "epoch": 0.0006599417409570076, "grad_norm": 36.86280822753906, "learning_rate": 4.99999616307514e-06, "loss": 2.5149, "step": 645 }, { "epoch": 0.0006650575684062867, "grad_norm": 26.100961685180664, "learning_rate": 4.9999960923499205e-06, "loss": 2.3968, "step": 650 }, { "epoch": 0.0006701733958555659, "grad_norm": 39.144691467285156, "learning_rate": 4.99999602097881e-06, "loss": 2.3198, "step": 655 }, { "epoch": 0.000675289223304845, "grad_norm": 17.698461532592773, "learning_rate": 4.999995948961808e-06, "loss": 1.9116, "step": 660 }, { "epoch": 0.0006804050507541241, "grad_norm": 27.16783905029297, "learning_rate": 4.999995876298914e-06, "loss": 2.3217, "step": 665 }, { "epoch": 0.0006855208782034032, "grad_norm": 18.525087356567383, "learning_rate": 4.999995802990128e-06, "loss": 2.1678, "step": 670 }, { "epoch": 0.0006906367056526823, "grad_norm": 38.191551208496094, "learning_rate": 4.9999957290354505e-06, "loss": 2.2345, "step": 675 }, { "epoch": 0.0006957525331019616, "grad_norm": 19.29829978942871, "learning_rate": 4.999995654434881e-06, "loss": 1.6135, "step": 680 }, { "epoch": 0.0007008683605512407, "grad_norm": 39.211708068847656, "learning_rate": 4.999995579188421e-06, "loss": 2.3727, "step": 685 }, { "epoch": 0.0007059841880005198, "grad_norm": 45.596885681152344, "learning_rate": 4.999995503296069e-06, "loss": 2.2015, "step": 690 }, { "epoch": 0.0007111000154497989, "grad_norm": 44.08140563964844, "learning_rate": 4.999995426757826e-06, "loss": 2.2414, "step": 695 }, { "epoch": 0.000716215842899078, "grad_norm": 68.40155792236328, "learning_rate": 4.99999534957369e-06, "loss": 1.7362, "step": 700 }, { "epoch": 0.0007213316703483572, "grad_norm": 22.28540802001953, "learning_rate": 4.999995271743663e-06, "loss": 2.6493, "step": 705 }, { "epoch": 0.0007264474977976363, "grad_norm": 30.89618682861328, "learning_rate": 4.999995193267744e-06, "loss": 1.8376, "step": 710 }, { "epoch": 0.0007315633252469154, "grad_norm": 36.84446334838867, "learning_rate": 4.999995114145935e-06, "loss": 1.9947, "step": 715 }, { "epoch": 0.0007366791526961945, "grad_norm": 18.87799644470215, "learning_rate": 4.999995034378233e-06, "loss": 1.8442, "step": 720 }, { "epoch": 0.0007417949801454736, "grad_norm": 25.173625946044922, "learning_rate": 4.9999949539646394e-06, "loss": 1.7687, "step": 725 }, { "epoch": 0.0007469108075947528, "grad_norm": 33.23951721191406, "learning_rate": 4.999994872905156e-06, "loss": 1.5321, "step": 730 }, { "epoch": 0.0007520266350440319, "grad_norm": 37.704280853271484, "learning_rate": 4.99999479119978e-06, "loss": 1.616, "step": 735 }, { "epoch": 0.000757142462493311, "grad_norm": 52.02144241333008, "learning_rate": 4.999994708848513e-06, "loss": 2.4416, "step": 740 }, { "epoch": 0.0007622582899425902, "grad_norm": 23.63690757751465, "learning_rate": 4.999994625851354e-06, "loss": 1.9651, "step": 745 }, { "epoch": 0.0007673741173918694, "grad_norm": 70.5105209350586, "learning_rate": 4.999994542208304e-06, "loss": 2.7555, "step": 750 }, { "epoch": 0.0007724899448411485, "grad_norm": 27.378671646118164, "learning_rate": 4.999994457919363e-06, "loss": 1.9247, "step": 755 }, { "epoch": 0.0007776057722904276, "grad_norm": 24.71000099182129, "learning_rate": 4.999994372984531e-06, "loss": 1.4909, "step": 760 }, { "epoch": 0.0007827215997397067, "grad_norm": 18.179676055908203, "learning_rate": 4.999994287403807e-06, "loss": 2.0899, "step": 765 }, { "epoch": 0.0007878374271889858, "grad_norm": 25.926177978515625, "learning_rate": 4.999994201177192e-06, "loss": 2.039, "step": 770 }, { "epoch": 0.000792953254638265, "grad_norm": 78.46064758300781, "learning_rate": 4.999994114304686e-06, "loss": 3.3029, "step": 775 }, { "epoch": 0.0007980690820875441, "grad_norm": 25.967947006225586, "learning_rate": 4.999994026786288e-06, "loss": 2.1911, "step": 780 }, { "epoch": 0.0008031849095368232, "grad_norm": 34.92380905151367, "learning_rate": 4.999993938621999e-06, "loss": 1.9662, "step": 785 }, { "epoch": 0.0008083007369861023, "grad_norm": 40.42374801635742, "learning_rate": 4.99999384981182e-06, "loss": 1.9342, "step": 790 }, { "epoch": 0.0008134165644353814, "grad_norm": 45.6330451965332, "learning_rate": 4.999993760355748e-06, "loss": 2.0007, "step": 795 }, { "epoch": 0.0008185323918846606, "grad_norm": 24.03801918029785, "learning_rate": 4.999993670253786e-06, "loss": 2.1525, "step": 800 }, { "epoch": 0.0008236482193339397, "grad_norm": 35.476776123046875, "learning_rate": 4.999993579505933e-06, "loss": 1.6079, "step": 805 }, { "epoch": 0.0008287640467832189, "grad_norm": 40.44805908203125, "learning_rate": 4.999993488112188e-06, "loss": 2.2392, "step": 810 }, { "epoch": 0.000833879874232498, "grad_norm": 30.908016204833984, "learning_rate": 4.999993396072553e-06, "loss": 1.6448, "step": 815 }, { "epoch": 0.0008389957016817771, "grad_norm": 66.95894622802734, "learning_rate": 4.999993303387027e-06, "loss": 1.8044, "step": 820 }, { "epoch": 0.0008441115291310563, "grad_norm": 24.162912368774414, "learning_rate": 4.9999932100556095e-06, "loss": 2.1769, "step": 825 }, { "epoch": 0.0008492273565803354, "grad_norm": 23.905620574951172, "learning_rate": 4.9999931160783e-06, "loss": 1.7553, "step": 830 }, { "epoch": 0.0008543431840296145, "grad_norm": 28.471939086914062, "learning_rate": 4.999993021455101e-06, "loss": 3.063, "step": 835 }, { "epoch": 0.0008594590114788936, "grad_norm": 31.271305084228516, "learning_rate": 4.9999929261860105e-06, "loss": 2.1651, "step": 840 }, { "epoch": 0.0008645748389281727, "grad_norm": 118.99810028076172, "learning_rate": 4.999992830271029e-06, "loss": 1.9233, "step": 845 }, { "epoch": 0.0008696906663774519, "grad_norm": 18.52125358581543, "learning_rate": 4.999992733710157e-06, "loss": 2.2058, "step": 850 }, { "epoch": 0.000874806493826731, "grad_norm": 21.95071792602539, "learning_rate": 4.999992636503394e-06, "loss": 2.1759, "step": 855 }, { "epoch": 0.0008799223212760101, "grad_norm": 18.205928802490234, "learning_rate": 4.99999253865074e-06, "loss": 1.656, "step": 860 }, { "epoch": 0.0008850381487252892, "grad_norm": 20.836021423339844, "learning_rate": 4.999992440152196e-06, "loss": 2.5582, "step": 865 }, { "epoch": 0.0008901539761745685, "grad_norm": 13.655160903930664, "learning_rate": 4.99999234100776e-06, "loss": 2.8217, "step": 870 }, { "epoch": 0.0008952698036238476, "grad_norm": 15.36685848236084, "learning_rate": 4.999992241217433e-06, "loss": 2.0741, "step": 875 }, { "epoch": 0.0009003856310731267, "grad_norm": 18.998004913330078, "learning_rate": 4.999992140781217e-06, "loss": 2.3116, "step": 880 }, { "epoch": 0.0009055014585224058, "grad_norm": 18.93351173400879, "learning_rate": 4.999992039699109e-06, "loss": 2.232, "step": 885 }, { "epoch": 0.0009106172859716849, "grad_norm": 27.652629852294922, "learning_rate": 4.99999193797111e-06, "loss": 1.742, "step": 890 }, { "epoch": 0.0009157331134209641, "grad_norm": 27.58511734008789, "learning_rate": 4.999991835597221e-06, "loss": 2.2659, "step": 895 }, { "epoch": 0.0009208489408702432, "grad_norm": 42.58071517944336, "learning_rate": 4.9999917325774415e-06, "loss": 1.8706, "step": 900 }, { "epoch": 0.0009259647683195223, "grad_norm": 47.90303421020508, "learning_rate": 4.999991628911771e-06, "loss": 1.9292, "step": 905 }, { "epoch": 0.0009310805957688014, "grad_norm": 101.65077209472656, "learning_rate": 4.999991524600211e-06, "loss": 2.0334, "step": 910 }, { "epoch": 0.0009361964232180805, "grad_norm": 52.405216217041016, "learning_rate": 4.999991419642759e-06, "loss": 2.7342, "step": 915 }, { "epoch": 0.0009413122506673597, "grad_norm": 37.10912322998047, "learning_rate": 4.999991314039417e-06, "loss": 2.3093, "step": 920 }, { "epoch": 0.0009464280781166388, "grad_norm": 32.47056198120117, "learning_rate": 4.999991207790185e-06, "loss": 2.0898, "step": 925 }, { "epoch": 0.0009515439055659179, "grad_norm": 27.199281692504883, "learning_rate": 4.999991100895061e-06, "loss": 1.7634, "step": 930 }, { "epoch": 0.000956659733015197, "grad_norm": 18.594104766845703, "learning_rate": 4.999990993354048e-06, "loss": 2.4316, "step": 935 }, { "epoch": 0.0009617755604644762, "grad_norm": 28.679597854614258, "learning_rate": 4.9999908851671444e-06, "loss": 1.7815, "step": 940 }, { "epoch": 0.0009668913879137554, "grad_norm": 29.466014862060547, "learning_rate": 4.99999077633435e-06, "loss": 1.8508, "step": 945 }, { "epoch": 0.0009720072153630345, "grad_norm": 39.20371627807617, "learning_rate": 4.999990666855665e-06, "loss": 1.9046, "step": 950 }, { "epoch": 0.0009771230428123137, "grad_norm": 25.89055824279785, "learning_rate": 4.99999055673109e-06, "loss": 1.7934, "step": 955 }, { "epoch": 0.0009822388702615927, "grad_norm": 20.001237869262695, "learning_rate": 4.9999904459606255e-06, "loss": 1.5959, "step": 960 }, { "epoch": 0.000987354697710872, "grad_norm": 54.910518646240234, "learning_rate": 4.99999033454427e-06, "loss": 1.4751, "step": 965 }, { "epoch": 0.000992470525160151, "grad_norm": 17.680381774902344, "learning_rate": 4.999990222482024e-06, "loss": 1.6201, "step": 970 }, { "epoch": 0.0009975863526094301, "grad_norm": 38.746551513671875, "learning_rate": 4.999990109773888e-06, "loss": 1.9122, "step": 975 }, { "epoch": 0.0010027021800587093, "grad_norm": 38.26031494140625, "learning_rate": 4.999989996419862e-06, "loss": 1.7098, "step": 980 }, { "epoch": 0.0010078180075079883, "grad_norm": 20.34618377685547, "learning_rate": 4.999989882419945e-06, "loss": 2.1585, "step": 985 }, { "epoch": 0.0010129338349572675, "grad_norm": 21.369844436645508, "learning_rate": 4.999989767774139e-06, "loss": 2.2292, "step": 990 }, { "epoch": 0.0010180496624065465, "grad_norm": 81.74927520751953, "learning_rate": 4.9999896524824434e-06, "loss": 1.8355, "step": 995 }, { "epoch": 0.0010231654898558257, "grad_norm": 155.59019470214844, "learning_rate": 4.999989536544856e-06, "loss": 2.6615, "step": 1000 }, { "epoch": 0.001028281317305105, "grad_norm": 13.453683853149414, "learning_rate": 4.999989419961379e-06, "loss": 2.263, "step": 1005 }, { "epoch": 0.001033397144754384, "grad_norm": 29.900745391845703, "learning_rate": 4.999989302732013e-06, "loss": 2.1102, "step": 1010 }, { "epoch": 0.0010385129722036632, "grad_norm": 27.433685302734375, "learning_rate": 4.999989184856756e-06, "loss": 2.5465, "step": 1015 }, { "epoch": 0.0010436287996529422, "grad_norm": 42.718605041503906, "learning_rate": 4.99998906633561e-06, "loss": 2.1608, "step": 1020 }, { "epoch": 0.0010487446271022214, "grad_norm": 64.98743438720703, "learning_rate": 4.999988947168573e-06, "loss": 2.43, "step": 1025 }, { "epoch": 0.0010538604545515006, "grad_norm": 236.92457580566406, "learning_rate": 4.999988827355647e-06, "loss": 1.7434, "step": 1030 }, { "epoch": 0.0010589762820007796, "grad_norm": 33.03250503540039, "learning_rate": 4.999988706896831e-06, "loss": 2.0513, "step": 1035 }, { "epoch": 0.0010640921094500588, "grad_norm": 24.42076873779297, "learning_rate": 4.999988585792125e-06, "loss": 2.2066, "step": 1040 }, { "epoch": 0.0010692079368993378, "grad_norm": 79.73822021484375, "learning_rate": 4.999988464041528e-06, "loss": 2.203, "step": 1045 }, { "epoch": 0.001074323764348617, "grad_norm": 28.720699310302734, "learning_rate": 4.999988341645043e-06, "loss": 1.783, "step": 1050 }, { "epoch": 0.0010794395917978962, "grad_norm": 32.0167236328125, "learning_rate": 4.999988218602668e-06, "loss": 1.5053, "step": 1055 }, { "epoch": 0.0010845554192471752, "grad_norm": 50.33378601074219, "learning_rate": 4.999988094914403e-06, "loss": 2.1462, "step": 1060 }, { "epoch": 0.0010896712466964545, "grad_norm": 30.236129760742188, "learning_rate": 4.999987970580248e-06, "loss": 1.7047, "step": 1065 }, { "epoch": 0.0010947870741457335, "grad_norm": 30.875646591186523, "learning_rate": 4.9999878456002036e-06, "loss": 2.0262, "step": 1070 }, { "epoch": 0.0010999029015950127, "grad_norm": 117.4733657836914, "learning_rate": 4.99998771997427e-06, "loss": 2.0843, "step": 1075 }, { "epoch": 0.0011050187290442919, "grad_norm": 44.58800506591797, "learning_rate": 4.999987593702446e-06, "loss": 2.2522, "step": 1080 }, { "epoch": 0.0011101345564935709, "grad_norm": 20.899839401245117, "learning_rate": 4.999987466784733e-06, "loss": 1.7453, "step": 1085 }, { "epoch": 0.00111525038394285, "grad_norm": 36.437904357910156, "learning_rate": 4.999987339221131e-06, "loss": 1.9287, "step": 1090 }, { "epoch": 0.0011203662113921293, "grad_norm": 14.252959251403809, "learning_rate": 4.999987211011639e-06, "loss": 2.0808, "step": 1095 }, { "epoch": 0.0011254820388414083, "grad_norm": 17.861539840698242, "learning_rate": 4.999987082156257e-06, "loss": 1.7971, "step": 1100 }, { "epoch": 0.0011305978662906875, "grad_norm": 27.253488540649414, "learning_rate": 4.999986952654986e-06, "loss": 2.0245, "step": 1105 }, { "epoch": 0.0011357136937399665, "grad_norm": 25.46709442138672, "learning_rate": 4.999986822507826e-06, "loss": 2.6171, "step": 1110 }, { "epoch": 0.0011408295211892457, "grad_norm": 14.362956047058105, "learning_rate": 4.999986691714777e-06, "loss": 1.6328, "step": 1115 }, { "epoch": 0.001145945348638525, "grad_norm": 48.109825134277344, "learning_rate": 4.999986560275837e-06, "loss": 1.7387, "step": 1120 }, { "epoch": 0.001151061176087804, "grad_norm": 50.56632995605469, "learning_rate": 4.999986428191009e-06, "loss": 1.3007, "step": 1125 }, { "epoch": 0.0011561770035370832, "grad_norm": 33.44164276123047, "learning_rate": 4.999986295460292e-06, "loss": 2.4227, "step": 1130 }, { "epoch": 0.0011612928309863622, "grad_norm": 34.408748626708984, "learning_rate": 4.999986162083685e-06, "loss": 1.9347, "step": 1135 }, { "epoch": 0.0011664086584356414, "grad_norm": 21.713510513305664, "learning_rate": 4.99998602806119e-06, "loss": 2.3275, "step": 1140 }, { "epoch": 0.0011715244858849206, "grad_norm": 27.82878875732422, "learning_rate": 4.999985893392805e-06, "loss": 1.9127, "step": 1145 }, { "epoch": 0.0011766403133341996, "grad_norm": 31.478178024291992, "learning_rate": 4.999985758078531e-06, "loss": 1.8268, "step": 1150 }, { "epoch": 0.0011817561407834788, "grad_norm": 59.99496078491211, "learning_rate": 4.999985622118367e-06, "loss": 1.5123, "step": 1155 }, { "epoch": 0.0011868719682327578, "grad_norm": 56.81740188598633, "learning_rate": 4.999985485512315e-06, "loss": 1.7548, "step": 1160 }, { "epoch": 0.001191987795682037, "grad_norm": 23.728538513183594, "learning_rate": 4.999985348260375e-06, "loss": 2.4943, "step": 1165 }, { "epoch": 0.0011971036231313162, "grad_norm": 22.17827033996582, "learning_rate": 4.999985210362544e-06, "loss": 1.6343, "step": 1170 }, { "epoch": 0.0012022194505805952, "grad_norm": 17.064685821533203, "learning_rate": 4.999985071818825e-06, "loss": 2.2886, "step": 1175 }, { "epoch": 0.0012073352780298744, "grad_norm": 26.61735725402832, "learning_rate": 4.999984932629217e-06, "loss": 2.1643, "step": 1180 }, { "epoch": 0.0012124511054791534, "grad_norm": 23.97092056274414, "learning_rate": 4.999984792793721e-06, "loss": 2.0086, "step": 1185 }, { "epoch": 0.0012175669329284326, "grad_norm": 63.36018371582031, "learning_rate": 4.999984652312335e-06, "loss": 1.5969, "step": 1190 }, { "epoch": 0.0012226827603777119, "grad_norm": 17.489696502685547, "learning_rate": 4.999984511185061e-06, "loss": 2.0869, "step": 1195 }, { "epoch": 0.0012277985878269909, "grad_norm": 38.14609909057617, "learning_rate": 4.999984369411898e-06, "loss": 2.4618, "step": 1200 }, { "epoch": 0.00123291441527627, "grad_norm": 54.160430908203125, "learning_rate": 4.999984226992846e-06, "loss": 2.9638, "step": 1205 }, { "epoch": 0.001238030242725549, "grad_norm": 27.26688575744629, "learning_rate": 4.9999840839279055e-06, "loss": 2.2091, "step": 1210 }, { "epoch": 0.0012431460701748283, "grad_norm": 42.3342170715332, "learning_rate": 4.999983940217077e-06, "loss": 1.6613, "step": 1215 }, { "epoch": 0.0012482618976241075, "grad_norm": 31.36366844177246, "learning_rate": 4.9999837958603595e-06, "loss": 2.203, "step": 1220 }, { "epoch": 0.0012533777250733865, "grad_norm": 17.302810668945312, "learning_rate": 4.999983650857752e-06, "loss": 1.9153, "step": 1225 }, { "epoch": 0.0012584935525226657, "grad_norm": 19.862049102783203, "learning_rate": 4.999983505209258e-06, "loss": 1.8668, "step": 1230 }, { "epoch": 0.0012636093799719447, "grad_norm": 36.38582229614258, "learning_rate": 4.999983358914875e-06, "loss": 1.955, "step": 1235 }, { "epoch": 0.001268725207421224, "grad_norm": 24.003875732421875, "learning_rate": 4.999983211974603e-06, "loss": 1.6131, "step": 1240 }, { "epoch": 0.0012738410348705031, "grad_norm": 18.009445190429688, "learning_rate": 4.9999830643884425e-06, "loss": 2.3903, "step": 1245 }, { "epoch": 0.0012789568623197821, "grad_norm": 23.786048889160156, "learning_rate": 4.999982916156394e-06, "loss": 2.3797, "step": 1250 }, { "epoch": 0.0012840726897690613, "grad_norm": 279.4425048828125, "learning_rate": 4.999982767278458e-06, "loss": 1.6912, "step": 1255 }, { "epoch": 0.0012891885172183403, "grad_norm": 16.72315788269043, "learning_rate": 4.999982617754633e-06, "loss": 2.3699, "step": 1260 }, { "epoch": 0.0012943043446676196, "grad_norm": 20.403419494628906, "learning_rate": 4.99998246758492e-06, "loss": 2.2374, "step": 1265 }, { "epoch": 0.0012994201721168988, "grad_norm": 20.27910614013672, "learning_rate": 4.999982316769319e-06, "loss": 1.3972, "step": 1270 }, { "epoch": 0.0013045359995661778, "grad_norm": 90.23499298095703, "learning_rate": 4.999982165307829e-06, "loss": 2.4057, "step": 1275 }, { "epoch": 0.001309651827015457, "grad_norm": 34.15992736816406, "learning_rate": 4.999982013200451e-06, "loss": 1.9107, "step": 1280 }, { "epoch": 0.001314767654464736, "grad_norm": 103.80292510986328, "learning_rate": 4.999981860447185e-06, "loss": 2.1699, "step": 1285 }, { "epoch": 0.0013198834819140152, "grad_norm": 39.3248405456543, "learning_rate": 4.99998170704803e-06, "loss": 1.4623, "step": 1290 }, { "epoch": 0.0013249993093632944, "grad_norm": 21.913314819335938, "learning_rate": 4.999981553002988e-06, "loss": 3.0457, "step": 1295 }, { "epoch": 0.0013301151368125734, "grad_norm": 33.98173141479492, "learning_rate": 4.999981398312059e-06, "loss": 2.2006, "step": 1300 }, { "epoch": 0.0013352309642618526, "grad_norm": 33.70772171020508, "learning_rate": 4.999981242975241e-06, "loss": 2.3175, "step": 1305 }, { "epoch": 0.0013403467917111318, "grad_norm": 34.00438690185547, "learning_rate": 4.999981086992534e-06, "loss": 2.2941, "step": 1310 }, { "epoch": 0.0013454626191604108, "grad_norm": 58.431251525878906, "learning_rate": 4.999980930363941e-06, "loss": 1.3731, "step": 1315 }, { "epoch": 0.00135057844660969, "grad_norm": 31.435461044311523, "learning_rate": 4.99998077308946e-06, "loss": 1.6548, "step": 1320 }, { "epoch": 0.001355694274058969, "grad_norm": 40.38758850097656, "learning_rate": 4.999980615169091e-06, "loss": 1.7875, "step": 1325 }, { "epoch": 0.0013608101015082483, "grad_norm": 79.17440032958984, "learning_rate": 4.999980456602834e-06, "loss": 1.9586, "step": 1330 }, { "epoch": 0.0013659259289575275, "grad_norm": 19.951000213623047, "learning_rate": 4.999980297390688e-06, "loss": 1.8241, "step": 1335 }, { "epoch": 0.0013710417564068065, "grad_norm": 58.75507354736328, "learning_rate": 4.9999801375326565e-06, "loss": 2.1218, "step": 1340 }, { "epoch": 0.0013761575838560857, "grad_norm": 26.642562866210938, "learning_rate": 4.9999799770287365e-06, "loss": 2.0383, "step": 1345 }, { "epoch": 0.0013812734113053647, "grad_norm": 150.71414184570312, "learning_rate": 4.999979815878929e-06, "loss": 1.9711, "step": 1350 }, { "epoch": 0.001386389238754644, "grad_norm": 104.74163055419922, "learning_rate": 4.999979654083234e-06, "loss": 1.3387, "step": 1355 }, { "epoch": 0.0013915050662039231, "grad_norm": 30.208457946777344, "learning_rate": 4.999979491641652e-06, "loss": 2.2862, "step": 1360 }, { "epoch": 0.0013966208936532021, "grad_norm": 32.1755485534668, "learning_rate": 4.9999793285541814e-06, "loss": 2.3396, "step": 1365 }, { "epoch": 0.0014017367211024813, "grad_norm": 31.48807716369629, "learning_rate": 4.999979164820824e-06, "loss": 1.6899, "step": 1370 }, { "epoch": 0.0014068525485517603, "grad_norm": 15.240654945373535, "learning_rate": 4.999979000441579e-06, "loss": 1.229, "step": 1375 }, { "epoch": 0.0014119683760010395, "grad_norm": 173.07948303222656, "learning_rate": 4.999978835416447e-06, "loss": 2.1081, "step": 1380 }, { "epoch": 0.0014170842034503188, "grad_norm": 52.25848388671875, "learning_rate": 4.999978669745428e-06, "loss": 2.6468, "step": 1385 }, { "epoch": 0.0014222000308995978, "grad_norm": 26.527263641357422, "learning_rate": 4.999978503428522e-06, "loss": 2.1753, "step": 1390 }, { "epoch": 0.001427315858348877, "grad_norm": 20.365922927856445, "learning_rate": 4.999978336465727e-06, "loss": 1.7606, "step": 1395 }, { "epoch": 0.001432431685798156, "grad_norm": 17.185253143310547, "learning_rate": 4.999978168857047e-06, "loss": 2.2366, "step": 1400 }, { "epoch": 0.0014375475132474352, "grad_norm": 20.769306182861328, "learning_rate": 4.999978000602479e-06, "loss": 1.4728, "step": 1405 }, { "epoch": 0.0014426633406967144, "grad_norm": 23.805482864379883, "learning_rate": 4.999977831702024e-06, "loss": 1.9749, "step": 1410 }, { "epoch": 0.0014477791681459934, "grad_norm": 37.22270202636719, "learning_rate": 4.999977662155681e-06, "loss": 2.2264, "step": 1415 }, { "epoch": 0.0014528949955952726, "grad_norm": 26.72702980041504, "learning_rate": 4.999977491963453e-06, "loss": 2.0013, "step": 1420 }, { "epoch": 0.0014580108230445516, "grad_norm": 18.771821975708008, "learning_rate": 4.999977321125337e-06, "loss": 2.2074, "step": 1425 }, { "epoch": 0.0014631266504938308, "grad_norm": 34.00010681152344, "learning_rate": 4.999977149641334e-06, "loss": 1.7383, "step": 1430 }, { "epoch": 0.00146824247794311, "grad_norm": 35.661766052246094, "learning_rate": 4.999976977511444e-06, "loss": 1.2272, "step": 1435 }, { "epoch": 0.001473358305392389, "grad_norm": 72.74012756347656, "learning_rate": 4.999976804735668e-06, "loss": 1.9379, "step": 1440 }, { "epoch": 0.0014784741328416682, "grad_norm": 59.94070816040039, "learning_rate": 4.9999766313140045e-06, "loss": 2.0252, "step": 1445 }, { "epoch": 0.0014835899602909472, "grad_norm": 22.396297454833984, "learning_rate": 4.9999764572464544e-06, "loss": 2.2045, "step": 1450 }, { "epoch": 0.0014887057877402265, "grad_norm": 17.074033737182617, "learning_rate": 4.999976282533018e-06, "loss": 2.4884, "step": 1455 }, { "epoch": 0.0014938216151895057, "grad_norm": 47.30825424194336, "learning_rate": 4.999976107173694e-06, "loss": 2.2779, "step": 1460 }, { "epoch": 0.0014989374426387847, "grad_norm": 31.25780487060547, "learning_rate": 4.999975931168485e-06, "loss": 1.594, "step": 1465 }, { "epoch": 0.0015040532700880639, "grad_norm": 23.88001823425293, "learning_rate": 4.999975754517388e-06, "loss": 2.1745, "step": 1470 }, { "epoch": 0.0015091690975373429, "grad_norm": 25.972780227661133, "learning_rate": 4.999975577220405e-06, "loss": 1.7675, "step": 1475 }, { "epoch": 0.001514284924986622, "grad_norm": 31.607057571411133, "learning_rate": 4.999975399277536e-06, "loss": 1.9848, "step": 1480 }, { "epoch": 0.0015194007524359013, "grad_norm": 21.51637077331543, "learning_rate": 4.999975220688781e-06, "loss": 1.5795, "step": 1485 }, { "epoch": 0.0015245165798851803, "grad_norm": 212.26988220214844, "learning_rate": 4.999975041454138e-06, "loss": 2.2516, "step": 1490 }, { "epoch": 0.0015296324073344595, "grad_norm": 31.53080940246582, "learning_rate": 4.9999748615736105e-06, "loss": 2.0679, "step": 1495 }, { "epoch": 0.0015347482347837387, "grad_norm": 16.598777770996094, "learning_rate": 4.999974681047196e-06, "loss": 2.0786, "step": 1500 }, { "epoch": 0.0015398640622330177, "grad_norm": 49.823116302490234, "learning_rate": 4.999974499874895e-06, "loss": 1.8152, "step": 1505 }, { "epoch": 0.001544979889682297, "grad_norm": 32.701297760009766, "learning_rate": 4.999974318056709e-06, "loss": 2.1063, "step": 1510 }, { "epoch": 0.001550095717131576, "grad_norm": 18.362302780151367, "learning_rate": 4.999974135592635e-06, "loss": 1.7928, "step": 1515 }, { "epoch": 0.0015552115445808552, "grad_norm": 51.08883285522461, "learning_rate": 4.999973952482677e-06, "loss": 1.8052, "step": 1520 }, { "epoch": 0.0015603273720301344, "grad_norm": 44.36396026611328, "learning_rate": 4.9999737687268315e-06, "loss": 1.9612, "step": 1525 }, { "epoch": 0.0015654431994794134, "grad_norm": 24.798240661621094, "learning_rate": 4.9999735843251e-06, "loss": 2.3925, "step": 1530 }, { "epoch": 0.0015705590269286926, "grad_norm": 16.088239669799805, "learning_rate": 4.999973399277483e-06, "loss": 2.0287, "step": 1535 }, { "epoch": 0.0015756748543779716, "grad_norm": 26.319747924804688, "learning_rate": 4.999973213583981e-06, "loss": 2.0194, "step": 1540 }, { "epoch": 0.0015807906818272508, "grad_norm": 28.428298950195312, "learning_rate": 4.999973027244592e-06, "loss": 2.1673, "step": 1545 }, { "epoch": 0.00158590650927653, "grad_norm": 36.989017486572266, "learning_rate": 4.999972840259318e-06, "loss": 1.9936, "step": 1550 }, { "epoch": 0.001591022336725809, "grad_norm": 46.43477249145508, "learning_rate": 4.999972652628157e-06, "loss": 2.0324, "step": 1555 }, { "epoch": 0.0015961381641750882, "grad_norm": 19.083168029785156, "learning_rate": 4.999972464351112e-06, "loss": 2.0597, "step": 1560 }, { "epoch": 0.0016012539916243672, "grad_norm": 23.465576171875, "learning_rate": 4.99997227542818e-06, "loss": 1.8023, "step": 1565 }, { "epoch": 0.0016063698190736464, "grad_norm": 42.2997932434082, "learning_rate": 4.999972085859362e-06, "loss": 2.8618, "step": 1570 }, { "epoch": 0.0016114856465229256, "grad_norm": 32.204776763916016, "learning_rate": 4.9999718956446605e-06, "loss": 2.1625, "step": 1575 }, { "epoch": 0.0016166014739722046, "grad_norm": 43.07786560058594, "learning_rate": 4.999971704784072e-06, "loss": 1.6436, "step": 1580 }, { "epoch": 0.0016217173014214839, "grad_norm": 46.78351593017578, "learning_rate": 4.999971513277599e-06, "loss": 1.7706, "step": 1585 }, { "epoch": 0.0016268331288707629, "grad_norm": 20.138309478759766, "learning_rate": 4.99997132112524e-06, "loss": 1.8192, "step": 1590 }, { "epoch": 0.001631948956320042, "grad_norm": 29.333467483520508, "learning_rate": 4.999971128326996e-06, "loss": 2.6058, "step": 1595 }, { "epoch": 0.0016370647837693213, "grad_norm": 24.558216094970703, "learning_rate": 4.999970934882866e-06, "loss": 1.7405, "step": 1600 }, { "epoch": 0.0016421806112186003, "grad_norm": 48.540252685546875, "learning_rate": 4.999970740792851e-06, "loss": 1.9959, "step": 1605 }, { "epoch": 0.0016472964386678795, "grad_norm": 20.65495491027832, "learning_rate": 4.999970546056951e-06, "loss": 1.9823, "step": 1610 }, { "epoch": 0.0016524122661171585, "grad_norm": 15.533699989318848, "learning_rate": 4.999970350675166e-06, "loss": 2.5006, "step": 1615 }, { "epoch": 0.0016575280935664377, "grad_norm": 25.015609741210938, "learning_rate": 4.999970154647497e-06, "loss": 1.9932, "step": 1620 }, { "epoch": 0.001662643921015717, "grad_norm": 39.55202102661133, "learning_rate": 4.999969957973941e-06, "loss": 1.7569, "step": 1625 }, { "epoch": 0.001667759748464996, "grad_norm": 20.360933303833008, "learning_rate": 4.9999697606545015e-06, "loss": 2.1801, "step": 1630 }, { "epoch": 0.0016728755759142751, "grad_norm": 18.367870330810547, "learning_rate": 4.999969562689176e-06, "loss": 1.8302, "step": 1635 }, { "epoch": 0.0016779914033635541, "grad_norm": 37.595890045166016, "learning_rate": 4.999969364077966e-06, "loss": 1.5695, "step": 1640 }, { "epoch": 0.0016831072308128333, "grad_norm": 131.7490692138672, "learning_rate": 4.9999691648208714e-06, "loss": 1.736, "step": 1645 }, { "epoch": 0.0016882230582621126, "grad_norm": 17.704280853271484, "learning_rate": 4.9999689649178915e-06, "loss": 2.2124, "step": 1650 }, { "epoch": 0.0016933388857113916, "grad_norm": 50.01279830932617, "learning_rate": 4.999968764369028e-06, "loss": 2.2965, "step": 1655 }, { "epoch": 0.0016984547131606708, "grad_norm": 43.914642333984375, "learning_rate": 4.999968563174279e-06, "loss": 1.8774, "step": 1660 }, { "epoch": 0.0017035705406099498, "grad_norm": 16.064172744750977, "learning_rate": 4.999968361333645e-06, "loss": 1.5956, "step": 1665 }, { "epoch": 0.001708686368059229, "grad_norm": 18.485206604003906, "learning_rate": 4.999968158847127e-06, "loss": 2.4731, "step": 1670 }, { "epoch": 0.0017138021955085082, "grad_norm": 28.482746124267578, "learning_rate": 4.999967955714724e-06, "loss": 2.0509, "step": 1675 }, { "epoch": 0.0017189180229577872, "grad_norm": 18.967763900756836, "learning_rate": 4.999967751936437e-06, "loss": 2.132, "step": 1680 }, { "epoch": 0.0017240338504070664, "grad_norm": 25.05319595336914, "learning_rate": 4.999967547512266e-06, "loss": 2.5094, "step": 1685 }, { "epoch": 0.0017291496778563454, "grad_norm": 23.554901123046875, "learning_rate": 4.99996734244221e-06, "loss": 1.9741, "step": 1690 }, { "epoch": 0.0017342655053056246, "grad_norm": 35.07354736328125, "learning_rate": 4.9999671367262695e-06, "loss": 1.833, "step": 1695 }, { "epoch": 0.0017393813327549038, "grad_norm": 23.565120697021484, "learning_rate": 4.999966930364445e-06, "loss": 1.7698, "step": 1700 }, { "epoch": 0.0017444971602041828, "grad_norm": 15.039921760559082, "learning_rate": 4.9999667233567365e-06, "loss": 1.8166, "step": 1705 }, { "epoch": 0.001749612987653462, "grad_norm": 42.18770217895508, "learning_rate": 4.999966515703144e-06, "loss": 1.9051, "step": 1710 }, { "epoch": 0.0017547288151027413, "grad_norm": 20.408710479736328, "learning_rate": 4.999966307403667e-06, "loss": 1.8237, "step": 1715 }, { "epoch": 0.0017598446425520203, "grad_norm": 73.59622955322266, "learning_rate": 4.999966098458306e-06, "loss": 1.9357, "step": 1720 }, { "epoch": 0.0017649604700012995, "grad_norm": 24.029922485351562, "learning_rate": 4.999965888867061e-06, "loss": 1.7104, "step": 1725 }, { "epoch": 0.0017700762974505785, "grad_norm": 31.252532958984375, "learning_rate": 4.999965678629932e-06, "loss": 1.6505, "step": 1730 }, { "epoch": 0.0017751921248998577, "grad_norm": 48.73993682861328, "learning_rate": 4.999965467746919e-06, "loss": 2.2735, "step": 1735 }, { "epoch": 0.001780307952349137, "grad_norm": 17.346960067749023, "learning_rate": 4.9999652562180235e-06, "loss": 2.4909, "step": 1740 }, { "epoch": 0.001785423779798416, "grad_norm": 15.188125610351562, "learning_rate": 4.999965044043242e-06, "loss": 2.0477, "step": 1745 }, { "epoch": 0.0017905396072476951, "grad_norm": 267.7658386230469, "learning_rate": 4.999964831222579e-06, "loss": 1.8873, "step": 1750 }, { "epoch": 0.0017956554346969741, "grad_norm": 36.49431610107422, "learning_rate": 4.999964617756031e-06, "loss": 1.7055, "step": 1755 }, { "epoch": 0.0018007712621462533, "grad_norm": 17.473997116088867, "learning_rate": 4.9999644036436e-06, "loss": 1.5696, "step": 1760 }, { "epoch": 0.0018058870895955325, "grad_norm": 21.609394073486328, "learning_rate": 4.999964188885285e-06, "loss": 2.3425, "step": 1765 }, { "epoch": 0.0018110029170448115, "grad_norm": 20.953079223632812, "learning_rate": 4.999963973481087e-06, "loss": 2.4154, "step": 1770 }, { "epoch": 0.0018161187444940908, "grad_norm": 77.73824310302734, "learning_rate": 4.999963757431005e-06, "loss": 3.0099, "step": 1775 }, { "epoch": 0.0018212345719433698, "grad_norm": 23.025440216064453, "learning_rate": 4.99996354073504e-06, "loss": 1.6436, "step": 1780 }, { "epoch": 0.001826350399392649, "grad_norm": 15.245038986206055, "learning_rate": 4.999963323393191e-06, "loss": 2.3979, "step": 1785 }, { "epoch": 0.0018314662268419282, "grad_norm": 52.0065803527832, "learning_rate": 4.99996310540546e-06, "loss": 2.071, "step": 1790 }, { "epoch": 0.0018365820542912072, "grad_norm": 24.059059143066406, "learning_rate": 4.999962886771845e-06, "loss": 1.679, "step": 1795 }, { "epoch": 0.0018416978817404864, "grad_norm": 38.645042419433594, "learning_rate": 4.999962667492346e-06, "loss": 2.0825, "step": 1800 }, { "epoch": 0.0018468137091897654, "grad_norm": 15.86104965209961, "learning_rate": 4.999962447566965e-06, "loss": 1.5154, "step": 1805 }, { "epoch": 0.0018519295366390446, "grad_norm": 26.28799057006836, "learning_rate": 4.999962226995701e-06, "loss": 2.2037, "step": 1810 }, { "epoch": 0.0018570453640883238, "grad_norm": 45.791568756103516, "learning_rate": 4.999962005778553e-06, "loss": 1.6403, "step": 1815 }, { "epoch": 0.0018621611915376028, "grad_norm": 69.75652313232422, "learning_rate": 4.9999617839155236e-06, "loss": 2.4092, "step": 1820 }, { "epoch": 0.001867277018986882, "grad_norm": 38.95976638793945, "learning_rate": 4.999961561406611e-06, "loss": 2.2526, "step": 1825 }, { "epoch": 0.001872392846436161, "grad_norm": 60.79167175292969, "learning_rate": 4.999961338251815e-06, "loss": 1.4801, "step": 1830 }, { "epoch": 0.0018775086738854402, "grad_norm": 47.835899353027344, "learning_rate": 4.999961114451136e-06, "loss": 2.616, "step": 1835 }, { "epoch": 0.0018826245013347195, "grad_norm": 48.682098388671875, "learning_rate": 4.999960890004575e-06, "loss": 1.8155, "step": 1840 }, { "epoch": 0.0018877403287839985, "grad_norm": 18.791667938232422, "learning_rate": 4.999960664912132e-06, "loss": 1.8214, "step": 1845 }, { "epoch": 0.0018928561562332777, "grad_norm": 20.816709518432617, "learning_rate": 4.999960439173805e-06, "loss": 2.3244, "step": 1850 }, { "epoch": 0.0018979719836825567, "grad_norm": 20.281068801879883, "learning_rate": 4.999960212789596e-06, "loss": 1.5049, "step": 1855 }, { "epoch": 0.0019030878111318359, "grad_norm": 39.08038330078125, "learning_rate": 4.999959985759504e-06, "loss": 1.8219, "step": 1860 }, { "epoch": 0.001908203638581115, "grad_norm": 43.33205032348633, "learning_rate": 4.9999597580835305e-06, "loss": 1.5518, "step": 1865 }, { "epoch": 0.001913319466030394, "grad_norm": 40.8986930847168, "learning_rate": 4.9999595297616745e-06, "loss": 1.9617, "step": 1870 }, { "epoch": 0.0019184352934796733, "grad_norm": 50.462974548339844, "learning_rate": 4.9999593007939356e-06, "loss": 1.9692, "step": 1875 }, { "epoch": 0.0019235511209289523, "grad_norm": 19.13349723815918, "learning_rate": 4.9999590711803145e-06, "loss": 2.2123, "step": 1880 }, { "epoch": 0.0019286669483782315, "grad_norm": 20.72867202758789, "learning_rate": 4.999958840920812e-06, "loss": 1.9616, "step": 1885 }, { "epoch": 0.0019337827758275107, "grad_norm": 16.234466552734375, "learning_rate": 4.999958610015427e-06, "loss": 1.5623, "step": 1890 }, { "epoch": 0.0019388986032767897, "grad_norm": 23.931238174438477, "learning_rate": 4.9999583784641605e-06, "loss": 1.445, "step": 1895 }, { "epoch": 0.001944014430726069, "grad_norm": 22.996126174926758, "learning_rate": 4.999958146267011e-06, "loss": 1.9943, "step": 1900 }, { "epoch": 0.0019491302581753482, "grad_norm": 18.7641658782959, "learning_rate": 4.9999579134239795e-06, "loss": 1.8699, "step": 1905 }, { "epoch": 0.0019542460856246274, "grad_norm": 47.30205154418945, "learning_rate": 4.999957679935067e-06, "loss": 2.1765, "step": 1910 }, { "epoch": 0.001959361913073906, "grad_norm": 88.53414154052734, "learning_rate": 4.999957445800272e-06, "loss": 2.7706, "step": 1915 }, { "epoch": 0.0019644777405231854, "grad_norm": 228.85215759277344, "learning_rate": 4.9999572110195956e-06, "loss": 2.0127, "step": 1920 }, { "epoch": 0.0019695935679724646, "grad_norm": 50.268619537353516, "learning_rate": 4.999956975593038e-06, "loss": 2.0922, "step": 1925 }, { "epoch": 0.001974709395421744, "grad_norm": 61.23733901977539, "learning_rate": 4.999956739520599e-06, "loss": 2.2421, "step": 1930 }, { "epoch": 0.001979825222871023, "grad_norm": 27.620866775512695, "learning_rate": 4.999956502802277e-06, "loss": 1.8341, "step": 1935 }, { "epoch": 0.001984941050320302, "grad_norm": 29.914430618286133, "learning_rate": 4.999956265438074e-06, "loss": 2.0809, "step": 1940 }, { "epoch": 0.001990056877769581, "grad_norm": 14.38338565826416, "learning_rate": 4.99995602742799e-06, "loss": 1.9832, "step": 1945 }, { "epoch": 0.0019951727052188602, "grad_norm": 20.756608963012695, "learning_rate": 4.999955788772024e-06, "loss": 1.6528, "step": 1950 }, { "epoch": 0.0020002885326681394, "grad_norm": 29.781492233276367, "learning_rate": 4.999955549470177e-06, "loss": 1.907, "step": 1955 }, { "epoch": 0.0021286231392195556, "grad_norm": 247.9962158203125, "learning_rate": 4.999949648322882e-06, "loss": 2.0925, "step": 1960 }, { "epoch": 0.0021340533002889932, "grad_norm": 18.611543655395508, "learning_rate": 4.999949377252e-06, "loss": 1.9264, "step": 1965 }, { "epoch": 0.002139483461358431, "grad_norm": 98.91653442382812, "learning_rate": 4.999949105453423e-06, "loss": 1.7241, "step": 1970 }, { "epoch": 0.0021449136224278684, "grad_norm": 37.125675201416016, "learning_rate": 4.9999488329271465e-06, "loss": 2.0002, "step": 1975 }, { "epoch": 0.002150343783497306, "grad_norm": 27.03342628479004, "learning_rate": 4.999948559673173e-06, "loss": 2.5615, "step": 1980 }, { "epoch": 0.0021557739445667435, "grad_norm": 37.63926696777344, "learning_rate": 4.999948285691503e-06, "loss": 1.6305, "step": 1985 }, { "epoch": 0.0021612041056361816, "grad_norm": 42.60621643066406, "learning_rate": 4.999948010982136e-06, "loss": 1.9953, "step": 1990 }, { "epoch": 0.002166634266705619, "grad_norm": 21.01263427734375, "learning_rate": 4.9999477355450736e-06, "loss": 1.6886, "step": 1995 }, { "epoch": 0.0021720644277750567, "grad_norm": 48.50627136230469, "learning_rate": 4.999947459380312e-06, "loss": 1.939, "step": 2000 }, { "epoch": 0.0021774945888444943, "grad_norm": 47.228031158447266, "learning_rate": 4.999947182487855e-06, "loss": 1.6352, "step": 2005 }, { "epoch": 0.002182924749913932, "grad_norm": 21.68488121032715, "learning_rate": 4.9999469048677015e-06, "loss": 2.3359, "step": 2010 }, { "epoch": 0.0021883549109833695, "grad_norm": 50.57142639160156, "learning_rate": 4.999946626519852e-06, "loss": 1.8175, "step": 2015 }, { "epoch": 0.002193785072052807, "grad_norm": 24.081806182861328, "learning_rate": 4.999946347444306e-06, "loss": 1.7698, "step": 2020 }, { "epoch": 0.002199215233122245, "grad_norm": 30.824588775634766, "learning_rate": 4.999946067641063e-06, "loss": 1.8499, "step": 2025 }, { "epoch": 0.0022046453941916826, "grad_norm": 27.078004837036133, "learning_rate": 4.999945787110123e-06, "loss": 2.2203, "step": 2030 }, { "epoch": 0.00221007555526112, "grad_norm": 21.74220848083496, "learning_rate": 4.999945505851488e-06, "loss": 1.5938, "step": 2035 }, { "epoch": 0.002215505716330558, "grad_norm": 26.612152099609375, "learning_rate": 4.9999452238651565e-06, "loss": 1.5861, "step": 2040 }, { "epoch": 0.0022209358773999954, "grad_norm": 55.4253044128418, "learning_rate": 4.999944941151129e-06, "loss": 2.0007, "step": 2045 }, { "epoch": 0.002226366038469433, "grad_norm": 38.383827209472656, "learning_rate": 4.999944657709406e-06, "loss": 1.883, "step": 2050 }, { "epoch": 0.0022317961995388705, "grad_norm": 20.732208251953125, "learning_rate": 4.999944373539987e-06, "loss": 1.9547, "step": 2055 }, { "epoch": 0.0022372263606083086, "grad_norm": 19.54503631591797, "learning_rate": 4.999944088642872e-06, "loss": 1.7846, "step": 2060 }, { "epoch": 0.002242656521677746, "grad_norm": 20.220787048339844, "learning_rate": 4.999943803018061e-06, "loss": 2.0165, "step": 2065 }, { "epoch": 0.0022480866827471837, "grad_norm": 30.60348129272461, "learning_rate": 4.9999435166655545e-06, "loss": 1.3374, "step": 2070 }, { "epoch": 0.0022535168438166213, "grad_norm": 30.167631149291992, "learning_rate": 4.9999432295853525e-06, "loss": 1.8326, "step": 2075 }, { "epoch": 0.002258947004886059, "grad_norm": 19.988264083862305, "learning_rate": 4.999942941777455e-06, "loss": 1.8829, "step": 2080 }, { "epoch": 0.0022643771659554965, "grad_norm": 35.52172088623047, "learning_rate": 4.9999426532418624e-06, "loss": 1.7497, "step": 2085 }, { "epoch": 0.002269807327024934, "grad_norm": 24.402690887451172, "learning_rate": 4.999942363978575e-06, "loss": 1.951, "step": 2090 }, { "epoch": 0.002275237488094372, "grad_norm": 25.478397369384766, "learning_rate": 4.999942073987591e-06, "loss": 1.502, "step": 2095 }, { "epoch": 0.0022806676491638096, "grad_norm": 17.37947654724121, "learning_rate": 4.999941783268913e-06, "loss": 1.4524, "step": 2100 }, { "epoch": 0.002286097810233247, "grad_norm": 27.73431396484375, "learning_rate": 4.99994149182254e-06, "loss": 1.982, "step": 2105 }, { "epoch": 0.002291527971302685, "grad_norm": 28.483156204223633, "learning_rate": 4.9999411996484726e-06, "loss": 1.6876, "step": 2110 }, { "epoch": 0.0022969581323721224, "grad_norm": 13.800949096679688, "learning_rate": 4.999940906746709e-06, "loss": 1.8243, "step": 2115 }, { "epoch": 0.00230238829344156, "grad_norm": 21.70273780822754, "learning_rate": 4.999940613117251e-06, "loss": 1.5756, "step": 2120 }, { "epoch": 0.0023078184545109975, "grad_norm": 50.0323486328125, "learning_rate": 4.9999403187600995e-06, "loss": 1.7658, "step": 2125 }, { "epoch": 0.0023132486155804355, "grad_norm": 26.966022491455078, "learning_rate": 4.999940023675252e-06, "loss": 1.961, "step": 2130 }, { "epoch": 0.002318678776649873, "grad_norm": 51.64280700683594, "learning_rate": 4.999939727862711e-06, "loss": 2.3838, "step": 2135 }, { "epoch": 0.0023241089377193107, "grad_norm": 23.99488067626953, "learning_rate": 4.9999394313224745e-06, "loss": 2.0907, "step": 2140 }, { "epoch": 0.0023295390987887483, "grad_norm": 19.22940444946289, "learning_rate": 4.999939134054545e-06, "loss": 2.0032, "step": 2145 }, { "epoch": 0.002334969259858186, "grad_norm": 17.365747451782227, "learning_rate": 4.99993883605892e-06, "loss": 1.8995, "step": 2150 }, { "epoch": 0.0023403994209276234, "grad_norm": 31.37904167175293, "learning_rate": 4.999938537335601e-06, "loss": 2.2585, "step": 2155 }, { "epoch": 0.002345829581997061, "grad_norm": 33.94065856933594, "learning_rate": 4.9999382378845886e-06, "loss": 1.9811, "step": 2160 }, { "epoch": 0.002351259743066499, "grad_norm": 73.29490661621094, "learning_rate": 4.999937937705882e-06, "loss": 1.516, "step": 2165 }, { "epoch": 0.0023566899041359366, "grad_norm": 35.28907775878906, "learning_rate": 4.999937636799481e-06, "loss": 1.9786, "step": 2170 }, { "epoch": 0.002362120065205374, "grad_norm": 18.53374481201172, "learning_rate": 4.999937335165387e-06, "loss": 2.095, "step": 2175 }, { "epoch": 0.0023675502262748118, "grad_norm": 101.69239044189453, "learning_rate": 4.999937032803599e-06, "loss": 2.5732, "step": 2180 }, { "epoch": 0.0023729803873442494, "grad_norm": 134.85633850097656, "learning_rate": 4.9999367297141164e-06, "loss": 1.3642, "step": 2185 }, { "epoch": 0.002378410548413687, "grad_norm": 83.1693115234375, "learning_rate": 4.9999364258969415e-06, "loss": 1.3337, "step": 2190 }, { "epoch": 0.0023838407094831245, "grad_norm": 23.529193878173828, "learning_rate": 4.999936121352073e-06, "loss": 2.3832, "step": 2195 }, { "epoch": 0.0023892708705525625, "grad_norm": 30.599931716918945, "learning_rate": 4.999935816079511e-06, "loss": 1.9739, "step": 2200 }, { "epoch": 0.002394701031622, "grad_norm": 21.276742935180664, "learning_rate": 4.999935510079255e-06, "loss": 1.5245, "step": 2205 }, { "epoch": 0.0024001311926914377, "grad_norm": 34.094112396240234, "learning_rate": 4.999935203351306e-06, "loss": 1.3782, "step": 2210 }, { "epoch": 0.0024055613537608753, "grad_norm": 35.148040771484375, "learning_rate": 4.999934895895665e-06, "loss": 1.7358, "step": 2215 }, { "epoch": 0.002410991514830313, "grad_norm": 18.993867874145508, "learning_rate": 4.99993458771233e-06, "loss": 2.4298, "step": 2220 }, { "epoch": 0.0024164216758997504, "grad_norm": 57.52144241333008, "learning_rate": 4.999934278801303e-06, "loss": 1.8068, "step": 2225 }, { "epoch": 0.002421851836969188, "grad_norm": 24.586023330688477, "learning_rate": 4.999933969162582e-06, "loss": 1.5085, "step": 2230 }, { "epoch": 0.002427281998038626, "grad_norm": 34.26123809814453, "learning_rate": 4.999933658796169e-06, "loss": 2.2991, "step": 2235 }, { "epoch": 0.0024327121591080636, "grad_norm": 41.35533905029297, "learning_rate": 4.9999333477020625e-06, "loss": 1.8463, "step": 2240 }, { "epoch": 0.002438142320177501, "grad_norm": 26.151119232177734, "learning_rate": 4.9999330358802644e-06, "loss": 2.4202, "step": 2245 }, { "epoch": 0.0024435724812469388, "grad_norm": 122.51468658447266, "learning_rate": 4.9999327233307736e-06, "loss": 2.0888, "step": 2250 }, { "epoch": 0.0024490026423163763, "grad_norm": 84.96205139160156, "learning_rate": 4.99993241005359e-06, "loss": 1.8883, "step": 2255 }, { "epoch": 0.002454432803385814, "grad_norm": 72.89588165283203, "learning_rate": 4.999932096048714e-06, "loss": 1.9682, "step": 2260 }, { "epoch": 0.0024598629644552515, "grad_norm": 63.092491149902344, "learning_rate": 4.999931781316147e-06, "loss": 2.1394, "step": 2265 }, { "epoch": 0.0024652931255246895, "grad_norm": 39.80315017700195, "learning_rate": 4.9999314658558864e-06, "loss": 1.4364, "step": 2270 }, { "epoch": 0.002470723286594127, "grad_norm": 25.289823532104492, "learning_rate": 4.999931149667935e-06, "loss": 2.0335, "step": 2275 }, { "epoch": 0.0024761534476635647, "grad_norm": 21.807554244995117, "learning_rate": 4.999930832752291e-06, "loss": 1.3676, "step": 2280 }, { "epoch": 0.0024815836087330023, "grad_norm": 22.320205688476562, "learning_rate": 4.9999305151089556e-06, "loss": 1.6466, "step": 2285 }, { "epoch": 0.00248701376980244, "grad_norm": 61.7177848815918, "learning_rate": 4.9999301967379275e-06, "loss": 2.4226, "step": 2290 }, { "epoch": 0.0024924439308718774, "grad_norm": 149.8834991455078, "learning_rate": 4.999929877639209e-06, "loss": 2.2063, "step": 2295 }, { "epoch": 0.002497874091941315, "grad_norm": 28.230518341064453, "learning_rate": 4.999929557812798e-06, "loss": 1.4082, "step": 2300 }, { "epoch": 0.002503304253010753, "grad_norm": 34.31536102294922, "learning_rate": 4.999929237258696e-06, "loss": 1.9462, "step": 2305 }, { "epoch": 0.0025087344140801906, "grad_norm": 73.1697998046875, "learning_rate": 4.999928915976903e-06, "loss": 1.2675, "step": 2310 }, { "epoch": 0.002514164575149628, "grad_norm": 19.813806533813477, "learning_rate": 4.999928593967418e-06, "loss": 1.5346, "step": 2315 }, { "epoch": 0.0025195947362190657, "grad_norm": 23.057321548461914, "learning_rate": 4.999928271230242e-06, "loss": 1.8949, "step": 2320 }, { "epoch": 0.0025250248972885033, "grad_norm": 14.445368766784668, "learning_rate": 4.999927947765376e-06, "loss": 1.3939, "step": 2325 }, { "epoch": 0.002530455058357941, "grad_norm": 27.10457992553711, "learning_rate": 4.999927623572818e-06, "loss": 1.8804, "step": 2330 }, { "epoch": 0.0025358852194273785, "grad_norm": 16.13592529296875, "learning_rate": 4.9999272986525685e-06, "loss": 1.6278, "step": 2335 }, { "epoch": 0.0025413153804968165, "grad_norm": 17.088504791259766, "learning_rate": 4.999926973004629e-06, "loss": 1.7007, "step": 2340 }, { "epoch": 0.002546745541566254, "grad_norm": 26.47686195373535, "learning_rate": 4.999926646628999e-06, "loss": 1.9344, "step": 2345 }, { "epoch": 0.0025521757026356917, "grad_norm": 32.240787506103516, "learning_rate": 4.999926319525678e-06, "loss": 1.2406, "step": 2350 }, { "epoch": 0.0025576058637051292, "grad_norm": 23.460763931274414, "learning_rate": 4.999925991694666e-06, "loss": 2.2676, "step": 2355 }, { "epoch": 0.002563036024774567, "grad_norm": 34.591854095458984, "learning_rate": 4.999925663135965e-06, "loss": 1.8682, "step": 2360 }, { "epoch": 0.0025684661858440044, "grad_norm": 14.235818862915039, "learning_rate": 4.999925333849572e-06, "loss": 1.8425, "step": 2365 }, { "epoch": 0.002573896346913442, "grad_norm": 51.433258056640625, "learning_rate": 4.99992500383549e-06, "loss": 1.6818, "step": 2370 }, { "epoch": 0.00257932650798288, "grad_norm": 21.670801162719727, "learning_rate": 4.999924673093718e-06, "loss": 1.7315, "step": 2375 }, { "epoch": 0.0025847566690523176, "grad_norm": 25.600019454956055, "learning_rate": 4.999924341624255e-06, "loss": 1.522, "step": 2380 }, { "epoch": 0.002590186830121755, "grad_norm": 29.016921997070312, "learning_rate": 4.9999240094271015e-06, "loss": 2.0125, "step": 2385 }, { "epoch": 0.0025956169911911927, "grad_norm": 49.50724792480469, "learning_rate": 4.999923676502259e-06, "loss": 2.2529, "step": 2390 }, { "epoch": 0.0026010471522606303, "grad_norm": 29.878429412841797, "learning_rate": 4.999923342849728e-06, "loss": 2.0185, "step": 2395 }, { "epoch": 0.002606477313330068, "grad_norm": 20.417909622192383, "learning_rate": 4.999923008469506e-06, "loss": 1.9617, "step": 2400 }, { "epoch": 0.0026119074743995055, "grad_norm": 15.852913856506348, "learning_rate": 4.999922673361595e-06, "loss": 1.7975, "step": 2405 }, { "epoch": 0.0026173376354689435, "grad_norm": 20.492279052734375, "learning_rate": 4.999922337525993e-06, "loss": 2.256, "step": 2410 }, { "epoch": 0.002622767796538381, "grad_norm": 48.96660232543945, "learning_rate": 4.999922000962703e-06, "loss": 2.1168, "step": 2415 }, { "epoch": 0.0026281979576078186, "grad_norm": 35.75537109375, "learning_rate": 4.999921663671724e-06, "loss": 2.2904, "step": 2420 }, { "epoch": 0.0026336281186772562, "grad_norm": 27.852741241455078, "learning_rate": 4.999921325653055e-06, "loss": 1.7928, "step": 2425 }, { "epoch": 0.002639058279746694, "grad_norm": 22.530319213867188, "learning_rate": 4.999920986906698e-06, "loss": 1.4062, "step": 2430 }, { "epoch": 0.0026444884408161314, "grad_norm": 33.97126388549805, "learning_rate": 4.999920647432651e-06, "loss": 2.0387, "step": 2435 }, { "epoch": 0.002649918601885569, "grad_norm": 33.57851028442383, "learning_rate": 4.999920307230916e-06, "loss": 1.9289, "step": 2440 }, { "epoch": 0.002655348762955007, "grad_norm": 22.167356491088867, "learning_rate": 4.999919966301492e-06, "loss": 1.8863, "step": 2445 }, { "epoch": 0.0026607789240244446, "grad_norm": 30.278846740722656, "learning_rate": 4.999919624644379e-06, "loss": 1.6937, "step": 2450 }, { "epoch": 0.002666209085093882, "grad_norm": 68.3966064453125, "learning_rate": 4.999919282259578e-06, "loss": 2.692, "step": 2455 }, { "epoch": 0.0026716392461633197, "grad_norm": 28.31108283996582, "learning_rate": 4.999918939147087e-06, "loss": 1.231, "step": 2460 }, { "epoch": 0.0026770694072327573, "grad_norm": 43.30291748046875, "learning_rate": 4.999918595306909e-06, "loss": 1.8083, "step": 2465 }, { "epoch": 0.002682499568302195, "grad_norm": 33.43694305419922, "learning_rate": 4.999918250739042e-06, "loss": 1.4345, "step": 2470 }, { "epoch": 0.0026879297293716325, "grad_norm": 46.87275314331055, "learning_rate": 4.999917905443488e-06, "loss": 1.9558, "step": 2475 }, { "epoch": 0.0026933598904410705, "grad_norm": 23.07948875427246, "learning_rate": 4.999917559420245e-06, "loss": 2.3181, "step": 2480 }, { "epoch": 0.002698790051510508, "grad_norm": 28.706056594848633, "learning_rate": 4.999917212669314e-06, "loss": 1.897, "step": 2485 }, { "epoch": 0.0027042202125799456, "grad_norm": 56.42112731933594, "learning_rate": 4.999916865190696e-06, "loss": 2.8852, "step": 2490 }, { "epoch": 0.002709650373649383, "grad_norm": 41.922245025634766, "learning_rate": 4.999916516984389e-06, "loss": 1.6041, "step": 2495 }, { "epoch": 0.002715080534718821, "grad_norm": 20.10698699951172, "learning_rate": 4.999916168050395e-06, "loss": 1.9099, "step": 2500 }, { "epoch": 0.0027205106957882584, "grad_norm": 34.42890167236328, "learning_rate": 4.999915818388713e-06, "loss": 2.1123, "step": 2505 }, { "epoch": 0.002725940856857696, "grad_norm": 46.86060333251953, "learning_rate": 4.999915467999344e-06, "loss": 2.2664, "step": 2510 }, { "epoch": 0.002731371017927134, "grad_norm": 42.30652618408203, "learning_rate": 4.999915116882288e-06, "loss": 1.8983, "step": 2515 }, { "epoch": 0.0027368011789965715, "grad_norm": 14.582508087158203, "learning_rate": 4.999914765037544e-06, "loss": 1.7486, "step": 2520 }, { "epoch": 0.002742231340066009, "grad_norm": 23.24066162109375, "learning_rate": 4.999914412465113e-06, "loss": 1.6314, "step": 2525 }, { "epoch": 0.0027476615011354467, "grad_norm": 30.977283477783203, "learning_rate": 4.999914059164996e-06, "loss": 1.7428, "step": 2530 }, { "epoch": 0.0027530916622048843, "grad_norm": 17.458341598510742, "learning_rate": 4.999913705137191e-06, "loss": 1.7108, "step": 2535 }, { "epoch": 0.002758521823274322, "grad_norm": 127.93768310546875, "learning_rate": 4.999913350381699e-06, "loss": 1.985, "step": 2540 }, { "epoch": 0.0027639519843437594, "grad_norm": 45.76018524169922, "learning_rate": 4.999912994898521e-06, "loss": 1.8966, "step": 2545 }, { "epoch": 0.0027693821454131975, "grad_norm": 19.494455337524414, "learning_rate": 4.999912638687657e-06, "loss": 1.6349, "step": 2550 }, { "epoch": 0.002774812306482635, "grad_norm": 17.982257843017578, "learning_rate": 4.999912281749104e-06, "loss": 1.8508, "step": 2555 }, { "epoch": 0.0027802424675520726, "grad_norm": 28.472375869750977, "learning_rate": 4.999911924082867e-06, "loss": 1.8614, "step": 2560 }, { "epoch": 0.00278567262862151, "grad_norm": 52.70477294921875, "learning_rate": 4.9999115656889416e-06, "loss": 2.0365, "step": 2565 }, { "epoch": 0.0027911027896909478, "grad_norm": 24.838172912597656, "learning_rate": 4.999911206567332e-06, "loss": 1.6742, "step": 2570 }, { "epoch": 0.0027965329507603854, "grad_norm": 37.84638214111328, "learning_rate": 4.999910846718035e-06, "loss": 1.9398, "step": 2575 }, { "epoch": 0.002801963111829823, "grad_norm": 57.84357833862305, "learning_rate": 4.999910486141052e-06, "loss": 2.6613, "step": 2580 }, { "epoch": 0.0028073932728992605, "grad_norm": 18.914609909057617, "learning_rate": 4.999910124836383e-06, "loss": 2.2437, "step": 2585 }, { "epoch": 0.0028128234339686985, "grad_norm": 38.7525634765625, "learning_rate": 4.99990976280403e-06, "loss": 2.1208, "step": 2590 }, { "epoch": 0.002818253595038136, "grad_norm": 17.406803131103516, "learning_rate": 4.999909400043989e-06, "loss": 1.5017, "step": 2595 }, { "epoch": 0.0028236837561075737, "grad_norm": 31.431198120117188, "learning_rate": 4.999909036556263e-06, "loss": 2.0218, "step": 2600 }, { "epoch": 0.0028291139171770113, "grad_norm": 17.091012954711914, "learning_rate": 4.9999086723408514e-06, "loss": 2.043, "step": 2605 }, { "epoch": 0.002834544078246449, "grad_norm": 26.134004592895508, "learning_rate": 4.999908307397756e-06, "loss": 1.5922, "step": 2610 }, { "epoch": 0.0028399742393158864, "grad_norm": 24.30078887939453, "learning_rate": 4.999907941726974e-06, "loss": 1.4768, "step": 2615 }, { "epoch": 0.002845404400385324, "grad_norm": 51.668453216552734, "learning_rate": 4.999907575328507e-06, "loss": 2.1644, "step": 2620 }, { "epoch": 0.002850834561454762, "grad_norm": 22.920734405517578, "learning_rate": 4.9999072082023546e-06, "loss": 1.9812, "step": 2625 }, { "epoch": 0.0028562647225241996, "grad_norm": 20.959951400756836, "learning_rate": 4.999906840348517e-06, "loss": 1.7656, "step": 2630 }, { "epoch": 0.002861694883593637, "grad_norm": 22.755943298339844, "learning_rate": 4.999906471766996e-06, "loss": 1.6653, "step": 2635 }, { "epoch": 0.0028671250446630748, "grad_norm": 29.261995315551758, "learning_rate": 4.99990610245779e-06, "loss": 1.7559, "step": 2640 }, { "epoch": 0.0028725552057325123, "grad_norm": 53.40119171142578, "learning_rate": 4.999905732420898e-06, "loss": 2.174, "step": 2645 }, { "epoch": 0.00287798536680195, "grad_norm": 32.8975830078125, "learning_rate": 4.999905361656322e-06, "loss": 1.5108, "step": 2650 }, { "epoch": 0.0028834155278713875, "grad_norm": 36.12147903442383, "learning_rate": 4.9999049901640625e-06, "loss": 1.7485, "step": 2655 }, { "epoch": 0.0028888456889408255, "grad_norm": 21.089397430419922, "learning_rate": 4.999904617944118e-06, "loss": 1.5151, "step": 2660 }, { "epoch": 0.002894275850010263, "grad_norm": 38.926856994628906, "learning_rate": 4.999904244996489e-06, "loss": 2.1098, "step": 2665 }, { "epoch": 0.0028997060110797007, "grad_norm": 206.7702178955078, "learning_rate": 4.999903871321177e-06, "loss": 1.7251, "step": 2670 }, { "epoch": 0.0029051361721491383, "grad_norm": 49.235618591308594, "learning_rate": 4.99990349691818e-06, "loss": 1.4299, "step": 2675 }, { "epoch": 0.002910566333218576, "grad_norm": 22.279939651489258, "learning_rate": 4.999903121787499e-06, "loss": 2.3822, "step": 2680 }, { "epoch": 0.0029159964942880134, "grad_norm": 57.60578918457031, "learning_rate": 4.999902745929135e-06, "loss": 1.9115, "step": 2685 }, { "epoch": 0.002921426655357451, "grad_norm": 47.42727279663086, "learning_rate": 4.999902369343087e-06, "loss": 1.45, "step": 2690 }, { "epoch": 0.002926856816426889, "grad_norm": 32.06073760986328, "learning_rate": 4.999901992029355e-06, "loss": 1.6997, "step": 2695 }, { "epoch": 0.0029322869774963266, "grad_norm": 62.36907958984375, "learning_rate": 4.9999016139879394e-06, "loss": 2.301, "step": 2700 }, { "epoch": 0.002937717138565764, "grad_norm": 18.496145248413086, "learning_rate": 4.999901235218842e-06, "loss": 1.8442, "step": 2705 }, { "epoch": 0.0029431472996352018, "grad_norm": 29.36733055114746, "learning_rate": 4.9999008557220604e-06, "loss": 2.2081, "step": 2710 }, { "epoch": 0.0029485774607046393, "grad_norm": 58.15039825439453, "learning_rate": 4.999900475497596e-06, "loss": 1.6445, "step": 2715 }, { "epoch": 0.002954007621774077, "grad_norm": 23.706710815429688, "learning_rate": 4.999900094545448e-06, "loss": 1.8697, "step": 2720 }, { "epoch": 0.0029594377828435145, "grad_norm": 20.70639419555664, "learning_rate": 4.999899712865618e-06, "loss": 1.5734, "step": 2725 }, { "epoch": 0.0029648679439129525, "grad_norm": 16.032793045043945, "learning_rate": 4.999899330458104e-06, "loss": 2.0091, "step": 2730 }, { "epoch": 0.00297029810498239, "grad_norm": 38.163333892822266, "learning_rate": 4.999898947322908e-06, "loss": 1.6703, "step": 2735 }, { "epoch": 0.0029757282660518277, "grad_norm": 23.208463668823242, "learning_rate": 4.99989856346003e-06, "loss": 2.2028, "step": 2740 }, { "epoch": 0.0029811584271212653, "grad_norm": 25.827909469604492, "learning_rate": 4.999898178869469e-06, "loss": 1.9078, "step": 2745 }, { "epoch": 0.002986588588190703, "grad_norm": 34.32838821411133, "learning_rate": 4.9998977935512254e-06, "loss": 2.0958, "step": 2750 }, { "epoch": 0.0029920187492601404, "grad_norm": 21.95568084716797, "learning_rate": 4.999897407505301e-06, "loss": 1.5666, "step": 2755 }, { "epoch": 0.002997448910329578, "grad_norm": 25.15074348449707, "learning_rate": 4.9998970207316924e-06, "loss": 1.6871, "step": 2760 }, { "epoch": 0.003002879071399016, "grad_norm": 28.9196720123291, "learning_rate": 4.999896633230403e-06, "loss": 1.2694, "step": 2765 }, { "epoch": 0.0030083092324684536, "grad_norm": 34.04104995727539, "learning_rate": 4.9998962450014315e-06, "loss": 1.6127, "step": 2770 }, { "epoch": 0.003013739393537891, "grad_norm": 17.52210807800293, "learning_rate": 4.999895856044779e-06, "loss": 2.0516, "step": 2775 }, { "epoch": 0.0030191695546073287, "grad_norm": 43.6598014831543, "learning_rate": 4.9998954663604435e-06, "loss": 2.3655, "step": 2780 }, { "epoch": 0.0030245997156767663, "grad_norm": 32.072418212890625, "learning_rate": 4.999895075948427e-06, "loss": 1.6303, "step": 2785 }, { "epoch": 0.003030029876746204, "grad_norm": 48.97402572631836, "learning_rate": 4.999894684808729e-06, "loss": 2.1969, "step": 2790 }, { "epoch": 0.0030354600378156415, "grad_norm": 29.28865623474121, "learning_rate": 4.999894292941351e-06, "loss": 2.221, "step": 2795 }, { "epoch": 0.0030408901988850795, "grad_norm": 19.984825134277344, "learning_rate": 4.99989390034629e-06, "loss": 1.4478, "step": 2800 }, { "epoch": 0.003046320359954517, "grad_norm": 95.95281982421875, "learning_rate": 4.999893507023549e-06, "loss": 1.8581, "step": 2805 }, { "epoch": 0.0030517505210239547, "grad_norm": 28.101606369018555, "learning_rate": 4.999893112973126e-06, "loss": 1.815, "step": 2810 }, { "epoch": 0.0030571806820933922, "grad_norm": 20.291004180908203, "learning_rate": 4.999892718195023e-06, "loss": 2.2724, "step": 2815 }, { "epoch": 0.00306261084316283, "grad_norm": 21.283205032348633, "learning_rate": 4.999892322689239e-06, "loss": 1.1838, "step": 2820 }, { "epoch": 0.0030680410042322674, "grad_norm": 16.258079528808594, "learning_rate": 4.999891926455775e-06, "loss": 1.6203, "step": 2825 }, { "epoch": 0.003073471165301705, "grad_norm": 20.70055389404297, "learning_rate": 4.99989152949463e-06, "loss": 1.6969, "step": 2830 }, { "epoch": 0.003078901326371143, "grad_norm": 18.921104431152344, "learning_rate": 4.999891131805804e-06, "loss": 1.6842, "step": 2835 }, { "epoch": 0.0030843314874405806, "grad_norm": 27.024272918701172, "learning_rate": 4.999890733389298e-06, "loss": 1.7946, "step": 2840 }, { "epoch": 0.003089761648510018, "grad_norm": 31.993722915649414, "learning_rate": 4.999890334245113e-06, "loss": 2.3853, "step": 2845 }, { "epoch": 0.0030951918095794557, "grad_norm": 24.314903259277344, "learning_rate": 4.999889934373246e-06, "loss": 1.7426, "step": 2850 }, { "epoch": 0.0031006219706488933, "grad_norm": 347.2679748535156, "learning_rate": 4.9998895337737005e-06, "loss": 2.5961, "step": 2855 }, { "epoch": 0.003106052131718331, "grad_norm": 15.643810272216797, "learning_rate": 4.9998891324464755e-06, "loss": 1.283, "step": 2860 }, { "epoch": 0.0031114822927877685, "grad_norm": 24.617786407470703, "learning_rate": 4.99988873039157e-06, "loss": 1.6896, "step": 2865 }, { "epoch": 0.0031169124538572065, "grad_norm": 35.799285888671875, "learning_rate": 4.999888327608985e-06, "loss": 1.715, "step": 2870 }, { "epoch": 0.003122342614926644, "grad_norm": 18.64017677307129, "learning_rate": 4.999887924098721e-06, "loss": 2.0019, "step": 2875 }, { "epoch": 0.0031277727759960816, "grad_norm": 29.947084426879883, "learning_rate": 4.999887519860777e-06, "loss": 1.665, "step": 2880 }, { "epoch": 0.0031332029370655192, "grad_norm": 17.4774112701416, "learning_rate": 4.999887114895154e-06, "loss": 1.9515, "step": 2885 }, { "epoch": 0.003138633098134957, "grad_norm": 28.115110397338867, "learning_rate": 4.999886709201852e-06, "loss": 2.0472, "step": 2890 }, { "epoch": 0.0031440632592043944, "grad_norm": 26.596765518188477, "learning_rate": 4.999886302780871e-06, "loss": 1.908, "step": 2895 }, { "epoch": 0.003149493420273832, "grad_norm": 144.9949493408203, "learning_rate": 4.999885895632211e-06, "loss": 2.7367, "step": 2900 }, { "epoch": 0.00315492358134327, "grad_norm": 24.393753051757812, "learning_rate": 4.9998854877558725e-06, "loss": 2.1519, "step": 2905 }, { "epoch": 0.0031603537424127076, "grad_norm": 18.037137985229492, "learning_rate": 4.999885079151855e-06, "loss": 1.5562, "step": 2910 }, { "epoch": 0.003165783903482145, "grad_norm": 27.214448928833008, "learning_rate": 4.9998846698201595e-06, "loss": 2.03, "step": 2915 }, { "epoch": 0.0031712140645515827, "grad_norm": 30.776966094970703, "learning_rate": 4.9998842597607855e-06, "loss": 2.037, "step": 2920 }, { "epoch": 0.0031766442256210203, "grad_norm": 23.840837478637695, "learning_rate": 4.999883848973733e-06, "loss": 2.0467, "step": 2925 }, { "epoch": 0.003182074386690458, "grad_norm": 75.33116149902344, "learning_rate": 4.999883437459002e-06, "loss": 2.4714, "step": 2930 }, { "epoch": 0.0031875045477598955, "grad_norm": 14.618073463439941, "learning_rate": 4.999883025216594e-06, "loss": 2.3766, "step": 2935 }, { "epoch": 0.0031929347088293335, "grad_norm": 52.24959945678711, "learning_rate": 4.999882612246507e-06, "loss": 1.6185, "step": 2940 }, { "epoch": 0.003198364869898771, "grad_norm": 18.217262268066406, "learning_rate": 4.999882198548744e-06, "loss": 1.5291, "step": 2945 }, { "epoch": 0.0032037950309682086, "grad_norm": 97.78324890136719, "learning_rate": 4.999881784123301e-06, "loss": 2.4324, "step": 2950 }, { "epoch": 0.003209225192037646, "grad_norm": 40.57048416137695, "learning_rate": 4.999881368970182e-06, "loss": 1.9181, "step": 2955 }, { "epoch": 0.003214655353107084, "grad_norm": 38.040069580078125, "learning_rate": 4.999880953089384e-06, "loss": 1.7158, "step": 2960 }, { "epoch": 0.0032200855141765214, "grad_norm": 37.453704833984375, "learning_rate": 4.99988053648091e-06, "loss": 1.8093, "step": 2965 }, { "epoch": 0.003225515675245959, "grad_norm": 20.68849754333496, "learning_rate": 4.9998801191447585e-06, "loss": 1.7373, "step": 2970 }, { "epoch": 0.003230945836315397, "grad_norm": 29.34853172302246, "learning_rate": 4.99987970108093e-06, "loss": 2.3305, "step": 2975 }, { "epoch": 0.0032363759973848345, "grad_norm": 24.58540916442871, "learning_rate": 4.999879282289425e-06, "loss": 1.6675, "step": 2980 }, { "epoch": 0.003241806158454272, "grad_norm": 118.8993148803711, "learning_rate": 4.999878862770242e-06, "loss": 1.6382, "step": 2985 }, { "epoch": 0.0032472363195237097, "grad_norm": 19.683237075805664, "learning_rate": 4.999878442523384e-06, "loss": 1.6306, "step": 2990 }, { "epoch": 0.0032526664805931473, "grad_norm": 20.347923278808594, "learning_rate": 4.999878021548848e-06, "loss": 1.8837, "step": 2995 }, { "epoch": 0.003258096641662585, "grad_norm": 37.446388244628906, "learning_rate": 4.999877599846636e-06, "loss": 1.828, "step": 3000 }, { "epoch": 0.0032635268027320224, "grad_norm": 13.400245666503906, "learning_rate": 4.999877177416748e-06, "loss": 1.658, "step": 3005 }, { "epoch": 0.0032689569638014605, "grad_norm": 16.650066375732422, "learning_rate": 4.999876754259183e-06, "loss": 1.7154, "step": 3010 }, { "epoch": 0.003274387124870898, "grad_norm": 16.784832000732422, "learning_rate": 4.999876330373943e-06, "loss": 2.4992, "step": 3015 }, { "epoch": 0.0032798172859403356, "grad_norm": 21.57523536682129, "learning_rate": 4.9998759057610256e-06, "loss": 1.8206, "step": 3020 }, { "epoch": 0.003285247447009773, "grad_norm": 120.26985931396484, "learning_rate": 4.999875480420433e-06, "loss": 2.2466, "step": 3025 }, { "epoch": 0.0032906776080792108, "grad_norm": 50.32571792602539, "learning_rate": 4.999875054352165e-06, "loss": 2.1565, "step": 3030 }, { "epoch": 0.0032961077691486484, "grad_norm": 27.745473861694336, "learning_rate": 4.999874627556221e-06, "loss": 2.2583, "step": 3035 }, { "epoch": 0.003301537930218086, "grad_norm": 14.84918212890625, "learning_rate": 4.999874200032602e-06, "loss": 1.4078, "step": 3040 }, { "epoch": 0.003306968091287524, "grad_norm": 56.06150436401367, "learning_rate": 4.9998737717813074e-06, "loss": 1.6334, "step": 3045 }, { "epoch": 0.0033123982523569615, "grad_norm": 25.652420043945312, "learning_rate": 4.999873342802337e-06, "loss": 1.5873, "step": 3050 }, { "epoch": 0.003317828413426399, "grad_norm": 16.19859504699707, "learning_rate": 4.999872913095692e-06, "loss": 2.1209, "step": 3055 }, { "epoch": 0.0033232585744958367, "grad_norm": 85.86725616455078, "learning_rate": 4.999872482661373e-06, "loss": 1.4877, "step": 3060 }, { "epoch": 0.0033286887355652743, "grad_norm": 108.3898696899414, "learning_rate": 4.999872051499378e-06, "loss": 1.0426, "step": 3065 }, { "epoch": 0.003334118896634712, "grad_norm": 283.6915283203125, "learning_rate": 4.9998716196097085e-06, "loss": 1.8967, "step": 3070 }, { "epoch": 0.0033395490577041494, "grad_norm": 25.9902400970459, "learning_rate": 4.999871186992365e-06, "loss": 1.372, "step": 3075 }, { "epoch": 0.0033449792187735874, "grad_norm": 27.170734405517578, "learning_rate": 4.999870753647345e-06, "loss": 1.9319, "step": 3080 }, { "epoch": 0.003350409379843025, "grad_norm": 23.33572769165039, "learning_rate": 4.999870319574653e-06, "loss": 1.4404, "step": 3085 }, { "epoch": 0.0033558395409124626, "grad_norm": 28.157133102416992, "learning_rate": 4.999869884774285e-06, "loss": 1.2948, "step": 3090 }, { "epoch": 0.0033612697019819, "grad_norm": 22.186471939086914, "learning_rate": 4.999869449246245e-06, "loss": 1.8446, "step": 3095 }, { "epoch": 0.0033666998630513378, "grad_norm": 27.76140785217285, "learning_rate": 4.999869012990529e-06, "loss": 2.1967, "step": 3100 }, { "epoch": 0.0033721300241207753, "grad_norm": 15.510659217834473, "learning_rate": 4.999868576007141e-06, "loss": 1.694, "step": 3105 }, { "epoch": 0.003377560185190213, "grad_norm": 39.238746643066406, "learning_rate": 4.999868138296078e-06, "loss": 1.7329, "step": 3110 }, { "epoch": 0.003382990346259651, "grad_norm": 24.85637855529785, "learning_rate": 4.999867699857341e-06, "loss": 1.4832, "step": 3115 }, { "epoch": 0.0033884205073290885, "grad_norm": 302.2299499511719, "learning_rate": 4.999867260690931e-06, "loss": 2.368, "step": 3120 }, { "epoch": 0.003393850668398526, "grad_norm": 28.000377655029297, "learning_rate": 4.999866820796849e-06, "loss": 2.3057, "step": 3125 }, { "epoch": 0.0033992808294679637, "grad_norm": 111.83431243896484, "learning_rate": 4.9998663801750925e-06, "loss": 1.8131, "step": 3130 }, { "epoch": 0.0034047109905374013, "grad_norm": 32.693050384521484, "learning_rate": 4.999865938825663e-06, "loss": 1.4393, "step": 3135 }, { "epoch": 0.003410141151606839, "grad_norm": 31.253681182861328, "learning_rate": 4.9998654967485615e-06, "loss": 1.7749, "step": 3140 }, { "epoch": 0.0034155713126762764, "grad_norm": 31.79893684387207, "learning_rate": 4.9998650539437865e-06, "loss": 1.6091, "step": 3145 }, { "epoch": 0.0034210014737457144, "grad_norm": 17.543184280395508, "learning_rate": 4.999864610411338e-06, "loss": 1.743, "step": 3150 }, { "epoch": 0.003426431634815152, "grad_norm": 21.868974685668945, "learning_rate": 4.999864166151218e-06, "loss": 1.9205, "step": 3155 }, { "epoch": 0.0034318617958845896, "grad_norm": 16.42713737487793, "learning_rate": 4.999863721163425e-06, "loss": 1.9851, "step": 3160 }, { "epoch": 0.003437291956954027, "grad_norm": 25.354948043823242, "learning_rate": 4.99986327544796e-06, "loss": 1.575, "step": 3165 }, { "epoch": 0.0034427221180234648, "grad_norm": 234.32382202148438, "learning_rate": 4.999862829004824e-06, "loss": 1.7255, "step": 3170 }, { "epoch": 0.0034481522790929023, "grad_norm": 18.627294540405273, "learning_rate": 4.999862381834014e-06, "loss": 1.8941, "step": 3175 }, { "epoch": 0.00345358244016234, "grad_norm": 22.479291915893555, "learning_rate": 4.999861933935533e-06, "loss": 1.8786, "step": 3180 }, { "epoch": 0.003459012601231778, "grad_norm": 95.2927474975586, "learning_rate": 4.999861485309381e-06, "loss": 2.188, "step": 3185 }, { "epoch": 0.0034644427623012155, "grad_norm": 17.977439880371094, "learning_rate": 4.999861035955555e-06, "loss": 1.0662, "step": 3190 }, { "epoch": 0.003469872923370653, "grad_norm": 74.12982177734375, "learning_rate": 4.99986058587406e-06, "loss": 1.4749, "step": 3195 }, { "epoch": 0.0034753030844400907, "grad_norm": 46.284297943115234, "learning_rate": 4.999860135064892e-06, "loss": 1.5694, "step": 3200 }, { "epoch": 0.0034807332455095282, "grad_norm": 17.662809371948242, "learning_rate": 4.999859683528054e-06, "loss": 1.567, "step": 3205 }, { "epoch": 0.003486163406578966, "grad_norm": 60.78343963623047, "learning_rate": 4.999859231263544e-06, "loss": 1.7989, "step": 3210 }, { "epoch": 0.0034915935676484034, "grad_norm": 22.532367706298828, "learning_rate": 4.9998587782713635e-06, "loss": 2.3952, "step": 3215 }, { "epoch": 0.0034970237287178414, "grad_norm": 26.013442993164062, "learning_rate": 4.999858324551512e-06, "loss": 2.2677, "step": 3220 }, { "epoch": 0.003502453889787279, "grad_norm": 24.88296127319336, "learning_rate": 4.9998578701039894e-06, "loss": 1.8126, "step": 3225 }, { "epoch": 0.0035078840508567166, "grad_norm": 19.09382438659668, "learning_rate": 4.999857414928796e-06, "loss": 1.7502, "step": 3230 }, { "epoch": 0.003513314211926154, "grad_norm": 30.483482360839844, "learning_rate": 4.999856959025932e-06, "loss": 3.364, "step": 3235 }, { "epoch": 0.0035187443729955917, "grad_norm": 24.077871322631836, "learning_rate": 4.999856502395399e-06, "loss": 2.0782, "step": 3240 }, { "epoch": 0.0035241745340650293, "grad_norm": 17.456064224243164, "learning_rate": 4.999856045037195e-06, "loss": 1.4038, "step": 3245 }, { "epoch": 0.003529604695134467, "grad_norm": 25.440088272094727, "learning_rate": 4.999855586951321e-06, "loss": 2.106, "step": 3250 }, { "epoch": 0.003535034856203905, "grad_norm": 24.3858699798584, "learning_rate": 4.999855128137778e-06, "loss": 1.8963, "step": 3255 }, { "epoch": 0.0035404650172733425, "grad_norm": 17.395532608032227, "learning_rate": 4.999854668596563e-06, "loss": 2.6456, "step": 3260 }, { "epoch": 0.00354589517834278, "grad_norm": 16.16135025024414, "learning_rate": 4.99985420832768e-06, "loss": 1.9272, "step": 3265 }, { "epoch": 0.0035513253394122177, "grad_norm": 31.95172119140625, "learning_rate": 4.9998537473311274e-06, "loss": 2.1391, "step": 3270 }, { "epoch": 0.0035567555004816552, "grad_norm": 17.128015518188477, "learning_rate": 4.999853285606905e-06, "loss": 1.6957, "step": 3275 }, { "epoch": 0.003562185661551093, "grad_norm": 30.533050537109375, "learning_rate": 4.999852823155014e-06, "loss": 1.344, "step": 3280 }, { "epoch": 0.0035676158226205304, "grad_norm": 38.84669876098633, "learning_rate": 4.999852359975453e-06, "loss": 2.2864, "step": 3285 }, { "epoch": 0.0035730459836899684, "grad_norm": 20.525287628173828, "learning_rate": 4.999851896068223e-06, "loss": 1.9694, "step": 3290 }, { "epoch": 0.003578476144759406, "grad_norm": 31.058385848999023, "learning_rate": 4.999851431433325e-06, "loss": 1.9562, "step": 3295 }, { "epoch": 0.0035839063058288436, "grad_norm": 14.671404838562012, "learning_rate": 4.999850966070757e-06, "loss": 1.7819, "step": 3300 }, { "epoch": 0.003589336466898281, "grad_norm": 36.12254333496094, "learning_rate": 4.999850499980522e-06, "loss": 1.9966, "step": 3305 }, { "epoch": 0.0035947666279677187, "grad_norm": 20.859237670898438, "learning_rate": 4.999850033162617e-06, "loss": 1.5392, "step": 3310 }, { "epoch": 0.0036001967890371563, "grad_norm": 18.085086822509766, "learning_rate": 4.999849565617045e-06, "loss": 2.1173, "step": 3315 }, { "epoch": 0.003605626950106594, "grad_norm": 56.04669952392578, "learning_rate": 4.999849097343804e-06, "loss": 1.5495, "step": 3320 }, { "epoch": 0.003611057111176032, "grad_norm": 170.3017578125, "learning_rate": 4.999848628342896e-06, "loss": 1.6873, "step": 3325 }, { "epoch": 0.0036164872722454695, "grad_norm": 45.46292495727539, "learning_rate": 4.999848158614319e-06, "loss": 2.2979, "step": 3330 }, { "epoch": 0.003621917433314907, "grad_norm": 33.455841064453125, "learning_rate": 4.999847688158075e-06, "loss": 2.1606, "step": 3335 }, { "epoch": 0.0036273475943843446, "grad_norm": 25.087303161621094, "learning_rate": 4.9998472169741626e-06, "loss": 2.0596, "step": 3340 }, { "epoch": 0.0036327777554537822, "grad_norm": 65.46402740478516, "learning_rate": 4.9998467450625835e-06, "loss": 1.5399, "step": 3345 }, { "epoch": 0.00363820791652322, "grad_norm": 38.17913818359375, "learning_rate": 4.999846272423336e-06, "loss": 1.9641, "step": 3350 }, { "epoch": 0.0036436380775926574, "grad_norm": 18.20652961730957, "learning_rate": 4.999845799056422e-06, "loss": 2.0245, "step": 3355 }, { "epoch": 0.0036490682386620954, "grad_norm": 65.51870727539062, "learning_rate": 4.99984532496184e-06, "loss": 2.2562, "step": 3360 }, { "epoch": 0.003654498399731533, "grad_norm": 103.58392333984375, "learning_rate": 4.9998448501395925e-06, "loss": 1.3967, "step": 3365 }, { "epoch": 0.0036599285608009706, "grad_norm": 67.8432846069336, "learning_rate": 4.9998443745896774e-06, "loss": 1.5874, "step": 3370 }, { "epoch": 0.003665358721870408, "grad_norm": 38.68763732910156, "learning_rate": 4.999843898312096e-06, "loss": 1.5117, "step": 3375 }, { "epoch": 0.0036707888829398457, "grad_norm": 16.132299423217773, "learning_rate": 4.999843421306848e-06, "loss": 1.6737, "step": 3380 }, { "epoch": 0.0036762190440092833, "grad_norm": 33.25187683105469, "learning_rate": 4.999842943573934e-06, "loss": 1.8159, "step": 3385 }, { "epoch": 0.003681649205078721, "grad_norm": 13.638212203979492, "learning_rate": 4.999842465113353e-06, "loss": 2.0931, "step": 3390 }, { "epoch": 0.003687079366148159, "grad_norm": 50.731346130371094, "learning_rate": 4.999841985925106e-06, "loss": 2.2876, "step": 3395 }, { "epoch": 0.0036925095272175965, "grad_norm": 21.18352699279785, "learning_rate": 4.999841506009193e-06, "loss": 1.8047, "step": 3400 }, { "epoch": 0.003697939688287034, "grad_norm": 56.15159606933594, "learning_rate": 4.999841025365615e-06, "loss": 3.326, "step": 3405 }, { "epoch": 0.0037033698493564716, "grad_norm": 41.844783782958984, "learning_rate": 4.9998405439943694e-06, "loss": 1.6059, "step": 3410 }, { "epoch": 0.003708800010425909, "grad_norm": 26.94524383544922, "learning_rate": 4.99984006189546e-06, "loss": 1.5195, "step": 3415 }, { "epoch": 0.003714230171495347, "grad_norm": 20.75353240966797, "learning_rate": 4.999839579068885e-06, "loss": 2.184, "step": 3420 }, { "epoch": 0.0037196603325647844, "grad_norm": 25.103954315185547, "learning_rate": 4.9998390955146445e-06, "loss": 1.6145, "step": 3425 }, { "epoch": 0.0037250904936342224, "grad_norm": 48.730960845947266, "learning_rate": 4.999838611232739e-06, "loss": 2.1506, "step": 3430 }, { "epoch": 0.00373052065470366, "grad_norm": 15.65014934539795, "learning_rate": 4.999838126223168e-06, "loss": 1.8995, "step": 3435 }, { "epoch": 0.0037359508157730975, "grad_norm": 17.508237838745117, "learning_rate": 4.999837640485933e-06, "loss": 1.5812, "step": 3440 }, { "epoch": 0.003741380976842535, "grad_norm": 19.44622230529785, "learning_rate": 4.9998371540210325e-06, "loss": 1.752, "step": 3445 }, { "epoch": 0.0037468111379119727, "grad_norm": 34.60442352294922, "learning_rate": 4.9998366668284675e-06, "loss": 2.042, "step": 3450 }, { "epoch": 0.0037522412989814103, "grad_norm": 34.01153564453125, "learning_rate": 4.999836178908238e-06, "loss": 2.0563, "step": 3455 }, { "epoch": 0.003757671460050848, "grad_norm": 26.75284767150879, "learning_rate": 4.999835690260345e-06, "loss": 1.6681, "step": 3460 }, { "epoch": 0.003763101621120286, "grad_norm": 25.029403686523438, "learning_rate": 4.9998352008847875e-06, "loss": 1.7154, "step": 3465 }, { "epoch": 0.0037685317821897235, "grad_norm": 47.13941192626953, "learning_rate": 4.999834710781566e-06, "loss": 2.7037, "step": 3470 }, { "epoch": 0.003773961943259161, "grad_norm": 29.156686782836914, "learning_rate": 4.999834219950681e-06, "loss": 1.571, "step": 3475 }, { "epoch": 0.0037793921043285986, "grad_norm": 15.487994194030762, "learning_rate": 4.999833728392132e-06, "loss": 1.6728, "step": 3480 }, { "epoch": 0.003784822265398036, "grad_norm": 37.404685974121094, "learning_rate": 4.999833236105919e-06, "loss": 2.0984, "step": 3485 }, { "epoch": 0.0037902524264674738, "grad_norm": 16.294334411621094, "learning_rate": 4.999832743092043e-06, "loss": 1.7074, "step": 3490 }, { "epoch": 0.0037956825875369114, "grad_norm": 50.73017120361328, "learning_rate": 4.9998322493505035e-06, "loss": 2.4629, "step": 3495 }, { "epoch": 0.0038011127486063494, "grad_norm": 24.68677520751953, "learning_rate": 4.9998317548813e-06, "loss": 1.4194, "step": 3500 }, { "epoch": 0.003806542909675787, "grad_norm": 24.73664093017578, "learning_rate": 4.999831259684435e-06, "loss": 2.2831, "step": 3505 }, { "epoch": 0.0038119730707452245, "grad_norm": 13.540345191955566, "learning_rate": 4.999830763759906e-06, "loss": 1.7978, "step": 3510 }, { "epoch": 0.003817403231814662, "grad_norm": 14.738937377929688, "learning_rate": 4.999830267107716e-06, "loss": 1.3686, "step": 3515 }, { "epoch": 0.0038228333928840997, "grad_norm": 16.651103973388672, "learning_rate": 4.999829769727862e-06, "loss": 1.298, "step": 3520 }, { "epoch": 0.0038282635539535373, "grad_norm": 20.781064987182617, "learning_rate": 4.999829271620346e-06, "loss": 1.5557, "step": 3525 }, { "epoch": 0.003833693715022975, "grad_norm": 109.23353576660156, "learning_rate": 4.999828772785167e-06, "loss": 1.4489, "step": 3530 }, { "epoch": 0.003839123876092413, "grad_norm": 17.585599899291992, "learning_rate": 4.999828273222327e-06, "loss": 1.3889, "step": 3535 }, { "epoch": 0.0038445540371618504, "grad_norm": 26.93206787109375, "learning_rate": 4.999827772931825e-06, "loss": 2.2643, "step": 3540 }, { "epoch": 0.003849984198231288, "grad_norm": 21.660640716552734, "learning_rate": 4.99982727191366e-06, "loss": 2.6391, "step": 3545 }, { "epoch": 0.0038554143593007256, "grad_norm": 25.547443389892578, "learning_rate": 4.999826770167835e-06, "loss": 2.1522, "step": 3550 }, { "epoch": 0.003860844520370163, "grad_norm": 20.953516006469727, "learning_rate": 4.999826267694347e-06, "loss": 1.4267, "step": 3555 }, { "epoch": 0.0038662746814396008, "grad_norm": 24.47382354736328, "learning_rate": 4.999825764493198e-06, "loss": 1.9195, "step": 3560 }, { "epoch": 0.0038717048425090383, "grad_norm": 59.936309814453125, "learning_rate": 4.999825260564388e-06, "loss": 2.0237, "step": 3565 }, { "epoch": 0.0038771350035784764, "grad_norm": 22.804443359375, "learning_rate": 4.999824755907917e-06, "loss": 2.1094, "step": 3570 }, { "epoch": 0.003882565164647914, "grad_norm": 20.22281265258789, "learning_rate": 4.9998242505237845e-06, "loss": 1.6963, "step": 3575 }, { "epoch": 0.0038879953257173515, "grad_norm": 18.17652702331543, "learning_rate": 4.9998237444119914e-06, "loss": 1.919, "step": 3580 }, { "epoch": 0.003893425486786789, "grad_norm": 17.04357147216797, "learning_rate": 4.999823237572539e-06, "loss": 2.1971, "step": 3585 }, { "epoch": 0.0038988556478562267, "grad_norm": 19.04085350036621, "learning_rate": 4.9998227300054244e-06, "loss": 1.6283, "step": 3590 }, { "epoch": 0.0039042858089256643, "grad_norm": 33.35275650024414, "learning_rate": 4.99982222171065e-06, "loss": 1.8962, "step": 3595 }, { "epoch": 0.003909715969995102, "grad_norm": 30.535388946533203, "learning_rate": 4.999821712688215e-06, "loss": 2.2541, "step": 3600 }, { "epoch": 0.00391514613106454, "grad_norm": 20.485971450805664, "learning_rate": 4.99982120293812e-06, "loss": 2.2659, "step": 3605 }, { "epoch": 0.003920576292133977, "grad_norm": 44.67912292480469, "learning_rate": 4.999820692460365e-06, "loss": 1.7138, "step": 3610 }, { "epoch": 0.003926006453203415, "grad_norm": 24.48299789428711, "learning_rate": 4.99982018125495e-06, "loss": 2.002, "step": 3615 }, { "epoch": 0.003931436614272852, "grad_norm": 33.87007522583008, "learning_rate": 4.999819669321877e-06, "loss": 2.0767, "step": 3620 }, { "epoch": 0.00393686677534229, "grad_norm": 29.062358856201172, "learning_rate": 4.9998191566611435e-06, "loss": 1.6235, "step": 3625 }, { "epoch": 0.003942296936411728, "grad_norm": 15.427749633789062, "learning_rate": 4.9998186432727505e-06, "loss": 1.574, "step": 3630 }, { "epoch": 0.003947727097481165, "grad_norm": 24.290390014648438, "learning_rate": 4.9998181291566985e-06, "loss": 1.4348, "step": 3635 }, { "epoch": 0.003953157258550603, "grad_norm": 32.33269119262695, "learning_rate": 4.999817614312988e-06, "loss": 2.3007, "step": 3640 }, { "epoch": 0.0039585874196200405, "grad_norm": 16.556608200073242, "learning_rate": 4.999817098741617e-06, "loss": 1.6436, "step": 3645 }, { "epoch": 0.0039640175806894785, "grad_norm": 26.583860397338867, "learning_rate": 4.999816582442589e-06, "loss": 1.6429, "step": 3650 }, { "epoch": 0.003969447741758916, "grad_norm": 120.0948486328125, "learning_rate": 4.999816065415902e-06, "loss": 1.172, "step": 3655 }, { "epoch": 0.003974877902828354, "grad_norm": 25.258342742919922, "learning_rate": 4.999815547661556e-06, "loss": 1.7103, "step": 3660 }, { "epoch": 0.003980308063897792, "grad_norm": 17.074848175048828, "learning_rate": 4.9998150291795525e-06, "loss": 1.8737, "step": 3665 }, { "epoch": 0.003985738224967229, "grad_norm": 96.65465545654297, "learning_rate": 4.99981450996989e-06, "loss": 1.6857, "step": 3670 }, { "epoch": 0.003991168386036667, "grad_norm": 61.10310745239258, "learning_rate": 4.99981399003257e-06, "loss": 1.9109, "step": 3675 }, { "epoch": 0.003996598547106104, "grad_norm": 109.70652770996094, "learning_rate": 4.999813469367592e-06, "loss": 1.6686, "step": 3680 }, { "epoch": 0.004002028708175542, "grad_norm": 47.408077239990234, "learning_rate": 4.9998129479749566e-06, "loss": 1.8361, "step": 3685 }, { "epoch": 0.004007458869244979, "grad_norm": 28.075794219970703, "learning_rate": 4.9998124258546635e-06, "loss": 1.8578, "step": 3690 }, { "epoch": 0.004012889030314417, "grad_norm": 31.738311767578125, "learning_rate": 4.999811903006713e-06, "loss": 2.0079, "step": 3695 }, { "epoch": 0.004018319191383855, "grad_norm": 45.03678512573242, "learning_rate": 4.999811379431106e-06, "loss": 2.3902, "step": 3700 }, { "epoch": 0.004023749352453292, "grad_norm": 22.36641502380371, "learning_rate": 4.999810855127841e-06, "loss": 1.7244, "step": 3705 }, { "epoch": 0.00402917951352273, "grad_norm": 38.22343063354492, "learning_rate": 4.999810330096919e-06, "loss": 2.0926, "step": 3710 }, { "epoch": 0.0040346096745921675, "grad_norm": 27.632165908813477, "learning_rate": 4.999809804338342e-06, "loss": 1.9831, "step": 3715 }, { "epoch": 0.0040400398356616055, "grad_norm": 21.985363006591797, "learning_rate": 4.999809277852106e-06, "loss": 1.764, "step": 3720 }, { "epoch": 0.004045469996731043, "grad_norm": 20.790836334228516, "learning_rate": 4.9998087506382145e-06, "loss": 1.7826, "step": 3725 }, { "epoch": 0.004050900157800481, "grad_norm": 57.33964157104492, "learning_rate": 4.999808222696667e-06, "loss": 1.4568, "step": 3730 }, { "epoch": 0.004056330318869919, "grad_norm": 36.46984100341797, "learning_rate": 4.9998076940274635e-06, "loss": 1.5977, "step": 3735 }, { "epoch": 0.004061760479939356, "grad_norm": 19.436508178710938, "learning_rate": 4.9998071646306034e-06, "loss": 1.3679, "step": 3740 }, { "epoch": 0.004067190641008794, "grad_norm": 27.80879783630371, "learning_rate": 4.999806634506088e-06, "loss": 1.9952, "step": 3745 }, { "epoch": 0.004072620802078231, "grad_norm": 25.816741943359375, "learning_rate": 4.999806103653917e-06, "loss": 1.3246, "step": 3750 }, { "epoch": 0.004078050963147669, "grad_norm": 28.973247528076172, "learning_rate": 4.99980557207409e-06, "loss": 1.908, "step": 3755 }, { "epoch": 0.004083481124217106, "grad_norm": 26.63705062866211, "learning_rate": 4.999805039766607e-06, "loss": 1.7716, "step": 3760 }, { "epoch": 0.004088911285286544, "grad_norm": 37.86471176147461, "learning_rate": 4.9998045067314706e-06, "loss": 1.3973, "step": 3765 }, { "epoch": 0.004094341446355982, "grad_norm": 33.5548210144043, "learning_rate": 4.999803972968678e-06, "loss": 2.4229, "step": 3770 }, { "epoch": 0.004099771607425419, "grad_norm": 18.690467834472656, "learning_rate": 4.9998034384782305e-06, "loss": 1.7872, "step": 3775 }, { "epoch": 0.004105201768494857, "grad_norm": 36.09320068359375, "learning_rate": 4.999802903260129e-06, "loss": 1.7279, "step": 3780 }, { "epoch": 0.0041106319295642945, "grad_norm": 21.36130142211914, "learning_rate": 4.999802367314372e-06, "loss": 1.673, "step": 3785 }, { "epoch": 0.0041160620906337325, "grad_norm": 37.789615631103516, "learning_rate": 4.999801830640961e-06, "loss": 1.4627, "step": 3790 }, { "epoch": 0.00412149225170317, "grad_norm": 38.39377212524414, "learning_rate": 4.999801293239895e-06, "loss": 1.346, "step": 3795 }, { "epoch": 0.004126922412772608, "grad_norm": 26.421764373779297, "learning_rate": 4.999800755111176e-06, "loss": 2.0093, "step": 3800 }, { "epoch": 0.004132352573842046, "grad_norm": 40.08497619628906, "learning_rate": 4.999800216254803e-06, "loss": 1.9014, "step": 3805 }, { "epoch": 0.004137782734911483, "grad_norm": 18.044145584106445, "learning_rate": 4.999799676670775e-06, "loss": 2.0616, "step": 3810 }, { "epoch": 0.004143212895980921, "grad_norm": 27.08595848083496, "learning_rate": 4.999799136359094e-06, "loss": 1.8848, "step": 3815 }, { "epoch": 0.004148643057050358, "grad_norm": 38.97542190551758, "learning_rate": 4.99979859531976e-06, "loss": 1.8552, "step": 3820 }, { "epoch": 0.004154073218119796, "grad_norm": 21.618608474731445, "learning_rate": 4.9997980535527724e-06, "loss": 1.4958, "step": 3825 }, { "epoch": 0.004159503379189233, "grad_norm": 26.63603973388672, "learning_rate": 4.999797511058131e-06, "loss": 1.434, "step": 3830 }, { "epoch": 0.004164933540258671, "grad_norm": 26.964723587036133, "learning_rate": 4.999796967835837e-06, "loss": 2.119, "step": 3835 }, { "epoch": 0.004170363701328109, "grad_norm": 204.12608337402344, "learning_rate": 4.99979642388589e-06, "loss": 2.7151, "step": 3840 }, { "epoch": 0.004175793862397546, "grad_norm": 19.234338760375977, "learning_rate": 4.9997958792082904e-06, "loss": 1.5176, "step": 3845 }, { "epoch": 0.004181224023466984, "grad_norm": 38.292701721191406, "learning_rate": 4.999795333803039e-06, "loss": 1.694, "step": 3850 }, { "epoch": 0.0041866541845364215, "grad_norm": 47.24244689941406, "learning_rate": 4.999794787670134e-06, "loss": 1.521, "step": 3855 }, { "epoch": 0.0041920843456058595, "grad_norm": 17.876163482666016, "learning_rate": 4.999794240809577e-06, "loss": 1.6908, "step": 3860 }, { "epoch": 0.004197514506675297, "grad_norm": 18.77605438232422, "learning_rate": 4.9997936932213685e-06, "loss": 1.9959, "step": 3865 }, { "epoch": 0.004202944667744735, "grad_norm": 131.33444213867188, "learning_rate": 4.999793144905508e-06, "loss": 1.8055, "step": 3870 }, { "epoch": 0.004208374828814173, "grad_norm": 34.91744613647461, "learning_rate": 4.999792595861995e-06, "loss": 1.3742, "step": 3875 }, { "epoch": 0.00421380498988361, "grad_norm": 132.6812286376953, "learning_rate": 4.999792046090831e-06, "loss": 0.8999, "step": 3880 }, { "epoch": 0.004219235150953048, "grad_norm": 18.15677833557129, "learning_rate": 4.999791495592016e-06, "loss": 1.9931, "step": 3885 }, { "epoch": 0.004224665312022485, "grad_norm": 29.922033309936523, "learning_rate": 4.999790944365549e-06, "loss": 1.6814, "step": 3890 }, { "epoch": 0.004230095473091923, "grad_norm": 139.40650939941406, "learning_rate": 4.999790392411432e-06, "loss": 1.8136, "step": 3895 }, { "epoch": 0.00423552563416136, "grad_norm": 49.51626968383789, "learning_rate": 4.999789839729663e-06, "loss": 1.4564, "step": 3900 }, { "epoch": 0.004240955795230798, "grad_norm": 25.279361724853516, "learning_rate": 4.999789286320243e-06, "loss": 1.7533, "step": 3905 }, { "epoch": 0.004246385956300236, "grad_norm": 27.072219848632812, "learning_rate": 4.999788732183173e-06, "loss": 1.8279, "step": 3910 }, { "epoch": 0.004251816117369673, "grad_norm": 25.49057960510254, "learning_rate": 4.999788177318452e-06, "loss": 1.6408, "step": 3915 }, { "epoch": 0.004257246278439111, "grad_norm": 35.656593322753906, "learning_rate": 4.999787621726081e-06, "loss": 1.7324, "step": 3920 }, { "epoch": 0.0042626764395085484, "grad_norm": 21.441741943359375, "learning_rate": 4.99978706540606e-06, "loss": 1.242, "step": 3925 }, { "epoch": 0.0042681066005779865, "grad_norm": 26.47601890563965, "learning_rate": 4.999786508358389e-06, "loss": 1.8206, "step": 3930 }, { "epoch": 0.004273536761647424, "grad_norm": 25.763626098632812, "learning_rate": 4.999785950583068e-06, "loss": 2.4151, "step": 3935 }, { "epoch": 0.004278966922716862, "grad_norm": 19.95008659362793, "learning_rate": 4.9997853920800975e-06, "loss": 1.4263, "step": 3940 }, { "epoch": 0.0042843970837863, "grad_norm": 29.982303619384766, "learning_rate": 4.9997848328494775e-06, "loss": 2.3012, "step": 3945 }, { "epoch": 0.004289827244855737, "grad_norm": 23.364641189575195, "learning_rate": 4.999784272891208e-06, "loss": 1.608, "step": 3950 }, { "epoch": 0.004295257405925175, "grad_norm": 30.02018928527832, "learning_rate": 4.99978371220529e-06, "loss": 2.3979, "step": 3955 }, { "epoch": 0.004300687566994612, "grad_norm": 21.722848892211914, "learning_rate": 4.999783150791722e-06, "loss": 1.9166, "step": 3960 }, { "epoch": 0.00430611772806405, "grad_norm": 24.236392974853516, "learning_rate": 4.999782588650506e-06, "loss": 2.5619, "step": 3965 }, { "epoch": 0.004311547889133487, "grad_norm": 28.661348342895508, "learning_rate": 4.999782025781641e-06, "loss": 1.5538, "step": 3970 }, { "epoch": 0.004316978050202925, "grad_norm": 27.234350204467773, "learning_rate": 4.999781462185127e-06, "loss": 1.924, "step": 3975 }, { "epoch": 0.004322408211272363, "grad_norm": 28.538524627685547, "learning_rate": 4.999780897860965e-06, "loss": 1.4341, "step": 3980 }, { "epoch": 0.0043278383723418, "grad_norm": 34.05399703979492, "learning_rate": 4.999780332809155e-06, "loss": 1.788, "step": 3985 }, { "epoch": 0.004333268533411238, "grad_norm": 21.953807830810547, "learning_rate": 4.999779767029697e-06, "loss": 1.6732, "step": 3990 }, { "epoch": 0.004338698694480675, "grad_norm": 33.479705810546875, "learning_rate": 4.999779200522591e-06, "loss": 1.3944, "step": 3995 }, { "epoch": 0.0043441288555501134, "grad_norm": 22.20330238342285, "learning_rate": 4.999778633287837e-06, "loss": 2.3559, "step": 4000 }, { "epoch": 0.004349559016619551, "grad_norm": 98.79212951660156, "learning_rate": 4.9997780653254355e-06, "loss": 1.164, "step": 4005 }, { "epoch": 0.004354989177688989, "grad_norm": 22.032493591308594, "learning_rate": 4.999777496635387e-06, "loss": 1.5785, "step": 4010 }, { "epoch": 0.004360419338758427, "grad_norm": 55.8387565612793, "learning_rate": 4.999776927217691e-06, "loss": 1.7398, "step": 4015 }, { "epoch": 0.004365849499827864, "grad_norm": 31.456995010375977, "learning_rate": 4.999776357072348e-06, "loss": 1.9083, "step": 4020 }, { "epoch": 0.004371279660897302, "grad_norm": 32.39219284057617, "learning_rate": 4.999775786199359e-06, "loss": 1.5433, "step": 4025 }, { "epoch": 0.004376709821966739, "grad_norm": 24.553180694580078, "learning_rate": 4.999775214598722e-06, "loss": 2.1468, "step": 4030 }, { "epoch": 0.004382139983036177, "grad_norm": 38.06990432739258, "learning_rate": 4.99977464227044e-06, "loss": 1.6266, "step": 4035 }, { "epoch": 0.004387570144105614, "grad_norm": 26.416147232055664, "learning_rate": 4.99977406921451e-06, "loss": 1.8288, "step": 4040 }, { "epoch": 0.004393000305175052, "grad_norm": 19.085798263549805, "learning_rate": 4.999773495430935e-06, "loss": 2.0355, "step": 4045 }, { "epoch": 0.00439843046624449, "grad_norm": 49.82600021362305, "learning_rate": 4.9997729209197135e-06, "loss": 1.8866, "step": 4050 }, { "epoch": 0.004403860627313927, "grad_norm": 23.14927101135254, "learning_rate": 4.999772345680846e-06, "loss": 1.3662, "step": 4055 }, { "epoch": 0.004409290788383365, "grad_norm": 37.01432800292969, "learning_rate": 4.999771769714333e-06, "loss": 1.8571, "step": 4060 }, { "epoch": 0.004414720949452802, "grad_norm": 46.7080192565918, "learning_rate": 4.999771193020175e-06, "loss": 1.7654, "step": 4065 }, { "epoch": 0.00442015111052224, "grad_norm": 27.086233139038086, "learning_rate": 4.99977061559837e-06, "loss": 1.7642, "step": 4070 }, { "epoch": 0.004425581271591678, "grad_norm": 277.5721435546875, "learning_rate": 4.999770037448921e-06, "loss": 1.8295, "step": 4075 }, { "epoch": 0.004431011432661116, "grad_norm": 22.821870803833008, "learning_rate": 4.999769458571827e-06, "loss": 2.3557, "step": 4080 }, { "epoch": 0.004436441593730554, "grad_norm": 17.907442092895508, "learning_rate": 4.999768878967088e-06, "loss": 1.6507, "step": 4085 }, { "epoch": 0.004441871754799991, "grad_norm": 30.61693572998047, "learning_rate": 4.999768298634704e-06, "loss": 1.8075, "step": 4090 }, { "epoch": 0.004447301915869429, "grad_norm": 20.992740631103516, "learning_rate": 4.999767717574676e-06, "loss": 2.0449, "step": 4095 }, { "epoch": 0.004452732076938866, "grad_norm": 74.41835021972656, "learning_rate": 4.999767135787004e-06, "loss": 1.9513, "step": 4100 }, { "epoch": 0.004458162238008304, "grad_norm": 45.77877426147461, "learning_rate": 4.999766553271687e-06, "loss": 1.4259, "step": 4105 }, { "epoch": 0.004463592399077741, "grad_norm": 18.328489303588867, "learning_rate": 4.999765970028726e-06, "loss": 1.1327, "step": 4110 }, { "epoch": 0.004469022560147179, "grad_norm": 55.25654602050781, "learning_rate": 4.999765386058121e-06, "loss": 1.8757, "step": 4115 }, { "epoch": 0.004474452721216617, "grad_norm": 20.614349365234375, "learning_rate": 4.9997648013598735e-06, "loss": 1.5996, "step": 4120 }, { "epoch": 0.004479882882286054, "grad_norm": 21.922130584716797, "learning_rate": 4.999764215933981e-06, "loss": 1.8241, "step": 4125 }, { "epoch": 0.004485313043355492, "grad_norm": 24.2725830078125, "learning_rate": 4.999763629780446e-06, "loss": 1.9038, "step": 4130 }, { "epoch": 0.004490743204424929, "grad_norm": 95.07975006103516, "learning_rate": 4.999763042899268e-06, "loss": 2.6974, "step": 4135 }, { "epoch": 0.004496173365494367, "grad_norm": 176.9788055419922, "learning_rate": 4.999762455290447e-06, "loss": 2.0456, "step": 4140 }, { "epoch": 0.004501603526563805, "grad_norm": 17.36978530883789, "learning_rate": 4.999761866953983e-06, "loss": 1.6279, "step": 4145 }, { "epoch": 0.004507033687633243, "grad_norm": 25.533615112304688, "learning_rate": 4.999761277889876e-06, "loss": 1.9709, "step": 4150 }, { "epoch": 0.004512463848702681, "grad_norm": 14.804618835449219, "learning_rate": 4.999760688098127e-06, "loss": 2.1932, "step": 4155 }, { "epoch": 0.004517894009772118, "grad_norm": 42.66942596435547, "learning_rate": 4.999760097578736e-06, "loss": 2.4263, "step": 4160 }, { "epoch": 0.004523324170841556, "grad_norm": 34.15489196777344, "learning_rate": 4.999759506331702e-06, "loss": 1.5647, "step": 4165 }, { "epoch": 0.004528754331910993, "grad_norm": 27.552165985107422, "learning_rate": 4.999758914357026e-06, "loss": 1.9413, "step": 4170 }, { "epoch": 0.004534184492980431, "grad_norm": 21.480472564697266, "learning_rate": 4.999758321654709e-06, "loss": 1.4965, "step": 4175 }, { "epoch": 0.004539614654049868, "grad_norm": 18.5081844329834, "learning_rate": 4.999757728224749e-06, "loss": 2.3847, "step": 4180 }, { "epoch": 0.004545044815119306, "grad_norm": 25.345861434936523, "learning_rate": 4.999757134067149e-06, "loss": 1.9156, "step": 4185 }, { "epoch": 0.004550474976188744, "grad_norm": 20.065967559814453, "learning_rate": 4.999756539181907e-06, "loss": 1.6086, "step": 4190 }, { "epoch": 0.004555905137258181, "grad_norm": 38.94892501831055, "learning_rate": 4.999755943569025e-06, "loss": 1.6229, "step": 4195 }, { "epoch": 0.004561335298327619, "grad_norm": 42.25081253051758, "learning_rate": 4.9997553472285006e-06, "loss": 0.8147, "step": 4200 }, { "epoch": 0.004566765459397056, "grad_norm": 51.22443389892578, "learning_rate": 4.999754750160336e-06, "loss": 1.5572, "step": 4205 }, { "epoch": 0.004572195620466494, "grad_norm": 25.48631477355957, "learning_rate": 4.999754152364531e-06, "loss": 1.8986, "step": 4210 }, { "epoch": 0.0045776257815359316, "grad_norm": 16.505313873291016, "learning_rate": 4.9997535538410854e-06, "loss": 2.1041, "step": 4215 }, { "epoch": 0.00458305594260537, "grad_norm": 19.91929054260254, "learning_rate": 4.99975295459e-06, "loss": 1.7591, "step": 4220 }, { "epoch": 0.004588486103674808, "grad_norm": 15.821951866149902, "learning_rate": 4.999752354611275e-06, "loss": 0.9566, "step": 4225 }, { "epoch": 0.004593916264744245, "grad_norm": 22.23419761657715, "learning_rate": 4.999751753904909e-06, "loss": 1.5998, "step": 4230 }, { "epoch": 0.004599346425813683, "grad_norm": 37.01189422607422, "learning_rate": 4.9997511524709035e-06, "loss": 1.6665, "step": 4235 }, { "epoch": 0.00460477658688312, "grad_norm": 20.424470901489258, "learning_rate": 4.999750550309259e-06, "loss": 1.5844, "step": 4240 }, { "epoch": 0.004610206747952558, "grad_norm": 23.547033309936523, "learning_rate": 4.999749947419975e-06, "loss": 1.5607, "step": 4245 }, { "epoch": 0.004615636909021995, "grad_norm": 17.523618698120117, "learning_rate": 4.9997493438030505e-06, "loss": 1.0621, "step": 4250 }, { "epoch": 0.004621067070091433, "grad_norm": 23.064414978027344, "learning_rate": 4.999748739458488e-06, "loss": 1.8468, "step": 4255 }, { "epoch": 0.004626497231160871, "grad_norm": 17.253957748413086, "learning_rate": 4.999748134386288e-06, "loss": 1.7526, "step": 4260 }, { "epoch": 0.004631927392230308, "grad_norm": 15.474635124206543, "learning_rate": 4.999747528586448e-06, "loss": 1.8407, "step": 4265 }, { "epoch": 0.004637357553299746, "grad_norm": 25.431365966796875, "learning_rate": 4.999746922058969e-06, "loss": 1.8538, "step": 4270 }, { "epoch": 0.004642787714369183, "grad_norm": 172.71487426757812, "learning_rate": 4.999746314803853e-06, "loss": 1.1687, "step": 4275 }, { "epoch": 0.004648217875438621, "grad_norm": 20.020139694213867, "learning_rate": 4.999745706821099e-06, "loss": 1.658, "step": 4280 }, { "epoch": 0.0046536480365080585, "grad_norm": 38.354251861572266, "learning_rate": 4.999745098110705e-06, "loss": 1.9611, "step": 4285 }, { "epoch": 0.0046590781975774966, "grad_norm": 57.797576904296875, "learning_rate": 4.999744488672676e-06, "loss": 2.294, "step": 4290 }, { "epoch": 0.004664508358646935, "grad_norm": 45.13158416748047, "learning_rate": 4.9997438785070076e-06, "loss": 1.5096, "step": 4295 }, { "epoch": 0.004669938519716372, "grad_norm": 54.64434814453125, "learning_rate": 4.999743267613702e-06, "loss": 2.0173, "step": 4300 }, { "epoch": 0.00467536868078581, "grad_norm": 50.79085922241211, "learning_rate": 4.999742655992759e-06, "loss": 1.7671, "step": 4305 }, { "epoch": 0.004680798841855247, "grad_norm": 21.504562377929688, "learning_rate": 4.999742043644179e-06, "loss": 1.6754, "step": 4310 }, { "epoch": 0.004686229002924685, "grad_norm": 31.390865325927734, "learning_rate": 4.999741430567962e-06, "loss": 1.1285, "step": 4315 }, { "epoch": 0.004691659163994122, "grad_norm": 17.281015396118164, "learning_rate": 4.99974081676411e-06, "loss": 1.5523, "step": 4320 }, { "epoch": 0.00469708932506356, "grad_norm": 32.06979751586914, "learning_rate": 4.9997402022326195e-06, "loss": 2.3047, "step": 4325 }, { "epoch": 0.004702519486132998, "grad_norm": 17.321500778198242, "learning_rate": 4.999739586973493e-06, "loss": 1.4906, "step": 4330 }, { "epoch": 0.004707949647202435, "grad_norm": 42.13648223876953, "learning_rate": 4.999738970986732e-06, "loss": 2.0275, "step": 4335 }, { "epoch": 0.004713379808271873, "grad_norm": 19.812976837158203, "learning_rate": 4.9997383542723336e-06, "loss": 1.5575, "step": 4340 }, { "epoch": 0.00471880996934131, "grad_norm": 16.108808517456055, "learning_rate": 4.999737736830299e-06, "loss": 2.0331, "step": 4345 }, { "epoch": 0.004724240130410748, "grad_norm": 16.288692474365234, "learning_rate": 4.999737118660629e-06, "loss": 2.02, "step": 4350 }, { "epoch": 0.0047296702914801855, "grad_norm": 23.988309860229492, "learning_rate": 4.999736499763324e-06, "loss": 1.7238, "step": 4355 }, { "epoch": 0.0047351004525496235, "grad_norm": 17.052490234375, "learning_rate": 4.9997358801383835e-06, "loss": 1.6022, "step": 4360 }, { "epoch": 0.0047405306136190616, "grad_norm": 64.01812744140625, "learning_rate": 4.999735259785808e-06, "loss": 1.4948, "step": 4365 }, { "epoch": 0.004745960774688499, "grad_norm": 49.471763610839844, "learning_rate": 4.999734638705598e-06, "loss": 1.9436, "step": 4370 }, { "epoch": 0.004751390935757937, "grad_norm": 22.804981231689453, "learning_rate": 4.999734016897753e-06, "loss": 1.5359, "step": 4375 }, { "epoch": 0.004756821096827374, "grad_norm": 20.712995529174805, "learning_rate": 4.999733394362273e-06, "loss": 1.7558, "step": 4380 }, { "epoch": 0.004762251257896812, "grad_norm": 25.288484573364258, "learning_rate": 4.99973277109916e-06, "loss": 2.2518, "step": 4385 }, { "epoch": 0.004767681418966249, "grad_norm": 23.701669692993164, "learning_rate": 4.999732147108411e-06, "loss": 1.7298, "step": 4390 }, { "epoch": 0.004773111580035687, "grad_norm": 14.544864654541016, "learning_rate": 4.9997315223900286e-06, "loss": 2.4584, "step": 4395 }, { "epoch": 0.004778541741105125, "grad_norm": 72.53294372558594, "learning_rate": 4.999730896944013e-06, "loss": 1.174, "step": 4400 }, { "epoch": 0.004783971902174562, "grad_norm": 22.69927978515625, "learning_rate": 4.999730270770363e-06, "loss": 1.5532, "step": 4405 }, { "epoch": 0.004789402063244, "grad_norm": 37.94588088989258, "learning_rate": 4.99972964386908e-06, "loss": 1.7475, "step": 4410 }, { "epoch": 0.004794832224313437, "grad_norm": 16.08177947998047, "learning_rate": 4.9997290162401635e-06, "loss": 1.8872, "step": 4415 }, { "epoch": 0.004800262385382875, "grad_norm": 35.98625946044922, "learning_rate": 4.999728387883614e-06, "loss": 1.3897, "step": 4420 }, { "epoch": 0.0048056925464523125, "grad_norm": 49.840606689453125, "learning_rate": 4.999727758799432e-06, "loss": 2.0669, "step": 4425 }, { "epoch": 0.0048111227075217505, "grad_norm": 29.001855850219727, "learning_rate": 4.9997271289876174e-06, "loss": 2.0503, "step": 4430 }, { "epoch": 0.0048165528685911885, "grad_norm": 278.2001953125, "learning_rate": 4.999726498448169e-06, "loss": 1.9848, "step": 4435 }, { "epoch": 0.004821983029660626, "grad_norm": 37.90883255004883, "learning_rate": 4.999725867181089e-06, "loss": 0.85, "step": 4440 }, { "epoch": 0.004827413190730064, "grad_norm": 17.08104705810547, "learning_rate": 4.999725235186378e-06, "loss": 1.1279, "step": 4445 }, { "epoch": 0.004832843351799501, "grad_norm": 74.6081771850586, "learning_rate": 4.999724602464033e-06, "loss": 1.6868, "step": 4450 }, { "epoch": 0.004838273512868939, "grad_norm": 16.775508880615234, "learning_rate": 4.999723969014057e-06, "loss": 1.9755, "step": 4455 }, { "epoch": 0.004843703673938376, "grad_norm": 16.627090454101562, "learning_rate": 4.99972333483645e-06, "loss": 1.6903, "step": 4460 }, { "epoch": 0.004849133835007814, "grad_norm": 14.103394508361816, "learning_rate": 4.999722699931211e-06, "loss": 1.6718, "step": 4465 }, { "epoch": 0.004854563996077252, "grad_norm": 21.59641456604004, "learning_rate": 4.99972206429834e-06, "loss": 1.3331, "step": 4470 }, { "epoch": 0.004859994157146689, "grad_norm": 127.47725677490234, "learning_rate": 4.99972142793784e-06, "loss": 2.3194, "step": 4475 }, { "epoch": 0.004865424318216127, "grad_norm": 30.577579498291016, "learning_rate": 4.999720790849708e-06, "loss": 1.8474, "step": 4480 }, { "epoch": 0.004870854479285564, "grad_norm": 13.361328125, "learning_rate": 4.999720153033944e-06, "loss": 1.7648, "step": 4485 }, { "epoch": 0.004876284640355002, "grad_norm": 16.280019760131836, "learning_rate": 4.9997195144905515e-06, "loss": 1.2365, "step": 4490 }, { "epoch": 0.0048817148014244395, "grad_norm": 22.97209358215332, "learning_rate": 4.999718875219528e-06, "loss": 2.1418, "step": 4495 }, { "epoch": 0.0048871449624938775, "grad_norm": 26.84596061706543, "learning_rate": 4.999718235220874e-06, "loss": 1.6117, "step": 4500 }, { "epoch": 0.0048925751235633155, "grad_norm": 44.8737678527832, "learning_rate": 4.99971759449459e-06, "loss": 2.0858, "step": 4505 }, { "epoch": 0.004898005284632753, "grad_norm": 19.636995315551758, "learning_rate": 4.999716953040677e-06, "loss": 1.7424, "step": 4510 }, { "epoch": 0.004903435445702191, "grad_norm": 37.59233474731445, "learning_rate": 4.999716310859134e-06, "loss": 2.4267, "step": 4515 }, { "epoch": 0.004908865606771628, "grad_norm": 20.441091537475586, "learning_rate": 4.999715667949962e-06, "loss": 1.7553, "step": 4520 }, { "epoch": 0.004914295767841066, "grad_norm": 36.6260871887207, "learning_rate": 4.99971502431316e-06, "loss": 2.4067, "step": 4525 }, { "epoch": 0.004919725928910503, "grad_norm": 21.117691040039062, "learning_rate": 4.999714379948729e-06, "loss": 1.5839, "step": 4530 }, { "epoch": 0.004925156089979941, "grad_norm": 35.32769775390625, "learning_rate": 4.99971373485667e-06, "loss": 1.5226, "step": 4535 }, { "epoch": 0.004930586251049379, "grad_norm": 27.652795791625977, "learning_rate": 4.9997130890369816e-06, "loss": 1.9064, "step": 4540 }, { "epoch": 0.004936016412118816, "grad_norm": 17.082698822021484, "learning_rate": 4.9997124424896644e-06, "loss": 1.9506, "step": 4545 }, { "epoch": 0.004941446573188254, "grad_norm": 205.3570098876953, "learning_rate": 4.99971179521472e-06, "loss": 2.7213, "step": 4550 }, { "epoch": 0.004946876734257691, "grad_norm": 26.214683532714844, "learning_rate": 4.9997111472121475e-06, "loss": 1.6594, "step": 4555 }, { "epoch": 0.004952306895327129, "grad_norm": 16.37002944946289, "learning_rate": 4.999710498481947e-06, "loss": 1.4557, "step": 4560 }, { "epoch": 0.0049577370563965665, "grad_norm": 21.61284065246582, "learning_rate": 4.999709849024118e-06, "loss": 1.6022, "step": 4565 }, { "epoch": 0.0049631672174660045, "grad_norm": 33.77655029296875, "learning_rate": 4.999709198838663e-06, "loss": 1.9536, "step": 4570 }, { "epoch": 0.0049685973785354425, "grad_norm": 20.42025375366211, "learning_rate": 4.9997085479255795e-06, "loss": 2.0031, "step": 4575 }, { "epoch": 0.00497402753960488, "grad_norm": 25.92229461669922, "learning_rate": 4.999707896284869e-06, "loss": 1.9784, "step": 4580 }, { "epoch": 0.004979457700674318, "grad_norm": 24.21070098876953, "learning_rate": 4.999707243916532e-06, "loss": 1.6938, "step": 4585 }, { "epoch": 0.004984887861743755, "grad_norm": 29.894079208374023, "learning_rate": 4.9997065908205676e-06, "loss": 1.922, "step": 4590 }, { "epoch": 0.004990318022813193, "grad_norm": 20.52463722229004, "learning_rate": 4.9997059369969765e-06, "loss": 1.2614, "step": 4595 }, { "epoch": 0.00499574818388263, "grad_norm": 50.648799896240234, "learning_rate": 4.99970528244576e-06, "loss": 2.0274, "step": 4600 }, { "epoch": 0.005001178344952068, "grad_norm": 23.788875579833984, "learning_rate": 4.999704627166917e-06, "loss": 1.2743, "step": 4605 }, { "epoch": 0.005006608506021506, "grad_norm": 24.888708114624023, "learning_rate": 4.999703971160448e-06, "loss": 1.9645, "step": 4610 }, { "epoch": 0.005012038667090943, "grad_norm": 27.118000030517578, "learning_rate": 4.999703314426353e-06, "loss": 1.845, "step": 4615 }, { "epoch": 0.005017468828160381, "grad_norm": 53.124481201171875, "learning_rate": 4.9997026569646325e-06, "loss": 2.131, "step": 4620 }, { "epoch": 0.005022898989229818, "grad_norm": 32.018070220947266, "learning_rate": 4.999701998775286e-06, "loss": 1.2113, "step": 4625 }, { "epoch": 0.005028329150299256, "grad_norm": 22.29557228088379, "learning_rate": 4.999701339858315e-06, "loss": 2.0563, "step": 4630 }, { "epoch": 0.0050337593113686935, "grad_norm": 44.37444305419922, "learning_rate": 4.999700680213719e-06, "loss": 2.0754, "step": 4635 }, { "epoch": 0.0050391894724381315, "grad_norm": 16.36578941345215, "learning_rate": 4.999700019841498e-06, "loss": 2.6225, "step": 4640 }, { "epoch": 0.0050446196335075695, "grad_norm": 32.637229919433594, "learning_rate": 4.999699358741652e-06, "loss": 1.6873, "step": 4645 }, { "epoch": 0.005050049794577007, "grad_norm": 21.261987686157227, "learning_rate": 4.9996986969141824e-06, "loss": 1.4512, "step": 4650 }, { "epoch": 0.005055479955646445, "grad_norm": 26.72662925720215, "learning_rate": 4.999698034359088e-06, "loss": 1.3984, "step": 4655 }, { "epoch": 0.005060910116715882, "grad_norm": 15.990920066833496, "learning_rate": 4.99969737107637e-06, "loss": 1.8939, "step": 4660 }, { "epoch": 0.00506634027778532, "grad_norm": 17.405567169189453, "learning_rate": 4.999696707066028e-06, "loss": 1.659, "step": 4665 }, { "epoch": 0.005071770438854757, "grad_norm": 12.970836639404297, "learning_rate": 4.999696042328061e-06, "loss": 1.4175, "step": 4670 }, { "epoch": 0.005077200599924195, "grad_norm": 19.261566162109375, "learning_rate": 4.999695376862471e-06, "loss": 1.7401, "step": 4675 }, { "epoch": 0.005082630760993633, "grad_norm": 59.69552993774414, "learning_rate": 4.999694710669259e-06, "loss": 1.8743, "step": 4680 }, { "epoch": 0.00508806092206307, "grad_norm": 18.418954849243164, "learning_rate": 4.999694043748423e-06, "loss": 1.4204, "step": 4685 }, { "epoch": 0.005093491083132508, "grad_norm": 35.27416229248047, "learning_rate": 4.9996933760999646e-06, "loss": 2.066, "step": 4690 }, { "epoch": 0.005098921244201945, "grad_norm": 75.73809051513672, "learning_rate": 4.9996927077238825e-06, "loss": 1.6641, "step": 4695 }, { "epoch": 0.005104351405271383, "grad_norm": 22.639095306396484, "learning_rate": 4.999692038620179e-06, "loss": 1.5356, "step": 4700 }, { "epoch": 0.0051097815663408205, "grad_norm": 28.05038833618164, "learning_rate": 4.9996913687888526e-06, "loss": 2.2873, "step": 4705 }, { "epoch": 0.0051152117274102585, "grad_norm": 20.101383209228516, "learning_rate": 4.999690698229904e-06, "loss": 2.1318, "step": 4710 }, { "epoch": 0.0051206418884796965, "grad_norm": 125.32181549072266, "learning_rate": 4.999690026943334e-06, "loss": 1.4032, "step": 4715 }, { "epoch": 0.005126072049549134, "grad_norm": 37.49420928955078, "learning_rate": 4.999689354929142e-06, "loss": 1.8085, "step": 4720 }, { "epoch": 0.005131502210618572, "grad_norm": 25.748056411743164, "learning_rate": 4.999688682187328e-06, "loss": 2.2211, "step": 4725 }, { "epoch": 0.005136932371688009, "grad_norm": 21.98324966430664, "learning_rate": 4.999688008717893e-06, "loss": 1.6787, "step": 4730 }, { "epoch": 0.005142362532757447, "grad_norm": 17.084766387939453, "learning_rate": 4.999687334520836e-06, "loss": 1.7208, "step": 4735 }, { "epoch": 0.005147792693826884, "grad_norm": 18.00237464904785, "learning_rate": 4.999686659596159e-06, "loss": 1.9036, "step": 4740 }, { "epoch": 0.005153222854896322, "grad_norm": 34.239070892333984, "learning_rate": 4.999685983943862e-06, "loss": 2.7179, "step": 4745 }, { "epoch": 0.00515865301596576, "grad_norm": 19.180025100708008, "learning_rate": 4.9996853075639434e-06, "loss": 1.8275, "step": 4750 }, { "epoch": 0.005164083177035197, "grad_norm": 22.276296615600586, "learning_rate": 4.999684630456404e-06, "loss": 1.4989, "step": 4755 }, { "epoch": 0.005169513338104635, "grad_norm": 19.363197326660156, "learning_rate": 4.999683952621246e-06, "loss": 2.1967, "step": 4760 }, { "epoch": 0.005174943499174072, "grad_norm": 19.598718643188477, "learning_rate": 4.999683274058466e-06, "loss": 1.6025, "step": 4765 }, { "epoch": 0.00518037366024351, "grad_norm": 26.596248626708984, "learning_rate": 4.999682594768068e-06, "loss": 1.8413, "step": 4770 }, { "epoch": 0.0051858038213129475, "grad_norm": 21.489049911499023, "learning_rate": 4.999681914750049e-06, "loss": 1.7604, "step": 4775 }, { "epoch": 0.0051912339823823855, "grad_norm": 17.431522369384766, "learning_rate": 4.999681234004412e-06, "loss": 1.7261, "step": 4780 }, { "epoch": 0.0051966641434518235, "grad_norm": 24.8800106048584, "learning_rate": 4.999680552531155e-06, "loss": 1.7136, "step": 4785 }, { "epoch": 0.005202094304521261, "grad_norm": 22.14327049255371, "learning_rate": 4.99967987033028e-06, "loss": 1.1265, "step": 4790 }, { "epoch": 0.005207524465590699, "grad_norm": 21.169355392456055, "learning_rate": 4.999679187401786e-06, "loss": 1.5649, "step": 4795 }, { "epoch": 0.005212954626660136, "grad_norm": 21.720632553100586, "learning_rate": 4.999678503745672e-06, "loss": 1.743, "step": 4800 }, { "epoch": 0.005218384787729574, "grad_norm": 22.337194442749023, "learning_rate": 4.999677819361941e-06, "loss": 1.7047, "step": 4805 }, { "epoch": 0.005223814948799011, "grad_norm": 20.97328758239746, "learning_rate": 4.999677134250591e-06, "loss": 1.9097, "step": 4810 }, { "epoch": 0.005229245109868449, "grad_norm": 56.15665817260742, "learning_rate": 4.999676448411623e-06, "loss": 1.7994, "step": 4815 }, { "epoch": 0.005234675270937887, "grad_norm": 61.42665481567383, "learning_rate": 4.9996757618450376e-06, "loss": 1.9237, "step": 4820 }, { "epoch": 0.005240105432007324, "grad_norm": 71.12615203857422, "learning_rate": 4.999675074550835e-06, "loss": 1.6167, "step": 4825 }, { "epoch": 0.005245535593076762, "grad_norm": 27.565258026123047, "learning_rate": 4.999674386529014e-06, "loss": 1.6718, "step": 4830 }, { "epoch": 0.005250965754146199, "grad_norm": 51.94449234008789, "learning_rate": 4.9996736977795765e-06, "loss": 1.5015, "step": 4835 }, { "epoch": 0.005256395915215637, "grad_norm": 60.66372299194336, "learning_rate": 4.999673008302522e-06, "loss": 1.9267, "step": 4840 }, { "epoch": 0.0052618260762850744, "grad_norm": 17.500286102294922, "learning_rate": 4.999672318097851e-06, "loss": 1.9177, "step": 4845 }, { "epoch": 0.0052672562373545125, "grad_norm": 19.51017951965332, "learning_rate": 4.999671627165563e-06, "loss": 1.1525, "step": 4850 }, { "epoch": 0.0052726863984239505, "grad_norm": 33.62522888183594, "learning_rate": 4.999670935505659e-06, "loss": 2.2874, "step": 4855 }, { "epoch": 0.005278116559493388, "grad_norm": 441.5694274902344, "learning_rate": 4.9996702431181386e-06, "loss": 1.9843, "step": 4860 }, { "epoch": 0.005283546720562826, "grad_norm": 30.55267906188965, "learning_rate": 4.9996695500030015e-06, "loss": 1.5206, "step": 4865 }, { "epoch": 0.005288976881632263, "grad_norm": 57.731231689453125, "learning_rate": 4.99966885616025e-06, "loss": 1.5419, "step": 4870 }, { "epoch": 0.005294407042701701, "grad_norm": 22.69487762451172, "learning_rate": 4.999668161589882e-06, "loss": 1.8394, "step": 4875 }, { "epoch": 0.005299837203771138, "grad_norm": 14.523727416992188, "learning_rate": 4.999667466291899e-06, "loss": 1.8172, "step": 4880 }, { "epoch": 0.005305267364840576, "grad_norm": 33.3651123046875, "learning_rate": 4.9996667702663005e-06, "loss": 2.0074, "step": 4885 }, { "epoch": 0.005310697525910014, "grad_norm": 23.77447509765625, "learning_rate": 4.9996660735130875e-06, "loss": 1.5289, "step": 4890 }, { "epoch": 0.005316127686979451, "grad_norm": 192.78375244140625, "learning_rate": 4.999665376032259e-06, "loss": 2.2148, "step": 4895 }, { "epoch": 0.005321557848048889, "grad_norm": 22.635337829589844, "learning_rate": 4.999664677823817e-06, "loss": 1.8075, "step": 4900 }, { "epoch": 0.005326988009118326, "grad_norm": 20.11060333251953, "learning_rate": 4.999663978887759e-06, "loss": 2.1737, "step": 4905 }, { "epoch": 0.005332418170187764, "grad_norm": 20.32866668701172, "learning_rate": 4.9996632792240886e-06, "loss": 1.0807, "step": 4910 }, { "epoch": 0.005337848331257201, "grad_norm": 30.49724578857422, "learning_rate": 4.999662578832804e-06, "loss": 1.5334, "step": 4915 }, { "epoch": 0.0053432784923266394, "grad_norm": 29.018848419189453, "learning_rate": 4.999661877713904e-06, "loss": 2.1873, "step": 4920 }, { "epoch": 0.0053487086533960775, "grad_norm": 53.38876724243164, "learning_rate": 4.999661175867392e-06, "loss": 1.5538, "step": 4925 }, { "epoch": 0.005354138814465515, "grad_norm": 49.9700813293457, "learning_rate": 4.999660473293266e-06, "loss": 1.9419, "step": 4930 }, { "epoch": 0.005359568975534953, "grad_norm": 17.268016815185547, "learning_rate": 4.999659769991527e-06, "loss": 1.4056, "step": 4935 }, { "epoch": 0.00536499913660439, "grad_norm": 16.556962966918945, "learning_rate": 4.999659065962176e-06, "loss": 2.0934, "step": 4940 }, { "epoch": 0.005370429297673828, "grad_norm": 174.4562530517578, "learning_rate": 4.999658361205211e-06, "loss": 2.042, "step": 4945 }, { "epoch": 0.005375859458743265, "grad_norm": 24.421916961669922, "learning_rate": 4.9996576557206344e-06, "loss": 2.8982, "step": 4950 }, { "epoch": 0.005381289619812703, "grad_norm": 20.59016227722168, "learning_rate": 4.999656949508445e-06, "loss": 1.8078, "step": 4955 }, { "epoch": 0.005386719780882141, "grad_norm": 36.114227294921875, "learning_rate": 4.999656242568644e-06, "loss": 1.8091, "step": 4960 }, { "epoch": 0.005392149941951578, "grad_norm": 28.61829376220703, "learning_rate": 4.99965553490123e-06, "loss": 1.6193, "step": 4965 }, { "epoch": 0.005397580103021016, "grad_norm": 19.966970443725586, "learning_rate": 4.999654826506205e-06, "loss": 2.0242, "step": 4970 }, { "epoch": 0.005403010264090453, "grad_norm": 29.692081451416016, "learning_rate": 4.999654117383569e-06, "loss": 1.4974, "step": 4975 }, { "epoch": 0.005408440425159891, "grad_norm": 22.056415557861328, "learning_rate": 4.999653407533321e-06, "loss": 1.4826, "step": 4980 }, { "epoch": 0.005413870586229328, "grad_norm": 66.81109619140625, "learning_rate": 4.999652696955462e-06, "loss": 2.2023, "step": 4985 }, { "epoch": 0.005419300747298766, "grad_norm": 20.28089714050293, "learning_rate": 4.999651985649992e-06, "loss": 1.1524, "step": 4990 }, { "epoch": 0.0054247309083682044, "grad_norm": 12.43229866027832, "learning_rate": 4.999651273616912e-06, "loss": 1.9316, "step": 4995 }, { "epoch": 0.005430161069437642, "grad_norm": 48.258663177490234, "learning_rate": 4.9996505608562215e-06, "loss": 1.9972, "step": 5000 }, { "epoch": 0.00543559123050708, "grad_norm": 41.02811050415039, "learning_rate": 4.99964984736792e-06, "loss": 1.675, "step": 5005 }, { "epoch": 0.005441021391576517, "grad_norm": 55.572261810302734, "learning_rate": 4.999649133152009e-06, "loss": 1.9857, "step": 5010 }, { "epoch": 0.005446451552645955, "grad_norm": 14.304669380187988, "learning_rate": 4.999648418208488e-06, "loss": 1.6192, "step": 5015 }, { "epoch": 0.005451881713715392, "grad_norm": 30.158506393432617, "learning_rate": 4.999647702537358e-06, "loss": 2.3979, "step": 5020 }, { "epoch": 0.00545731187478483, "grad_norm": 22.55425262451172, "learning_rate": 4.999646986138617e-06, "loss": 1.2372, "step": 5025 }, { "epoch": 0.005462742035854268, "grad_norm": 22.605337142944336, "learning_rate": 4.999646269012268e-06, "loss": 2.0878, "step": 5030 }, { "epoch": 0.005468172196923705, "grad_norm": 22.594636917114258, "learning_rate": 4.99964555115831e-06, "loss": 1.0002, "step": 5035 }, { "epoch": 0.005473602357993143, "grad_norm": 24.818784713745117, "learning_rate": 4.999644832576743e-06, "loss": 1.1722, "step": 5040 }, { "epoch": 0.00547903251906258, "grad_norm": 25.07884979248047, "learning_rate": 4.999644113267567e-06, "loss": 2.3877, "step": 5045 }, { "epoch": 0.005484462680132018, "grad_norm": 53.54447555541992, "learning_rate": 4.999643393230784e-06, "loss": 1.7181, "step": 5050 }, { "epoch": 0.005489892841201455, "grad_norm": 204.2445526123047, "learning_rate": 4.999642672466392e-06, "loss": 1.216, "step": 5055 }, { "epoch": 0.005495323002270893, "grad_norm": 25.86240577697754, "learning_rate": 4.9996419509743914e-06, "loss": 1.6493, "step": 5060 }, { "epoch": 0.005500753163340331, "grad_norm": 18.79839324951172, "learning_rate": 4.999641228754784e-06, "loss": 1.5332, "step": 5065 }, { "epoch": 0.005506183324409769, "grad_norm": 16.341054916381836, "learning_rate": 4.999640505807568e-06, "loss": 1.8186, "step": 5070 }, { "epoch": 0.005511613485479207, "grad_norm": 21.626359939575195, "learning_rate": 4.999639782132746e-06, "loss": 1.4845, "step": 5075 }, { "epoch": 0.005517043646548644, "grad_norm": 51.40708923339844, "learning_rate": 4.999639057730316e-06, "loss": 2.0806, "step": 5080 }, { "epoch": 0.005522473807618082, "grad_norm": 47.12931442260742, "learning_rate": 4.99963833260028e-06, "loss": 2.2025, "step": 5085 }, { "epoch": 0.005527903968687519, "grad_norm": 26.211687088012695, "learning_rate": 4.999637606742636e-06, "loss": 1.3442, "step": 5090 }, { "epoch": 0.005533334129756957, "grad_norm": 45.3824348449707, "learning_rate": 4.999636880157387e-06, "loss": 2.1084, "step": 5095 }, { "epoch": 0.005538764290826395, "grad_norm": 31.1392879486084, "learning_rate": 4.99963615284453e-06, "loss": 1.601, "step": 5100 }, { "epoch": 0.005544194451895832, "grad_norm": 116.12617492675781, "learning_rate": 4.999635424804068e-06, "loss": 2.5103, "step": 5105 }, { "epoch": 0.00554962461296527, "grad_norm": 30.404809951782227, "learning_rate": 4.999634696036e-06, "loss": 1.6435, "step": 5110 }, { "epoch": 0.005555054774034707, "grad_norm": 58.083805084228516, "learning_rate": 4.999633966540327e-06, "loss": 1.8711, "step": 5115 }, { "epoch": 0.005560484935104145, "grad_norm": 26.654043197631836, "learning_rate": 4.999633236317047e-06, "loss": 1.5364, "step": 5120 }, { "epoch": 0.005565915096173582, "grad_norm": 19.98863983154297, "learning_rate": 4.999632505366163e-06, "loss": 1.7119, "step": 5125 }, { "epoch": 0.00557134525724302, "grad_norm": 62.049774169921875, "learning_rate": 4.999631773687674e-06, "loss": 1.2378, "step": 5130 }, { "epoch": 0.0055767754183124576, "grad_norm": 21.323989868164062, "learning_rate": 4.99963104128158e-06, "loss": 1.8326, "step": 5135 }, { "epoch": 0.0055822055793818956, "grad_norm": 36.489295959472656, "learning_rate": 4.9996303081478816e-06, "loss": 1.6989, "step": 5140 }, { "epoch": 0.005587635740451334, "grad_norm": 34.178436279296875, "learning_rate": 4.999629574286579e-06, "loss": 2.4721, "step": 5145 }, { "epoch": 0.005593065901520771, "grad_norm": 14.41700553894043, "learning_rate": 4.999628839697672e-06, "loss": 2.2307, "step": 5150 }, { "epoch": 0.005598496062590209, "grad_norm": 16.582836151123047, "learning_rate": 4.9996281043811604e-06, "loss": 2.0465, "step": 5155 }, { "epoch": 0.005603926223659646, "grad_norm": 22.36189842224121, "learning_rate": 4.999627368337046e-06, "loss": 2.1519, "step": 5160 }, { "epoch": 0.005609356384729084, "grad_norm": 24.20349884033203, "learning_rate": 4.999626631565328e-06, "loss": 1.2293, "step": 5165 }, { "epoch": 0.005614786545798521, "grad_norm": 24.174203872680664, "learning_rate": 4.999625894066006e-06, "loss": 2.2941, "step": 5170 }, { "epoch": 0.005620216706867959, "grad_norm": 15.321023941040039, "learning_rate": 4.999625155839082e-06, "loss": 1.6027, "step": 5175 }, { "epoch": 0.005625646867937397, "grad_norm": 55.662471771240234, "learning_rate": 4.999624416884554e-06, "loss": 1.6922, "step": 5180 }, { "epoch": 0.005631077029006834, "grad_norm": 26.813804626464844, "learning_rate": 4.9996236772024245e-06, "loss": 1.8775, "step": 5185 }, { "epoch": 0.005636507190076272, "grad_norm": 34.96736526489258, "learning_rate": 4.999622936792692e-06, "loss": 2.3541, "step": 5190 }, { "epoch": 0.005641937351145709, "grad_norm": 16.03090476989746, "learning_rate": 4.9996221956553575e-06, "loss": 1.7449, "step": 5195 }, { "epoch": 0.005647367512215147, "grad_norm": 33.330238342285156, "learning_rate": 4.999621453790421e-06, "loss": 1.5792, "step": 5200 }, { "epoch": 0.0056527976732845845, "grad_norm": 24.272037506103516, "learning_rate": 4.999620711197882e-06, "loss": 1.7657, "step": 5205 }, { "epoch": 0.0056582278343540226, "grad_norm": 18.43645477294922, "learning_rate": 4.999619967877743e-06, "loss": 1.3882, "step": 5210 }, { "epoch": 0.0056636579954234606, "grad_norm": 107.7060546875, "learning_rate": 4.999619223830001e-06, "loss": 2.0194, "step": 5215 }, { "epoch": 0.005669088156492898, "grad_norm": 196.6104736328125, "learning_rate": 4.999618479054658e-06, "loss": 1.4904, "step": 5220 }, { "epoch": 0.005674518317562336, "grad_norm": 21.928171157836914, "learning_rate": 4.999617733551716e-06, "loss": 1.5223, "step": 5225 }, { "epoch": 0.005679948478631773, "grad_norm": 48.38093566894531, "learning_rate": 4.999616987321172e-06, "loss": 2.5768, "step": 5230 }, { "epoch": 0.005685378639701211, "grad_norm": 18.006013870239258, "learning_rate": 4.999616240363027e-06, "loss": 1.6252, "step": 5235 }, { "epoch": 0.005690808800770648, "grad_norm": 20.76980972290039, "learning_rate": 4.999615492677282e-06, "loss": 1.2313, "step": 5240 }, { "epoch": 0.005696238961840086, "grad_norm": 53.61490249633789, "learning_rate": 4.999614744263938e-06, "loss": 1.843, "step": 5245 }, { "epoch": 0.005701669122909524, "grad_norm": 44.79423904418945, "learning_rate": 4.999613995122993e-06, "loss": 1.4645, "step": 5250 }, { "epoch": 0.005707099283978961, "grad_norm": 14.564753532409668, "learning_rate": 4.999613245254449e-06, "loss": 1.6534, "step": 5255 }, { "epoch": 0.005712529445048399, "grad_norm": 22.07400894165039, "learning_rate": 4.999612494658305e-06, "loss": 1.8354, "step": 5260 }, { "epoch": 0.005717959606117836, "grad_norm": 20.23527717590332, "learning_rate": 4.999611743334562e-06, "loss": 1.8488, "step": 5265 }, { "epoch": 0.005723389767187274, "grad_norm": 19.022342681884766, "learning_rate": 4.999610991283221e-06, "loss": 1.6624, "step": 5270 }, { "epoch": 0.0057288199282567115, "grad_norm": 19.907268524169922, "learning_rate": 4.999610238504281e-06, "loss": 1.9812, "step": 5275 }, { "epoch": 0.0057342500893261495, "grad_norm": 43.43062973022461, "learning_rate": 4.999609484997742e-06, "loss": 1.9832, "step": 5280 }, { "epoch": 0.0057396802503955876, "grad_norm": 21.29763412475586, "learning_rate": 4.999608730763604e-06, "loss": 2.7616, "step": 5285 }, { "epoch": 0.005745110411465025, "grad_norm": 39.30599594116211, "learning_rate": 4.999607975801869e-06, "loss": 1.2984, "step": 5290 }, { "epoch": 0.005750540572534463, "grad_norm": 29.78785514831543, "learning_rate": 4.999607220112536e-06, "loss": 1.9975, "step": 5295 }, { "epoch": 0.0057559707336039, "grad_norm": 80.25061798095703, "learning_rate": 4.999606463695605e-06, "loss": 1.8407, "step": 5300 }, { "epoch": 0.005761400894673338, "grad_norm": 21.0229549407959, "learning_rate": 4.999605706551078e-06, "loss": 1.5693, "step": 5305 }, { "epoch": 0.005766831055742775, "grad_norm": 31.699092864990234, "learning_rate": 4.999604948678951e-06, "loss": 2.1826, "step": 5310 }, { "epoch": 0.005772261216812213, "grad_norm": 21.70425033569336, "learning_rate": 4.9996041900792295e-06, "loss": 1.364, "step": 5315 }, { "epoch": 0.005777691377881651, "grad_norm": 28.446739196777344, "learning_rate": 4.999603430751911e-06, "loss": 2.3308, "step": 5320 }, { "epoch": 0.005783121538951088, "grad_norm": 35.6738395690918, "learning_rate": 4.999602670696995e-06, "loss": 2.4406, "step": 5325 }, { "epoch": 0.005788551700020526, "grad_norm": 39.0118408203125, "learning_rate": 4.999601909914482e-06, "loss": 2.1011, "step": 5330 }, { "epoch": 0.005793981861089963, "grad_norm": 29.346073150634766, "learning_rate": 4.999601148404374e-06, "loss": 1.3872, "step": 5335 }, { "epoch": 0.005799412022159401, "grad_norm": 23.66858673095703, "learning_rate": 4.9996003861666705e-06, "loss": 1.3341, "step": 5340 }, { "epoch": 0.0058048421832288385, "grad_norm": 48.519222259521484, "learning_rate": 4.9995996232013715e-06, "loss": 1.8406, "step": 5345 }, { "epoch": 0.0058102723442982765, "grad_norm": 38.47585678100586, "learning_rate": 4.999598859508476e-06, "loss": 1.6092, "step": 5350 }, { "epoch": 0.0058157025053677145, "grad_norm": 48.56809997558594, "learning_rate": 4.999598095087986e-06, "loss": 1.6689, "step": 5355 }, { "epoch": 0.005821132666437152, "grad_norm": 25.596527099609375, "learning_rate": 4.999597329939901e-06, "loss": 2.1055, "step": 5360 }, { "epoch": 0.00582656282750659, "grad_norm": 25.698974609375, "learning_rate": 4.9995965640642205e-06, "loss": 1.9524, "step": 5365 }, { "epoch": 0.005831992988576027, "grad_norm": 18.064311981201172, "learning_rate": 4.999595797460946e-06, "loss": 1.7057, "step": 5370 }, { "epoch": 0.005837423149645465, "grad_norm": 25.069490432739258, "learning_rate": 4.9995950301300775e-06, "loss": 1.7008, "step": 5375 }, { "epoch": 0.005842853310714902, "grad_norm": 19.074604034423828, "learning_rate": 4.999594262071615e-06, "loss": 1.6405, "step": 5380 }, { "epoch": 0.00584828347178434, "grad_norm": 46.19169616699219, "learning_rate": 4.999593493285558e-06, "loss": 1.4147, "step": 5385 }, { "epoch": 0.005853713632853778, "grad_norm": 18.601667404174805, "learning_rate": 4.999592723771908e-06, "loss": 0.9088, "step": 5390 }, { "epoch": 0.005859143793923215, "grad_norm": 46.879180908203125, "learning_rate": 4.999591953530665e-06, "loss": 1.6992, "step": 5395 }, { "epoch": 0.005864573954992653, "grad_norm": 20.413469314575195, "learning_rate": 4.9995911825618284e-06, "loss": 1.4144, "step": 5400 }, { "epoch": 0.00587000411606209, "grad_norm": 26.497671127319336, "learning_rate": 4.999590410865398e-06, "loss": 1.5866, "step": 5405 }, { "epoch": 0.005875434277131528, "grad_norm": 28.83824920654297, "learning_rate": 4.999589638441376e-06, "loss": 1.8768, "step": 5410 }, { "epoch": 0.0058808644382009655, "grad_norm": 14.301694869995117, "learning_rate": 4.999588865289761e-06, "loss": 2.4273, "step": 5415 }, { "epoch": 0.0058862945992704035, "grad_norm": 31.37520980834961, "learning_rate": 4.999588091410553e-06, "loss": 1.562, "step": 5420 }, { "epoch": 0.0058917247603398415, "grad_norm": 24.07052230834961, "learning_rate": 4.9995873168037535e-06, "loss": 2.836, "step": 5425 }, { "epoch": 0.005897154921409279, "grad_norm": 41.5803108215332, "learning_rate": 4.999586541469362e-06, "loss": 2.28, "step": 5430 }, { "epoch": 0.005902585082478717, "grad_norm": 18.013010025024414, "learning_rate": 4.999585765407379e-06, "loss": 1.4069, "step": 5435 }, { "epoch": 0.005908015243548154, "grad_norm": 21.76613426208496, "learning_rate": 4.999584988617805e-06, "loss": 1.0699, "step": 5440 }, { "epoch": 0.005913445404617592, "grad_norm": 19.28229522705078, "learning_rate": 4.99958421110064e-06, "loss": 1.8789, "step": 5445 }, { "epoch": 0.005918875565687029, "grad_norm": 29.779239654541016, "learning_rate": 4.999583432855883e-06, "loss": 1.6855, "step": 5450 }, { "epoch": 0.005924305726756467, "grad_norm": 35.33720779418945, "learning_rate": 4.999582653883536e-06, "loss": 1.5128, "step": 5455 }, { "epoch": 0.005929735887825905, "grad_norm": 22.140111923217773, "learning_rate": 4.999581874183599e-06, "loss": 1.4754, "step": 5460 }, { "epoch": 0.005935166048895342, "grad_norm": 15.077241897583008, "learning_rate": 4.99958109375607e-06, "loss": 1.6228, "step": 5465 }, { "epoch": 0.00594059620996478, "grad_norm": 22.65277862548828, "learning_rate": 4.999580312600953e-06, "loss": 1.8797, "step": 5470 }, { "epoch": 0.005946026371034217, "grad_norm": 28.859771728515625, "learning_rate": 4.9995795307182454e-06, "loss": 2.1541, "step": 5475 }, { "epoch": 0.005951456532103655, "grad_norm": 40.33438491821289, "learning_rate": 4.999578748107948e-06, "loss": 1.6498, "step": 5480 }, { "epoch": 0.0059568866931730925, "grad_norm": 25.7384033203125, "learning_rate": 4.999577964770062e-06, "loss": 1.8289, "step": 5485 }, { "epoch": 0.0059623168542425305, "grad_norm": 23.229909896850586, "learning_rate": 4.999577180704586e-06, "loss": 1.4999, "step": 5490 }, { "epoch": 0.0059677470153119685, "grad_norm": 24.172466278076172, "learning_rate": 4.999576395911521e-06, "loss": 2.0593, "step": 5495 }, { "epoch": 0.005973177176381406, "grad_norm": 34.43540573120117, "learning_rate": 4.999575610390868e-06, "loss": 2.1684, "step": 5500 }, { "epoch": 0.005978607337450844, "grad_norm": 82.13909149169922, "learning_rate": 4.9995748241426265e-06, "loss": 1.5526, "step": 5505 }, { "epoch": 0.005984037498520281, "grad_norm": 14.93838882446289, "learning_rate": 4.999574037166797e-06, "loss": 2.0822, "step": 5510 }, { "epoch": 0.005989467659589719, "grad_norm": 18.698463439941406, "learning_rate": 4.999573249463379e-06, "loss": 1.3967, "step": 5515 }, { "epoch": 0.005994897820659156, "grad_norm": 35.92490005493164, "learning_rate": 4.999572461032374e-06, "loss": 1.8119, "step": 5520 }, { "epoch": 0.006000327981728594, "grad_norm": 30.46004867553711, "learning_rate": 4.999571671873781e-06, "loss": 2.0987, "step": 5525 }, { "epoch": 0.006005758142798032, "grad_norm": 23.209774017333984, "learning_rate": 4.9995708819876e-06, "loss": 2.1424, "step": 5530 }, { "epoch": 0.006011188303867469, "grad_norm": 22.51923370361328, "learning_rate": 4.9995700913738325e-06, "loss": 2.1409, "step": 5535 }, { "epoch": 0.006016618464936907, "grad_norm": 32.1729850769043, "learning_rate": 4.999569300032478e-06, "loss": 1.5138, "step": 5540 }, { "epoch": 0.006022048626006344, "grad_norm": 22.5428524017334, "learning_rate": 4.999568507963538e-06, "loss": 2.3072, "step": 5545 }, { "epoch": 0.006027478787075782, "grad_norm": 71.06879425048828, "learning_rate": 4.999567715167011e-06, "loss": 1.195, "step": 5550 }, { "epoch": 0.0060329089481452195, "grad_norm": 24.040325164794922, "learning_rate": 4.999566921642897e-06, "loss": 1.8098, "step": 5555 }, { "epoch": 0.0060383391092146575, "grad_norm": 23.54729652404785, "learning_rate": 4.999566127391198e-06, "loss": 1.2727, "step": 5560 }, { "epoch": 0.0060437692702840955, "grad_norm": 81.89497375488281, "learning_rate": 4.999565332411913e-06, "loss": 1.9711, "step": 5565 }, { "epoch": 0.006049199431353533, "grad_norm": 47.18026351928711, "learning_rate": 4.999564536705043e-06, "loss": 1.5809, "step": 5570 }, { "epoch": 0.006054629592422971, "grad_norm": 23.486042022705078, "learning_rate": 4.999563740270588e-06, "loss": 2.1346, "step": 5575 }, { "epoch": 0.006060059753492408, "grad_norm": 36.46920394897461, "learning_rate": 4.999562943108547e-06, "loss": 1.7139, "step": 5580 }, { "epoch": 0.006065489914561846, "grad_norm": 31.517602920532227, "learning_rate": 4.999562145218921e-06, "loss": 1.3576, "step": 5585 }, { "epoch": 0.006070920075631283, "grad_norm": 39.242591857910156, "learning_rate": 4.9995613466017124e-06, "loss": 1.3657, "step": 5590 }, { "epoch": 0.006076350236700721, "grad_norm": 38.803470611572266, "learning_rate": 4.999560547256918e-06, "loss": 1.7715, "step": 5595 }, { "epoch": 0.006081780397770159, "grad_norm": 20.473873138427734, "learning_rate": 4.999559747184541e-06, "loss": 2.0462, "step": 5600 }, { "epoch": 0.006087210558839596, "grad_norm": 21.158288955688477, "learning_rate": 4.999558946384579e-06, "loss": 2.0317, "step": 5605 }, { "epoch": 0.006092640719909034, "grad_norm": 11.615714073181152, "learning_rate": 4.999558144857033e-06, "loss": 1.5301, "step": 5610 }, { "epoch": 0.006098070880978471, "grad_norm": 14.928739547729492, "learning_rate": 4.999557342601904e-06, "loss": 2.1252, "step": 5615 }, { "epoch": 0.006103501042047909, "grad_norm": 36.02782440185547, "learning_rate": 4.999556539619192e-06, "loss": 1.8088, "step": 5620 }, { "epoch": 0.0061089312031173465, "grad_norm": 28.048370361328125, "learning_rate": 4.999555735908897e-06, "loss": 1.9231, "step": 5625 }, { "epoch": 0.0061143613641867845, "grad_norm": 39.22415542602539, "learning_rate": 4.999554931471019e-06, "loss": 2.3541, "step": 5630 }, { "epoch": 0.0061197915252562225, "grad_norm": 47.66825866699219, "learning_rate": 4.99955412630556e-06, "loss": 2.1991, "step": 5635 }, { "epoch": 0.00612522168632566, "grad_norm": 17.334524154663086, "learning_rate": 4.999553320412518e-06, "loss": 1.2456, "step": 5640 }, { "epoch": 0.006130651847395098, "grad_norm": 18.925661087036133, "learning_rate": 4.999552513791894e-06, "loss": 1.8752, "step": 5645 }, { "epoch": 0.006136082008464535, "grad_norm": 22.744773864746094, "learning_rate": 4.999551706443688e-06, "loss": 1.6553, "step": 5650 }, { "epoch": 0.006141512169533973, "grad_norm": 51.061092376708984, "learning_rate": 4.999550898367901e-06, "loss": 1.5392, "step": 5655 }, { "epoch": 0.00614694233060341, "grad_norm": 32.41295623779297, "learning_rate": 4.999550089564532e-06, "loss": 1.3374, "step": 5660 }, { "epoch": 0.006152372491672848, "grad_norm": 23.429479598999023, "learning_rate": 4.999549280033583e-06, "loss": 1.9371, "step": 5665 }, { "epoch": 0.006157802652742286, "grad_norm": 42.98960876464844, "learning_rate": 4.999548469775053e-06, "loss": 1.6122, "step": 5670 }, { "epoch": 0.006163232813811723, "grad_norm": 37.9432487487793, "learning_rate": 4.999547658788943e-06, "loss": 1.5594, "step": 5675 }, { "epoch": 0.006168662974881161, "grad_norm": 71.33613586425781, "learning_rate": 4.999546847075252e-06, "loss": 1.8191, "step": 5680 }, { "epoch": 0.006174093135950598, "grad_norm": 41.13054275512695, "learning_rate": 4.9995460346339805e-06, "loss": 1.7056, "step": 5685 }, { "epoch": 0.006179523297020036, "grad_norm": 18.591978073120117, "learning_rate": 4.999545221465129e-06, "loss": 1.5339, "step": 5690 }, { "epoch": 0.0061849534580894735, "grad_norm": 13.271710395812988, "learning_rate": 4.9995444075687e-06, "loss": 1.7495, "step": 5695 }, { "epoch": 0.0061903836191589115, "grad_norm": 183.6235809326172, "learning_rate": 4.99954359294469e-06, "loss": 1.5333, "step": 5700 }, { "epoch": 0.0061958137802283495, "grad_norm": 19.234933853149414, "learning_rate": 4.999542777593101e-06, "loss": 2.0156, "step": 5705 }, { "epoch": 0.006201243941297787, "grad_norm": 28.108686447143555, "learning_rate": 4.999541961513933e-06, "loss": 1.7889, "step": 5710 }, { "epoch": 0.006206674102367225, "grad_norm": 74.52554321289062, "learning_rate": 4.9995411447071864e-06, "loss": 1.5487, "step": 5715 }, { "epoch": 0.006212104263436662, "grad_norm": 42.19317626953125, "learning_rate": 4.999540327172861e-06, "loss": 1.9319, "step": 5720 }, { "epoch": 0.0062175344245061, "grad_norm": 40.939247131347656, "learning_rate": 4.9995395089109584e-06, "loss": 1.8966, "step": 5725 }, { "epoch": 0.006222964585575537, "grad_norm": 13.394136428833008, "learning_rate": 4.999538689921478e-06, "loss": 1.2079, "step": 5730 }, { "epoch": 0.006228394746644975, "grad_norm": 23.69133186340332, "learning_rate": 4.999537870204419e-06, "loss": 1.1739, "step": 5735 }, { "epoch": 0.006233824907714413, "grad_norm": 36.94432067871094, "learning_rate": 4.999537049759784e-06, "loss": 1.4681, "step": 5740 }, { "epoch": 0.00623925506878385, "grad_norm": 20.519529342651367, "learning_rate": 4.9995362285875705e-06, "loss": 1.1232, "step": 5745 }, { "epoch": 0.006244685229853288, "grad_norm": 32.03989791870117, "learning_rate": 4.99953540668778e-06, "loss": 1.8593, "step": 5750 }, { "epoch": 0.006250115390922725, "grad_norm": 282.17169189453125, "learning_rate": 4.999534584060413e-06, "loss": 1.4717, "step": 5755 }, { "epoch": 0.006255545551992163, "grad_norm": 32.84487533569336, "learning_rate": 4.99953376070547e-06, "loss": 1.7153, "step": 5760 }, { "epoch": 0.0062609757130616004, "grad_norm": 13.314151763916016, "learning_rate": 4.99953293662295e-06, "loss": 2.1657, "step": 5765 }, { "epoch": 0.0062664058741310385, "grad_norm": 38.68479537963867, "learning_rate": 4.999532111812855e-06, "loss": 1.635, "step": 5770 }, { "epoch": 0.0062718360352004765, "grad_norm": 21.97728157043457, "learning_rate": 4.9995312862751836e-06, "loss": 1.6496, "step": 5775 }, { "epoch": 0.006277266196269914, "grad_norm": 26.21125602722168, "learning_rate": 4.999530460009936e-06, "loss": 2.2446, "step": 5780 }, { "epoch": 0.006282696357339352, "grad_norm": 80.70079803466797, "learning_rate": 4.999529633017114e-06, "loss": 2.3647, "step": 5785 }, { "epoch": 0.006288126518408789, "grad_norm": 139.7001953125, "learning_rate": 4.999528805296717e-06, "loss": 1.0282, "step": 5790 }, { "epoch": 0.006293556679478227, "grad_norm": 26.441373825073242, "learning_rate": 4.999527976848745e-06, "loss": 2.0669, "step": 5795 }, { "epoch": 0.006298986840547664, "grad_norm": 20.404075622558594, "learning_rate": 4.999527147673198e-06, "loss": 2.0888, "step": 5800 }, { "epoch": 0.006304417001617102, "grad_norm": 19.49995994567871, "learning_rate": 4.999526317770077e-06, "loss": 1.3928, "step": 5805 }, { "epoch": 0.00630984716268654, "grad_norm": 22.383337020874023, "learning_rate": 4.999525487139382e-06, "loss": 1.338, "step": 5810 }, { "epoch": 0.006315277323755977, "grad_norm": 17.41474151611328, "learning_rate": 4.999524655781113e-06, "loss": 0.7635, "step": 5815 }, { "epoch": 0.006320707484825415, "grad_norm": 20.24514389038086, "learning_rate": 4.99952382369527e-06, "loss": 2.5273, "step": 5820 }, { "epoch": 0.006326137645894852, "grad_norm": 79.37544250488281, "learning_rate": 4.999522990881855e-06, "loss": 1.6377, "step": 5825 }, { "epoch": 0.00633156780696429, "grad_norm": 47.47299575805664, "learning_rate": 4.9995221573408655e-06, "loss": 1.5598, "step": 5830 }, { "epoch": 0.006336997968033727, "grad_norm": 64.12039184570312, "learning_rate": 4.999521323072304e-06, "loss": 1.9607, "step": 5835 }, { "epoch": 0.0063424281291031654, "grad_norm": 27.67397689819336, "learning_rate": 4.999520488076169e-06, "loss": 2.3138, "step": 5840 }, { "epoch": 0.0063478582901726035, "grad_norm": 21.307403564453125, "learning_rate": 4.999519652352462e-06, "loss": 2.5509, "step": 5845 }, { "epoch": 0.006353288451242041, "grad_norm": 172.9713592529297, "learning_rate": 4.999518815901184e-06, "loss": 2.1055, "step": 5850 }, { "epoch": 0.006358718612311479, "grad_norm": 19.18326187133789, "learning_rate": 4.999517978722333e-06, "loss": 1.5505, "step": 5855 }, { "epoch": 0.006364148773380916, "grad_norm": 30.263763427734375, "learning_rate": 4.9995171408159105e-06, "loss": 1.4005, "step": 5860 }, { "epoch": 0.006369578934450354, "grad_norm": 19.29738426208496, "learning_rate": 4.999516302181917e-06, "loss": 1.8609, "step": 5865 }, { "epoch": 0.006375009095519791, "grad_norm": 14.42725944519043, "learning_rate": 4.999515462820351e-06, "loss": 1.883, "step": 5870 }, { "epoch": 0.006380439256589229, "grad_norm": 17.89107894897461, "learning_rate": 4.999514622731216e-06, "loss": 1.5576, "step": 5875 }, { "epoch": 0.006385869417658667, "grad_norm": 11.541342735290527, "learning_rate": 4.999513781914509e-06, "loss": 1.5676, "step": 5880 }, { "epoch": 0.006391299578728104, "grad_norm": 18.568519592285156, "learning_rate": 4.999512940370231e-06, "loss": 1.7225, "step": 5885 }, { "epoch": 0.006396729739797542, "grad_norm": 24.619037628173828, "learning_rate": 4.999512098098384e-06, "loss": 1.9667, "step": 5890 }, { "epoch": 0.006402159900866979, "grad_norm": 12.519689559936523, "learning_rate": 4.999511255098967e-06, "loss": 1.4307, "step": 5895 }, { "epoch": 0.006407590061936417, "grad_norm": 22.99030113220215, "learning_rate": 4.99951041137198e-06, "loss": 1.047, "step": 5900 }, { "epoch": 0.006413020223005854, "grad_norm": 21.73295021057129, "learning_rate": 4.999509566917423e-06, "loss": 1.653, "step": 5905 }, { "epoch": 0.006418450384075292, "grad_norm": 17.478986740112305, "learning_rate": 4.999508721735297e-06, "loss": 1.8753, "step": 5910 }, { "epoch": 0.0064238805451447304, "grad_norm": 29.355045318603516, "learning_rate": 4.999507875825603e-06, "loss": 2.0869, "step": 5915 }, { "epoch": 0.006429310706214168, "grad_norm": 32.20018005371094, "learning_rate": 4.9995070291883405e-06, "loss": 1.8121, "step": 5920 }, { "epoch": 0.006434740867283606, "grad_norm": 27.907434463500977, "learning_rate": 4.999506181823509e-06, "loss": 1.6656, "step": 5925 }, { "epoch": 0.006440171028353043, "grad_norm": 22.325401306152344, "learning_rate": 4.999505333731108e-06, "loss": 1.3389, "step": 5930 }, { "epoch": 0.006445601189422481, "grad_norm": 43.76414108276367, "learning_rate": 4.9995044849111405e-06, "loss": 1.7889, "step": 5935 }, { "epoch": 0.006451031350491918, "grad_norm": 27.623891830444336, "learning_rate": 4.999503635363605e-06, "loss": 1.7799, "step": 5940 }, { "epoch": 0.006456461511561356, "grad_norm": 32.4687385559082, "learning_rate": 4.999502785088502e-06, "loss": 1.5402, "step": 5945 }, { "epoch": 0.006461891672630794, "grad_norm": 51.59307098388672, "learning_rate": 4.999501934085831e-06, "loss": 2.2155, "step": 5950 }, { "epoch": 0.006467321833700231, "grad_norm": 66.9980239868164, "learning_rate": 4.999501082355594e-06, "loss": 1.6825, "step": 5955 }, { "epoch": 0.006472751994769669, "grad_norm": 18.256093978881836, "learning_rate": 4.99950022989779e-06, "loss": 2.0051, "step": 5960 }, { "epoch": 0.006478182155839106, "grad_norm": 24.71274757385254, "learning_rate": 4.9994993767124195e-06, "loss": 1.5041, "step": 5965 }, { "epoch": 0.006483612316908544, "grad_norm": 22.75724983215332, "learning_rate": 4.999498522799483e-06, "loss": 1.6289, "step": 5970 }, { "epoch": 0.006489042477977981, "grad_norm": 55.56850814819336, "learning_rate": 4.99949766815898e-06, "loss": 2.6878, "step": 5975 }, { "epoch": 0.006494472639047419, "grad_norm": 15.127692222595215, "learning_rate": 4.999496812790912e-06, "loss": 1.4708, "step": 5980 }, { "epoch": 0.006499902800116857, "grad_norm": 41.02385711669922, "learning_rate": 4.999495956695277e-06, "loss": 1.7706, "step": 5985 }, { "epoch": 0.006505332961186295, "grad_norm": 27.14217758178711, "learning_rate": 4.999495099872078e-06, "loss": 1.4803, "step": 5990 }, { "epoch": 0.006510763122255733, "grad_norm": 19.354516983032227, "learning_rate": 4.999494242321314e-06, "loss": 1.9146, "step": 5995 }, { "epoch": 0.00651619328332517, "grad_norm": 17.13602066040039, "learning_rate": 4.999493384042985e-06, "loss": 2.0401, "step": 6000 }, { "epoch": 0.006521623444394608, "grad_norm": 15.087470054626465, "learning_rate": 4.999492525037092e-06, "loss": 2.2264, "step": 6005 }, { "epoch": 0.006527053605464045, "grad_norm": 15.525076866149902, "learning_rate": 4.999491665303633e-06, "loss": 2.3122, "step": 6010 }, { "epoch": 0.006532483766533483, "grad_norm": 16.198570251464844, "learning_rate": 4.999490804842612e-06, "loss": 1.2415, "step": 6015 }, { "epoch": 0.006537913927602921, "grad_norm": 15.608357429504395, "learning_rate": 4.999489943654026e-06, "loss": 1.4772, "step": 6020 }, { "epoch": 0.006543344088672358, "grad_norm": 44.09362030029297, "learning_rate": 4.999489081737877e-06, "loss": 1.3927, "step": 6025 }, { "epoch": 0.006548774249741796, "grad_norm": 15.196873664855957, "learning_rate": 4.999488219094165e-06, "loss": 1.8107, "step": 6030 }, { "epoch": 0.006554204410811233, "grad_norm": 20.73935890197754, "learning_rate": 4.999487355722889e-06, "loss": 1.7609, "step": 6035 }, { "epoch": 0.006559634571880671, "grad_norm": 35.231082916259766, "learning_rate": 4.999486491624052e-06, "loss": 1.6656, "step": 6040 }, { "epoch": 0.006565064732950108, "grad_norm": 26.257505416870117, "learning_rate": 4.999485626797651e-06, "loss": 1.2113, "step": 6045 }, { "epoch": 0.006570494894019546, "grad_norm": 20.34503936767578, "learning_rate": 4.999484761243688e-06, "loss": 1.3171, "step": 6050 }, { "epoch": 0.006575925055088984, "grad_norm": 18.22838020324707, "learning_rate": 4.999483894962164e-06, "loss": 1.7471, "step": 6055 }, { "epoch": 0.0065813552161584216, "grad_norm": 59.310760498046875, "learning_rate": 4.999483027953078e-06, "loss": 1.6784, "step": 6060 }, { "epoch": 0.00658678537722786, "grad_norm": 47.864952087402344, "learning_rate": 4.9994821602164294e-06, "loss": 1.6561, "step": 6065 }, { "epoch": 0.006592215538297297, "grad_norm": 16.675025939941406, "learning_rate": 4.9994812917522205e-06, "loss": 1.4573, "step": 6070 }, { "epoch": 0.006597645699366735, "grad_norm": 23.08821678161621, "learning_rate": 4.99948042256045e-06, "loss": 1.3102, "step": 6075 }, { "epoch": 0.006603075860436172, "grad_norm": 17.036731719970703, "learning_rate": 4.99947955264112e-06, "loss": 1.9107, "step": 6080 }, { "epoch": 0.00660850602150561, "grad_norm": 20.849905014038086, "learning_rate": 4.999478681994229e-06, "loss": 2.2087, "step": 6085 }, { "epoch": 0.006613936182575048, "grad_norm": 26.82684326171875, "learning_rate": 4.999477810619777e-06, "loss": 2.7484, "step": 6090 }, { "epoch": 0.006619366343644485, "grad_norm": 29.420316696166992, "learning_rate": 4.999476938517765e-06, "loss": 1.9574, "step": 6095 }, { "epoch": 0.006624796504713923, "grad_norm": 22.29788589477539, "learning_rate": 4.999476065688194e-06, "loss": 2.4505, "step": 6100 }, { "epoch": 0.00663022666578336, "grad_norm": 17.594818115234375, "learning_rate": 4.999475192131064e-06, "loss": 1.7982, "step": 6105 }, { "epoch": 0.006635656826852798, "grad_norm": 41.79448318481445, "learning_rate": 4.999474317846373e-06, "loss": 1.6205, "step": 6110 }, { "epoch": 0.006641086987922235, "grad_norm": 22.47249412536621, "learning_rate": 4.999473442834125e-06, "loss": 0.9757, "step": 6115 }, { "epoch": 0.006646517148991673, "grad_norm": 19.47974395751953, "learning_rate": 4.999472567094318e-06, "loss": 1.9228, "step": 6120 }, { "epoch": 0.006651947310061111, "grad_norm": 21.4582462310791, "learning_rate": 4.9994716906269515e-06, "loss": 1.7531, "step": 6125 }, { "epoch": 0.0066573774711305485, "grad_norm": 68.37345886230469, "learning_rate": 4.9994708134320275e-06, "loss": 0.9064, "step": 6130 }, { "epoch": 0.0066628076321999866, "grad_norm": 20.388202667236328, "learning_rate": 4.9994699355095454e-06, "loss": 1.4413, "step": 6135 }, { "epoch": 0.006668237793269424, "grad_norm": 17.351266860961914, "learning_rate": 4.999469056859506e-06, "loss": 1.6789, "step": 6140 }, { "epoch": 0.006673667954338862, "grad_norm": 28.45676612854004, "learning_rate": 4.999468177481909e-06, "loss": 1.5632, "step": 6145 }, { "epoch": 0.006679098115408299, "grad_norm": 56.57532501220703, "learning_rate": 4.999467297376755e-06, "loss": 2.1119, "step": 6150 }, { "epoch": 0.006684528276477737, "grad_norm": 23.76050567626953, "learning_rate": 4.999466416544043e-06, "loss": 1.4857, "step": 6155 }, { "epoch": 0.006689958437547175, "grad_norm": 51.12370300292969, "learning_rate": 4.999465534983775e-06, "loss": 1.5809, "step": 6160 }, { "epoch": 0.006695388598616612, "grad_norm": 45.4416389465332, "learning_rate": 4.999464652695951e-06, "loss": 1.8819, "step": 6165 }, { "epoch": 0.00670081875968605, "grad_norm": 21.09233856201172, "learning_rate": 4.999463769680572e-06, "loss": 1.7708, "step": 6170 }, { "epoch": 0.006706248920755487, "grad_norm": 22.615032196044922, "learning_rate": 4.999462885937635e-06, "loss": 2.0663, "step": 6175 }, { "epoch": 0.006711679081824925, "grad_norm": 27.720378875732422, "learning_rate": 4.999462001467144e-06, "loss": 2.1748, "step": 6180 }, { "epoch": 0.006717109242894362, "grad_norm": 19.432281494140625, "learning_rate": 4.999461116269097e-06, "loss": 1.7658, "step": 6185 }, { "epoch": 0.0067225394039638, "grad_norm": 19.599939346313477, "learning_rate": 4.999460230343494e-06, "loss": 2.3363, "step": 6190 }, { "epoch": 0.006727969565033238, "grad_norm": 18.5716552734375, "learning_rate": 4.9994593436903375e-06, "loss": 1.8125, "step": 6195 }, { "epoch": 0.0067333997261026755, "grad_norm": 22.10057258605957, "learning_rate": 4.999458456309625e-06, "loss": 1.9672, "step": 6200 }, { "epoch": 0.0067388298871721136, "grad_norm": 60.02240753173828, "learning_rate": 4.999457568201359e-06, "loss": 1.5477, "step": 6205 }, { "epoch": 0.006744260048241551, "grad_norm": 22.65485954284668, "learning_rate": 4.999456679365538e-06, "loss": 1.5058, "step": 6210 }, { "epoch": 0.006749690209310989, "grad_norm": 48.723812103271484, "learning_rate": 4.999455789802165e-06, "loss": 1.2281, "step": 6215 }, { "epoch": 0.006755120370380426, "grad_norm": 25.84229850769043, "learning_rate": 4.999454899511238e-06, "loss": 1.9847, "step": 6220 }, { "epoch": 0.006760550531449864, "grad_norm": 28.711185455322266, "learning_rate": 4.9994540084927565e-06, "loss": 1.3359, "step": 6225 }, { "epoch": 0.006765980692519302, "grad_norm": 28.732019424438477, "learning_rate": 4.999453116746723e-06, "loss": 1.7733, "step": 6230 }, { "epoch": 0.006771410853588739, "grad_norm": 85.62967681884766, "learning_rate": 4.999452224273137e-06, "loss": 1.2875, "step": 6235 }, { "epoch": 0.006776841014658177, "grad_norm": 16.945819854736328, "learning_rate": 4.999451331071997e-06, "loss": 1.7941, "step": 6240 }, { "epoch": 0.006782271175727614, "grad_norm": 28.82146644592285, "learning_rate": 4.999450437143306e-06, "loss": 1.6303, "step": 6245 }, { "epoch": 0.006787701336797052, "grad_norm": 27.64118194580078, "learning_rate": 4.999449542487063e-06, "loss": 2.1886, "step": 6250 }, { "epoch": 0.006793131497866489, "grad_norm": 12.998029708862305, "learning_rate": 4.999448647103268e-06, "loss": 1.7115, "step": 6255 }, { "epoch": 0.006798561658935927, "grad_norm": 25.392589569091797, "learning_rate": 4.999447750991922e-06, "loss": 1.5558, "step": 6260 }, { "epoch": 0.006803991820005365, "grad_norm": 52.13328552246094, "learning_rate": 4.999446854153024e-06, "loss": 2.356, "step": 6265 }, { "epoch": 0.0068094219810748025, "grad_norm": 164.6450653076172, "learning_rate": 4.999445956586575e-06, "loss": 2.0923, "step": 6270 }, { "epoch": 0.0068148521421442405, "grad_norm": 117.76384735107422, "learning_rate": 4.999445058292576e-06, "loss": 1.7621, "step": 6275 }, { "epoch": 0.006820282303213678, "grad_norm": 36.27488327026367, "learning_rate": 4.999444159271026e-06, "loss": 2.1167, "step": 6280 }, { "epoch": 0.006825712464283116, "grad_norm": 26.483287811279297, "learning_rate": 4.999443259521926e-06, "loss": 1.594, "step": 6285 }, { "epoch": 0.006831142625352553, "grad_norm": 19.27322006225586, "learning_rate": 4.9994423590452756e-06, "loss": 2.1136, "step": 6290 }, { "epoch": 0.006836572786421991, "grad_norm": 11.991548538208008, "learning_rate": 4.9994414578410765e-06, "loss": 1.5683, "step": 6295 }, { "epoch": 0.006842002947491429, "grad_norm": 22.308025360107422, "learning_rate": 4.999440555909327e-06, "loss": 1.336, "step": 6300 }, { "epoch": 0.006847433108560866, "grad_norm": 21.666074752807617, "learning_rate": 4.99943965325003e-06, "loss": 1.8792, "step": 6305 }, { "epoch": 0.006852863269630304, "grad_norm": 30.34619140625, "learning_rate": 4.999438749863182e-06, "loss": 2.2373, "step": 6310 }, { "epoch": 0.006858293430699741, "grad_norm": 43.26150894165039, "learning_rate": 4.999437845748787e-06, "loss": 1.9748, "step": 6315 }, { "epoch": 0.006863723591769179, "grad_norm": 92.05634307861328, "learning_rate": 4.999436940906843e-06, "loss": 1.9678, "step": 6320 }, { "epoch": 0.006869153752838616, "grad_norm": 26.12843132019043, "learning_rate": 4.9994360353373515e-06, "loss": 1.5053, "step": 6325 }, { "epoch": 0.006874583913908054, "grad_norm": 25.901996612548828, "learning_rate": 4.999435129040312e-06, "loss": 1.7353, "step": 6330 }, { "epoch": 0.006880014074977492, "grad_norm": 32.6561164855957, "learning_rate": 4.999434222015725e-06, "loss": 1.3235, "step": 6335 }, { "epoch": 0.0068854442360469295, "grad_norm": 25.886857986450195, "learning_rate": 4.99943331426359e-06, "loss": 1.6109, "step": 6340 }, { "epoch": 0.0068908743971163675, "grad_norm": 13.66329288482666, "learning_rate": 4.999432405783909e-06, "loss": 1.7729, "step": 6345 }, { "epoch": 0.006896304558185805, "grad_norm": 24.905851364135742, "learning_rate": 4.999431496576681e-06, "loss": 1.499, "step": 6350 }, { "epoch": 0.006901734719255243, "grad_norm": 13.622884750366211, "learning_rate": 4.999430586641906e-06, "loss": 1.2391, "step": 6355 }, { "epoch": 0.00690716488032468, "grad_norm": 56.846778869628906, "learning_rate": 4.999429675979585e-06, "loss": 1.7322, "step": 6360 }, { "epoch": 0.006912595041394118, "grad_norm": 32.21195983886719, "learning_rate": 4.999428764589718e-06, "loss": 2.1086, "step": 6365 }, { "epoch": 0.006918025202463556, "grad_norm": 26.778751373291016, "learning_rate": 4.999427852472306e-06, "loss": 1.6993, "step": 6370 }, { "epoch": 0.006923455363532993, "grad_norm": 41.61622619628906, "learning_rate": 4.999426939627348e-06, "loss": 1.4249, "step": 6375 }, { "epoch": 0.006928885524602431, "grad_norm": 21.620895385742188, "learning_rate": 4.999426026054845e-06, "loss": 1.8149, "step": 6380 }, { "epoch": 0.006934315685671868, "grad_norm": 28.307281494140625, "learning_rate": 4.999425111754796e-06, "loss": 1.2132, "step": 6385 }, { "epoch": 0.006939745846741306, "grad_norm": 21.96520233154297, "learning_rate": 4.999424196727204e-06, "loss": 1.5518, "step": 6390 }, { "epoch": 0.006945176007810743, "grad_norm": 21.605833053588867, "learning_rate": 4.999423280972067e-06, "loss": 1.8046, "step": 6395 }, { "epoch": 0.006950606168880181, "grad_norm": 70.3662338256836, "learning_rate": 4.999422364489386e-06, "loss": 1.7286, "step": 6400 }, { "epoch": 0.006956036329949619, "grad_norm": 148.39869689941406, "learning_rate": 4.999421447279161e-06, "loss": 1.486, "step": 6405 }, { "epoch": 0.0069614664910190565, "grad_norm": 28.187028884887695, "learning_rate": 4.999420529341393e-06, "loss": 2.0168, "step": 6410 }, { "epoch": 0.0069668966520884945, "grad_norm": 22.861196517944336, "learning_rate": 4.999419610676081e-06, "loss": 2.3335, "step": 6415 }, { "epoch": 0.006972326813157932, "grad_norm": 13.712096214294434, "learning_rate": 4.999418691283227e-06, "loss": 1.5122, "step": 6420 }, { "epoch": 0.00697775697422737, "grad_norm": 13.692708015441895, "learning_rate": 4.999417771162829e-06, "loss": 1.3862, "step": 6425 }, { "epoch": 0.006983187135296807, "grad_norm": 34.281654357910156, "learning_rate": 4.999416850314889e-06, "loss": 1.9781, "step": 6430 }, { "epoch": 0.006988617296366245, "grad_norm": 27.226043701171875, "learning_rate": 4.9994159287394065e-06, "loss": 2.2918, "step": 6435 }, { "epoch": 0.006994047457435683, "grad_norm": 21.070411682128906, "learning_rate": 4.9994150064363824e-06, "loss": 1.4276, "step": 6440 }, { "epoch": 0.00699947761850512, "grad_norm": 52.48470687866211, "learning_rate": 4.999414083405817e-06, "loss": 1.5806, "step": 6445 }, { "epoch": 0.007004907779574558, "grad_norm": 12.976173400878906, "learning_rate": 4.999413159647709e-06, "loss": 2.5325, "step": 6450 }, { "epoch": 0.007010337940643995, "grad_norm": 23.1564998626709, "learning_rate": 4.9994122351620615e-06, "loss": 1.5655, "step": 6455 }, { "epoch": 0.007015768101713433, "grad_norm": 24.22195816040039, "learning_rate": 4.999411309948872e-06, "loss": 1.3717, "step": 6460 }, { "epoch": 0.00702119826278287, "grad_norm": 203.91058349609375, "learning_rate": 4.999410384008142e-06, "loss": 1.8104, "step": 6465 }, { "epoch": 0.007026628423852308, "grad_norm": 22.06757926940918, "learning_rate": 4.999409457339872e-06, "loss": 1.5958, "step": 6470 }, { "epoch": 0.007032058584921746, "grad_norm": 49.863059997558594, "learning_rate": 4.999408529944061e-06, "loss": 1.2672, "step": 6475 }, { "epoch": 0.0070374887459911835, "grad_norm": 19.22406768798828, "learning_rate": 4.999407601820711e-06, "loss": 1.4253, "step": 6480 }, { "epoch": 0.0070429189070606215, "grad_norm": 22.643869400024414, "learning_rate": 4.999406672969822e-06, "loss": 2.1483, "step": 6485 }, { "epoch": 0.007048349068130059, "grad_norm": 18.5252742767334, "learning_rate": 4.999405743391393e-06, "loss": 1.4845, "step": 6490 }, { "epoch": 0.007053779229199497, "grad_norm": 20.296655654907227, "learning_rate": 4.999404813085425e-06, "loss": 2.1831, "step": 6495 }, { "epoch": 0.007059209390268934, "grad_norm": 17.400114059448242, "learning_rate": 4.999403882051918e-06, "loss": 2.1997, "step": 6500 }, { "epoch": 0.007064639551338372, "grad_norm": 21.154830932617188, "learning_rate": 4.9994029502908735e-06, "loss": 1.6569, "step": 6505 }, { "epoch": 0.00707006971240781, "grad_norm": 51.98811721801758, "learning_rate": 4.999402017802291e-06, "loss": 1.334, "step": 6510 }, { "epoch": 0.007075499873477247, "grad_norm": 21.370132446289062, "learning_rate": 4.99940108458617e-06, "loss": 1.539, "step": 6515 }, { "epoch": 0.007080930034546685, "grad_norm": 21.90231704711914, "learning_rate": 4.999400150642511e-06, "loss": 1.7762, "step": 6520 }, { "epoch": 0.007086360195616122, "grad_norm": 25.35392189025879, "learning_rate": 4.999399215971315e-06, "loss": 1.8282, "step": 6525 }, { "epoch": 0.00709179035668556, "grad_norm": 20.01601219177246, "learning_rate": 4.999398280572581e-06, "loss": 1.548, "step": 6530 }, { "epoch": 0.007097220517754997, "grad_norm": 42.10734176635742, "learning_rate": 4.9993973444463114e-06, "loss": 2.5315, "step": 6535 }, { "epoch": 0.007102650678824435, "grad_norm": 20.630338668823242, "learning_rate": 4.999396407592505e-06, "loss": 1.1793, "step": 6540 }, { "epoch": 0.007108080839893873, "grad_norm": 12.3633394241333, "learning_rate": 4.999395470011162e-06, "loss": 1.6367, "step": 6545 }, { "epoch": 0.0071135110009633105, "grad_norm": 40.88344955444336, "learning_rate": 4.999394531702284e-06, "loss": 1.5575, "step": 6550 }, { "epoch": 0.0071189411620327485, "grad_norm": 17.615230560302734, "learning_rate": 4.999393592665869e-06, "loss": 1.3027, "step": 6555 }, { "epoch": 0.007124371323102186, "grad_norm": 19.47707176208496, "learning_rate": 4.999392652901919e-06, "loss": 1.8684, "step": 6560 }, { "epoch": 0.007129801484171624, "grad_norm": 18.542768478393555, "learning_rate": 4.999391712410434e-06, "loss": 1.7071, "step": 6565 }, { "epoch": 0.007135231645241061, "grad_norm": 25.23841094970703, "learning_rate": 4.999390771191415e-06, "loss": 1.9993, "step": 6570 }, { "epoch": 0.007140661806310499, "grad_norm": 19.596086502075195, "learning_rate": 4.99938982924486e-06, "loss": 2.8719, "step": 6575 }, { "epoch": 0.007146091967379937, "grad_norm": 16.3028507232666, "learning_rate": 4.999388886570771e-06, "loss": 1.4374, "step": 6580 }, { "epoch": 0.007151522128449374, "grad_norm": 16.76361083984375, "learning_rate": 4.999387943169148e-06, "loss": 1.3013, "step": 6585 }, { "epoch": 0.007156952289518812, "grad_norm": 52.32975387573242, "learning_rate": 4.999386999039991e-06, "loss": 2.2284, "step": 6590 }, { "epoch": 0.007162382450588249, "grad_norm": 17.35318374633789, "learning_rate": 4.999386054183301e-06, "loss": 1.7078, "step": 6595 }, { "epoch": 0.007167812611657687, "grad_norm": 33.697296142578125, "learning_rate": 4.999385108599077e-06, "loss": 1.8003, "step": 6600 }, { "epoch": 0.007173242772727124, "grad_norm": 32.93210220336914, "learning_rate": 4.999384162287321e-06, "loss": 1.4218, "step": 6605 }, { "epoch": 0.007178672933796562, "grad_norm": 53.157169342041016, "learning_rate": 4.999383215248031e-06, "loss": 1.9452, "step": 6610 }, { "epoch": 0.007184103094866, "grad_norm": 29.841367721557617, "learning_rate": 4.99938226748121e-06, "loss": 1.2841, "step": 6615 }, { "epoch": 0.0071895332559354375, "grad_norm": 33.2380256652832, "learning_rate": 4.999381318986855e-06, "loss": 1.5926, "step": 6620 }, { "epoch": 0.0071949634170048755, "grad_norm": 18.57761001586914, "learning_rate": 4.99938036976497e-06, "loss": 1.7745, "step": 6625 }, { "epoch": 0.007200393578074313, "grad_norm": 14.265168190002441, "learning_rate": 4.999379419815553e-06, "loss": 1.977, "step": 6630 }, { "epoch": 0.007205823739143751, "grad_norm": 62.224605560302734, "learning_rate": 4.999378469138604e-06, "loss": 1.8061, "step": 6635 }, { "epoch": 0.007211253900213188, "grad_norm": 57.25526809692383, "learning_rate": 4.999377517734124e-06, "loss": 1.5967, "step": 6640 }, { "epoch": 0.007216684061282626, "grad_norm": 75.79637908935547, "learning_rate": 4.999376565602114e-06, "loss": 1.9496, "step": 6645 }, { "epoch": 0.007222114222352064, "grad_norm": 16.49116325378418, "learning_rate": 4.999375612742573e-06, "loss": 1.034, "step": 6650 }, { "epoch": 0.007227544383421501, "grad_norm": 19.849903106689453, "learning_rate": 4.999374659155501e-06, "loss": 2.1318, "step": 6655 }, { "epoch": 0.007232974544490939, "grad_norm": 18.213890075683594, "learning_rate": 4.999373704840901e-06, "loss": 2.2409, "step": 6660 }, { "epoch": 0.007238404705560376, "grad_norm": 19.566059112548828, "learning_rate": 4.99937274979877e-06, "loss": 1.4825, "step": 6665 }, { "epoch": 0.007243834866629814, "grad_norm": 54.684715270996094, "learning_rate": 4.99937179402911e-06, "loss": 1.5233, "step": 6670 }, { "epoch": 0.007249265027699251, "grad_norm": 92.60920715332031, "learning_rate": 4.9993708375319205e-06, "loss": 1.8914, "step": 6675 }, { "epoch": 0.007254695188768689, "grad_norm": 12.893360137939453, "learning_rate": 4.999369880307202e-06, "loss": 1.6157, "step": 6680 }, { "epoch": 0.007260125349838127, "grad_norm": 21.871240615844727, "learning_rate": 4.999368922354956e-06, "loss": 2.3405, "step": 6685 }, { "epoch": 0.0072655555109075644, "grad_norm": 19.46041488647461, "learning_rate": 4.999367963675182e-06, "loss": 1.5002, "step": 6690 }, { "epoch": 0.0072709856719770025, "grad_norm": 36.11216735839844, "learning_rate": 4.999367004267878e-06, "loss": 1.5599, "step": 6695 }, { "epoch": 0.00727641583304644, "grad_norm": 43.97473907470703, "learning_rate": 4.999366044133047e-06, "loss": 1.6647, "step": 6700 }, { "epoch": 0.007281845994115878, "grad_norm": 64.80052947998047, "learning_rate": 4.999365083270689e-06, "loss": 1.8503, "step": 6705 }, { "epoch": 0.007287276155185315, "grad_norm": 50.69807434082031, "learning_rate": 4.999364121680804e-06, "loss": 1.2528, "step": 6710 }, { "epoch": 0.007292706316254753, "grad_norm": 97.84739685058594, "learning_rate": 4.999363159363392e-06, "loss": 1.1174, "step": 6715 }, { "epoch": 0.007298136477324191, "grad_norm": 97.35795593261719, "learning_rate": 4.9993621963184535e-06, "loss": 1.2093, "step": 6720 }, { "epoch": 0.007303566638393628, "grad_norm": 22.67673683166504, "learning_rate": 4.999361232545989e-06, "loss": 1.3794, "step": 6725 }, { "epoch": 0.007308996799463066, "grad_norm": 55.696800231933594, "learning_rate": 4.999360268045998e-06, "loss": 2.24, "step": 6730 }, { "epoch": 0.007314426960532503, "grad_norm": 42.27450180053711, "learning_rate": 4.999359302818481e-06, "loss": 1.442, "step": 6735 }, { "epoch": 0.007319857121601941, "grad_norm": 26.433969497680664, "learning_rate": 4.999358336863439e-06, "loss": 1.4246, "step": 6740 }, { "epoch": 0.007325287282671378, "grad_norm": 24.460527420043945, "learning_rate": 4.999357370180872e-06, "loss": 1.6184, "step": 6745 }, { "epoch": 0.007330717443740816, "grad_norm": 29.439836502075195, "learning_rate": 4.99935640277078e-06, "loss": 2.4388, "step": 6750 }, { "epoch": 0.007336147604810254, "grad_norm": 16.528995513916016, "learning_rate": 4.999355434633163e-06, "loss": 2.2586, "step": 6755 }, { "epoch": 0.0073415777658796914, "grad_norm": 14.015754699707031, "learning_rate": 4.999354465768021e-06, "loss": 1.8458, "step": 6760 }, { "epoch": 0.0073470079269491294, "grad_norm": 44.99337387084961, "learning_rate": 4.999353496175357e-06, "loss": 1.732, "step": 6765 }, { "epoch": 0.007352438088018567, "grad_norm": 20.54554557800293, "learning_rate": 4.9993525258551666e-06, "loss": 1.6771, "step": 6770 }, { "epoch": 0.007357868249088005, "grad_norm": 33.62420654296875, "learning_rate": 4.999351554807455e-06, "loss": 1.8472, "step": 6775 }, { "epoch": 0.007363298410157442, "grad_norm": 18.95176124572754, "learning_rate": 4.9993505830322185e-06, "loss": 2.3057, "step": 6780 }, { "epoch": 0.00736872857122688, "grad_norm": 32.85442352294922, "learning_rate": 4.999349610529461e-06, "loss": 1.8111, "step": 6785 }, { "epoch": 0.007374158732296318, "grad_norm": 84.52377319335938, "learning_rate": 4.999348637299179e-06, "loss": 1.9619, "step": 6790 }, { "epoch": 0.007379588893365755, "grad_norm": 22.14383888244629, "learning_rate": 4.999347663341375e-06, "loss": 1.9601, "step": 6795 }, { "epoch": 0.007385019054435193, "grad_norm": 32.43605422973633, "learning_rate": 4.9993466886560496e-06, "loss": 2.0733, "step": 6800 }, { "epoch": 0.00739044921550463, "grad_norm": 33.16332244873047, "learning_rate": 4.9993457132432025e-06, "loss": 1.6853, "step": 6805 }, { "epoch": 0.007395879376574068, "grad_norm": 227.20494079589844, "learning_rate": 4.999344737102833e-06, "loss": 1.9713, "step": 6810 }, { "epoch": 0.007401309537643505, "grad_norm": 13.688126564025879, "learning_rate": 4.9993437602349424e-06, "loss": 1.3642, "step": 6815 }, { "epoch": 0.007406739698712943, "grad_norm": 42.294639587402344, "learning_rate": 4.999342782639531e-06, "loss": 1.478, "step": 6820 }, { "epoch": 0.007412169859782381, "grad_norm": 27.14398765563965, "learning_rate": 4.9993418043166e-06, "loss": 1.5146, "step": 6825 }, { "epoch": 0.007417600020851818, "grad_norm": 16.918760299682617, "learning_rate": 4.999340825266147e-06, "loss": 1.3535, "step": 6830 }, { "epoch": 0.0074230301819212564, "grad_norm": 19.78348159790039, "learning_rate": 4.999339845488175e-06, "loss": 1.6857, "step": 6835 }, { "epoch": 0.007428460342990694, "grad_norm": 31.43338394165039, "learning_rate": 4.999338864982682e-06, "loss": 1.7982, "step": 6840 }, { "epoch": 0.007433890504060132, "grad_norm": 35.45477294921875, "learning_rate": 4.999337883749671e-06, "loss": 1.5226, "step": 6845 }, { "epoch": 0.007439320665129569, "grad_norm": 45.64082717895508, "learning_rate": 4.9993369017891404e-06, "loss": 1.8561, "step": 6850 }, { "epoch": 0.007444750826199007, "grad_norm": 15.640351295471191, "learning_rate": 4.99933591910109e-06, "loss": 1.1833, "step": 6855 }, { "epoch": 0.007450180987268445, "grad_norm": 26.955915451049805, "learning_rate": 4.999334935685522e-06, "loss": 2.7007, "step": 6860 }, { "epoch": 0.007455611148337882, "grad_norm": 14.926719665527344, "learning_rate": 4.9993339515424346e-06, "loss": 1.0109, "step": 6865 }, { "epoch": 0.00746104130940732, "grad_norm": 31.675172805786133, "learning_rate": 4.99933296667183e-06, "loss": 0.9632, "step": 6870 }, { "epoch": 0.007466471470476757, "grad_norm": 17.1876163482666, "learning_rate": 4.9993319810737075e-06, "loss": 1.6663, "step": 6875 }, { "epoch": 0.007471901631546195, "grad_norm": 69.91344451904297, "learning_rate": 4.9993309947480674e-06, "loss": 2.4829, "step": 6880 }, { "epoch": 0.007477331792615632, "grad_norm": 32.80873489379883, "learning_rate": 4.99933000769491e-06, "loss": 1.9497, "step": 6885 }, { "epoch": 0.00748276195368507, "grad_norm": 29.218788146972656, "learning_rate": 4.999329019914235e-06, "loss": 1.3771, "step": 6890 }, { "epoch": 0.007488192114754508, "grad_norm": 35.428226470947266, "learning_rate": 4.999328031406044e-06, "loss": 1.4416, "step": 6895 }, { "epoch": 0.007493622275823945, "grad_norm": 186.7178497314453, "learning_rate": 4.999327042170337e-06, "loss": 2.1771, "step": 6900 }, { "epoch": 0.007499052436893383, "grad_norm": 22.27720832824707, "learning_rate": 4.999326052207113e-06, "loss": 1.5778, "step": 6905 }, { "epoch": 0.007504482597962821, "grad_norm": 35.726261138916016, "learning_rate": 4.999325061516373e-06, "loss": 1.6553, "step": 6910 }, { "epoch": 0.007509912759032259, "grad_norm": 13.563847541809082, "learning_rate": 4.9993240700981195e-06, "loss": 1.7392, "step": 6915 }, { "epoch": 0.007515342920101696, "grad_norm": 14.31415843963623, "learning_rate": 4.999323077952349e-06, "loss": 1.9281, "step": 6920 }, { "epoch": 0.007520773081171134, "grad_norm": 20.940759658813477, "learning_rate": 4.999322085079064e-06, "loss": 2.1168, "step": 6925 }, { "epoch": 0.007526203242240572, "grad_norm": 24.764060974121094, "learning_rate": 4.9993210914782645e-06, "loss": 1.6144, "step": 6930 }, { "epoch": 0.007531633403310009, "grad_norm": 35.25798416137695, "learning_rate": 4.99932009714995e-06, "loss": 2.6073, "step": 6935 }, { "epoch": 0.007537063564379447, "grad_norm": 38.44682312011719, "learning_rate": 4.999319102094123e-06, "loss": 1.645, "step": 6940 }, { "epoch": 0.007542493725448884, "grad_norm": 41.0965690612793, "learning_rate": 4.999318106310781e-06, "loss": 1.8959, "step": 6945 }, { "epoch": 0.007547923886518322, "grad_norm": 43.34303665161133, "learning_rate": 4.999317109799926e-06, "loss": 1.576, "step": 6950 }, { "epoch": 0.007553354047587759, "grad_norm": 72.90728759765625, "learning_rate": 4.999316112561557e-06, "loss": 2.2789, "step": 6955 }, { "epoch": 0.007558784208657197, "grad_norm": 160.86468505859375, "learning_rate": 4.999315114595676e-06, "loss": 1.4735, "step": 6960 }, { "epoch": 0.007564214369726635, "grad_norm": 134.8756561279297, "learning_rate": 4.9993141159022826e-06, "loss": 1.7094, "step": 6965 }, { "epoch": 0.007569644530796072, "grad_norm": 19.874544143676758, "learning_rate": 4.999313116481376e-06, "loss": 1.664, "step": 6970 }, { "epoch": 0.00757507469186551, "grad_norm": 23.262039184570312, "learning_rate": 4.999312116332958e-06, "loss": 1.7051, "step": 6975 }, { "epoch": 0.0075805048529349476, "grad_norm": 62.25078201293945, "learning_rate": 4.999311115457028e-06, "loss": 1.2332, "step": 6980 }, { "epoch": 0.007585935014004386, "grad_norm": 15.50937271118164, "learning_rate": 4.999310113853587e-06, "loss": 2.2411, "step": 6985 }, { "epoch": 0.007591365175073823, "grad_norm": 17.419004440307617, "learning_rate": 4.999309111522635e-06, "loss": 1.5957, "step": 6990 }, { "epoch": 0.007596795336143261, "grad_norm": 59.0344352722168, "learning_rate": 4.999308108464171e-06, "loss": 1.2744, "step": 6995 }, { "epoch": 0.007602225497212699, "grad_norm": 29.459693908691406, "learning_rate": 4.999307104678197e-06, "loss": 1.5515, "step": 7000 }, { "epoch": 0.007607655658282136, "grad_norm": 23.414657592773438, "learning_rate": 4.999306100164713e-06, "loss": 1.5386, "step": 7005 }, { "epoch": 0.007613085819351574, "grad_norm": 60.79243850708008, "learning_rate": 4.999305094923718e-06, "loss": 1.9197, "step": 7010 }, { "epoch": 0.007618515980421011, "grad_norm": 26.288984298706055, "learning_rate": 4.9993040889552145e-06, "loss": 1.3913, "step": 7015 }, { "epoch": 0.007623946141490449, "grad_norm": 21.29357147216797, "learning_rate": 4.999303082259201e-06, "loss": 1.9474, "step": 7020 }, { "epoch": 0.007629376302559886, "grad_norm": 17.498573303222656, "learning_rate": 4.999302074835679e-06, "loss": 1.6452, "step": 7025 }, { "epoch": 0.007634806463629324, "grad_norm": 14.475834846496582, "learning_rate": 4.999301066684647e-06, "loss": 1.4419, "step": 7030 }, { "epoch": 0.007640236624698762, "grad_norm": 16.193618774414062, "learning_rate": 4.999300057806108e-06, "loss": 1.5672, "step": 7035 }, { "epoch": 0.007645666785768199, "grad_norm": 20.271873474121094, "learning_rate": 4.99929904820006e-06, "loss": 2.0481, "step": 7040 }, { "epoch": 0.007651096946837637, "grad_norm": 21.469329833984375, "learning_rate": 4.999298037866504e-06, "loss": 1.7868, "step": 7045 }, { "epoch": 0.0076565271079070745, "grad_norm": 34.08251953125, "learning_rate": 4.999297026805441e-06, "loss": 1.609, "step": 7050 }, { "epoch": 0.0076619572689765126, "grad_norm": 13.935778617858887, "learning_rate": 4.99929601501687e-06, "loss": 1.4722, "step": 7055 }, { "epoch": 0.00766738743004595, "grad_norm": 15.994159698486328, "learning_rate": 4.999295002500792e-06, "loss": 1.7712, "step": 7060 }, { "epoch": 0.007672817591115388, "grad_norm": 26.39081382751465, "learning_rate": 4.999293989257207e-06, "loss": 1.7253, "step": 7065 }, { "epoch": 0.007678247752184826, "grad_norm": 15.049460411071777, "learning_rate": 4.999292975286116e-06, "loss": 1.0848, "step": 7070 }, { "epoch": 0.007683677913254263, "grad_norm": 23.779199600219727, "learning_rate": 4.999291960587519e-06, "loss": 2.3549, "step": 7075 }, { "epoch": 0.007689108074323701, "grad_norm": 21.462970733642578, "learning_rate": 4.999290945161416e-06, "loss": 2.0475, "step": 7080 }, { "epoch": 0.007694538235393138, "grad_norm": 33.3776969909668, "learning_rate": 4.999289929007807e-06, "loss": 1.5524, "step": 7085 }, { "epoch": 0.007699968396462576, "grad_norm": 22.325952529907227, "learning_rate": 4.9992889121266926e-06, "loss": 1.3029, "step": 7090 }, { "epoch": 0.007705398557532013, "grad_norm": 22.920398712158203, "learning_rate": 4.999287894518074e-06, "loss": 1.8023, "step": 7095 }, { "epoch": 0.007710828718601451, "grad_norm": 17.90821647644043, "learning_rate": 4.999286876181951e-06, "loss": 1.8641, "step": 7100 }, { "epoch": 0.007716258879670889, "grad_norm": 13.37690544128418, "learning_rate": 4.9992858571183224e-06, "loss": 1.5349, "step": 7105 }, { "epoch": 0.007721689040740326, "grad_norm": 20.30241584777832, "learning_rate": 4.999284837327191e-06, "loss": 1.4336, "step": 7110 }, { "epoch": 0.007727119201809764, "grad_norm": 18.840646743774414, "learning_rate": 4.999283816808555e-06, "loss": 1.2605, "step": 7115 }, { "epoch": 0.0077325493628792015, "grad_norm": 16.87798500061035, "learning_rate": 4.9992827955624156e-06, "loss": 1.745, "step": 7120 }, { "epoch": 0.0077379795239486395, "grad_norm": 28.860624313354492, "learning_rate": 4.999281773588773e-06, "loss": 1.9617, "step": 7125 }, { "epoch": 0.007743409685018077, "grad_norm": 23.440385818481445, "learning_rate": 4.999280750887627e-06, "loss": 1.5175, "step": 7130 }, { "epoch": 0.007748839846087515, "grad_norm": 55.329952239990234, "learning_rate": 4.99927972745898e-06, "loss": 1.8366, "step": 7135 }, { "epoch": 0.007754270007156953, "grad_norm": 45.4221076965332, "learning_rate": 4.99927870330283e-06, "loss": 2.1751, "step": 7140 }, { "epoch": 0.00775970016822639, "grad_norm": 15.6482515335083, "learning_rate": 4.999277678419177e-06, "loss": 1.8115, "step": 7145 }, { "epoch": 0.007765130329295828, "grad_norm": 55.267112731933594, "learning_rate": 4.999276652808023e-06, "loss": 2.0385, "step": 7150 }, { "epoch": 0.007770560490365265, "grad_norm": 29.94124984741211, "learning_rate": 4.999275626469367e-06, "loss": 1.8919, "step": 7155 }, { "epoch": 0.007775990651434703, "grad_norm": 59.34672546386719, "learning_rate": 4.999274599403211e-06, "loss": 1.4367, "step": 7160 }, { "epoch": 0.00778142081250414, "grad_norm": 20.059955596923828, "learning_rate": 4.999273571609553e-06, "loss": 1.2101, "step": 7165 }, { "epoch": 0.007786850973573578, "grad_norm": 25.33367919921875, "learning_rate": 4.999272543088396e-06, "loss": 1.7896, "step": 7170 }, { "epoch": 0.007792281134643016, "grad_norm": 21.700008392333984, "learning_rate": 4.9992715138397375e-06, "loss": 1.7869, "step": 7175 }, { "epoch": 0.007797711295712453, "grad_norm": 18.134170532226562, "learning_rate": 4.99927048386358e-06, "loss": 1.4995, "step": 7180 }, { "epoch": 0.007803141456781891, "grad_norm": 36.34523391723633, "learning_rate": 4.999269453159922e-06, "loss": 1.8453, "step": 7185 }, { "epoch": 0.0078085716178513285, "grad_norm": 23.157955169677734, "learning_rate": 4.999268421728765e-06, "loss": 1.4024, "step": 7190 }, { "epoch": 0.007814001778920766, "grad_norm": 21.55681610107422, "learning_rate": 4.999267389570109e-06, "loss": 1.5406, "step": 7195 }, { "epoch": 0.007819431939990204, "grad_norm": 62.54039001464844, "learning_rate": 4.999266356683955e-06, "loss": 1.5628, "step": 7200 }, { "epoch": 0.007824862101059642, "grad_norm": 26.548545837402344, "learning_rate": 4.9992653230703014e-06, "loss": 1.9354, "step": 7205 }, { "epoch": 0.00783029226212908, "grad_norm": 16.04022216796875, "learning_rate": 4.99926428872915e-06, "loss": 1.8559, "step": 7210 }, { "epoch": 0.007835722423198518, "grad_norm": 33.090782165527344, "learning_rate": 4.999263253660501e-06, "loss": 1.4774, "step": 7215 }, { "epoch": 0.007841152584267954, "grad_norm": 41.623619079589844, "learning_rate": 4.999262217864355e-06, "loss": 2.3277, "step": 7220 }, { "epoch": 0.007846582745337392, "grad_norm": 15.60271167755127, "learning_rate": 4.999261181340711e-06, "loss": 1.5947, "step": 7225 }, { "epoch": 0.00785201290640683, "grad_norm": 23.090167999267578, "learning_rate": 4.99926014408957e-06, "loss": 1.2503, "step": 7230 }, { "epoch": 0.007857443067476268, "grad_norm": 17.2906551361084, "learning_rate": 4.999259106110933e-06, "loss": 2.1404, "step": 7235 }, { "epoch": 0.007862873228545704, "grad_norm": 19.913612365722656, "learning_rate": 4.9992580674048e-06, "loss": 1.6225, "step": 7240 }, { "epoch": 0.007868303389615142, "grad_norm": 16.553131103515625, "learning_rate": 4.9992570279711705e-06, "loss": 1.4574, "step": 7245 }, { "epoch": 0.00787373355068458, "grad_norm": 26.323829650878906, "learning_rate": 4.999255987810046e-06, "loss": 1.4869, "step": 7250 }, { "epoch": 0.007879163711754018, "grad_norm": 14.648614883422852, "learning_rate": 4.999254946921425e-06, "loss": 1.5094, "step": 7255 }, { "epoch": 0.007884593872823456, "grad_norm": 17.32176399230957, "learning_rate": 4.999253905305309e-06, "loss": 2.1873, "step": 7260 }, { "epoch": 0.007890024033892893, "grad_norm": 114.56478118896484, "learning_rate": 4.999252862961698e-06, "loss": 1.6882, "step": 7265 }, { "epoch": 0.00789545419496233, "grad_norm": 67.80133819580078, "learning_rate": 4.999251819890594e-06, "loss": 1.983, "step": 7270 }, { "epoch": 0.007900884356031769, "grad_norm": 22.68665313720703, "learning_rate": 4.9992507760919944e-06, "loss": 2.8475, "step": 7275 }, { "epoch": 0.007906314517101207, "grad_norm": 53.25872039794922, "learning_rate": 4.9992497315659015e-06, "loss": 0.937, "step": 7280 }, { "epoch": 0.007911744678170645, "grad_norm": 25.666378021240234, "learning_rate": 4.999248686312315e-06, "loss": 1.9012, "step": 7285 }, { "epoch": 0.007917174839240081, "grad_norm": 36.669918060302734, "learning_rate": 4.999247640331235e-06, "loss": 1.6133, "step": 7290 }, { "epoch": 0.007922605000309519, "grad_norm": 95.02339172363281, "learning_rate": 4.999246593622662e-06, "loss": 0.9331, "step": 7295 }, { "epoch": 0.007928035161378957, "grad_norm": 14.573845863342285, "learning_rate": 4.9992455461865965e-06, "loss": 2.0858, "step": 7300 }, { "epoch": 0.007933465322448395, "grad_norm": 87.0354995727539, "learning_rate": 4.999244498023039e-06, "loss": 1.4778, "step": 7305 }, { "epoch": 0.007938895483517831, "grad_norm": 21.546096801757812, "learning_rate": 4.999243449131989e-06, "loss": 1.6554, "step": 7310 }, { "epoch": 0.00794432564458727, "grad_norm": 19.99115562438965, "learning_rate": 4.999242399513447e-06, "loss": 1.4496, "step": 7315 }, { "epoch": 0.007949755805656707, "grad_norm": 24.822229385375977, "learning_rate": 4.999241349167414e-06, "loss": 1.4224, "step": 7320 }, { "epoch": 0.007955185966726145, "grad_norm": 28.51142120361328, "learning_rate": 4.99924029809389e-06, "loss": 1.3066, "step": 7325 }, { "epoch": 0.007960616127795583, "grad_norm": 72.89680480957031, "learning_rate": 4.999239246292875e-06, "loss": 1.9333, "step": 7330 }, { "epoch": 0.00796604628886502, "grad_norm": 18.678848266601562, "learning_rate": 4.99923819376437e-06, "loss": 2.0042, "step": 7335 }, { "epoch": 0.007971476449934458, "grad_norm": 16.115215301513672, "learning_rate": 4.999237140508374e-06, "loss": 2.0929, "step": 7340 }, { "epoch": 0.007976906611003896, "grad_norm": 13.4520263671875, "learning_rate": 4.999236086524889e-06, "loss": 1.7554, "step": 7345 }, { "epoch": 0.007982336772073334, "grad_norm": 19.130603790283203, "learning_rate": 4.999235031813913e-06, "loss": 1.5452, "step": 7350 }, { "epoch": 0.007987766933142772, "grad_norm": 52.595703125, "learning_rate": 4.999233976375449e-06, "loss": 1.3971, "step": 7355 }, { "epoch": 0.007993197094212208, "grad_norm": 14.612600326538086, "learning_rate": 4.999232920209495e-06, "loss": 1.8944, "step": 7360 }, { "epoch": 0.007998627255281646, "grad_norm": 149.70977783203125, "learning_rate": 4.999231863316053e-06, "loss": 1.3685, "step": 7365 }, { "epoch": 0.008004057416351084, "grad_norm": 18.421743392944336, "learning_rate": 4.999230805695122e-06, "loss": 1.7471, "step": 7370 }, { "epoch": 0.008009487577420522, "grad_norm": 26.299591064453125, "learning_rate": 4.999229747346704e-06, "loss": 1.6078, "step": 7375 }, { "epoch": 0.008014917738489958, "grad_norm": 19.11115264892578, "learning_rate": 4.9992286882707975e-06, "loss": 1.0714, "step": 7380 }, { "epoch": 0.008020347899559396, "grad_norm": 32.66044235229492, "learning_rate": 4.999227628467403e-06, "loss": 1.8709, "step": 7385 }, { "epoch": 0.008025778060628834, "grad_norm": 22.927562713623047, "learning_rate": 4.999226567936523e-06, "loss": 1.9027, "step": 7390 }, { "epoch": 0.008031208221698272, "grad_norm": 18.011560440063477, "learning_rate": 4.999225506678154e-06, "loss": 2.0884, "step": 7395 }, { "epoch": 0.00803663838276771, "grad_norm": 111.5927505493164, "learning_rate": 4.999224444692301e-06, "loss": 1.5587, "step": 7400 }, { "epoch": 0.008042068543837147, "grad_norm": 13.64093017578125, "learning_rate": 4.99922338197896e-06, "loss": 1.2155, "step": 7405 }, { "epoch": 0.008047498704906585, "grad_norm": 30.477033615112305, "learning_rate": 4.999222318538134e-06, "loss": 1.4853, "step": 7410 }, { "epoch": 0.008052928865976023, "grad_norm": 47.297725677490234, "learning_rate": 4.999221254369822e-06, "loss": 1.8094, "step": 7415 }, { "epoch": 0.00805835902704546, "grad_norm": 24.44301986694336, "learning_rate": 4.999220189474025e-06, "loss": 2.0078, "step": 7420 }, { "epoch": 0.008063789188114899, "grad_norm": 14.775474548339844, "learning_rate": 4.9992191238507425e-06, "loss": 2.1014, "step": 7425 }, { "epoch": 0.008069219349184335, "grad_norm": 15.236822128295898, "learning_rate": 4.999218057499975e-06, "loss": 2.285, "step": 7430 }, { "epoch": 0.008074649510253773, "grad_norm": 18.365652084350586, "learning_rate": 4.999216990421724e-06, "loss": 1.5668, "step": 7435 }, { "epoch": 0.008080079671323211, "grad_norm": 18.357196807861328, "learning_rate": 4.999215922615989e-06, "loss": 0.9988, "step": 7440 }, { "epoch": 0.008085509832392649, "grad_norm": 15.008543968200684, "learning_rate": 4.99921485408277e-06, "loss": 1.4187, "step": 7445 }, { "epoch": 0.008090939993462085, "grad_norm": 46.89785385131836, "learning_rate": 4.999213784822067e-06, "loss": 2.3248, "step": 7450 }, { "epoch": 0.008096370154531523, "grad_norm": 46.5635871887207, "learning_rate": 4.9992127148338824e-06, "loss": 2.6935, "step": 7455 }, { "epoch": 0.008101800315600961, "grad_norm": 31.084001541137695, "learning_rate": 4.9992116441182134e-06, "loss": 1.6261, "step": 7460 }, { "epoch": 0.0081072304766704, "grad_norm": 55.325687408447266, "learning_rate": 4.9992105726750625e-06, "loss": 1.2259, "step": 7465 }, { "epoch": 0.008112660637739837, "grad_norm": 29.109437942504883, "learning_rate": 4.99920950050443e-06, "loss": 2.0653, "step": 7470 }, { "epoch": 0.008118090798809274, "grad_norm": 55.37721633911133, "learning_rate": 4.999208427606314e-06, "loss": 1.2211, "step": 7475 }, { "epoch": 0.008123520959878712, "grad_norm": 19.254913330078125, "learning_rate": 4.999207353980719e-06, "loss": 1.7881, "step": 7480 }, { "epoch": 0.00812895112094815, "grad_norm": 18.618703842163086, "learning_rate": 4.999206279627641e-06, "loss": 1.5517, "step": 7485 }, { "epoch": 0.008134381282017588, "grad_norm": 22.181346893310547, "learning_rate": 4.999205204547082e-06, "loss": 1.5525, "step": 7490 }, { "epoch": 0.008139811443087026, "grad_norm": 185.8705596923828, "learning_rate": 4.999204128739042e-06, "loss": 1.7431, "step": 7495 }, { "epoch": 0.008145241604156462, "grad_norm": 40.4719352722168, "learning_rate": 4.9992030522035225e-06, "loss": 1.2582, "step": 7500 }, { "epoch": 0.0081506717652259, "grad_norm": 15.060558319091797, "learning_rate": 4.999201974940524e-06, "loss": 1.6322, "step": 7505 }, { "epoch": 0.008156101926295338, "grad_norm": 168.7138214111328, "learning_rate": 4.999200896950044e-06, "loss": 1.5144, "step": 7510 }, { "epoch": 0.008161532087364776, "grad_norm": 32.989036560058594, "learning_rate": 4.9991998182320865e-06, "loss": 2.3921, "step": 7515 }, { "epoch": 0.008166962248434212, "grad_norm": 17.259639739990234, "learning_rate": 4.999198738786648e-06, "loss": 1.6532, "step": 7520 }, { "epoch": 0.00817239240950365, "grad_norm": 51.31061935424805, "learning_rate": 4.999197658613732e-06, "loss": 1.9703, "step": 7525 }, { "epoch": 0.008177822570573088, "grad_norm": 15.322998046875, "learning_rate": 4.999196577713337e-06, "loss": 1.8729, "step": 7530 }, { "epoch": 0.008183252731642526, "grad_norm": 19.202308654785156, "learning_rate": 4.999195496085465e-06, "loss": 1.7057, "step": 7535 }, { "epoch": 0.008188682892711964, "grad_norm": 12.310423851013184, "learning_rate": 4.999194413730114e-06, "loss": 1.6587, "step": 7540 }, { "epoch": 0.0081941130537814, "grad_norm": 35.54924392700195, "learning_rate": 4.999193330647286e-06, "loss": 1.543, "step": 7545 }, { "epoch": 0.008199543214850839, "grad_norm": 18.064285278320312, "learning_rate": 4.999192246836982e-06, "loss": 1.9609, "step": 7550 }, { "epoch": 0.008204973375920277, "grad_norm": 136.45506286621094, "learning_rate": 4.999191162299199e-06, "loss": 1.819, "step": 7555 }, { "epoch": 0.008210403536989715, "grad_norm": 23.847679138183594, "learning_rate": 4.9991900770339404e-06, "loss": 1.6564, "step": 7560 }, { "epoch": 0.008215833698059153, "grad_norm": 64.13664245605469, "learning_rate": 4.9991889910412055e-06, "loss": 1.0023, "step": 7565 }, { "epoch": 0.008221263859128589, "grad_norm": 24.967849731445312, "learning_rate": 4.999187904320995e-06, "loss": 1.9055, "step": 7570 }, { "epoch": 0.008226694020198027, "grad_norm": 25.43021011352539, "learning_rate": 4.999186816873309e-06, "loss": 1.9377, "step": 7575 }, { "epoch": 0.008232124181267465, "grad_norm": 45.955841064453125, "learning_rate": 4.999185728698147e-06, "loss": 2.0263, "step": 7580 }, { "epoch": 0.008237554342336903, "grad_norm": 20.313274383544922, "learning_rate": 4.999184639795511e-06, "loss": 1.761, "step": 7585 }, { "epoch": 0.00824298450340634, "grad_norm": 112.87864685058594, "learning_rate": 4.9991835501653995e-06, "loss": 2.0765, "step": 7590 }, { "epoch": 0.008248414664475777, "grad_norm": 46.01885223388672, "learning_rate": 4.999182459807814e-06, "loss": 1.4215, "step": 7595 }, { "epoch": 0.008253844825545215, "grad_norm": 97.19223022460938, "learning_rate": 4.999181368722755e-06, "loss": 1.6689, "step": 7600 }, { "epoch": 0.008259274986614653, "grad_norm": 16.73203468322754, "learning_rate": 4.999180276910222e-06, "loss": 1.801, "step": 7605 }, { "epoch": 0.008264705147684091, "grad_norm": 24.389034271240234, "learning_rate": 4.9991791843702155e-06, "loss": 1.474, "step": 7610 }, { "epoch": 0.008270135308753528, "grad_norm": 22.474971771240234, "learning_rate": 4.999178091102736e-06, "loss": 1.5879, "step": 7615 }, { "epoch": 0.008275565469822966, "grad_norm": 19.38718605041504, "learning_rate": 4.999176997107784e-06, "loss": 1.5605, "step": 7620 }, { "epoch": 0.008280995630892404, "grad_norm": 22.22490692138672, "learning_rate": 4.999175902385359e-06, "loss": 0.9384, "step": 7625 }, { "epoch": 0.008286425791961842, "grad_norm": 31.726179122924805, "learning_rate": 4.999174806935463e-06, "loss": 2.0496, "step": 7630 }, { "epoch": 0.00829185595303128, "grad_norm": 38.64930725097656, "learning_rate": 4.9991737107580945e-06, "loss": 2.4823, "step": 7635 }, { "epoch": 0.008297286114100716, "grad_norm": 173.4932403564453, "learning_rate": 4.999172613853254e-06, "loss": 1.9684, "step": 7640 }, { "epoch": 0.008302716275170154, "grad_norm": 34.37493896484375, "learning_rate": 4.999171516220944e-06, "loss": 1.518, "step": 7645 }, { "epoch": 0.008308146436239592, "grad_norm": 34.73685836791992, "learning_rate": 4.999170417861162e-06, "loss": 2.0144, "step": 7650 }, { "epoch": 0.00831357659730903, "grad_norm": 18.3474063873291, "learning_rate": 4.99916931877391e-06, "loss": 1.362, "step": 7655 }, { "epoch": 0.008319006758378466, "grad_norm": 20.8380184173584, "learning_rate": 4.999168218959187e-06, "loss": 1.5382, "step": 7660 }, { "epoch": 0.008324436919447904, "grad_norm": 58.70833206176758, "learning_rate": 4.999167118416995e-06, "loss": 1.6959, "step": 7665 }, { "epoch": 0.008329867080517342, "grad_norm": 31.176759719848633, "learning_rate": 4.999166017147333e-06, "loss": 1.5678, "step": 7670 }, { "epoch": 0.00833529724158678, "grad_norm": 22.980697631835938, "learning_rate": 4.999164915150203e-06, "loss": 1.9113, "step": 7675 }, { "epoch": 0.008340727402656218, "grad_norm": 16.073904037475586, "learning_rate": 4.999163812425603e-06, "loss": 1.9699, "step": 7680 }, { "epoch": 0.008346157563725655, "grad_norm": 17.532196044921875, "learning_rate": 4.999162708973534e-06, "loss": 2.4115, "step": 7685 }, { "epoch": 0.008351587724795093, "grad_norm": 48.101131439208984, "learning_rate": 4.999161604793998e-06, "loss": 1.5681, "step": 7690 }, { "epoch": 0.00835701788586453, "grad_norm": 16.792217254638672, "learning_rate": 4.999160499886994e-06, "loss": 1.6555, "step": 7695 }, { "epoch": 0.008362448046933969, "grad_norm": 94.47759246826172, "learning_rate": 4.999159394252521e-06, "loss": 2.033, "step": 7700 }, { "epoch": 0.008367878208003407, "grad_norm": 32.540889739990234, "learning_rate": 4.999158287890582e-06, "loss": 1.4758, "step": 7705 }, { "epoch": 0.008373308369072843, "grad_norm": 25.180665969848633, "learning_rate": 4.999157180801175e-06, "loss": 1.6709, "step": 7710 }, { "epoch": 0.008378738530142281, "grad_norm": 25.248170852661133, "learning_rate": 4.9991560729843025e-06, "loss": 2.0929, "step": 7715 }, { "epoch": 0.008384168691211719, "grad_norm": 15.83434772491455, "learning_rate": 4.9991549644399625e-06, "loss": 2.2257, "step": 7720 }, { "epoch": 0.008389598852281157, "grad_norm": 77.1236343383789, "learning_rate": 4.9991538551681576e-06, "loss": 1.8497, "step": 7725 }, { "epoch": 0.008395029013350593, "grad_norm": 13.21204662322998, "learning_rate": 4.999152745168887e-06, "loss": 1.255, "step": 7730 }, { "epoch": 0.008400459174420031, "grad_norm": 33.04648208618164, "learning_rate": 4.99915163444215e-06, "loss": 1.5627, "step": 7735 }, { "epoch": 0.00840588933548947, "grad_norm": 17.807849884033203, "learning_rate": 4.999150522987949e-06, "loss": 1.8229, "step": 7740 }, { "epoch": 0.008411319496558907, "grad_norm": 54.603816986083984, "learning_rate": 4.9991494108062835e-06, "loss": 1.5677, "step": 7745 }, { "epoch": 0.008416749657628345, "grad_norm": 27.60224723815918, "learning_rate": 4.999148297897153e-06, "loss": 1.7246, "step": 7750 }, { "epoch": 0.008422179818697782, "grad_norm": 14.092010498046875, "learning_rate": 4.9991471842605585e-06, "loss": 1.5484, "step": 7755 }, { "epoch": 0.00842760997976722, "grad_norm": 29.87077522277832, "learning_rate": 4.9991460698965e-06, "loss": 1.2379, "step": 7760 }, { "epoch": 0.008433040140836658, "grad_norm": 13.489496231079102, "learning_rate": 4.999144954804979e-06, "loss": 1.5303, "step": 7765 }, { "epoch": 0.008438470301906096, "grad_norm": 17.435985565185547, "learning_rate": 4.999143838985994e-06, "loss": 2.4684, "step": 7770 }, { "epoch": 0.008443900462975534, "grad_norm": 13.154239654541016, "learning_rate": 4.9991427224395465e-06, "loss": 2.0033, "step": 7775 }, { "epoch": 0.00844933062404497, "grad_norm": 21.528141021728516, "learning_rate": 4.999141605165637e-06, "loss": 1.8733, "step": 7780 }, { "epoch": 0.008454760785114408, "grad_norm": 35.7089958190918, "learning_rate": 4.999140487164264e-06, "loss": 1.5332, "step": 7785 }, { "epoch": 0.008460190946183846, "grad_norm": 20.545330047607422, "learning_rate": 4.9991393684354315e-06, "loss": 1.779, "step": 7790 }, { "epoch": 0.008465621107253284, "grad_norm": 26.591970443725586, "learning_rate": 4.999138248979136e-06, "loss": 1.1917, "step": 7795 }, { "epoch": 0.00847105126832272, "grad_norm": 79.05941009521484, "learning_rate": 4.999137128795379e-06, "loss": 1.8137, "step": 7800 }, { "epoch": 0.008476481429392158, "grad_norm": 38.60193634033203, "learning_rate": 4.999136007884163e-06, "loss": 1.5542, "step": 7805 }, { "epoch": 0.008481911590461596, "grad_norm": 13.375686645507812, "learning_rate": 4.999134886245485e-06, "loss": 1.8231, "step": 7810 }, { "epoch": 0.008487341751531034, "grad_norm": 24.585912704467773, "learning_rate": 4.999133763879347e-06, "loss": 1.1781, "step": 7815 }, { "epoch": 0.008492771912600472, "grad_norm": 39.27122116088867, "learning_rate": 4.99913264078575e-06, "loss": 1.9061, "step": 7820 }, { "epoch": 0.008498202073669909, "grad_norm": 22.785993576049805, "learning_rate": 4.999131516964693e-06, "loss": 2.2173, "step": 7825 }, { "epoch": 0.008503632234739347, "grad_norm": 22.469411849975586, "learning_rate": 4.999130392416177e-06, "loss": 1.4775, "step": 7830 }, { "epoch": 0.008509062395808785, "grad_norm": 57.438899993896484, "learning_rate": 4.999129267140202e-06, "loss": 2.4903, "step": 7835 }, { "epoch": 0.008514492556878223, "grad_norm": 29.802305221557617, "learning_rate": 4.999128141136768e-06, "loss": 1.8939, "step": 7840 }, { "epoch": 0.00851992271794766, "grad_norm": 21.25376319885254, "learning_rate": 4.999127014405877e-06, "loss": 1.4303, "step": 7845 }, { "epoch": 0.008525352879017097, "grad_norm": 22.276304244995117, "learning_rate": 4.999125886947528e-06, "loss": 1.4176, "step": 7850 }, { "epoch": 0.008530783040086535, "grad_norm": 58.219886779785156, "learning_rate": 4.999124758761721e-06, "loss": 2.0183, "step": 7855 }, { "epoch": 0.008536213201155973, "grad_norm": 15.438048362731934, "learning_rate": 4.999123629848456e-06, "loss": 1.7828, "step": 7860 }, { "epoch": 0.008541643362225411, "grad_norm": 25.366613388061523, "learning_rate": 4.999122500207736e-06, "loss": 1.9694, "step": 7865 }, { "epoch": 0.008547073523294847, "grad_norm": 37.81720733642578, "learning_rate": 4.999121369839558e-06, "loss": 1.8919, "step": 7870 }, { "epoch": 0.008552503684364285, "grad_norm": 81.37352752685547, "learning_rate": 4.9991202387439245e-06, "loss": 2.463, "step": 7875 }, { "epoch": 0.008557933845433723, "grad_norm": 36.66537857055664, "learning_rate": 4.999119106920835e-06, "loss": 1.7124, "step": 7880 }, { "epoch": 0.008563364006503161, "grad_norm": 18.058813095092773, "learning_rate": 4.999117974370289e-06, "loss": 1.5106, "step": 7885 }, { "epoch": 0.0085687941675726, "grad_norm": 18.47439193725586, "learning_rate": 4.999116841092288e-06, "loss": 1.754, "step": 7890 }, { "epoch": 0.008574224328642036, "grad_norm": 29.617116928100586, "learning_rate": 4.999115707086833e-06, "loss": 2.0592, "step": 7895 }, { "epoch": 0.008579654489711474, "grad_norm": 34.512451171875, "learning_rate": 4.9991145723539234e-06, "loss": 1.5214, "step": 7900 }, { "epoch": 0.008585084650780912, "grad_norm": 32.752872467041016, "learning_rate": 4.999113436893559e-06, "loss": 1.6237, "step": 7905 }, { "epoch": 0.00859051481185035, "grad_norm": 108.36131286621094, "learning_rate": 4.999112300705741e-06, "loss": 1.9148, "step": 7910 }, { "epoch": 0.008595944972919788, "grad_norm": 18.0410213470459, "learning_rate": 4.99911116379047e-06, "loss": 1.4917, "step": 7915 }, { "epoch": 0.008601375133989224, "grad_norm": 14.293110847473145, "learning_rate": 4.999110026147745e-06, "loss": 1.6145, "step": 7920 }, { "epoch": 0.008606805295058662, "grad_norm": 41.297882080078125, "learning_rate": 4.999108887777568e-06, "loss": 2.3159, "step": 7925 }, { "epoch": 0.0086122354561281, "grad_norm": 45.22941207885742, "learning_rate": 4.999107748679937e-06, "loss": 1.3544, "step": 7930 }, { "epoch": 0.008617665617197538, "grad_norm": 21.719587326049805, "learning_rate": 4.999106608854855e-06, "loss": 1.9246, "step": 7935 }, { "epoch": 0.008623095778266974, "grad_norm": 12.734528541564941, "learning_rate": 4.99910546830232e-06, "loss": 1.9338, "step": 7940 }, { "epoch": 0.008628525939336412, "grad_norm": 24.431562423706055, "learning_rate": 4.999104327022334e-06, "loss": 1.1313, "step": 7945 }, { "epoch": 0.00863395610040585, "grad_norm": 14.78453254699707, "learning_rate": 4.999103185014896e-06, "loss": 2.0159, "step": 7950 }, { "epoch": 0.008639386261475288, "grad_norm": 21.46977424621582, "learning_rate": 4.9991020422800085e-06, "loss": 1.5765, "step": 7955 }, { "epoch": 0.008644816422544726, "grad_norm": 19.287315368652344, "learning_rate": 4.9991008988176695e-06, "loss": 1.5996, "step": 7960 }, { "epoch": 0.008650246583614163, "grad_norm": 12.960758209228516, "learning_rate": 4.999099754627881e-06, "loss": 1.7038, "step": 7965 }, { "epoch": 0.0086556767446836, "grad_norm": 76.26851654052734, "learning_rate": 4.999098609710642e-06, "loss": 1.4668, "step": 7970 }, { "epoch": 0.008661106905753039, "grad_norm": 22.83784294128418, "learning_rate": 4.999097464065954e-06, "loss": 1.7677, "step": 7975 }, { "epoch": 0.008666537066822477, "grad_norm": 37.607933044433594, "learning_rate": 4.999096317693816e-06, "loss": 1.8728, "step": 7980 }, { "epoch": 0.008671967227891915, "grad_norm": 33.14974594116211, "learning_rate": 4.999095170594229e-06, "loss": 2.5947, "step": 7985 }, { "epoch": 0.00867739738896135, "grad_norm": 23.50336265563965, "learning_rate": 4.999094022767194e-06, "loss": 1.7086, "step": 7990 }, { "epoch": 0.008682827550030789, "grad_norm": 86.94734191894531, "learning_rate": 4.99909287421271e-06, "loss": 1.1481, "step": 7995 }, { "epoch": 0.008688257711100227, "grad_norm": 24.385639190673828, "learning_rate": 4.999091724930779e-06, "loss": 1.9853, "step": 8000 }, { "epoch": 0.008693687872169665, "grad_norm": 17.221662521362305, "learning_rate": 4.9990905749214e-06, "loss": 1.0468, "step": 8005 }, { "epoch": 0.008699118033239101, "grad_norm": 20.3345947265625, "learning_rate": 4.9990894241845734e-06, "loss": 2.0832, "step": 8010 }, { "epoch": 0.00870454819430854, "grad_norm": 183.61053466796875, "learning_rate": 4.999088272720301e-06, "loss": 2.5515, "step": 8015 }, { "epoch": 0.008709978355377977, "grad_norm": 29.827402114868164, "learning_rate": 4.999087120528581e-06, "loss": 1.5449, "step": 8020 }, { "epoch": 0.008715408516447415, "grad_norm": 41.41845703125, "learning_rate": 4.999085967609415e-06, "loss": 1.9768, "step": 8025 }, { "epoch": 0.008720838677516853, "grad_norm": 45.70603561401367, "learning_rate": 4.999084813962803e-06, "loss": 1.7672, "step": 8030 }, { "epoch": 0.00872626883858629, "grad_norm": 58.876991271972656, "learning_rate": 4.999083659588746e-06, "loss": 0.9201, "step": 8035 }, { "epoch": 0.008731698999655728, "grad_norm": 181.5756072998047, "learning_rate": 4.999082504487242e-06, "loss": 1.6636, "step": 8040 }, { "epoch": 0.008737129160725166, "grad_norm": 37.07171630859375, "learning_rate": 4.999081348658296e-06, "loss": 1.5964, "step": 8045 }, { "epoch": 0.008742559321794604, "grad_norm": 25.96610450744629, "learning_rate": 4.999080192101903e-06, "loss": 1.6995, "step": 8050 }, { "epoch": 0.008747989482864042, "grad_norm": 21.46846580505371, "learning_rate": 4.999079034818067e-06, "loss": 1.7622, "step": 8055 }, { "epoch": 0.008753419643933478, "grad_norm": 34.03228759765625, "learning_rate": 4.999077876806787e-06, "loss": 1.6733, "step": 8060 }, { "epoch": 0.008758849805002916, "grad_norm": 25.63414764404297, "learning_rate": 4.999076718068063e-06, "loss": 1.5583, "step": 8065 }, { "epoch": 0.008764279966072354, "grad_norm": 63.75238800048828, "learning_rate": 4.9990755586018955e-06, "loss": 1.7739, "step": 8070 }, { "epoch": 0.008769710127141792, "grad_norm": 34.067962646484375, "learning_rate": 4.999074398408287e-06, "loss": 1.8477, "step": 8075 }, { "epoch": 0.008775140288211228, "grad_norm": 12.600729942321777, "learning_rate": 4.999073237487234e-06, "loss": 1.5176, "step": 8080 }, { "epoch": 0.008780570449280666, "grad_norm": 20.399429321289062, "learning_rate": 4.999072075838739e-06, "loss": 1.9939, "step": 8085 }, { "epoch": 0.008786000610350104, "grad_norm": 20.724525451660156, "learning_rate": 4.999070913462803e-06, "loss": 1.921, "step": 8090 }, { "epoch": 0.008791430771419542, "grad_norm": 19.829160690307617, "learning_rate": 4.999069750359425e-06, "loss": 1.6, "step": 8095 }, { "epoch": 0.00879686093248898, "grad_norm": 53.9341926574707, "learning_rate": 4.999068586528606e-06, "loss": 1.0852, "step": 8100 }, { "epoch": 0.008802291093558417, "grad_norm": 27.516576766967773, "learning_rate": 4.999067421970346e-06, "loss": 1.8536, "step": 8105 }, { "epoch": 0.008807721254627855, "grad_norm": 23.92047882080078, "learning_rate": 4.9990662566846446e-06, "loss": 1.5596, "step": 8110 }, { "epoch": 0.008813151415697293, "grad_norm": 16.49608612060547, "learning_rate": 4.999065090671504e-06, "loss": 2.0201, "step": 8115 }, { "epoch": 0.00881858157676673, "grad_norm": 25.979291915893555, "learning_rate": 4.999063923930924e-06, "loss": 1.6418, "step": 8120 }, { "epoch": 0.008824011737836169, "grad_norm": 22.58542251586914, "learning_rate": 4.9990627564629045e-06, "loss": 1.8115, "step": 8125 }, { "epoch": 0.008829441898905605, "grad_norm": 42.39257049560547, "learning_rate": 4.999061588267446e-06, "loss": 2.0729, "step": 8130 }, { "epoch": 0.008834872059975043, "grad_norm": 29.63323211669922, "learning_rate": 4.9990604193445465e-06, "loss": 1.5795, "step": 8135 }, { "epoch": 0.00884030222104448, "grad_norm": 23.949913024902344, "learning_rate": 4.999059249694211e-06, "loss": 1.7183, "step": 8140 }, { "epoch": 0.008845732382113919, "grad_norm": 45.619205474853516, "learning_rate": 4.9990580793164365e-06, "loss": 1.3903, "step": 8145 }, { "epoch": 0.008851162543183355, "grad_norm": 15.319489479064941, "learning_rate": 4.999056908211224e-06, "loss": 1.6723, "step": 8150 }, { "epoch": 0.008856592704252793, "grad_norm": 68.36441802978516, "learning_rate": 4.999055736378574e-06, "loss": 1.7441, "step": 8155 }, { "epoch": 0.008862022865322231, "grad_norm": 16.86027717590332, "learning_rate": 4.999054563818488e-06, "loss": 1.4406, "step": 8160 }, { "epoch": 0.00886745302639167, "grad_norm": 27.573381423950195, "learning_rate": 4.999053390530964e-06, "loss": 1.4452, "step": 8165 }, { "epoch": 0.008872883187461107, "grad_norm": 17.50723648071289, "learning_rate": 4.999052216516004e-06, "loss": 1.3067, "step": 8170 }, { "epoch": 0.008878313348530543, "grad_norm": 11.881651878356934, "learning_rate": 4.999051041773609e-06, "loss": 2.0333, "step": 8175 }, { "epoch": 0.008883743509599982, "grad_norm": 22.59203338623047, "learning_rate": 4.999049866303776e-06, "loss": 1.8097, "step": 8180 }, { "epoch": 0.00888917367066942, "grad_norm": 15.522335052490234, "learning_rate": 4.999048690106509e-06, "loss": 1.931, "step": 8185 }, { "epoch": 0.008894603831738858, "grad_norm": 252.6591339111328, "learning_rate": 4.999047513181807e-06, "loss": 1.6164, "step": 8190 }, { "epoch": 0.008900033992808296, "grad_norm": 13.865987777709961, "learning_rate": 4.99904633552967e-06, "loss": 1.8144, "step": 8195 }, { "epoch": 0.008905464153877732, "grad_norm": 25.156538009643555, "learning_rate": 4.999045157150099e-06, "loss": 1.5238, "step": 8200 }, { "epoch": 0.00891089431494717, "grad_norm": 15.954425811767578, "learning_rate": 4.999043978043094e-06, "loss": 1.5411, "step": 8205 }, { "epoch": 0.008916324476016608, "grad_norm": 13.602270126342773, "learning_rate": 4.999042798208654e-06, "loss": 1.7911, "step": 8210 }, { "epoch": 0.008921754637086046, "grad_norm": 43.360374450683594, "learning_rate": 4.999041617646781e-06, "loss": 2.2795, "step": 8215 }, { "epoch": 0.008927184798155482, "grad_norm": 19.74899673461914, "learning_rate": 4.999040436357476e-06, "loss": 1.7423, "step": 8220 }, { "epoch": 0.00893261495922492, "grad_norm": 30.853069305419922, "learning_rate": 4.999039254340739e-06, "loss": 1.4851, "step": 8225 }, { "epoch": 0.008938045120294358, "grad_norm": 40.98056411743164, "learning_rate": 4.999038071596568e-06, "loss": 1.6805, "step": 8230 }, { "epoch": 0.008943475281363796, "grad_norm": 117.35905456542969, "learning_rate": 4.999036888124966e-06, "loss": 1.3548, "step": 8235 }, { "epoch": 0.008948905442433234, "grad_norm": 26.41805076599121, "learning_rate": 4.999035703925932e-06, "loss": 1.7522, "step": 8240 }, { "epoch": 0.00895433560350267, "grad_norm": 12.696758270263672, "learning_rate": 4.999034518999467e-06, "loss": 1.5687, "step": 8245 }, { "epoch": 0.008959765764572108, "grad_norm": 16.03137969970703, "learning_rate": 4.999033333345571e-06, "loss": 1.7175, "step": 8250 }, { "epoch": 0.008965195925641547, "grad_norm": 30.946744918823242, "learning_rate": 4.999032146964244e-06, "loss": 1.6844, "step": 8255 }, { "epoch": 0.008970626086710985, "grad_norm": 19.123014450073242, "learning_rate": 4.999030959855487e-06, "loss": 1.5666, "step": 8260 }, { "epoch": 0.00897605624778042, "grad_norm": 17.968978881835938, "learning_rate": 4.9990297720193e-06, "loss": 2.1109, "step": 8265 }, { "epoch": 0.008981486408849859, "grad_norm": 13.16151237487793, "learning_rate": 4.999028583455684e-06, "loss": 1.9496, "step": 8270 }, { "epoch": 0.008986916569919297, "grad_norm": 19.519994735717773, "learning_rate": 4.999027394164639e-06, "loss": 1.7826, "step": 8275 }, { "epoch": 0.008992346730988735, "grad_norm": 17.081226348876953, "learning_rate": 4.999026204146164e-06, "loss": 2.1492, "step": 8280 }, { "epoch": 0.008997776892058173, "grad_norm": 29.318876266479492, "learning_rate": 4.999025013400261e-06, "loss": 1.5137, "step": 8285 }, { "epoch": 0.00900320705312761, "grad_norm": 44.24159622192383, "learning_rate": 4.99902382192693e-06, "loss": 1.625, "step": 8290 }, { "epoch": 0.009008637214197047, "grad_norm": 16.587265014648438, "learning_rate": 4.999022629726171e-06, "loss": 1.4361, "step": 8295 }, { "epoch": 0.009014067375266485, "grad_norm": 19.846797943115234, "learning_rate": 4.999021436797985e-06, "loss": 1.7206, "step": 8300 }, { "epoch": 0.009019497536335923, "grad_norm": 21.181072235107422, "learning_rate": 4.999020243142372e-06, "loss": 1.4431, "step": 8305 }, { "epoch": 0.009024927697405361, "grad_norm": 36.0201530456543, "learning_rate": 4.9990190487593316e-06, "loss": 1.503, "step": 8310 }, { "epoch": 0.009030357858474797, "grad_norm": 22.633039474487305, "learning_rate": 4.999017853648864e-06, "loss": 1.8096, "step": 8315 }, { "epoch": 0.009035788019544235, "grad_norm": 33.014060974121094, "learning_rate": 4.999016657810972e-06, "loss": 1.7282, "step": 8320 }, { "epoch": 0.009041218180613673, "grad_norm": 22.819868087768555, "learning_rate": 4.999015461245653e-06, "loss": 2.1421, "step": 8325 }, { "epoch": 0.009046648341683112, "grad_norm": 160.66685485839844, "learning_rate": 4.999014263952909e-06, "loss": 1.901, "step": 8330 }, { "epoch": 0.009052078502752548, "grad_norm": 11.567634582519531, "learning_rate": 4.999013065932741e-06, "loss": 1.9472, "step": 8335 }, { "epoch": 0.009057508663821986, "grad_norm": 20.303396224975586, "learning_rate": 4.999011867185148e-06, "loss": 1.6805, "step": 8340 }, { "epoch": 0.009062938824891424, "grad_norm": 24.169160842895508, "learning_rate": 4.999010667710129e-06, "loss": 1.5902, "step": 8345 }, { "epoch": 0.009068368985960862, "grad_norm": 37.51234817504883, "learning_rate": 4.9990094675076875e-06, "loss": 1.656, "step": 8350 }, { "epoch": 0.0090737991470303, "grad_norm": 24.280128479003906, "learning_rate": 4.999008266577821e-06, "loss": 1.7687, "step": 8355 }, { "epoch": 0.009079229308099736, "grad_norm": 38.182857513427734, "learning_rate": 4.9990070649205335e-06, "loss": 1.9782, "step": 8360 }, { "epoch": 0.009084659469169174, "grad_norm": 24.08686065673828, "learning_rate": 4.999005862535821e-06, "loss": 1.9, "step": 8365 }, { "epoch": 0.009090089630238612, "grad_norm": 27.141740798950195, "learning_rate": 4.9990046594236876e-06, "loss": 1.7635, "step": 8370 }, { "epoch": 0.00909551979130805, "grad_norm": 38.502288818359375, "learning_rate": 4.99900345558413e-06, "loss": 1.6121, "step": 8375 }, { "epoch": 0.009100949952377488, "grad_norm": 20.119665145874023, "learning_rate": 4.999002251017152e-06, "loss": 1.5742, "step": 8380 }, { "epoch": 0.009106380113446924, "grad_norm": 24.126747131347656, "learning_rate": 4.999001045722752e-06, "loss": 1.4151, "step": 8385 }, { "epoch": 0.009111810274516362, "grad_norm": 43.51616287231445, "learning_rate": 4.998999839700931e-06, "loss": 1.241, "step": 8390 }, { "epoch": 0.0091172404355858, "grad_norm": 19.502750396728516, "learning_rate": 4.998998632951689e-06, "loss": 1.4592, "step": 8395 }, { "epoch": 0.009122670596655238, "grad_norm": 18.31578826904297, "learning_rate": 4.9989974254750264e-06, "loss": 1.8756, "step": 8400 }, { "epoch": 0.009128100757724675, "grad_norm": 12.338918685913086, "learning_rate": 4.998996217270945e-06, "loss": 1.7286, "step": 8405 }, { "epoch": 0.009133530918794113, "grad_norm": 45.18296813964844, "learning_rate": 4.998995008339442e-06, "loss": 1.757, "step": 8410 }, { "epoch": 0.00913896107986355, "grad_norm": 15.716870307922363, "learning_rate": 4.99899379868052e-06, "loss": 2.4209, "step": 8415 }, { "epoch": 0.009144391240932989, "grad_norm": 30.474384307861328, "learning_rate": 4.998992588294179e-06, "loss": 1.6878, "step": 8420 }, { "epoch": 0.009149821402002427, "grad_norm": 33.146217346191406, "learning_rate": 4.99899137718042e-06, "loss": 1.6395, "step": 8425 }, { "epoch": 0.009155251563071863, "grad_norm": 24.674135208129883, "learning_rate": 4.998990165339242e-06, "loss": 1.7052, "step": 8430 }, { "epoch": 0.009160681724141301, "grad_norm": 53.53852462768555, "learning_rate": 4.9989889527706455e-06, "loss": 2.2794, "step": 8435 }, { "epoch": 0.00916611188521074, "grad_norm": 97.64529418945312, "learning_rate": 4.998987739474632e-06, "loss": 1.5909, "step": 8440 }, { "epoch": 0.009171542046280177, "grad_norm": 21.233366012573242, "learning_rate": 4.998986525451202e-06, "loss": 1.4815, "step": 8445 }, { "epoch": 0.009176972207349615, "grad_norm": 94.1326675415039, "learning_rate": 4.998985310700354e-06, "loss": 1.7906, "step": 8450 }, { "epoch": 0.009182402368419051, "grad_norm": 16.273571014404297, "learning_rate": 4.998984095222089e-06, "loss": 1.2933, "step": 8455 }, { "epoch": 0.00918783252948849, "grad_norm": 28.001834869384766, "learning_rate": 4.998982879016409e-06, "loss": 1.5462, "step": 8460 }, { "epoch": 0.009193262690557927, "grad_norm": 27.296607971191406, "learning_rate": 4.998981662083312e-06, "loss": 1.7731, "step": 8465 }, { "epoch": 0.009198692851627365, "grad_norm": 26.759706497192383, "learning_rate": 4.9989804444228e-06, "loss": 2.1575, "step": 8470 }, { "epoch": 0.009204123012696802, "grad_norm": 38.9648551940918, "learning_rate": 4.998979226034872e-06, "loss": 1.2963, "step": 8475 }, { "epoch": 0.00920955317376624, "grad_norm": 21.49458885192871, "learning_rate": 4.9989780069195305e-06, "loss": 1.6885, "step": 8480 }, { "epoch": 0.009214983334835678, "grad_norm": 27.11126136779785, "learning_rate": 4.998976787076774e-06, "loss": 2.6253, "step": 8485 }, { "epoch": 0.009220413495905116, "grad_norm": 21.1065731048584, "learning_rate": 4.998975566506603e-06, "loss": 1.4379, "step": 8490 }, { "epoch": 0.009225843656974554, "grad_norm": 17.88996696472168, "learning_rate": 4.9989743452090176e-06, "loss": 1.6735, "step": 8495 }, { "epoch": 0.00923127381804399, "grad_norm": 18.463180541992188, "learning_rate": 4.9989731231840195e-06, "loss": 2.2496, "step": 8500 }, { "epoch": 0.009236703979113428, "grad_norm": 18.458919525146484, "learning_rate": 4.99897190043161e-06, "loss": 1.7709, "step": 8505 }, { "epoch": 0.009242134140182866, "grad_norm": 24.93008804321289, "learning_rate": 4.998970676951785e-06, "loss": 1.192, "step": 8510 }, { "epoch": 0.009247564301252304, "grad_norm": 26.237445831298828, "learning_rate": 4.99896945274455e-06, "loss": 2.344, "step": 8515 }, { "epoch": 0.009252994462321742, "grad_norm": 73.57440185546875, "learning_rate": 4.9989682278099015e-06, "loss": 1.3405, "step": 8520 }, { "epoch": 0.009258424623391178, "grad_norm": 12.460958480834961, "learning_rate": 4.998967002147842e-06, "loss": 1.5011, "step": 8525 }, { "epoch": 0.009263854784460616, "grad_norm": 18.792449951171875, "learning_rate": 4.9989657757583706e-06, "loss": 1.7795, "step": 8530 }, { "epoch": 0.009269284945530054, "grad_norm": 19.5899658203125, "learning_rate": 4.998964548641489e-06, "loss": 1.6587, "step": 8535 }, { "epoch": 0.009274715106599492, "grad_norm": 17.975248336791992, "learning_rate": 4.998963320797197e-06, "loss": 1.7166, "step": 8540 }, { "epoch": 0.009280145267668929, "grad_norm": 16.93927764892578, "learning_rate": 4.998962092225495e-06, "loss": 1.7265, "step": 8545 }, { "epoch": 0.009285575428738367, "grad_norm": 18.029043197631836, "learning_rate": 4.998960862926382e-06, "loss": 1.9749, "step": 8550 }, { "epoch": 0.009291005589807805, "grad_norm": 166.98948669433594, "learning_rate": 4.99895963289986e-06, "loss": 1.2687, "step": 8555 }, { "epoch": 0.009296435750877243, "grad_norm": 20.76036834716797, "learning_rate": 4.998958402145929e-06, "loss": 1.8301, "step": 8560 }, { "epoch": 0.00930186591194668, "grad_norm": 25.021751403808594, "learning_rate": 4.998957170664589e-06, "loss": 1.1827, "step": 8565 }, { "epoch": 0.009307296073016117, "grad_norm": 23.23174476623535, "learning_rate": 4.998955938455841e-06, "loss": 2.2178, "step": 8570 }, { "epoch": 0.009312726234085555, "grad_norm": 62.662784576416016, "learning_rate": 4.998954705519684e-06, "loss": 2.076, "step": 8575 }, { "epoch": 0.009318156395154993, "grad_norm": 21.427583694458008, "learning_rate": 4.998953471856121e-06, "loss": 2.3292, "step": 8580 }, { "epoch": 0.009323586556224431, "grad_norm": 23.11036491394043, "learning_rate": 4.998952237465149e-06, "loss": 1.5994, "step": 8585 }, { "epoch": 0.00932901671729387, "grad_norm": 13.423990249633789, "learning_rate": 4.99895100234677e-06, "loss": 1.485, "step": 8590 }, { "epoch": 0.009334446878363305, "grad_norm": 17.512332916259766, "learning_rate": 4.998949766500985e-06, "loss": 1.9965, "step": 8595 }, { "epoch": 0.009339877039432743, "grad_norm": 46.299407958984375, "learning_rate": 4.998948529927795e-06, "loss": 1.9629, "step": 8600 }, { "epoch": 0.009345307200502181, "grad_norm": 32.41817855834961, "learning_rate": 4.998947292627198e-06, "loss": 1.6205, "step": 8605 }, { "epoch": 0.00935073736157162, "grad_norm": 16.902584075927734, "learning_rate": 4.9989460545991954e-06, "loss": 1.4931, "step": 8610 }, { "epoch": 0.009356167522641056, "grad_norm": 27.710004806518555, "learning_rate": 4.998944815843787e-06, "loss": 1.6141, "step": 8615 }, { "epoch": 0.009361597683710494, "grad_norm": 51.12255859375, "learning_rate": 4.998943576360974e-06, "loss": 1.684, "step": 8620 }, { "epoch": 0.009367027844779932, "grad_norm": 87.46617126464844, "learning_rate": 4.998942336150757e-06, "loss": 2.0348, "step": 8625 }, { "epoch": 0.00937245800584937, "grad_norm": 50.443946838378906, "learning_rate": 4.9989410952131366e-06, "loss": 2.1431, "step": 8630 }, { "epoch": 0.009377888166918808, "grad_norm": 72.76702117919922, "learning_rate": 4.998939853548112e-06, "loss": 1.543, "step": 8635 }, { "epoch": 0.009383318327988244, "grad_norm": 65.05448150634766, "learning_rate": 4.998938611155683e-06, "loss": 2.3777, "step": 8640 }, { "epoch": 0.009388748489057682, "grad_norm": 22.75779151916504, "learning_rate": 4.998937368035852e-06, "loss": 1.9478, "step": 8645 }, { "epoch": 0.00939417865012712, "grad_norm": 38.440731048583984, "learning_rate": 4.998936124188618e-06, "loss": 1.8323, "step": 8650 }, { "epoch": 0.009399608811196558, "grad_norm": 23.561548233032227, "learning_rate": 4.998934879613982e-06, "loss": 1.6814, "step": 8655 }, { "epoch": 0.009405038972265996, "grad_norm": 29.118911743164062, "learning_rate": 4.998933634311944e-06, "loss": 1.6305, "step": 8660 }, { "epoch": 0.009410469133335432, "grad_norm": 22.849157333374023, "learning_rate": 4.998932388282504e-06, "loss": 2.3112, "step": 8665 }, { "epoch": 0.00941589929440487, "grad_norm": 203.41893005371094, "learning_rate": 4.9989311415256635e-06, "loss": 1.9174, "step": 8670 }, { "epoch": 0.009421329455474308, "grad_norm": 18.682907104492188, "learning_rate": 4.9989298940414215e-06, "loss": 1.3058, "step": 8675 }, { "epoch": 0.009426759616543746, "grad_norm": 17.62217903137207, "learning_rate": 4.998928645829779e-06, "loss": 1.1653, "step": 8680 }, { "epoch": 0.009432189777613183, "grad_norm": 39.739559173583984, "learning_rate": 4.998927396890737e-06, "loss": 1.9617, "step": 8685 }, { "epoch": 0.00943761993868262, "grad_norm": 31.617021560668945, "learning_rate": 4.998926147224295e-06, "loss": 2.087, "step": 8690 }, { "epoch": 0.009443050099752059, "grad_norm": 16.051267623901367, "learning_rate": 4.998924896830454e-06, "loss": 2.0961, "step": 8695 }, { "epoch": 0.009448480260821497, "grad_norm": 55.97182846069336, "learning_rate": 4.998923645709213e-06, "loss": 1.989, "step": 8700 }, { "epoch": 0.009453910421890935, "grad_norm": 19.184673309326172, "learning_rate": 4.998922393860574e-06, "loss": 2.0161, "step": 8705 }, { "epoch": 0.009459340582960371, "grad_norm": 22.634912490844727, "learning_rate": 4.998921141284537e-06, "loss": 2.0923, "step": 8710 }, { "epoch": 0.009464770744029809, "grad_norm": 18.506601333618164, "learning_rate": 4.998919887981102e-06, "loss": 1.3251, "step": 8715 }, { "epoch": 0.009470200905099247, "grad_norm": 73.42236328125, "learning_rate": 4.998918633950269e-06, "loss": 2.2562, "step": 8720 }, { "epoch": 0.009475631066168685, "grad_norm": 15.226593971252441, "learning_rate": 4.998917379192039e-06, "loss": 1.9405, "step": 8725 }, { "epoch": 0.009481061227238123, "grad_norm": 16.9351806640625, "learning_rate": 4.998916123706412e-06, "loss": 1.9237, "step": 8730 }, { "epoch": 0.00948649138830756, "grad_norm": 35.22544860839844, "learning_rate": 4.998914867493389e-06, "loss": 1.4916, "step": 8735 }, { "epoch": 0.009491921549376997, "grad_norm": 30.753877639770508, "learning_rate": 4.998913610552969e-06, "loss": 1.8376, "step": 8740 }, { "epoch": 0.009497351710446435, "grad_norm": 16.031009674072266, "learning_rate": 4.998912352885153e-06, "loss": 1.5763, "step": 8745 }, { "epoch": 0.009502781871515873, "grad_norm": 48.03716278076172, "learning_rate": 4.998911094489944e-06, "loss": 1.8073, "step": 8750 }, { "epoch": 0.00950821203258531, "grad_norm": 63.08102035522461, "learning_rate": 4.998909835367339e-06, "loss": 1.9789, "step": 8755 }, { "epoch": 0.009513642193654748, "grad_norm": 31.296607971191406, "learning_rate": 4.998908575517338e-06, "loss": 1.3705, "step": 8760 }, { "epoch": 0.009519072354724186, "grad_norm": 39.05985641479492, "learning_rate": 4.998907314939944e-06, "loss": 1.7632, "step": 8765 }, { "epoch": 0.009524502515793624, "grad_norm": 41.813594818115234, "learning_rate": 4.998906053635156e-06, "loss": 1.9571, "step": 8770 }, { "epoch": 0.009529932676863062, "grad_norm": 30.4575138092041, "learning_rate": 4.998904791602973e-06, "loss": 1.7893, "step": 8775 }, { "epoch": 0.009535362837932498, "grad_norm": 17.204696655273438, "learning_rate": 4.998903528843398e-06, "loss": 1.4872, "step": 8780 }, { "epoch": 0.009540792999001936, "grad_norm": 15.45145034790039, "learning_rate": 4.99890226535643e-06, "loss": 1.6127, "step": 8785 }, { "epoch": 0.009546223160071374, "grad_norm": 15.840797424316406, "learning_rate": 4.9989010011420705e-06, "loss": 1.3924, "step": 8790 }, { "epoch": 0.009551653321140812, "grad_norm": 40.90594482421875, "learning_rate": 4.998899736200318e-06, "loss": 1.8226, "step": 8795 }, { "epoch": 0.00955708348221025, "grad_norm": 18.75249671936035, "learning_rate": 4.998898470531174e-06, "loss": 1.484, "step": 8800 }, { "epoch": 0.009562513643279686, "grad_norm": 188.29086303710938, "learning_rate": 4.9988972041346386e-06, "loss": 1.3169, "step": 8805 }, { "epoch": 0.009567943804349124, "grad_norm": 18.111217498779297, "learning_rate": 4.998895937010712e-06, "loss": 1.7521, "step": 8810 }, { "epoch": 0.009573373965418562, "grad_norm": 44.515785217285156, "learning_rate": 4.998894669159395e-06, "loss": 2.0956, "step": 8815 }, { "epoch": 0.009578804126488, "grad_norm": 19.888521194458008, "learning_rate": 4.998893400580688e-06, "loss": 1.3578, "step": 8820 }, { "epoch": 0.009584234287557437, "grad_norm": 104.6119384765625, "learning_rate": 4.998892131274591e-06, "loss": 1.9759, "step": 8825 }, { "epoch": 0.009589664448626875, "grad_norm": 22.433483123779297, "learning_rate": 4.998890861241105e-06, "loss": 1.4981, "step": 8830 }, { "epoch": 0.009595094609696313, "grad_norm": 15.799570083618164, "learning_rate": 4.998889590480229e-06, "loss": 1.6285, "step": 8835 }, { "epoch": 0.00960052477076575, "grad_norm": 42.55540084838867, "learning_rate": 4.998888318991965e-06, "loss": 1.8967, "step": 8840 }, { "epoch": 0.009605954931835189, "grad_norm": 27.623022079467773, "learning_rate": 4.998887046776313e-06, "loss": 1.3348, "step": 8845 }, { "epoch": 0.009611385092904625, "grad_norm": 22.504501342773438, "learning_rate": 4.998885773833272e-06, "loss": 1.3874, "step": 8850 }, { "epoch": 0.009616815253974063, "grad_norm": 51.829566955566406, "learning_rate": 4.9988845001628436e-06, "loss": 1.45, "step": 8855 }, { "epoch": 0.009622245415043501, "grad_norm": 29.936769485473633, "learning_rate": 4.9988832257650284e-06, "loss": 1.3377, "step": 8860 }, { "epoch": 0.009627675576112939, "grad_norm": 43.02104187011719, "learning_rate": 4.9988819506398255e-06, "loss": 1.2474, "step": 8865 }, { "epoch": 0.009633105737182377, "grad_norm": 30.571073532104492, "learning_rate": 4.998880674787236e-06, "loss": 2.1802, "step": 8870 }, { "epoch": 0.009638535898251813, "grad_norm": 55.63262176513672, "learning_rate": 4.998879398207261e-06, "loss": 2.1038, "step": 8875 }, { "epoch": 0.009643966059321251, "grad_norm": 24.052160263061523, "learning_rate": 4.998878120899901e-06, "loss": 2.0738, "step": 8880 }, { "epoch": 0.00964939622039069, "grad_norm": 29.381080627441406, "learning_rate": 4.9988768428651545e-06, "loss": 2.1878, "step": 8885 }, { "epoch": 0.009654826381460127, "grad_norm": 53.82161331176758, "learning_rate": 4.998875564103023e-06, "loss": 1.4259, "step": 8890 }, { "epoch": 0.009660256542529564, "grad_norm": 30.629545211791992, "learning_rate": 4.998874284613508e-06, "loss": 2.1707, "step": 8895 }, { "epoch": 0.009665686703599002, "grad_norm": 25.882469177246094, "learning_rate": 4.9988730043966075e-06, "loss": 1.9842, "step": 8900 }, { "epoch": 0.00967111686466844, "grad_norm": 20.3397274017334, "learning_rate": 4.998871723452323e-06, "loss": 1.8161, "step": 8905 }, { "epoch": 0.009676547025737878, "grad_norm": 95.24870300292969, "learning_rate": 4.9988704417806556e-06, "loss": 1.9993, "step": 8910 }, { "epoch": 0.009681977186807316, "grad_norm": 94.81773376464844, "learning_rate": 4.998869159381605e-06, "loss": 1.8263, "step": 8915 }, { "epoch": 0.009687407347876752, "grad_norm": 21.738405227661133, "learning_rate": 4.998867876255171e-06, "loss": 1.29, "step": 8920 }, { "epoch": 0.00969283750894619, "grad_norm": 39.78136444091797, "learning_rate": 4.998866592401355e-06, "loss": 1.9923, "step": 8925 }, { "epoch": 0.009698267670015628, "grad_norm": 103.39131927490234, "learning_rate": 4.998865307820157e-06, "loss": 1.3508, "step": 8930 }, { "epoch": 0.009703697831085066, "grad_norm": 22.8281307220459, "learning_rate": 4.998864022511577e-06, "loss": 1.685, "step": 8935 }, { "epoch": 0.009709127992154504, "grad_norm": 21.082117080688477, "learning_rate": 4.998862736475616e-06, "loss": 1.3608, "step": 8940 }, { "epoch": 0.00971455815322394, "grad_norm": 16.224872589111328, "learning_rate": 4.998861449712274e-06, "loss": 1.8536, "step": 8945 }, { "epoch": 0.009719988314293378, "grad_norm": 25.1973876953125, "learning_rate": 4.9988601622215514e-06, "loss": 1.8734, "step": 8950 }, { "epoch": 0.009725418475362816, "grad_norm": 107.20414733886719, "learning_rate": 4.998858874003449e-06, "loss": 1.5495, "step": 8955 }, { "epoch": 0.009730848636432254, "grad_norm": 90.45844268798828, "learning_rate": 4.998857585057967e-06, "loss": 1.4436, "step": 8960 }, { "epoch": 0.00973627879750169, "grad_norm": 11.677785873413086, "learning_rate": 4.998856295385105e-06, "loss": 1.7574, "step": 8965 }, { "epoch": 0.009741708958571129, "grad_norm": 81.18023681640625, "learning_rate": 4.998855004984864e-06, "loss": 1.616, "step": 8970 }, { "epoch": 0.009747139119640567, "grad_norm": 30.863901138305664, "learning_rate": 4.998853713857244e-06, "loss": 2.2926, "step": 8975 }, { "epoch": 0.009752569280710005, "grad_norm": 24.330198287963867, "learning_rate": 4.9988524220022464e-06, "loss": 2.283, "step": 8980 }, { "epoch": 0.009757999441779443, "grad_norm": 16.245590209960938, "learning_rate": 4.998851129419871e-06, "loss": 1.4356, "step": 8985 }, { "epoch": 0.009763429602848879, "grad_norm": 17.16288185119629, "learning_rate": 4.998849836110117e-06, "loss": 1.465, "step": 8990 }, { "epoch": 0.009768859763918317, "grad_norm": 21.655797958374023, "learning_rate": 4.998848542072987e-06, "loss": 1.4186, "step": 8995 }, { "epoch": 0.009774289924987755, "grad_norm": 16.3906307220459, "learning_rate": 4.9988472473084795e-06, "loss": 1.9997, "step": 9000 }, { "epoch": 0.009779720086057193, "grad_norm": 51.08323669433594, "learning_rate": 4.9988459518165955e-06, "loss": 1.5247, "step": 9005 }, { "epoch": 0.009785150247126631, "grad_norm": 21.127389907836914, "learning_rate": 4.9988446555973355e-06, "loss": 1.5429, "step": 9010 }, { "epoch": 0.009790580408196067, "grad_norm": 26.07276153564453, "learning_rate": 4.9988433586507e-06, "loss": 1.2566, "step": 9015 }, { "epoch": 0.009796010569265505, "grad_norm": 49.454044342041016, "learning_rate": 4.99884206097669e-06, "loss": 1.4516, "step": 9020 }, { "epoch": 0.009801440730334943, "grad_norm": 32.88904571533203, "learning_rate": 4.998840762575304e-06, "loss": 2.2616, "step": 9025 }, { "epoch": 0.009806870891404381, "grad_norm": 48.63734817504883, "learning_rate": 4.998839463446543e-06, "loss": 2.8604, "step": 9030 }, { "epoch": 0.009812301052473818, "grad_norm": 14.246026039123535, "learning_rate": 4.998838163590409e-06, "loss": 2.7655, "step": 9035 }, { "epoch": 0.009817731213543256, "grad_norm": 23.254133224487305, "learning_rate": 4.9988368630069015e-06, "loss": 1.4571, "step": 9040 }, { "epoch": 0.009823161374612694, "grad_norm": 39.212493896484375, "learning_rate": 4.99883556169602e-06, "loss": 1.7854, "step": 9045 }, { "epoch": 0.009828591535682132, "grad_norm": 49.54895782470703, "learning_rate": 4.998834259657765e-06, "loss": 1.4902, "step": 9050 }, { "epoch": 0.00983402169675157, "grad_norm": 36.726646423339844, "learning_rate": 4.9988329568921375e-06, "loss": 1.697, "step": 9055 }, { "epoch": 0.009839451857821006, "grad_norm": 41.702964782714844, "learning_rate": 4.998831653399139e-06, "loss": 1.9481, "step": 9060 }, { "epoch": 0.009844882018890444, "grad_norm": 48.42988967895508, "learning_rate": 4.998830349178767e-06, "loss": 1.5439, "step": 9065 }, { "epoch": 0.009850312179959882, "grad_norm": 70.58255767822266, "learning_rate": 4.998829044231025e-06, "loss": 1.2535, "step": 9070 }, { "epoch": 0.00985574234102932, "grad_norm": 20.657838821411133, "learning_rate": 4.998827738555911e-06, "loss": 1.8714, "step": 9075 }, { "epoch": 0.009861172502098758, "grad_norm": 22.987871170043945, "learning_rate": 4.998826432153426e-06, "loss": 1.7348, "step": 9080 }, { "epoch": 0.009866602663168194, "grad_norm": 62.39743423461914, "learning_rate": 4.998825125023571e-06, "loss": 1.8402, "step": 9085 }, { "epoch": 0.009872032824237632, "grad_norm": 34.378814697265625, "learning_rate": 4.998823817166346e-06, "loss": 1.6419, "step": 9090 }, { "epoch": 0.00987746298530707, "grad_norm": 27.190773010253906, "learning_rate": 4.998822508581752e-06, "loss": 2.1186, "step": 9095 }, { "epoch": 0.009882893146376508, "grad_norm": 26.255891799926758, "learning_rate": 4.9988211992697874e-06, "loss": 1.8976, "step": 9100 }, { "epoch": 0.009888323307445945, "grad_norm": 30.234561920166016, "learning_rate": 4.998819889230455e-06, "loss": 1.7504, "step": 9105 }, { "epoch": 0.009893753468515383, "grad_norm": 49.32392883300781, "learning_rate": 4.9988185784637545e-06, "loss": 1.8818, "step": 9110 }, { "epoch": 0.00989918362958482, "grad_norm": 36.446563720703125, "learning_rate": 4.998817266969686e-06, "loss": 2.0875, "step": 9115 }, { "epoch": 0.009904613790654259, "grad_norm": 24.60091209411621, "learning_rate": 4.998815954748248e-06, "loss": 1.5838, "step": 9120 }, { "epoch": 0.009910043951723697, "grad_norm": 49.73931121826172, "learning_rate": 4.9988146417994445e-06, "loss": 1.8, "step": 9125 }, { "epoch": 0.009915474112793133, "grad_norm": 23.766387939453125, "learning_rate": 4.998813328123273e-06, "loss": 2.1173, "step": 9130 }, { "epoch": 0.009920904273862571, "grad_norm": 15.528092384338379, "learning_rate": 4.998812013719736e-06, "loss": 1.7811, "step": 9135 }, { "epoch": 0.009926334434932009, "grad_norm": 37.076416015625, "learning_rate": 4.998810698588832e-06, "loss": 1.2446, "step": 9140 }, { "epoch": 0.009931764596001447, "grad_norm": 20.91008758544922, "learning_rate": 4.998809382730563e-06, "loss": 2.0287, "step": 9145 }, { "epoch": 0.009937194757070885, "grad_norm": 30.27454376220703, "learning_rate": 4.998808066144928e-06, "loss": 1.7267, "step": 9150 }, { "epoch": 0.009942624918140321, "grad_norm": 29.702556610107422, "learning_rate": 4.9988067488319285e-06, "loss": 1.8528, "step": 9155 }, { "epoch": 0.00994805507920976, "grad_norm": 17.61436653137207, "learning_rate": 4.998805430791563e-06, "loss": 1.4933, "step": 9160 }, { "epoch": 0.009953485240279197, "grad_norm": 40.51751708984375, "learning_rate": 4.9988041120238355e-06, "loss": 2.508, "step": 9165 }, { "epoch": 0.009958915401348635, "grad_norm": 59.23405456542969, "learning_rate": 4.998802792528742e-06, "loss": 1.9608, "step": 9170 }, { "epoch": 0.009964345562418072, "grad_norm": 19.674224853515625, "learning_rate": 4.998801472306286e-06, "loss": 1.4432, "step": 9175 }, { "epoch": 0.00996977572348751, "grad_norm": 32.845420837402344, "learning_rate": 4.998800151356467e-06, "loss": 1.782, "step": 9180 }, { "epoch": 0.009975205884556948, "grad_norm": 49.683372497558594, "learning_rate": 4.998798829679285e-06, "loss": 2.1579, "step": 9185 }, { "epoch": 0.009980636045626386, "grad_norm": 24.900060653686523, "learning_rate": 4.998797507274741e-06, "loss": 1.6367, "step": 9190 }, { "epoch": 0.009986066206695824, "grad_norm": 17.374101638793945, "learning_rate": 4.998796184142834e-06, "loss": 1.9032, "step": 9195 }, { "epoch": 0.00999149636776526, "grad_norm": 17.527029037475586, "learning_rate": 4.998794860283567e-06, "loss": 1.2779, "step": 9200 }, { "epoch": 0.009996926528834698, "grad_norm": 14.446876525878906, "learning_rate": 4.998793535696938e-06, "loss": 1.4908, "step": 9205 }, { "epoch": 0.010002356689904136, "grad_norm": 22.69999122619629, "learning_rate": 4.9987922103829485e-06, "loss": 1.5381, "step": 9210 }, { "epoch": 0.010007786850973574, "grad_norm": 21.200729370117188, "learning_rate": 4.998790884341598e-06, "loss": 1.957, "step": 9215 }, { "epoch": 0.010013217012043012, "grad_norm": 19.62489128112793, "learning_rate": 4.998789557572889e-06, "loss": 1.6813, "step": 9220 }, { "epoch": 0.010018647173112448, "grad_norm": 66.23180389404297, "learning_rate": 4.9987882300768185e-06, "loss": 1.7611, "step": 9225 }, { "epoch": 0.010024077334181886, "grad_norm": 19.582347869873047, "learning_rate": 4.99878690185339e-06, "loss": 1.7026, "step": 9230 }, { "epoch": 0.010029507495251324, "grad_norm": 14.199991226196289, "learning_rate": 4.998785572902603e-06, "loss": 1.5066, "step": 9235 }, { "epoch": 0.010034937656320762, "grad_norm": 12.311741828918457, "learning_rate": 4.998784243224456e-06, "loss": 1.5515, "step": 9240 }, { "epoch": 0.010040367817390199, "grad_norm": 27.02737808227539, "learning_rate": 4.9987829128189514e-06, "loss": 1.6421, "step": 9245 }, { "epoch": 0.010045797978459637, "grad_norm": 20.213274002075195, "learning_rate": 4.998781581686089e-06, "loss": 0.9477, "step": 9250 }, { "epoch": 0.010051228139529075, "grad_norm": 18.520423889160156, "learning_rate": 4.99878024982587e-06, "loss": 1.8538, "step": 9255 }, { "epoch": 0.010056658300598513, "grad_norm": 33.1057014465332, "learning_rate": 4.998778917238294e-06, "loss": 2.441, "step": 9260 }, { "epoch": 0.01006208846166795, "grad_norm": 28.508975982666016, "learning_rate": 4.998777583923362e-06, "loss": 1.3944, "step": 9265 }, { "epoch": 0.010067518622737387, "grad_norm": 39.07247543334961, "learning_rate": 4.998776249881073e-06, "loss": 1.5604, "step": 9270 }, { "epoch": 0.010072948783806825, "grad_norm": 28.10300064086914, "learning_rate": 4.998774915111428e-06, "loss": 1.8421, "step": 9275 }, { "epoch": 0.010078378944876263, "grad_norm": 20.59554672241211, "learning_rate": 4.998773579614428e-06, "loss": 1.6258, "step": 9280 }, { "epoch": 0.010083809105945701, "grad_norm": 25.107728958129883, "learning_rate": 4.998772243390073e-06, "loss": 1.6858, "step": 9285 }, { "epoch": 0.010089239267015139, "grad_norm": 18.393814086914062, "learning_rate": 4.998770906438364e-06, "loss": 1.5996, "step": 9290 }, { "epoch": 0.010094669428084575, "grad_norm": 22.859451293945312, "learning_rate": 4.9987695687593e-06, "loss": 1.4196, "step": 9295 }, { "epoch": 0.010100099589154013, "grad_norm": 19.275739669799805, "learning_rate": 4.998768230352883e-06, "loss": 2.3234, "step": 9300 }, { "epoch": 0.010105529750223451, "grad_norm": 89.4050064086914, "learning_rate": 4.998766891219112e-06, "loss": 2.3825, "step": 9305 }, { "epoch": 0.01011095991129289, "grad_norm": 44.68021774291992, "learning_rate": 4.998765551357988e-06, "loss": 1.6651, "step": 9310 }, { "epoch": 0.010116390072362326, "grad_norm": 56.238800048828125, "learning_rate": 4.9987642107695115e-06, "loss": 1.1329, "step": 9315 }, { "epoch": 0.010121820233431764, "grad_norm": 32.83275604248047, "learning_rate": 4.998762869453683e-06, "loss": 1.3963, "step": 9320 }, { "epoch": 0.010127250394501202, "grad_norm": 123.61528015136719, "learning_rate": 4.998761527410502e-06, "loss": 2.2204, "step": 9325 }, { "epoch": 0.01013268055557064, "grad_norm": 15.55739974975586, "learning_rate": 4.99876018463997e-06, "loss": 1.3346, "step": 9330 }, { "epoch": 0.010138110716640078, "grad_norm": 20.810808181762695, "learning_rate": 4.998758841142086e-06, "loss": 2.0372, "step": 9335 }, { "epoch": 0.010143540877709514, "grad_norm": 37.64890670776367, "learning_rate": 4.9987574969168525e-06, "loss": 1.5758, "step": 9340 }, { "epoch": 0.010148971038778952, "grad_norm": 21.55962562561035, "learning_rate": 4.998756151964268e-06, "loss": 1.7754, "step": 9345 }, { "epoch": 0.01015440119984839, "grad_norm": 38.248172760009766, "learning_rate": 4.998754806284335e-06, "loss": 2.1799, "step": 9350 }, { "epoch": 0.010159831360917828, "grad_norm": 116.74456787109375, "learning_rate": 4.998753459877051e-06, "loss": 2.6799, "step": 9355 }, { "epoch": 0.010165261521987266, "grad_norm": 26.052515029907227, "learning_rate": 4.998752112742418e-06, "loss": 1.6377, "step": 9360 }, { "epoch": 0.010170691683056702, "grad_norm": 13.811615943908691, "learning_rate": 4.9987507648804366e-06, "loss": 1.7411, "step": 9365 }, { "epoch": 0.01017612184412614, "grad_norm": 23.367277145385742, "learning_rate": 4.998749416291107e-06, "loss": 1.6135, "step": 9370 }, { "epoch": 0.010181552005195578, "grad_norm": 54.51013946533203, "learning_rate": 4.998748066974429e-06, "loss": 1.9064, "step": 9375 }, { "epoch": 0.010186982166265016, "grad_norm": 13.195215225219727, "learning_rate": 4.998746716930404e-06, "loss": 1.8884, "step": 9380 }, { "epoch": 0.010192412327334453, "grad_norm": 22.894681930541992, "learning_rate": 4.998745366159032e-06, "loss": 2.1997, "step": 9385 }, { "epoch": 0.01019784248840389, "grad_norm": 16.0703125, "learning_rate": 4.998744014660313e-06, "loss": 1.6049, "step": 9390 }, { "epoch": 0.010203272649473329, "grad_norm": 96.32728576660156, "learning_rate": 4.998742662434248e-06, "loss": 2.0203, "step": 9395 }, { "epoch": 0.010208702810542767, "grad_norm": 28.328651428222656, "learning_rate": 4.998741309480836e-06, "loss": 1.41, "step": 9400 }, { "epoch": 0.010214132971612205, "grad_norm": 30.232540130615234, "learning_rate": 4.998739955800079e-06, "loss": 1.7287, "step": 9405 }, { "epoch": 0.010219563132681641, "grad_norm": 29.080411911010742, "learning_rate": 4.998738601391976e-06, "loss": 1.4281, "step": 9410 }, { "epoch": 0.010224993293751079, "grad_norm": 16.036361694335938, "learning_rate": 4.99873724625653e-06, "loss": 1.3588, "step": 9415 }, { "epoch": 0.010230423454820517, "grad_norm": 17.637073516845703, "learning_rate": 4.998735890393738e-06, "loss": 1.2129, "step": 9420 }, { "epoch": 0.010235853615889955, "grad_norm": 54.050079345703125, "learning_rate": 4.998734533803603e-06, "loss": 1.3412, "step": 9425 }, { "epoch": 0.010241283776959393, "grad_norm": 20.73946189880371, "learning_rate": 4.998733176486123e-06, "loss": 1.7485, "step": 9430 }, { "epoch": 0.01024671393802883, "grad_norm": 29.12095832824707, "learning_rate": 4.998731818441302e-06, "loss": 1.8134, "step": 9435 }, { "epoch": 0.010252144099098267, "grad_norm": 213.30775451660156, "learning_rate": 4.998730459669136e-06, "loss": 1.8484, "step": 9440 }, { "epoch": 0.010257574260167705, "grad_norm": 24.139705657958984, "learning_rate": 4.9987291001696296e-06, "loss": 1.5437, "step": 9445 }, { "epoch": 0.010263004421237143, "grad_norm": 15.576598167419434, "learning_rate": 4.998727739942779e-06, "loss": 1.8793, "step": 9450 }, { "epoch": 0.01026843458230658, "grad_norm": 16.5748233795166, "learning_rate": 4.998726378988589e-06, "loss": 1.4953, "step": 9455 }, { "epoch": 0.010273864743376018, "grad_norm": 27.82956314086914, "learning_rate": 4.998725017307056e-06, "loss": 1.5929, "step": 9460 }, { "epoch": 0.010279294904445456, "grad_norm": 16.396127700805664, "learning_rate": 4.998723654898183e-06, "loss": 1.5394, "step": 9465 }, { "epoch": 0.010284725065514894, "grad_norm": 23.010379791259766, "learning_rate": 4.998722291761969e-06, "loss": 1.7062, "step": 9470 }, { "epoch": 0.010290155226584332, "grad_norm": 14.450254440307617, "learning_rate": 4.998720927898416e-06, "loss": 1.6067, "step": 9475 }, { "epoch": 0.010295585387653768, "grad_norm": 23.562820434570312, "learning_rate": 4.998719563307522e-06, "loss": 1.691, "step": 9480 }, { "epoch": 0.010301015548723206, "grad_norm": 36.722686767578125, "learning_rate": 4.9987181979892895e-06, "loss": 1.6429, "step": 9485 }, { "epoch": 0.010306445709792644, "grad_norm": 19.98993682861328, "learning_rate": 4.998716831943718e-06, "loss": 1.8169, "step": 9490 }, { "epoch": 0.010311875870862082, "grad_norm": 43.6855354309082, "learning_rate": 4.998715465170809e-06, "loss": 2.0317, "step": 9495 }, { "epoch": 0.01031730603193152, "grad_norm": 31.667301177978516, "learning_rate": 4.99871409767056e-06, "loss": 1.6939, "step": 9500 }, { "epoch": 0.010322736193000956, "grad_norm": 45.85348892211914, "learning_rate": 4.9987127294429745e-06, "loss": 1.8241, "step": 9505 }, { "epoch": 0.010328166354070394, "grad_norm": 18.202075958251953, "learning_rate": 4.9987113604880526e-06, "loss": 2.1755, "step": 9510 }, { "epoch": 0.010333596515139832, "grad_norm": 14.863726615905762, "learning_rate": 4.998709990805793e-06, "loss": 1.7792, "step": 9515 }, { "epoch": 0.01033902667620927, "grad_norm": 15.036576271057129, "learning_rate": 4.998708620396197e-06, "loss": 1.7098, "step": 9520 }, { "epoch": 0.010344456837278707, "grad_norm": 16.949687957763672, "learning_rate": 4.9987072492592645e-06, "loss": 1.2353, "step": 9525 }, { "epoch": 0.010349886998348145, "grad_norm": 21.056861877441406, "learning_rate": 4.998705877394997e-06, "loss": 1.7309, "step": 9530 }, { "epoch": 0.010355317159417583, "grad_norm": 16.513608932495117, "learning_rate": 4.998704504803393e-06, "loss": 1.8742, "step": 9535 }, { "epoch": 0.01036074732048702, "grad_norm": 89.46842956542969, "learning_rate": 4.998703131484456e-06, "loss": 1.7776, "step": 9540 }, { "epoch": 0.010366177481556459, "grad_norm": 23.716297149658203, "learning_rate": 4.998701757438183e-06, "loss": 1.3131, "step": 9545 }, { "epoch": 0.010371607642625895, "grad_norm": 46.57939910888672, "learning_rate": 4.998700382664576e-06, "loss": 1.7581, "step": 9550 }, { "epoch": 0.010377037803695333, "grad_norm": 15.756389617919922, "learning_rate": 4.998699007163636e-06, "loss": 1.09, "step": 9555 }, { "epoch": 0.010382467964764771, "grad_norm": 35.05914306640625, "learning_rate": 4.9986976309353626e-06, "loss": 1.7966, "step": 9560 }, { "epoch": 0.010387898125834209, "grad_norm": 39.429969787597656, "learning_rate": 4.998696253979757e-06, "loss": 1.5836, "step": 9565 }, { "epoch": 0.010393328286903647, "grad_norm": 26.50185203552246, "learning_rate": 4.998694876296818e-06, "loss": 1.5896, "step": 9570 }, { "epoch": 0.010398758447973083, "grad_norm": 13.887907028198242, "learning_rate": 4.998693497886547e-06, "loss": 2.0304, "step": 9575 }, { "epoch": 0.010404188609042521, "grad_norm": 16.499900817871094, "learning_rate": 4.9986921187489445e-06, "loss": 1.3949, "step": 9580 }, { "epoch": 0.01040961877011196, "grad_norm": 33.90262222290039, "learning_rate": 4.998690738884011e-06, "loss": 2.143, "step": 9585 }, { "epoch": 0.010415048931181397, "grad_norm": 32.041595458984375, "learning_rate": 4.998689358291746e-06, "loss": 2.0022, "step": 9590 }, { "epoch": 0.010420479092250834, "grad_norm": 35.7755241394043, "learning_rate": 4.998687976972151e-06, "loss": 1.6544, "step": 9595 }, { "epoch": 0.010425909253320272, "grad_norm": 36.71112823486328, "learning_rate": 4.998686594925226e-06, "loss": 2.0628, "step": 9600 }, { "epoch": 0.01043133941438971, "grad_norm": 26.75010871887207, "learning_rate": 4.998685212150971e-06, "loss": 1.9384, "step": 9605 }, { "epoch": 0.010436769575459148, "grad_norm": 26.59108543395996, "learning_rate": 4.9986838286493874e-06, "loss": 1.4074, "step": 9610 }, { "epoch": 0.010442199736528586, "grad_norm": 13.448664665222168, "learning_rate": 4.998682444420474e-06, "loss": 2.0747, "step": 9615 }, { "epoch": 0.010447629897598022, "grad_norm": 17.29410171508789, "learning_rate": 4.9986810594642335e-06, "loss": 2.2539, "step": 9620 }, { "epoch": 0.01045306005866746, "grad_norm": 16.06539535522461, "learning_rate": 4.998679673780664e-06, "loss": 1.8507, "step": 9625 }, { "epoch": 0.010458490219736898, "grad_norm": 15.122481346130371, "learning_rate": 4.998678287369767e-06, "loss": 2.5204, "step": 9630 }, { "epoch": 0.010463920380806336, "grad_norm": 33.21748733520508, "learning_rate": 4.998676900231543e-06, "loss": 2.0932, "step": 9635 }, { "epoch": 0.010469350541875774, "grad_norm": 23.794919967651367, "learning_rate": 4.998675512365991e-06, "loss": 1.4692, "step": 9640 }, { "epoch": 0.01047478070294521, "grad_norm": 69.32677459716797, "learning_rate": 4.998674123773114e-06, "loss": 1.6618, "step": 9645 }, { "epoch": 0.010480210864014648, "grad_norm": 53.567054748535156, "learning_rate": 4.998672734452911e-06, "loss": 1.341, "step": 9650 }, { "epoch": 0.010485641025084086, "grad_norm": 30.544111251831055, "learning_rate": 4.998671344405381e-06, "loss": 1.7942, "step": 9655 }, { "epoch": 0.010491071186153524, "grad_norm": 11.769539833068848, "learning_rate": 4.998669953630527e-06, "loss": 1.6765, "step": 9660 }, { "epoch": 0.01049650134722296, "grad_norm": 15.731956481933594, "learning_rate": 4.9986685621283486e-06, "loss": 1.9989, "step": 9665 }, { "epoch": 0.010501931508292399, "grad_norm": 33.2796745300293, "learning_rate": 4.998667169898845e-06, "loss": 1.6844, "step": 9670 }, { "epoch": 0.010507361669361837, "grad_norm": 14.710127830505371, "learning_rate": 4.998665776942017e-06, "loss": 1.9841, "step": 9675 }, { "epoch": 0.010512791830431275, "grad_norm": 18.448516845703125, "learning_rate": 4.998664383257867e-06, "loss": 1.8894, "step": 9680 }, { "epoch": 0.010518221991500713, "grad_norm": 27.699247360229492, "learning_rate": 4.998662988846392e-06, "loss": 1.5122, "step": 9685 }, { "epoch": 0.010523652152570149, "grad_norm": 17.385719299316406, "learning_rate": 4.998661593707595e-06, "loss": 2.1921, "step": 9690 }, { "epoch": 0.010529082313639587, "grad_norm": 47.38220977783203, "learning_rate": 4.998660197841475e-06, "loss": 1.2856, "step": 9695 }, { "epoch": 0.010534512474709025, "grad_norm": 23.162839889526367, "learning_rate": 4.998658801248034e-06, "loss": 1.4537, "step": 9700 }, { "epoch": 0.010539942635778463, "grad_norm": 34.65904235839844, "learning_rate": 4.998657403927271e-06, "loss": 1.7448, "step": 9705 }, { "epoch": 0.010545372796847901, "grad_norm": 19.68937110900879, "learning_rate": 4.998656005879187e-06, "loss": 1.8022, "step": 9710 }, { "epoch": 0.010550802957917337, "grad_norm": 16.281152725219727, "learning_rate": 4.998654607103782e-06, "loss": 1.95, "step": 9715 }, { "epoch": 0.010556233118986775, "grad_norm": 76.97811126708984, "learning_rate": 4.9986532076010566e-06, "loss": 2.5018, "step": 9720 }, { "epoch": 0.010561663280056213, "grad_norm": 15.890995025634766, "learning_rate": 4.998651807371012e-06, "loss": 0.9399, "step": 9725 }, { "epoch": 0.010567093441125651, "grad_norm": 17.519668579101562, "learning_rate": 4.998650406413647e-06, "loss": 1.6617, "step": 9730 }, { "epoch": 0.010572523602195088, "grad_norm": 15.2011079788208, "learning_rate": 4.9986490047289635e-06, "loss": 1.6462, "step": 9735 }, { "epoch": 0.010577953763264526, "grad_norm": 38.900630950927734, "learning_rate": 4.998647602316961e-06, "loss": 2.0313, "step": 9740 }, { "epoch": 0.010583383924333964, "grad_norm": 19.559261322021484, "learning_rate": 4.99864619917764e-06, "loss": 1.3943, "step": 9745 }, { "epoch": 0.010588814085403402, "grad_norm": 40.5893669128418, "learning_rate": 4.9986447953110015e-06, "loss": 2.1949, "step": 9750 }, { "epoch": 0.01059424424647284, "grad_norm": 23.477930068969727, "learning_rate": 4.998643390717045e-06, "loss": 1.6944, "step": 9755 }, { "epoch": 0.010599674407542276, "grad_norm": 107.88645935058594, "learning_rate": 4.998641985395773e-06, "loss": 1.7859, "step": 9760 }, { "epoch": 0.010605104568611714, "grad_norm": 14.730772018432617, "learning_rate": 4.998640579347182e-06, "loss": 2.0661, "step": 9765 }, { "epoch": 0.010610534729681152, "grad_norm": 31.101484298706055, "learning_rate": 4.9986391725712755e-06, "loss": 1.53, "step": 9770 }, { "epoch": 0.01061596489075059, "grad_norm": 15.471879959106445, "learning_rate": 4.9986377650680536e-06, "loss": 1.6014, "step": 9775 }, { "epoch": 0.010621395051820028, "grad_norm": 47.86062240600586, "learning_rate": 4.998636356837517e-06, "loss": 1.701, "step": 9780 }, { "epoch": 0.010626825212889464, "grad_norm": 44.33843231201172, "learning_rate": 4.998634947879664e-06, "loss": 1.5235, "step": 9785 }, { "epoch": 0.010632255373958902, "grad_norm": 17.127622604370117, "learning_rate": 4.998633538194498e-06, "loss": 1.3053, "step": 9790 }, { "epoch": 0.01063768553502834, "grad_norm": 51.466102600097656, "learning_rate": 4.998632127782016e-06, "loss": 3.0214, "step": 9795 }, { "epoch": 0.010643115696097778, "grad_norm": 45.32978820800781, "learning_rate": 4.998630716642221e-06, "loss": 1.3865, "step": 9800 }, { "epoch": 0.010648545857167215, "grad_norm": 18.738683700561523, "learning_rate": 4.9986293047751125e-06, "loss": 1.7116, "step": 9805 }, { "epoch": 0.010653976018236653, "grad_norm": 24.805465698242188, "learning_rate": 4.998627892180691e-06, "loss": 2.0027, "step": 9810 }, { "epoch": 0.01065940617930609, "grad_norm": 17.188539505004883, "learning_rate": 4.998626478858957e-06, "loss": 2.1564, "step": 9815 }, { "epoch": 0.010664836340375529, "grad_norm": 25.929595947265625, "learning_rate": 4.99862506480991e-06, "loss": 1.585, "step": 9820 }, { "epoch": 0.010670266501444967, "grad_norm": 20.864601135253906, "learning_rate": 4.998623650033553e-06, "loss": 1.3274, "step": 9825 }, { "epoch": 0.010675696662514403, "grad_norm": 26.310338973999023, "learning_rate": 4.998622234529883e-06, "loss": 1.5335, "step": 9830 }, { "epoch": 0.010681126823583841, "grad_norm": 34.14971160888672, "learning_rate": 4.998620818298903e-06, "loss": 1.5962, "step": 9835 }, { "epoch": 0.010686556984653279, "grad_norm": 17.146526336669922, "learning_rate": 4.9986194013406125e-06, "loss": 1.5546, "step": 9840 }, { "epoch": 0.010691987145722717, "grad_norm": 23.46977996826172, "learning_rate": 4.998617983655012e-06, "loss": 1.8338, "step": 9845 }, { "epoch": 0.010697417306792155, "grad_norm": 37.05940628051758, "learning_rate": 4.998616565242101e-06, "loss": 1.5913, "step": 9850 }, { "epoch": 0.010702847467861591, "grad_norm": 23.515714645385742, "learning_rate": 4.9986151461018815e-06, "loss": 0.939, "step": 9855 }, { "epoch": 0.01070827762893103, "grad_norm": 14.88895034790039, "learning_rate": 4.998613726234353e-06, "loss": 1.61, "step": 9860 }, { "epoch": 0.010713707790000467, "grad_norm": 89.73543548583984, "learning_rate": 4.998612305639517e-06, "loss": 2.1453, "step": 9865 }, { "epoch": 0.010719137951069905, "grad_norm": 18.386018753051758, "learning_rate": 4.9986108843173705e-06, "loss": 1.5026, "step": 9870 }, { "epoch": 0.010724568112139342, "grad_norm": 22.430578231811523, "learning_rate": 4.998609462267918e-06, "loss": 1.3393, "step": 9875 }, { "epoch": 0.01072999827320878, "grad_norm": 19.514535903930664, "learning_rate": 4.998608039491159e-06, "loss": 2.2158, "step": 9880 }, { "epoch": 0.010735428434278218, "grad_norm": 16.65749740600586, "learning_rate": 4.998606615987092e-06, "loss": 1.4956, "step": 9885 }, { "epoch": 0.010740858595347656, "grad_norm": 14.160118103027344, "learning_rate": 4.998605191755719e-06, "loss": 2.2328, "step": 9890 }, { "epoch": 0.010746288756417094, "grad_norm": 17.678308486938477, "learning_rate": 4.9986037667970396e-06, "loss": 1.5071, "step": 9895 }, { "epoch": 0.01075171891748653, "grad_norm": 17.69506072998047, "learning_rate": 4.998602341111056e-06, "loss": 2.1378, "step": 9900 }, { "epoch": 0.010757149078555968, "grad_norm": 20.55043601989746, "learning_rate": 4.998600914697767e-06, "loss": 1.5662, "step": 9905 }, { "epoch": 0.010762579239625406, "grad_norm": 143.1421661376953, "learning_rate": 4.998599487557172e-06, "loss": 1.0813, "step": 9910 }, { "epoch": 0.010768009400694844, "grad_norm": 17.728538513183594, "learning_rate": 4.998598059689274e-06, "loss": 1.8918, "step": 9915 }, { "epoch": 0.010773439561764282, "grad_norm": 14.152349472045898, "learning_rate": 4.998596631094071e-06, "loss": 0.9852, "step": 9920 }, { "epoch": 0.010778869722833718, "grad_norm": 15.391342163085938, "learning_rate": 4.998595201771565e-06, "loss": 1.5391, "step": 9925 }, { "epoch": 0.010784299883903156, "grad_norm": 11.277433395385742, "learning_rate": 4.998593771721756e-06, "loss": 1.4004, "step": 9930 }, { "epoch": 0.010789730044972594, "grad_norm": 12.13476276397705, "learning_rate": 4.998592340944645e-06, "loss": 1.9633, "step": 9935 }, { "epoch": 0.010795160206042032, "grad_norm": 35.446754455566406, "learning_rate": 4.998590909440231e-06, "loss": 1.6124, "step": 9940 }, { "epoch": 0.010800590367111468, "grad_norm": 28.002567291259766, "learning_rate": 4.998589477208515e-06, "loss": 1.5456, "step": 9945 }, { "epoch": 0.010806020528180907, "grad_norm": 19.324352264404297, "learning_rate": 4.998588044249498e-06, "loss": 2.0043, "step": 9950 }, { "epoch": 0.010811450689250345, "grad_norm": 34.730316162109375, "learning_rate": 4.99858661056318e-06, "loss": 1.8676, "step": 9955 }, { "epoch": 0.010816880850319783, "grad_norm": 15.403035163879395, "learning_rate": 4.998585176149561e-06, "loss": 1.634, "step": 9960 }, { "epoch": 0.01082231101138922, "grad_norm": 126.96319580078125, "learning_rate": 4.998583741008642e-06, "loss": 1.4266, "step": 9965 }, { "epoch": 0.010827741172458657, "grad_norm": 16.894466400146484, "learning_rate": 4.998582305140424e-06, "loss": 2.2575, "step": 9970 }, { "epoch": 0.010833171333528095, "grad_norm": 21.21478271484375, "learning_rate": 4.998580868544907e-06, "loss": 1.9039, "step": 9975 }, { "epoch": 0.010838601494597533, "grad_norm": 10.81640338897705, "learning_rate": 4.998579431222091e-06, "loss": 2.2085, "step": 9980 }, { "epoch": 0.010844031655666971, "grad_norm": 28.900428771972656, "learning_rate": 4.998577993171976e-06, "loss": 1.8431, "step": 9985 }, { "epoch": 0.010849461816736409, "grad_norm": 25.604833602905273, "learning_rate": 4.998576554394562e-06, "loss": 1.6683, "step": 9990 }, { "epoch": 0.010854891977805845, "grad_norm": 20.463220596313477, "learning_rate": 4.998575114889851e-06, "loss": 2.0613, "step": 9995 }, { "epoch": 0.010860322138875283, "grad_norm": 50.138545989990234, "learning_rate": 4.998573674657844e-06, "loss": 1.1894, "step": 10000 }, { "epoch": 0.010865752299944721, "grad_norm": 29.21415138244629, "learning_rate": 4.998572233698539e-06, "loss": 1.686, "step": 10005 }, { "epoch": 0.01087118246101416, "grad_norm": 20.347198486328125, "learning_rate": 4.998570792011938e-06, "loss": 1.9022, "step": 10010 }, { "epoch": 0.010876612622083595, "grad_norm": 45.79477310180664, "learning_rate": 4.998569349598041e-06, "loss": 1.8815, "step": 10015 }, { "epoch": 0.010882042783153033, "grad_norm": 82.03829193115234, "learning_rate": 4.998567906456848e-06, "loss": 1.268, "step": 10020 }, { "epoch": 0.010887472944222472, "grad_norm": 25.870573043823242, "learning_rate": 4.998566462588361e-06, "loss": 1.8686, "step": 10025 }, { "epoch": 0.01089290310529191, "grad_norm": 59.940391540527344, "learning_rate": 4.998565017992579e-06, "loss": 1.7993, "step": 10030 }, { "epoch": 0.010898333266361348, "grad_norm": 71.3803939819336, "learning_rate": 4.998563572669503e-06, "loss": 2.2053, "step": 10035 }, { "epoch": 0.010903763427430784, "grad_norm": 23.81072235107422, "learning_rate": 4.998562126619132e-06, "loss": 1.3135, "step": 10040 }, { "epoch": 0.010909193588500222, "grad_norm": 15.710099220275879, "learning_rate": 4.998560679841468e-06, "loss": 1.9582, "step": 10045 }, { "epoch": 0.01091462374956966, "grad_norm": 27.831790924072266, "learning_rate": 4.998559232336512e-06, "loss": 1.8777, "step": 10050 }, { "epoch": 0.010920053910639098, "grad_norm": 54.47487258911133, "learning_rate": 4.998557784104262e-06, "loss": 1.4775, "step": 10055 }, { "epoch": 0.010925484071708536, "grad_norm": 32.540016174316406, "learning_rate": 4.998556335144721e-06, "loss": 1.9206, "step": 10060 }, { "epoch": 0.010930914232777972, "grad_norm": 37.81696319580078, "learning_rate": 4.998554885457888e-06, "loss": 1.5694, "step": 10065 }, { "epoch": 0.01093634439384741, "grad_norm": 24.228673934936523, "learning_rate": 4.9985534350437624e-06, "loss": 1.7749, "step": 10070 }, { "epoch": 0.010941774554916848, "grad_norm": 20.721277236938477, "learning_rate": 4.9985519839023475e-06, "loss": 1.3616, "step": 10075 }, { "epoch": 0.010947204715986286, "grad_norm": 21.66297721862793, "learning_rate": 4.998550532033641e-06, "loss": 1.4309, "step": 10080 }, { "epoch": 0.010952634877055722, "grad_norm": 116.86261749267578, "learning_rate": 4.998549079437645e-06, "loss": 2.0444, "step": 10085 }, { "epoch": 0.01095806503812516, "grad_norm": 49.860740661621094, "learning_rate": 4.99854762611436e-06, "loss": 1.3283, "step": 10090 }, { "epoch": 0.010963495199194599, "grad_norm": 19.93017578125, "learning_rate": 4.998546172063785e-06, "loss": 2.3722, "step": 10095 }, { "epoch": 0.010968925360264037, "grad_norm": 14.76366901397705, "learning_rate": 4.998544717285921e-06, "loss": 1.3959, "step": 10100 }, { "epoch": 0.010974355521333475, "grad_norm": 22.39974594116211, "learning_rate": 4.998543261780769e-06, "loss": 1.868, "step": 10105 }, { "epoch": 0.01097978568240291, "grad_norm": 33.69144058227539, "learning_rate": 4.998541805548331e-06, "loss": 2.1212, "step": 10110 }, { "epoch": 0.010985215843472349, "grad_norm": 16.82862091064453, "learning_rate": 4.998540348588602e-06, "loss": 1.51, "step": 10115 }, { "epoch": 0.010990646004541787, "grad_norm": 67.18833923339844, "learning_rate": 4.9985388909015886e-06, "loss": 1.5199, "step": 10120 }, { "epoch": 0.010996076165611225, "grad_norm": 18.981151580810547, "learning_rate": 4.998537432487287e-06, "loss": 1.8634, "step": 10125 }, { "epoch": 0.011001506326680663, "grad_norm": 16.081239700317383, "learning_rate": 4.998535973345699e-06, "loss": 1.7667, "step": 10130 }, { "epoch": 0.0110069364877501, "grad_norm": 19.963266372680664, "learning_rate": 4.998534513476827e-06, "loss": 1.7715, "step": 10135 }, { "epoch": 0.011012366648819537, "grad_norm": 21.37301254272461, "learning_rate": 4.998533052880668e-06, "loss": 1.6857, "step": 10140 }, { "epoch": 0.011017796809888975, "grad_norm": 28.537439346313477, "learning_rate": 4.998531591557225e-06, "loss": 2.1375, "step": 10145 }, { "epoch": 0.011023226970958413, "grad_norm": 70.7808609008789, "learning_rate": 4.998530129506497e-06, "loss": 2.079, "step": 10150 }, { "epoch": 0.01102865713202785, "grad_norm": 30.81490707397461, "learning_rate": 4.998528666728485e-06, "loss": 1.7582, "step": 10155 }, { "epoch": 0.011034087293097287, "grad_norm": 14.220453262329102, "learning_rate": 4.998527203223189e-06, "loss": 1.7273, "step": 10160 }, { "epoch": 0.011039517454166725, "grad_norm": 38.057373046875, "learning_rate": 4.99852573899061e-06, "loss": 1.8986, "step": 10165 }, { "epoch": 0.011044947615236164, "grad_norm": 36.40947341918945, "learning_rate": 4.998524274030748e-06, "loss": 2.1603, "step": 10170 }, { "epoch": 0.011050377776305602, "grad_norm": 19.93641471862793, "learning_rate": 4.998522808343604e-06, "loss": 1.3332, "step": 10175 }, { "epoch": 0.011055807937375038, "grad_norm": 16.28131675720215, "learning_rate": 4.998521341929178e-06, "loss": 1.6372, "step": 10180 }, { "epoch": 0.011061238098444476, "grad_norm": 14.816411972045898, "learning_rate": 4.99851987478747e-06, "loss": 2.3215, "step": 10185 }, { "epoch": 0.011066668259513914, "grad_norm": 17.268495559692383, "learning_rate": 4.998518406918481e-06, "loss": 0.9006, "step": 10190 }, { "epoch": 0.011072098420583352, "grad_norm": 37.76838302612305, "learning_rate": 4.998516938322211e-06, "loss": 1.6629, "step": 10195 }, { "epoch": 0.01107752858165279, "grad_norm": 24.111223220825195, "learning_rate": 4.9985154689986615e-06, "loss": 1.8835, "step": 10200 }, { "epoch": 0.011082958742722226, "grad_norm": 39.25551986694336, "learning_rate": 4.998513998947832e-06, "loss": 1.9798, "step": 10205 }, { "epoch": 0.011088388903791664, "grad_norm": 25.74469566345215, "learning_rate": 4.998512528169723e-06, "loss": 1.5431, "step": 10210 }, { "epoch": 0.011093819064861102, "grad_norm": 21.202064514160156, "learning_rate": 4.998511056664335e-06, "loss": 1.3356, "step": 10215 }, { "epoch": 0.01109924922593054, "grad_norm": 13.349669456481934, "learning_rate": 4.998509584431668e-06, "loss": 1.862, "step": 10220 }, { "epoch": 0.011104679386999976, "grad_norm": 53.18241882324219, "learning_rate": 4.998508111471724e-06, "loss": 1.9685, "step": 10225 }, { "epoch": 0.011110109548069414, "grad_norm": 39.175758361816406, "learning_rate": 4.998506637784501e-06, "loss": 2.3721, "step": 10230 }, { "epoch": 0.011115539709138852, "grad_norm": 18.43044662475586, "learning_rate": 4.998505163370001e-06, "loss": 2.0067, "step": 10235 }, { "epoch": 0.01112096987020829, "grad_norm": 20.242198944091797, "learning_rate": 4.998503688228225e-06, "loss": 1.6504, "step": 10240 }, { "epoch": 0.011126400031277729, "grad_norm": 25.856782913208008, "learning_rate": 4.998502212359172e-06, "loss": 1.9872, "step": 10245 }, { "epoch": 0.011131830192347165, "grad_norm": 14.607989311218262, "learning_rate": 4.998500735762842e-06, "loss": 1.3753, "step": 10250 }, { "epoch": 0.011137260353416603, "grad_norm": 21.103899002075195, "learning_rate": 4.998499258439238e-06, "loss": 1.679, "step": 10255 }, { "epoch": 0.01114269051448604, "grad_norm": 31.28606605529785, "learning_rate": 4.998497780388358e-06, "loss": 1.4261, "step": 10260 }, { "epoch": 0.011148120675555479, "grad_norm": 31.657875061035156, "learning_rate": 4.998496301610204e-06, "loss": 1.5413, "step": 10265 }, { "epoch": 0.011153550836624915, "grad_norm": 43.213829040527344, "learning_rate": 4.998494822104776e-06, "loss": 2.2227, "step": 10270 }, { "epoch": 0.011158980997694353, "grad_norm": 30.03956413269043, "learning_rate": 4.998493341872073e-06, "loss": 1.7831, "step": 10275 }, { "epoch": 0.011164411158763791, "grad_norm": 18.3549861907959, "learning_rate": 4.998491860912097e-06, "loss": 1.4657, "step": 10280 }, { "epoch": 0.01116984131983323, "grad_norm": 13.038738250732422, "learning_rate": 4.998490379224849e-06, "loss": 1.2464, "step": 10285 }, { "epoch": 0.011175271480902667, "grad_norm": 66.09276580810547, "learning_rate": 4.998488896810328e-06, "loss": 2.2543, "step": 10290 }, { "epoch": 0.011180701641972103, "grad_norm": 56.1054801940918, "learning_rate": 4.998487413668534e-06, "loss": 1.6292, "step": 10295 }, { "epoch": 0.011186131803041541, "grad_norm": 33.63254165649414, "learning_rate": 4.998485929799469e-06, "loss": 1.9193, "step": 10300 }, { "epoch": 0.01119156196411098, "grad_norm": 18.831462860107422, "learning_rate": 4.998484445203133e-06, "loss": 1.5756, "step": 10305 }, { "epoch": 0.011196992125180417, "grad_norm": 113.00235748291016, "learning_rate": 4.998482959879526e-06, "loss": 2.0117, "step": 10310 }, { "epoch": 0.011202422286249855, "grad_norm": 21.25092887878418, "learning_rate": 4.998481473828648e-06, "loss": 1.6285, "step": 10315 }, { "epoch": 0.011207852447319292, "grad_norm": 17.319082260131836, "learning_rate": 4.998479987050501e-06, "loss": 1.2014, "step": 10320 }, { "epoch": 0.01121328260838873, "grad_norm": 71.2801284790039, "learning_rate": 4.998478499545084e-06, "loss": 1.7594, "step": 10325 }, { "epoch": 0.011218712769458168, "grad_norm": 13.752128601074219, "learning_rate": 4.998477011312398e-06, "loss": 1.8016, "step": 10330 }, { "epoch": 0.011224142930527606, "grad_norm": 20.61770248413086, "learning_rate": 4.998475522352443e-06, "loss": 1.4355, "step": 10335 }, { "epoch": 0.011229573091597042, "grad_norm": 17.049468994140625, "learning_rate": 4.99847403266522e-06, "loss": 1.0475, "step": 10340 }, { "epoch": 0.01123500325266648, "grad_norm": 13.756834983825684, "learning_rate": 4.99847254225073e-06, "loss": 2.4043, "step": 10345 }, { "epoch": 0.011240433413735918, "grad_norm": 19.018592834472656, "learning_rate": 4.998471051108971e-06, "loss": 1.6343, "step": 10350 }, { "epoch": 0.011245863574805356, "grad_norm": 14.134991645812988, "learning_rate": 4.9984695592399465e-06, "loss": 2.2446, "step": 10355 }, { "epoch": 0.011251293735874794, "grad_norm": 41.91609573364258, "learning_rate": 4.998468066643655e-06, "loss": 1.3987, "step": 10360 }, { "epoch": 0.01125672389694423, "grad_norm": 14.663599014282227, "learning_rate": 4.998466573320097e-06, "loss": 1.7598, "step": 10365 }, { "epoch": 0.011262154058013668, "grad_norm": 21.339778900146484, "learning_rate": 4.998465079269275e-06, "loss": 1.9229, "step": 10370 }, { "epoch": 0.011267584219083106, "grad_norm": 38.00367736816406, "learning_rate": 4.9984635844911865e-06, "loss": 2.0276, "step": 10375 }, { "epoch": 0.011273014380152544, "grad_norm": 24.213056564331055, "learning_rate": 4.998462088985834e-06, "loss": 1.6759, "step": 10380 }, { "epoch": 0.011278444541221982, "grad_norm": 14.72443675994873, "learning_rate": 4.998460592753216e-06, "loss": 1.8373, "step": 10385 }, { "epoch": 0.011283874702291419, "grad_norm": 9.569783210754395, "learning_rate": 4.998459095793335e-06, "loss": 1.5504, "step": 10390 }, { "epoch": 0.011289304863360857, "grad_norm": 15.012899398803711, "learning_rate": 4.998457598106191e-06, "loss": 1.8811, "step": 10395 }, { "epoch": 0.011294735024430295, "grad_norm": 19.688310623168945, "learning_rate": 4.9984560996917834e-06, "loss": 1.9906, "step": 10400 }, { "epoch": 0.011300165185499733, "grad_norm": 52.64585876464844, "learning_rate": 4.998454600550113e-06, "loss": 1.7919, "step": 10405 }, { "epoch": 0.011305595346569169, "grad_norm": 19.93270492553711, "learning_rate": 4.998453100681181e-06, "loss": 1.2867, "step": 10410 }, { "epoch": 0.011311025507638607, "grad_norm": 16.873977661132812, "learning_rate": 4.998451600084987e-06, "loss": 1.9487, "step": 10415 }, { "epoch": 0.011316455668708045, "grad_norm": 26.56378746032715, "learning_rate": 4.998450098761532e-06, "loss": 1.934, "step": 10420 }, { "epoch": 0.011321885829777483, "grad_norm": 10.77846622467041, "learning_rate": 4.998448596710816e-06, "loss": 2.5181, "step": 10425 }, { "epoch": 0.011327315990846921, "grad_norm": 64.80899810791016, "learning_rate": 4.99844709393284e-06, "loss": 1.4454, "step": 10430 }, { "epoch": 0.011332746151916357, "grad_norm": 14.415340423583984, "learning_rate": 4.998445590427603e-06, "loss": 1.5155, "step": 10435 }, { "epoch": 0.011338176312985795, "grad_norm": 18.35490608215332, "learning_rate": 4.998444086195108e-06, "loss": 1.6988, "step": 10440 }, { "epoch": 0.011343606474055233, "grad_norm": 12.216475486755371, "learning_rate": 4.998442581235353e-06, "loss": 1.8917, "step": 10445 }, { "epoch": 0.011349036635124671, "grad_norm": 12.849945068359375, "learning_rate": 4.998441075548339e-06, "loss": 1.8272, "step": 10450 }, { "epoch": 0.01135446679619411, "grad_norm": 34.234500885009766, "learning_rate": 4.9984395691340685e-06, "loss": 1.4562, "step": 10455 }, { "epoch": 0.011359896957263546, "grad_norm": 38.77682113647461, "learning_rate": 4.998438061992539e-06, "loss": 1.8769, "step": 10460 }, { "epoch": 0.011365327118332984, "grad_norm": 16.80317497253418, "learning_rate": 4.998436554123752e-06, "loss": 2.0461, "step": 10465 }, { "epoch": 0.011370757279402422, "grad_norm": 43.01018524169922, "learning_rate": 4.998435045527708e-06, "loss": 1.5515, "step": 10470 }, { "epoch": 0.01137618744047186, "grad_norm": 54.14521026611328, "learning_rate": 4.998433536204408e-06, "loss": 2.4953, "step": 10475 }, { "epoch": 0.011381617601541296, "grad_norm": 19.6528263092041, "learning_rate": 4.998432026153852e-06, "loss": 1.8682, "step": 10480 }, { "epoch": 0.011387047762610734, "grad_norm": 21.642642974853516, "learning_rate": 4.99843051537604e-06, "loss": 2.4536, "step": 10485 }, { "epoch": 0.011392477923680172, "grad_norm": 25.543062210083008, "learning_rate": 4.998429003870974e-06, "loss": 1.2017, "step": 10490 }, { "epoch": 0.01139790808474961, "grad_norm": 18.478822708129883, "learning_rate": 4.998427491638653e-06, "loss": 1.9853, "step": 10495 }, { "epoch": 0.011403338245819048, "grad_norm": 30.91162872314453, "learning_rate": 4.998425978679078e-06, "loss": 1.8599, "step": 10500 }, { "epoch": 0.011408768406888484, "grad_norm": 17.6353759765625, "learning_rate": 4.998424464992248e-06, "loss": 1.3693, "step": 10505 }, { "epoch": 0.011414198567957922, "grad_norm": 23.627748489379883, "learning_rate": 4.998422950578166e-06, "loss": 1.6194, "step": 10510 }, { "epoch": 0.01141962872902736, "grad_norm": 16.841033935546875, "learning_rate": 4.9984214354368295e-06, "loss": 1.4157, "step": 10515 }, { "epoch": 0.011425058890096798, "grad_norm": 21.675241470336914, "learning_rate": 4.998419919568242e-06, "loss": 1.9843, "step": 10520 }, { "epoch": 0.011430489051166236, "grad_norm": 40.279273986816406, "learning_rate": 4.998418402972401e-06, "loss": 2.47, "step": 10525 }, { "epoch": 0.011435919212235673, "grad_norm": 16.163103103637695, "learning_rate": 4.99841688564931e-06, "loss": 2.5414, "step": 10530 }, { "epoch": 0.01144134937330511, "grad_norm": 21.578319549560547, "learning_rate": 4.998415367598968e-06, "loss": 1.7792, "step": 10535 }, { "epoch": 0.011446779534374549, "grad_norm": 20.018653869628906, "learning_rate": 4.998413848821374e-06, "loss": 2.0143, "step": 10540 }, { "epoch": 0.011452209695443987, "grad_norm": 16.15210723876953, "learning_rate": 4.99841232931653e-06, "loss": 1.4675, "step": 10545 }, { "epoch": 0.011457639856513423, "grad_norm": 19.948354721069336, "learning_rate": 4.9984108090844365e-06, "loss": 1.6621, "step": 10550 }, { "epoch": 0.011463070017582861, "grad_norm": 22.061735153198242, "learning_rate": 4.998409288125093e-06, "loss": 1.9736, "step": 10555 }, { "epoch": 0.011468500178652299, "grad_norm": 20.58380126953125, "learning_rate": 4.998407766438502e-06, "loss": 1.0613, "step": 10560 }, { "epoch": 0.011473930339721737, "grad_norm": 46.17391586303711, "learning_rate": 4.998406244024662e-06, "loss": 2.2104, "step": 10565 }, { "epoch": 0.011479360500791175, "grad_norm": 14.26450252532959, "learning_rate": 4.998404720883573e-06, "loss": 1.8248, "step": 10570 }, { "epoch": 0.011484790661860611, "grad_norm": 44.86953353881836, "learning_rate": 4.998403197015237e-06, "loss": 1.6264, "step": 10575 }, { "epoch": 0.01149022082293005, "grad_norm": 30.861690521240234, "learning_rate": 4.998401672419654e-06, "loss": 1.5943, "step": 10580 }, { "epoch": 0.011495650983999487, "grad_norm": 73.93685913085938, "learning_rate": 4.998400147096824e-06, "loss": 1.6174, "step": 10585 }, { "epoch": 0.011501081145068925, "grad_norm": 57.3355598449707, "learning_rate": 4.998398621046748e-06, "loss": 1.8394, "step": 10590 }, { "epoch": 0.011506511306138363, "grad_norm": 15.312525749206543, "learning_rate": 4.998397094269426e-06, "loss": 1.3609, "step": 10595 }, { "epoch": 0.0115119414672078, "grad_norm": 43.276126861572266, "learning_rate": 4.9983955667648586e-06, "loss": 1.7101, "step": 10600 }, { "epoch": 0.011517371628277238, "grad_norm": 17.008787155151367, "learning_rate": 4.998394038533046e-06, "loss": 2.1778, "step": 10605 }, { "epoch": 0.011522801789346676, "grad_norm": 17.159608840942383, "learning_rate": 4.9983925095739895e-06, "loss": 1.8938, "step": 10610 }, { "epoch": 0.011528231950416114, "grad_norm": 28.83750343322754, "learning_rate": 4.998390979887689e-06, "loss": 1.5742, "step": 10615 }, { "epoch": 0.01153366211148555, "grad_norm": 19.698013305664062, "learning_rate": 4.998389449474145e-06, "loss": 1.2291, "step": 10620 }, { "epoch": 0.011539092272554988, "grad_norm": 18.577791213989258, "learning_rate": 4.998387918333357e-06, "loss": 1.5725, "step": 10625 }, { "epoch": 0.011544522433624426, "grad_norm": 28.8248233795166, "learning_rate": 4.998386386465327e-06, "loss": 1.7833, "step": 10630 }, { "epoch": 0.011549952594693864, "grad_norm": 37.301517486572266, "learning_rate": 4.998384853870054e-06, "loss": 1.7362, "step": 10635 }, { "epoch": 0.011555382755763302, "grad_norm": 47.83832931518555, "learning_rate": 4.99838332054754e-06, "loss": 1.47, "step": 10640 }, { "epoch": 0.011560812916832738, "grad_norm": 22.580299377441406, "learning_rate": 4.9983817864977835e-06, "loss": 1.4669, "step": 10645 }, { "epoch": 0.011566243077902176, "grad_norm": 39.894351959228516, "learning_rate": 4.998380251720787e-06, "loss": 1.8478, "step": 10650 }, { "epoch": 0.011571673238971614, "grad_norm": 21.244237899780273, "learning_rate": 4.99837871621655e-06, "loss": 1.761, "step": 10655 }, { "epoch": 0.011577103400041052, "grad_norm": 29.089120864868164, "learning_rate": 4.998377179985073e-06, "loss": 1.5402, "step": 10660 }, { "epoch": 0.01158253356111049, "grad_norm": 47.423622131347656, "learning_rate": 4.998375643026356e-06, "loss": 1.6051, "step": 10665 }, { "epoch": 0.011587963722179927, "grad_norm": 17.68145179748535, "learning_rate": 4.9983741053404e-06, "loss": 1.8696, "step": 10670 }, { "epoch": 0.011593393883249365, "grad_norm": 148.37161254882812, "learning_rate": 4.9983725669272055e-06, "loss": 1.624, "step": 10675 }, { "epoch": 0.011598824044318803, "grad_norm": 41.994590759277344, "learning_rate": 4.998371027786773e-06, "loss": 1.7852, "step": 10680 }, { "epoch": 0.01160425420538824, "grad_norm": 17.447650909423828, "learning_rate": 4.998369487919102e-06, "loss": 1.4172, "step": 10685 }, { "epoch": 0.011609684366457677, "grad_norm": 15.385407447814941, "learning_rate": 4.998367947324194e-06, "loss": 1.5168, "step": 10690 }, { "epoch": 0.011615114527527115, "grad_norm": 20.49140167236328, "learning_rate": 4.99836640600205e-06, "loss": 1.6291, "step": 10695 }, { "epoch": 0.011620544688596553, "grad_norm": 27.853050231933594, "learning_rate": 4.9983648639526685e-06, "loss": 1.2408, "step": 10700 }, { "epoch": 0.011625974849665991, "grad_norm": 14.06128978729248, "learning_rate": 4.99836332117605e-06, "loss": 1.8034, "step": 10705 }, { "epoch": 0.011631405010735429, "grad_norm": 47.19857406616211, "learning_rate": 4.998361777672199e-06, "loss": 1.9442, "step": 10710 }, { "epoch": 0.011636835171804865, "grad_norm": 18.718156814575195, "learning_rate": 4.99836023344111e-06, "loss": 1.6447, "step": 10715 }, { "epoch": 0.011642265332874303, "grad_norm": 120.06076049804688, "learning_rate": 4.998358688482787e-06, "loss": 1.9285, "step": 10720 }, { "epoch": 0.011647695493943741, "grad_norm": 75.14836883544922, "learning_rate": 4.998357142797231e-06, "loss": 2.0985, "step": 10725 }, { "epoch": 0.01165312565501318, "grad_norm": 40.972862243652344, "learning_rate": 4.99835559638444e-06, "loss": 2.0679, "step": 10730 }, { "epoch": 0.011658555816082617, "grad_norm": 21.368104934692383, "learning_rate": 4.998354049244416e-06, "loss": 1.2336, "step": 10735 }, { "epoch": 0.011663985977152054, "grad_norm": 85.99891662597656, "learning_rate": 4.998352501377159e-06, "loss": 1.8071, "step": 10740 }, { "epoch": 0.011669416138221492, "grad_norm": 23.218950271606445, "learning_rate": 4.9983509527826705e-06, "loss": 1.6003, "step": 10745 }, { "epoch": 0.01167484629929093, "grad_norm": 13.666888236999512, "learning_rate": 4.998349403460949e-06, "loss": 1.8081, "step": 10750 }, { "epoch": 0.011680276460360368, "grad_norm": 26.736705780029297, "learning_rate": 4.998347853411997e-06, "loss": 1.6059, "step": 10755 }, { "epoch": 0.011685706621429804, "grad_norm": 28.94096565246582, "learning_rate": 4.998346302635812e-06, "loss": 1.4629, "step": 10760 }, { "epoch": 0.011691136782499242, "grad_norm": 52.516910552978516, "learning_rate": 4.998344751132399e-06, "loss": 1.9327, "step": 10765 }, { "epoch": 0.01169656694356868, "grad_norm": 13.758468627929688, "learning_rate": 4.998343198901754e-06, "loss": 1.5064, "step": 10770 }, { "epoch": 0.011701997104638118, "grad_norm": 71.55120086669922, "learning_rate": 4.99834164594388e-06, "loss": 1.6415, "step": 10775 }, { "epoch": 0.011707427265707556, "grad_norm": 18.891368865966797, "learning_rate": 4.9983400922587764e-06, "loss": 2.4715, "step": 10780 }, { "epoch": 0.011712857426776992, "grad_norm": 197.7040252685547, "learning_rate": 4.998338537846445e-06, "loss": 1.8949, "step": 10785 }, { "epoch": 0.01171828758784643, "grad_norm": 26.889259338378906, "learning_rate": 4.998336982706884e-06, "loss": 2.594, "step": 10790 }, { "epoch": 0.011723717748915868, "grad_norm": 23.11223602294922, "learning_rate": 4.998335426840096e-06, "loss": 1.4849, "step": 10795 }, { "epoch": 0.011729147909985306, "grad_norm": 17.35491371154785, "learning_rate": 4.99833387024608e-06, "loss": 1.9515, "step": 10800 }, { "epoch": 0.011734578071054744, "grad_norm": 13.7982816696167, "learning_rate": 4.9983323129248375e-06, "loss": 2.0808, "step": 10805 }, { "epoch": 0.01174000823212418, "grad_norm": 25.433269500732422, "learning_rate": 4.998330754876368e-06, "loss": 1.5219, "step": 10810 }, { "epoch": 0.011745438393193619, "grad_norm": 20.09881591796875, "learning_rate": 4.998329196100673e-06, "loss": 1.5893, "step": 10815 }, { "epoch": 0.011750868554263057, "grad_norm": 32.29978561401367, "learning_rate": 4.998327636597752e-06, "loss": 1.5172, "step": 10820 }, { "epoch": 0.011756298715332495, "grad_norm": 76.21290588378906, "learning_rate": 4.998326076367606e-06, "loss": 1.9373, "step": 10825 }, { "epoch": 0.011761728876401931, "grad_norm": 14.848673820495605, "learning_rate": 4.998324515410235e-06, "loss": 2.1772, "step": 10830 }, { "epoch": 0.011767159037471369, "grad_norm": 115.35011291503906, "learning_rate": 4.99832295372564e-06, "loss": 1.9723, "step": 10835 }, { "epoch": 0.011772589198540807, "grad_norm": 358.15625, "learning_rate": 4.998321391313822e-06, "loss": 1.6241, "step": 10840 }, { "epoch": 0.011778019359610245, "grad_norm": 28.53584861755371, "learning_rate": 4.998319828174779e-06, "loss": 1.6929, "step": 10845 }, { "epoch": 0.011783449520679683, "grad_norm": 18.26047134399414, "learning_rate": 4.998318264308513e-06, "loss": 0.991, "step": 10850 }, { "epoch": 0.01178887968174912, "grad_norm": 36.218894958496094, "learning_rate": 4.998316699715027e-06, "loss": 1.9333, "step": 10855 }, { "epoch": 0.011794309842818557, "grad_norm": 26.11237144470215, "learning_rate": 4.998315134394317e-06, "loss": 1.9715, "step": 10860 }, { "epoch": 0.011799740003887995, "grad_norm": 70.12516784667969, "learning_rate": 4.998313568346386e-06, "loss": 1.8159, "step": 10865 }, { "epoch": 0.011805170164957433, "grad_norm": 12.796926498413086, "learning_rate": 4.998312001571234e-06, "loss": 1.347, "step": 10870 }, { "epoch": 0.011810600326026871, "grad_norm": 53.619232177734375, "learning_rate": 4.998310434068862e-06, "loss": 2.9988, "step": 10875 }, { "epoch": 0.011816030487096308, "grad_norm": 13.262703895568848, "learning_rate": 4.9983088658392686e-06, "loss": 1.1138, "step": 10880 }, { "epoch": 0.011821460648165746, "grad_norm": 14.751485824584961, "learning_rate": 4.998307296882455e-06, "loss": 1.0312, "step": 10885 }, { "epoch": 0.011826890809235184, "grad_norm": 23.429611206054688, "learning_rate": 4.998305727198424e-06, "loss": 1.7352, "step": 10890 }, { "epoch": 0.011832320970304622, "grad_norm": 41.01213836669922, "learning_rate": 4.9983041567871735e-06, "loss": 1.8489, "step": 10895 }, { "epoch": 0.011837751131374058, "grad_norm": 25.241838455200195, "learning_rate": 4.998302585648704e-06, "loss": 2.0826, "step": 10900 }, { "epoch": 0.011843181292443496, "grad_norm": 18.946901321411133, "learning_rate": 4.998301013783018e-06, "loss": 1.5941, "step": 10905 }, { "epoch": 0.011848611453512934, "grad_norm": 25.6570987701416, "learning_rate": 4.998299441190113e-06, "loss": 1.8866, "step": 10910 }, { "epoch": 0.011854041614582372, "grad_norm": 16.4945011138916, "learning_rate": 4.998297867869993e-06, "loss": 1.5255, "step": 10915 }, { "epoch": 0.01185947177565181, "grad_norm": 69.34546661376953, "learning_rate": 4.9982962938226546e-06, "loss": 1.0558, "step": 10920 }, { "epoch": 0.011864901936721246, "grad_norm": 17.46859359741211, "learning_rate": 4.9982947190481005e-06, "loss": 1.3764, "step": 10925 }, { "epoch": 0.011870332097790684, "grad_norm": 33.34421920776367, "learning_rate": 4.9982931435463315e-06, "loss": 1.7197, "step": 10930 }, { "epoch": 0.011875762258860122, "grad_norm": 41.72828674316406, "learning_rate": 4.998291567317347e-06, "loss": 2.0587, "step": 10935 }, { "epoch": 0.01188119241992956, "grad_norm": 30.050939559936523, "learning_rate": 4.998289990361148e-06, "loss": 1.6485, "step": 10940 }, { "epoch": 0.011886622580998998, "grad_norm": 33.07966613769531, "learning_rate": 4.998288412677735e-06, "loss": 1.5146, "step": 10945 }, { "epoch": 0.011892052742068435, "grad_norm": 25.470149993896484, "learning_rate": 4.998286834267108e-06, "loss": 1.6905, "step": 10950 }, { "epoch": 0.011897482903137873, "grad_norm": 24.595218658447266, "learning_rate": 4.998285255129267e-06, "loss": 1.3129, "step": 10955 }, { "epoch": 0.01190291306420731, "grad_norm": 18.773340225219727, "learning_rate": 4.9982836752642135e-06, "loss": 1.8113, "step": 10960 }, { "epoch": 0.011908343225276749, "grad_norm": 21.742446899414062, "learning_rate": 4.998282094671949e-06, "loss": 1.4793, "step": 10965 }, { "epoch": 0.011913773386346185, "grad_norm": 22.27544403076172, "learning_rate": 4.998280513352472e-06, "loss": 1.9679, "step": 10970 }, { "epoch": 0.011919203547415623, "grad_norm": 50.496368408203125, "learning_rate": 4.998278931305782e-06, "loss": 1.6025, "step": 10975 }, { "epoch": 0.011924633708485061, "grad_norm": 13.560656547546387, "learning_rate": 4.998277348531882e-06, "loss": 1.3298, "step": 10980 }, { "epoch": 0.011930063869554499, "grad_norm": 32.36294174194336, "learning_rate": 4.998275765030772e-06, "loss": 1.7873, "step": 10985 }, { "epoch": 0.011935494030623937, "grad_norm": 17.30262565612793, "learning_rate": 4.998274180802452e-06, "loss": 2.2244, "step": 10990 }, { "epoch": 0.011940924191693373, "grad_norm": 27.391765594482422, "learning_rate": 4.998272595846922e-06, "loss": 1.5992, "step": 10995 }, { "epoch": 0.011946354352762811, "grad_norm": 22.685625076293945, "learning_rate": 4.998271010164183e-06, "loss": 1.5365, "step": 11000 }, { "epoch": 0.01195178451383225, "grad_norm": 36.83216094970703, "learning_rate": 4.998269423754235e-06, "loss": 2.4763, "step": 11005 }, { "epoch": 0.011957214674901687, "grad_norm": 25.5157527923584, "learning_rate": 4.998267836617079e-06, "loss": 1.6257, "step": 11010 }, { "epoch": 0.011962644835971125, "grad_norm": 16.920928955078125, "learning_rate": 4.998266248752715e-06, "loss": 1.8356, "step": 11015 }, { "epoch": 0.011968074997040562, "grad_norm": 19.576858520507812, "learning_rate": 4.998264660161144e-06, "loss": 1.9196, "step": 11020 }, { "epoch": 0.01197350515811, "grad_norm": 27.281949996948242, "learning_rate": 4.998263070842366e-06, "loss": 1.2739, "step": 11025 }, { "epoch": 0.011978935319179438, "grad_norm": 16.487201690673828, "learning_rate": 4.998261480796381e-06, "loss": 2.4455, "step": 11030 }, { "epoch": 0.011984365480248876, "grad_norm": 31.895971298217773, "learning_rate": 4.998259890023192e-06, "loss": 1.9109, "step": 11035 }, { "epoch": 0.011989795641318312, "grad_norm": 14.814663887023926, "learning_rate": 4.998258298522796e-06, "loss": 1.3752, "step": 11040 }, { "epoch": 0.01199522580238775, "grad_norm": 21.58547019958496, "learning_rate": 4.998256706295195e-06, "loss": 1.542, "step": 11045 }, { "epoch": 0.012000655963457188, "grad_norm": 53.31250762939453, "learning_rate": 4.99825511334039e-06, "loss": 1.4961, "step": 11050 }, { "epoch": 0.012006086124526626, "grad_norm": 35.91347885131836, "learning_rate": 4.998253519658381e-06, "loss": 1.8368, "step": 11055 }, { "epoch": 0.012011516285596064, "grad_norm": 16.83083724975586, "learning_rate": 4.9982519252491675e-06, "loss": 2.1065, "step": 11060 }, { "epoch": 0.0120169464466655, "grad_norm": 46.308258056640625, "learning_rate": 4.998250330112752e-06, "loss": 1.9634, "step": 11065 }, { "epoch": 0.012022376607734938, "grad_norm": 12.91838550567627, "learning_rate": 4.998248734249133e-06, "loss": 1.114, "step": 11070 }, { "epoch": 0.012027806768804376, "grad_norm": 16.4686222076416, "learning_rate": 4.9982471376583115e-06, "loss": 1.8825, "step": 11075 }, { "epoch": 0.012033236929873814, "grad_norm": 16.841779708862305, "learning_rate": 4.99824554034029e-06, "loss": 1.5854, "step": 11080 }, { "epoch": 0.012038667090943252, "grad_norm": 16.437204360961914, "learning_rate": 4.998243942295066e-06, "loss": 1.0767, "step": 11085 }, { "epoch": 0.012044097252012689, "grad_norm": 21.987016677856445, "learning_rate": 4.998242343522641e-06, "loss": 2.5983, "step": 11090 }, { "epoch": 0.012049527413082127, "grad_norm": 84.2411117553711, "learning_rate": 4.998240744023017e-06, "loss": 1.541, "step": 11095 }, { "epoch": 0.012054957574151565, "grad_norm": 13.783365249633789, "learning_rate": 4.9982391437961916e-06, "loss": 1.6359, "step": 11100 }, { "epoch": 0.012060387735221003, "grad_norm": 20.15755271911621, "learning_rate": 4.998237542842167e-06, "loss": 1.869, "step": 11105 }, { "epoch": 0.012065817896290439, "grad_norm": 22.39057159423828, "learning_rate": 4.9982359411609436e-06, "loss": 2.1017, "step": 11110 }, { "epoch": 0.012071248057359877, "grad_norm": 264.1150817871094, "learning_rate": 4.998234338752522e-06, "loss": 1.8271, "step": 11115 }, { "epoch": 0.012076678218429315, "grad_norm": 10.834988594055176, "learning_rate": 4.998232735616903e-06, "loss": 1.9716, "step": 11120 }, { "epoch": 0.012082108379498753, "grad_norm": 88.61224365234375, "learning_rate": 4.998231131754086e-06, "loss": 1.9063, "step": 11125 }, { "epoch": 0.012087538540568191, "grad_norm": 130.019775390625, "learning_rate": 4.998229527164071e-06, "loss": 2.2187, "step": 11130 }, { "epoch": 0.012092968701637627, "grad_norm": 37.060401916503906, "learning_rate": 4.998227921846861e-06, "loss": 1.3013, "step": 11135 }, { "epoch": 0.012098398862707065, "grad_norm": 13.245466232299805, "learning_rate": 4.998226315802453e-06, "loss": 1.2399, "step": 11140 }, { "epoch": 0.012103829023776503, "grad_norm": 29.59423828125, "learning_rate": 4.998224709030851e-06, "loss": 1.6025, "step": 11145 }, { "epoch": 0.012109259184845941, "grad_norm": 15.702520370483398, "learning_rate": 4.998223101532053e-06, "loss": 1.598, "step": 11150 }, { "epoch": 0.01211468934591538, "grad_norm": 110.50146484375, "learning_rate": 4.99822149330606e-06, "loss": 2.1816, "step": 11155 }, { "epoch": 0.012120119506984816, "grad_norm": 25.392047882080078, "learning_rate": 4.998219884352874e-06, "loss": 1.6489, "step": 11160 }, { "epoch": 0.012125549668054254, "grad_norm": 16.116653442382812, "learning_rate": 4.998218274672492e-06, "loss": 1.9008, "step": 11165 }, { "epoch": 0.012130979829123692, "grad_norm": 61.356868743896484, "learning_rate": 4.998216664264919e-06, "loss": 1.91, "step": 11170 }, { "epoch": 0.01213640999019313, "grad_norm": 38.43524932861328, "learning_rate": 4.998215053130152e-06, "loss": 1.7692, "step": 11175 }, { "epoch": 0.012141840151262566, "grad_norm": 24.410730361938477, "learning_rate": 4.9982134412681925e-06, "loss": 1.5326, "step": 11180 }, { "epoch": 0.012147270312332004, "grad_norm": 21.360029220581055, "learning_rate": 4.998211828679042e-06, "loss": 1.441, "step": 11185 }, { "epoch": 0.012152700473401442, "grad_norm": 48.41032028198242, "learning_rate": 4.9982102153627e-06, "loss": 1.9085, "step": 11190 }, { "epoch": 0.01215813063447088, "grad_norm": 42.56544494628906, "learning_rate": 4.9982086013191654e-06, "loss": 1.5187, "step": 11195 }, { "epoch": 0.012163560795540318, "grad_norm": 28.862337112426758, "learning_rate": 4.9982069865484415e-06, "loss": 2.102, "step": 11200 }, { "epoch": 0.012168990956609754, "grad_norm": 27.78300666809082, "learning_rate": 4.998205371050527e-06, "loss": 1.4543, "step": 11205 }, { "epoch": 0.012174421117679192, "grad_norm": 16.184711456298828, "learning_rate": 4.998203754825424e-06, "loss": 1.4394, "step": 11210 }, { "epoch": 0.01217985127874863, "grad_norm": 22.628137588500977, "learning_rate": 4.99820213787313e-06, "loss": 1.8345, "step": 11215 }, { "epoch": 0.012185281439818068, "grad_norm": 18.124605178833008, "learning_rate": 4.998200520193649e-06, "loss": 2.2478, "step": 11220 }, { "epoch": 0.012190711600887506, "grad_norm": 80.7077407836914, "learning_rate": 4.99819890178698e-06, "loss": 1.21, "step": 11225 }, { "epoch": 0.012196141761956943, "grad_norm": 14.517572402954102, "learning_rate": 4.9981972826531225e-06, "loss": 1.3997, "step": 11230 }, { "epoch": 0.01220157192302638, "grad_norm": 28.270092010498047, "learning_rate": 4.998195662792078e-06, "loss": 1.6573, "step": 11235 }, { "epoch": 0.012207002084095819, "grad_norm": 17.58110809326172, "learning_rate": 4.998194042203847e-06, "loss": 1.764, "step": 11240 }, { "epoch": 0.012212432245165257, "grad_norm": 34.37449645996094, "learning_rate": 4.998192420888429e-06, "loss": 1.6484, "step": 11245 }, { "epoch": 0.012217862406234693, "grad_norm": 28.85211181640625, "learning_rate": 4.998190798845826e-06, "loss": 2.0345, "step": 11250 }, { "epoch": 0.012223292567304131, "grad_norm": 27.6376895904541, "learning_rate": 4.998189176076037e-06, "loss": 1.7303, "step": 11255 }, { "epoch": 0.012228722728373569, "grad_norm": 31.00856590270996, "learning_rate": 4.9981875525790635e-06, "loss": 2.1732, "step": 11260 }, { "epoch": 0.012234152889443007, "grad_norm": 31.33513832092285, "learning_rate": 4.998185928354906e-06, "loss": 1.1609, "step": 11265 }, { "epoch": 0.012239583050512445, "grad_norm": 13.282422065734863, "learning_rate": 4.998184303403564e-06, "loss": 1.5972, "step": 11270 }, { "epoch": 0.012245013211581881, "grad_norm": 44.596954345703125, "learning_rate": 4.998182677725039e-06, "loss": 2.5118, "step": 11275 }, { "epoch": 0.01225044337265132, "grad_norm": 22.086946487426758, "learning_rate": 4.9981810513193296e-06, "loss": 1.6518, "step": 11280 }, { "epoch": 0.012255873533720757, "grad_norm": 16.036117553710938, "learning_rate": 4.998179424186439e-06, "loss": 1.6636, "step": 11285 }, { "epoch": 0.012261303694790195, "grad_norm": 21.20543670654297, "learning_rate": 4.998177796326367e-06, "loss": 1.1895, "step": 11290 }, { "epoch": 0.012266733855859633, "grad_norm": 16.08302116394043, "learning_rate": 4.998176167739112e-06, "loss": 1.1769, "step": 11295 }, { "epoch": 0.01227216401692907, "grad_norm": 18.27577018737793, "learning_rate": 4.9981745384246765e-06, "loss": 1.2485, "step": 11300 }, { "epoch": 0.012277594177998508, "grad_norm": 18.724430084228516, "learning_rate": 4.998172908383061e-06, "loss": 1.6823, "step": 11305 }, { "epoch": 0.012283024339067946, "grad_norm": 12.342781066894531, "learning_rate": 4.998171277614265e-06, "loss": 1.567, "step": 11310 }, { "epoch": 0.012288454500137384, "grad_norm": 26.64559555053711, "learning_rate": 4.998169646118289e-06, "loss": 1.3584, "step": 11315 }, { "epoch": 0.01229388466120682, "grad_norm": 58.87510299682617, "learning_rate": 4.998168013895134e-06, "loss": 1.8262, "step": 11320 }, { "epoch": 0.012299314822276258, "grad_norm": 34.15950012207031, "learning_rate": 4.9981663809448e-06, "loss": 1.4264, "step": 11325 }, { "epoch": 0.012304744983345696, "grad_norm": 23.676515579223633, "learning_rate": 4.998164747267288e-06, "loss": 1.537, "step": 11330 }, { "epoch": 0.012310175144415134, "grad_norm": 21.517471313476562, "learning_rate": 4.998163112862599e-06, "loss": 1.8718, "step": 11335 }, { "epoch": 0.012315605305484572, "grad_norm": 20.064922332763672, "learning_rate": 4.998161477730732e-06, "loss": 1.6954, "step": 11340 }, { "epoch": 0.012321035466554008, "grad_norm": 29.006309509277344, "learning_rate": 4.998159841871688e-06, "loss": 1.9997, "step": 11345 }, { "epoch": 0.012326465627623446, "grad_norm": 45.35711669921875, "learning_rate": 4.9981582052854685e-06, "loss": 1.4837, "step": 11350 }, { "epoch": 0.012331895788692884, "grad_norm": 26.289274215698242, "learning_rate": 4.998156567972072e-06, "loss": 2.1523, "step": 11355 }, { "epoch": 0.012337325949762322, "grad_norm": 23.524972915649414, "learning_rate": 4.998154929931501e-06, "loss": 1.465, "step": 11360 }, { "epoch": 0.01234275611083176, "grad_norm": 32.737552642822266, "learning_rate": 4.998153291163755e-06, "loss": 1.2907, "step": 11365 }, { "epoch": 0.012348186271901197, "grad_norm": 103.12922668457031, "learning_rate": 4.998151651668834e-06, "loss": 1.7803, "step": 11370 }, { "epoch": 0.012353616432970635, "grad_norm": 17.149023056030273, "learning_rate": 4.9981500114467395e-06, "loss": 1.7881, "step": 11375 }, { "epoch": 0.012359046594040073, "grad_norm": 27.135658264160156, "learning_rate": 4.998148370497472e-06, "loss": 1.4124, "step": 11380 }, { "epoch": 0.01236447675510951, "grad_norm": 34.8885612487793, "learning_rate": 4.998146728821031e-06, "loss": 1.4938, "step": 11385 }, { "epoch": 0.012369906916178947, "grad_norm": 22.42924690246582, "learning_rate": 4.998145086417418e-06, "loss": 2.0944, "step": 11390 }, { "epoch": 0.012375337077248385, "grad_norm": 26.981523513793945, "learning_rate": 4.998143443286632e-06, "loss": 1.5393, "step": 11395 }, { "epoch": 0.012380767238317823, "grad_norm": 17.88446807861328, "learning_rate": 4.998141799428675e-06, "loss": 1.4376, "step": 11400 }, { "epoch": 0.012386197399387261, "grad_norm": 30.302562713623047, "learning_rate": 4.998140154843547e-06, "loss": 2.0072, "step": 11405 }, { "epoch": 0.012391627560456699, "grad_norm": 39.995418548583984, "learning_rate": 4.998138509531248e-06, "loss": 1.9643, "step": 11410 }, { "epoch": 0.012397057721526135, "grad_norm": 38.25883865356445, "learning_rate": 4.99813686349178e-06, "loss": 1.2495, "step": 11415 }, { "epoch": 0.012402487882595573, "grad_norm": 19.474834442138672, "learning_rate": 4.998135216725141e-06, "loss": 1.3494, "step": 11420 }, { "epoch": 0.012407918043665011, "grad_norm": 14.255511283874512, "learning_rate": 4.998133569231334e-06, "loss": 1.6664, "step": 11425 }, { "epoch": 0.01241334820473445, "grad_norm": 26.69709587097168, "learning_rate": 4.998131921010357e-06, "loss": 2.4515, "step": 11430 }, { "epoch": 0.012418778365803887, "grad_norm": 28.00982093811035, "learning_rate": 4.998130272062213e-06, "loss": 1.5261, "step": 11435 }, { "epoch": 0.012424208526873324, "grad_norm": 69.5927963256836, "learning_rate": 4.9981286223869e-06, "loss": 1.6914, "step": 11440 }, { "epoch": 0.012429638687942762, "grad_norm": 41.20991516113281, "learning_rate": 4.998126971984422e-06, "loss": 1.9922, "step": 11445 }, { "epoch": 0.0124350688490122, "grad_norm": 46.65325927734375, "learning_rate": 4.998125320854776e-06, "loss": 1.5095, "step": 11450 }, { "epoch": 0.012440499010081638, "grad_norm": 14.125570297241211, "learning_rate": 4.998123668997963e-06, "loss": 1.8445, "step": 11455 }, { "epoch": 0.012445929171151074, "grad_norm": 34.6286506652832, "learning_rate": 4.9981220164139855e-06, "loss": 1.5849, "step": 11460 }, { "epoch": 0.012451359332220512, "grad_norm": 22.021854400634766, "learning_rate": 4.9981203631028416e-06, "loss": 1.7114, "step": 11465 }, { "epoch": 0.01245678949328995, "grad_norm": 51.168216705322266, "learning_rate": 4.998118709064533e-06, "loss": 1.6034, "step": 11470 }, { "epoch": 0.012462219654359388, "grad_norm": 42.43317794799805, "learning_rate": 4.998117054299061e-06, "loss": 1.2278, "step": 11475 }, { "epoch": 0.012467649815428826, "grad_norm": 23.8902530670166, "learning_rate": 4.998115398806424e-06, "loss": 1.3917, "step": 11480 }, { "epoch": 0.012473079976498262, "grad_norm": 16.959901809692383, "learning_rate": 4.998113742586624e-06, "loss": 1.5776, "step": 11485 }, { "epoch": 0.0124785101375677, "grad_norm": 30.506685256958008, "learning_rate": 4.998112085639662e-06, "loss": 1.3495, "step": 11490 }, { "epoch": 0.012483940298637138, "grad_norm": 19.214412689208984, "learning_rate": 4.998110427965536e-06, "loss": 1.3483, "step": 11495 }, { "epoch": 0.012489370459706576, "grad_norm": 30.928956985473633, "learning_rate": 4.998108769564249e-06, "loss": 2.0915, "step": 11500 }, { "epoch": 0.012494800620776014, "grad_norm": 37.72391891479492, "learning_rate": 4.9981071104358e-06, "loss": 1.7538, "step": 11505 }, { "epoch": 0.01250023078184545, "grad_norm": 94.93775939941406, "learning_rate": 4.998105450580191e-06, "loss": 1.301, "step": 11510 }, { "epoch": 0.012505660942914889, "grad_norm": 39.87838363647461, "learning_rate": 4.998103789997421e-06, "loss": 1.391, "step": 11515 }, { "epoch": 0.012511091103984327, "grad_norm": 39.305511474609375, "learning_rate": 4.998102128687491e-06, "loss": 1.7049, "step": 11520 }, { "epoch": 0.012516521265053765, "grad_norm": 29.09282112121582, "learning_rate": 4.9981004666504004e-06, "loss": 1.9621, "step": 11525 }, { "epoch": 0.012521951426123201, "grad_norm": 16.29018783569336, "learning_rate": 4.998098803886152e-06, "loss": 1.6723, "step": 11530 }, { "epoch": 0.012527381587192639, "grad_norm": 48.829769134521484, "learning_rate": 4.998097140394745e-06, "loss": 2.027, "step": 11535 }, { "epoch": 0.012532811748262077, "grad_norm": 26.25273323059082, "learning_rate": 4.99809547617618e-06, "loss": 1.55, "step": 11540 }, { "epoch": 0.012538241909331515, "grad_norm": 28.2545223236084, "learning_rate": 4.998093811230457e-06, "loss": 1.7719, "step": 11545 }, { "epoch": 0.012543672070400953, "grad_norm": 33.3892822265625, "learning_rate": 4.998092145557577e-06, "loss": 1.7218, "step": 11550 }, { "epoch": 0.01254910223147039, "grad_norm": 51.73381423950195, "learning_rate": 4.99809047915754e-06, "loss": 1.5591, "step": 11555 }, { "epoch": 0.012554532392539827, "grad_norm": 37.13512420654297, "learning_rate": 4.998088812030348e-06, "loss": 1.3199, "step": 11560 }, { "epoch": 0.012559962553609265, "grad_norm": 43.09085464477539, "learning_rate": 4.9980871441759995e-06, "loss": 1.674, "step": 11565 }, { "epoch": 0.012565392714678703, "grad_norm": 40.09226608276367, "learning_rate": 4.9980854755944956e-06, "loss": 1.6811, "step": 11570 }, { "epoch": 0.012570822875748141, "grad_norm": 16.61285400390625, "learning_rate": 4.998083806285838e-06, "loss": 1.8532, "step": 11575 }, { "epoch": 0.012576253036817578, "grad_norm": 20.4953556060791, "learning_rate": 4.9980821362500255e-06, "loss": 1.8444, "step": 11580 }, { "epoch": 0.012581683197887016, "grad_norm": 20.53879737854004, "learning_rate": 4.9980804654870594e-06, "loss": 1.818, "step": 11585 }, { "epoch": 0.012587113358956454, "grad_norm": 43.40963363647461, "learning_rate": 4.99807879399694e-06, "loss": 1.9459, "step": 11590 }, { "epoch": 0.012592543520025892, "grad_norm": 13.946714401245117, "learning_rate": 4.9980771217796675e-06, "loss": 1.3672, "step": 11595 }, { "epoch": 0.012597973681095328, "grad_norm": 12.452985763549805, "learning_rate": 4.9980754488352434e-06, "loss": 1.6441, "step": 11600 }, { "epoch": 0.012603403842164766, "grad_norm": 22.324718475341797, "learning_rate": 4.998073775163668e-06, "loss": 1.7519, "step": 11605 }, { "epoch": 0.012608834003234204, "grad_norm": 26.5754337310791, "learning_rate": 4.99807210076494e-06, "loss": 1.7287, "step": 11610 }, { "epoch": 0.012614264164303642, "grad_norm": 51.173126220703125, "learning_rate": 4.998070425639062e-06, "loss": 1.3205, "step": 11615 }, { "epoch": 0.01261969432537308, "grad_norm": 16.97916603088379, "learning_rate": 4.9980687497860335e-06, "loss": 1.6457, "step": 11620 }, { "epoch": 0.012625124486442516, "grad_norm": 23.608413696289062, "learning_rate": 4.998067073205855e-06, "loss": 2.4034, "step": 11625 }, { "epoch": 0.012630554647511954, "grad_norm": 40.1147575378418, "learning_rate": 4.998065395898528e-06, "loss": 1.2856, "step": 11630 }, { "epoch": 0.012635984808581392, "grad_norm": 56.980377197265625, "learning_rate": 4.9980637178640524e-06, "loss": 1.622, "step": 11635 }, { "epoch": 0.01264141496965083, "grad_norm": 15.746199607849121, "learning_rate": 4.998062039102427e-06, "loss": 1.5295, "step": 11640 }, { "epoch": 0.012646845130720268, "grad_norm": 17.068279266357422, "learning_rate": 4.9980603596136546e-06, "loss": 0.9595, "step": 11645 }, { "epoch": 0.012652275291789705, "grad_norm": 29.697086334228516, "learning_rate": 4.998058679397735e-06, "loss": 2.0366, "step": 11650 }, { "epoch": 0.012657705452859143, "grad_norm": 26.449207305908203, "learning_rate": 4.9980569984546675e-06, "loss": 1.9657, "step": 11655 }, { "epoch": 0.01266313561392858, "grad_norm": 201.59654235839844, "learning_rate": 4.998055316784455e-06, "loss": 1.5134, "step": 11660 }, { "epoch": 0.012668565774998019, "grad_norm": 19.428781509399414, "learning_rate": 4.998053634387096e-06, "loss": 1.5398, "step": 11665 }, { "epoch": 0.012673995936067455, "grad_norm": 16.285371780395508, "learning_rate": 4.998051951262592e-06, "loss": 1.2342, "step": 11670 }, { "epoch": 0.012679426097136893, "grad_norm": 18.02349281311035, "learning_rate": 4.998050267410942e-06, "loss": 1.422, "step": 11675 }, { "epoch": 0.012684856258206331, "grad_norm": 23.195899963378906, "learning_rate": 4.9980485828321486e-06, "loss": 1.5027, "step": 11680 }, { "epoch": 0.012690286419275769, "grad_norm": 25.786632537841797, "learning_rate": 4.998046897526211e-06, "loss": 1.752, "step": 11685 }, { "epoch": 0.012695716580345207, "grad_norm": 19.436866760253906, "learning_rate": 4.99804521149313e-06, "loss": 1.4002, "step": 11690 }, { "epoch": 0.012701146741414643, "grad_norm": 125.53186798095703, "learning_rate": 4.998043524732906e-06, "loss": 1.7982, "step": 11695 }, { "epoch": 0.012706576902484081, "grad_norm": 63.047935485839844, "learning_rate": 4.998041837245539e-06, "loss": 1.7022, "step": 11700 }, { "epoch": 0.01271200706355352, "grad_norm": 17.881319046020508, "learning_rate": 4.99804014903103e-06, "loss": 1.8112, "step": 11705 }, { "epoch": 0.012717437224622957, "grad_norm": 26.068267822265625, "learning_rate": 4.99803846008938e-06, "loss": 1.6564, "step": 11710 }, { "epoch": 0.012722867385692395, "grad_norm": 93.0098876953125, "learning_rate": 4.9980367704205896e-06, "loss": 2.1897, "step": 11715 }, { "epoch": 0.012728297546761832, "grad_norm": 32.55135726928711, "learning_rate": 4.998035080024658e-06, "loss": 1.3098, "step": 11720 }, { "epoch": 0.01273372770783127, "grad_norm": 17.456844329833984, "learning_rate": 4.998033388901587e-06, "loss": 1.6201, "step": 11725 }, { "epoch": 0.012739157868900708, "grad_norm": 23.201305389404297, "learning_rate": 4.9980316970513754e-06, "loss": 2.0838, "step": 11730 }, { "epoch": 0.012744588029970146, "grad_norm": 32.59766387939453, "learning_rate": 4.998030004474025e-06, "loss": 2.2951, "step": 11735 }, { "epoch": 0.012750018191039582, "grad_norm": 16.05200958251953, "learning_rate": 4.998028311169537e-06, "loss": 2.0587, "step": 11740 }, { "epoch": 0.01275544835210902, "grad_norm": 18.21875762939453, "learning_rate": 4.99802661713791e-06, "loss": 1.7716, "step": 11745 }, { "epoch": 0.012760878513178458, "grad_norm": 34.08883285522461, "learning_rate": 4.998024922379146e-06, "loss": 1.8851, "step": 11750 }, { "epoch": 0.012766308674247896, "grad_norm": 51.39767837524414, "learning_rate": 4.998023226893245e-06, "loss": 1.2452, "step": 11755 }, { "epoch": 0.012771738835317334, "grad_norm": 110.37877655029297, "learning_rate": 4.9980215306802075e-06, "loss": 1.6115, "step": 11760 }, { "epoch": 0.01277716899638677, "grad_norm": 134.12831115722656, "learning_rate": 4.998019833740033e-06, "loss": 2.7605, "step": 11765 }, { "epoch": 0.012782599157456208, "grad_norm": 34.544036865234375, "learning_rate": 4.9980181360727245e-06, "loss": 1.5889, "step": 11770 }, { "epoch": 0.012788029318525646, "grad_norm": 46.89354705810547, "learning_rate": 4.99801643767828e-06, "loss": 1.3403, "step": 11775 }, { "epoch": 0.012793459479595084, "grad_norm": 30.082021713256836, "learning_rate": 4.998014738556701e-06, "loss": 1.675, "step": 11780 }, { "epoch": 0.012798889640664522, "grad_norm": 49.44233322143555, "learning_rate": 4.998013038707987e-06, "loss": 1.8705, "step": 11785 }, { "epoch": 0.012804319801733959, "grad_norm": 33.12400436401367, "learning_rate": 4.99801133813214e-06, "loss": 1.6798, "step": 11790 }, { "epoch": 0.012809749962803397, "grad_norm": 46.883426666259766, "learning_rate": 4.998009636829161e-06, "loss": 1.8033, "step": 11795 }, { "epoch": 0.012815180123872835, "grad_norm": 37.265445709228516, "learning_rate": 4.998007934799048e-06, "loss": 1.3575, "step": 11800 }, { "epoch": 0.012820610284942273, "grad_norm": 15.957152366638184, "learning_rate": 4.9980062320418045e-06, "loss": 1.4806, "step": 11805 }, { "epoch": 0.012826040446011709, "grad_norm": 21.16278648376465, "learning_rate": 4.998004528557428e-06, "loss": 1.5217, "step": 11810 }, { "epoch": 0.012831470607081147, "grad_norm": 18.9205322265625, "learning_rate": 4.998002824345921e-06, "loss": 1.2467, "step": 11815 }, { "epoch": 0.012836900768150585, "grad_norm": 121.61156463623047, "learning_rate": 4.998001119407283e-06, "loss": 1.7256, "step": 11820 }, { "epoch": 0.012842330929220023, "grad_norm": 20.748098373413086, "learning_rate": 4.997999413741515e-06, "loss": 2.2043, "step": 11825 }, { "epoch": 0.012847761090289461, "grad_norm": 19.97130584716797, "learning_rate": 4.997997707348617e-06, "loss": 1.438, "step": 11830 }, { "epoch": 0.012853191251358897, "grad_norm": 17.65506935119629, "learning_rate": 4.997996000228591e-06, "loss": 2.2344, "step": 11835 }, { "epoch": 0.012858621412428335, "grad_norm": 55.289878845214844, "learning_rate": 4.997994292381435e-06, "loss": 1.6132, "step": 11840 }, { "epoch": 0.012864051573497773, "grad_norm": 42.47261428833008, "learning_rate": 4.997992583807152e-06, "loss": 1.7594, "step": 11845 }, { "epoch": 0.012869481734567211, "grad_norm": 31.8128719329834, "learning_rate": 4.997990874505741e-06, "loss": 1.765, "step": 11850 }, { "epoch": 0.01287491189563665, "grad_norm": 43.95989990234375, "learning_rate": 4.997989164477203e-06, "loss": 2.3266, "step": 11855 }, { "epoch": 0.012880342056706085, "grad_norm": 62.479286193847656, "learning_rate": 4.997987453721538e-06, "loss": 1.6659, "step": 11860 }, { "epoch": 0.012885772217775524, "grad_norm": 14.552276611328125, "learning_rate": 4.997985742238747e-06, "loss": 1.4299, "step": 11865 }, { "epoch": 0.012891202378844962, "grad_norm": 17.212684631347656, "learning_rate": 4.99798403002883e-06, "loss": 1.5251, "step": 11870 }, { "epoch": 0.0128966325399144, "grad_norm": 22.488256454467773, "learning_rate": 4.997982317091788e-06, "loss": 1.5677, "step": 11875 }, { "epoch": 0.012902062700983836, "grad_norm": 35.09349822998047, "learning_rate": 4.997980603427622e-06, "loss": 2.4387, "step": 11880 }, { "epoch": 0.012907492862053274, "grad_norm": 19.160808563232422, "learning_rate": 4.997978889036331e-06, "loss": 1.469, "step": 11885 }, { "epoch": 0.012912923023122712, "grad_norm": 24.332738876342773, "learning_rate": 4.997977173917917e-06, "loss": 1.6776, "step": 11890 }, { "epoch": 0.01291835318419215, "grad_norm": 14.841110229492188, "learning_rate": 4.99797545807238e-06, "loss": 2.0948, "step": 11895 }, { "epoch": 0.012923783345261588, "grad_norm": 65.31137084960938, "learning_rate": 4.9979737414997196e-06, "loss": 2.2925, "step": 11900 }, { "epoch": 0.012929213506331024, "grad_norm": 126.62967681884766, "learning_rate": 4.9979720241999375e-06, "loss": 1.4794, "step": 11905 }, { "epoch": 0.012934643667400462, "grad_norm": 13.050639152526855, "learning_rate": 4.997970306173034e-06, "loss": 2.0015, "step": 11910 }, { "epoch": 0.0129400738284699, "grad_norm": 13.965747833251953, "learning_rate": 4.9979685874190085e-06, "loss": 2.0359, "step": 11915 }, { "epoch": 0.012945503989539338, "grad_norm": 23.32206916809082, "learning_rate": 4.997966867937863e-06, "loss": 1.6472, "step": 11920 }, { "epoch": 0.012950934150608776, "grad_norm": 18.487773895263672, "learning_rate": 4.997965147729598e-06, "loss": 1.8169, "step": 11925 }, { "epoch": 0.012956364311678212, "grad_norm": 17.39305305480957, "learning_rate": 4.997963426794212e-06, "loss": 1.6623, "step": 11930 }, { "epoch": 0.01296179447274765, "grad_norm": 15.14924430847168, "learning_rate": 4.997961705131707e-06, "loss": 1.4252, "step": 11935 }, { "epoch": 0.012967224633817089, "grad_norm": 40.08971405029297, "learning_rate": 4.997959982742084e-06, "loss": 1.9528, "step": 11940 }, { "epoch": 0.012972654794886527, "grad_norm": 64.30595397949219, "learning_rate": 4.997958259625343e-06, "loss": 1.2886, "step": 11945 }, { "epoch": 0.012978084955955963, "grad_norm": 24.623104095458984, "learning_rate": 4.997956535781484e-06, "loss": 2.2056, "step": 11950 }, { "epoch": 0.0129835151170254, "grad_norm": 17.523555755615234, "learning_rate": 4.997954811210508e-06, "loss": 1.6063, "step": 11955 }, { "epoch": 0.012988945278094839, "grad_norm": 22.440242767333984, "learning_rate": 4.997953085912415e-06, "loss": 1.7749, "step": 11960 }, { "epoch": 0.012994375439164277, "grad_norm": 59.13945770263672, "learning_rate": 4.997951359887206e-06, "loss": 1.5246, "step": 11965 }, { "epoch": 0.012999805600233715, "grad_norm": 20.6509952545166, "learning_rate": 4.9979496331348815e-06, "loss": 1.3344, "step": 11970 }, { "epoch": 0.013005235761303151, "grad_norm": 40.32819366455078, "learning_rate": 4.997947905655441e-06, "loss": 1.938, "step": 11975 }, { "epoch": 0.01301066592237259, "grad_norm": 13.866988182067871, "learning_rate": 4.997946177448888e-06, "loss": 1.8259, "step": 11980 }, { "epoch": 0.013016096083442027, "grad_norm": 27.31109619140625, "learning_rate": 4.997944448515219e-06, "loss": 1.6974, "step": 11985 }, { "epoch": 0.013021526244511465, "grad_norm": 36.95524215698242, "learning_rate": 4.997942718854437e-06, "loss": 2.0918, "step": 11990 }, { "epoch": 0.013026956405580903, "grad_norm": 63.89656066894531, "learning_rate": 4.997940988466542e-06, "loss": 1.618, "step": 11995 }, { "epoch": 0.01303238656665034, "grad_norm": 41.251705169677734, "learning_rate": 4.997939257351534e-06, "loss": 1.3471, "step": 12000 }, { "epoch": 0.013037816727719777, "grad_norm": 29.091644287109375, "learning_rate": 4.997937525509415e-06, "loss": 2.1097, "step": 12005 }, { "epoch": 0.013043246888789215, "grad_norm": 19.796737670898438, "learning_rate": 4.997935792940183e-06, "loss": 1.9446, "step": 12010 }, { "epoch": 0.013048677049858654, "grad_norm": 33.09740447998047, "learning_rate": 4.99793405964384e-06, "loss": 1.439, "step": 12015 }, { "epoch": 0.01305410721092809, "grad_norm": 47.13689422607422, "learning_rate": 4.997932325620387e-06, "loss": 0.6757, "step": 12020 }, { "epoch": 0.013059537371997528, "grad_norm": 19.549434661865234, "learning_rate": 4.9979305908698235e-06, "loss": 1.9921, "step": 12025 }, { "epoch": 0.013064967533066966, "grad_norm": 22.033212661743164, "learning_rate": 4.99792885539215e-06, "loss": 1.4533, "step": 12030 }, { "epoch": 0.013070397694136404, "grad_norm": 41.65156555175781, "learning_rate": 4.997927119187368e-06, "loss": 1.7044, "step": 12035 }, { "epoch": 0.013075827855205842, "grad_norm": 22.83566665649414, "learning_rate": 4.997925382255477e-06, "loss": 2.0824, "step": 12040 }, { "epoch": 0.013081258016275278, "grad_norm": 25.135154724121094, "learning_rate": 4.997923644596479e-06, "loss": 1.3366, "step": 12045 }, { "epoch": 0.013086688177344716, "grad_norm": 22.684022903442383, "learning_rate": 4.997921906210373e-06, "loss": 1.2761, "step": 12050 }, { "epoch": 0.013092118338414154, "grad_norm": 16.399490356445312, "learning_rate": 4.99792016709716e-06, "loss": 1.3489, "step": 12055 }, { "epoch": 0.013097548499483592, "grad_norm": 22.43840789794922, "learning_rate": 4.99791842725684e-06, "loss": 1.6565, "step": 12060 }, { "epoch": 0.01310297866055303, "grad_norm": 91.77362823486328, "learning_rate": 4.9979166866894135e-06, "loss": 1.3836, "step": 12065 }, { "epoch": 0.013108408821622466, "grad_norm": 20.160701751708984, "learning_rate": 4.997914945394883e-06, "loss": 1.4695, "step": 12070 }, { "epoch": 0.013113838982691904, "grad_norm": 51.39616394042969, "learning_rate": 4.997913203373246e-06, "loss": 1.7791, "step": 12075 }, { "epoch": 0.013119269143761342, "grad_norm": 19.010108947753906, "learning_rate": 4.997911460624505e-06, "loss": 1.8726, "step": 12080 }, { "epoch": 0.01312469930483078, "grad_norm": 13.778695106506348, "learning_rate": 4.99790971714866e-06, "loss": 2.4429, "step": 12085 }, { "epoch": 0.013130129465900217, "grad_norm": 19.96917724609375, "learning_rate": 4.997907972945711e-06, "loss": 2.1465, "step": 12090 }, { "epoch": 0.013135559626969655, "grad_norm": 20.656179428100586, "learning_rate": 4.99790622801566e-06, "loss": 1.3548, "step": 12095 }, { "epoch": 0.013140989788039093, "grad_norm": 22.681264877319336, "learning_rate": 4.997904482358506e-06, "loss": 1.1498, "step": 12100 }, { "epoch": 0.01314641994910853, "grad_norm": 21.79560089111328, "learning_rate": 4.99790273597425e-06, "loss": 1.9682, "step": 12105 }, { "epoch": 0.013151850110177969, "grad_norm": 24.608274459838867, "learning_rate": 4.997900988862893e-06, "loss": 1.7355, "step": 12110 }, { "epoch": 0.013157280271247405, "grad_norm": 65.80792236328125, "learning_rate": 4.9978992410244345e-06, "loss": 2.7023, "step": 12115 }, { "epoch": 0.013162710432316843, "grad_norm": 14.813632011413574, "learning_rate": 4.997897492458875e-06, "loss": 1.3021, "step": 12120 }, { "epoch": 0.013168140593386281, "grad_norm": 14.350610733032227, "learning_rate": 4.997895743166216e-06, "loss": 1.4882, "step": 12125 }, { "epoch": 0.01317357075445572, "grad_norm": 24.028751373291016, "learning_rate": 4.997893993146458e-06, "loss": 1.7658, "step": 12130 }, { "epoch": 0.013179000915525157, "grad_norm": 24.061012268066406, "learning_rate": 4.9978922423996e-06, "loss": 1.8824, "step": 12135 }, { "epoch": 0.013184431076594593, "grad_norm": 44.96529769897461, "learning_rate": 4.9978904909256444e-06, "loss": 1.3658, "step": 12140 }, { "epoch": 0.013189861237664031, "grad_norm": 28.473316192626953, "learning_rate": 4.9978887387245914e-06, "loss": 1.3191, "step": 12145 }, { "epoch": 0.01319529139873347, "grad_norm": 39.91132354736328, "learning_rate": 4.99788698579644e-06, "loss": 1.7544, "step": 12150 }, { "epoch": 0.013200721559802907, "grad_norm": 44.972660064697266, "learning_rate": 4.997885232141192e-06, "loss": 1.3848, "step": 12155 }, { "epoch": 0.013206151720872344, "grad_norm": 329.6860046386719, "learning_rate": 4.9978834777588485e-06, "loss": 1.8476, "step": 12160 }, { "epoch": 0.013211581881941782, "grad_norm": 27.17206573486328, "learning_rate": 4.997881722649408e-06, "loss": 1.728, "step": 12165 }, { "epoch": 0.01321701204301122, "grad_norm": 114.01722717285156, "learning_rate": 4.997879966812872e-06, "loss": 1.399, "step": 12170 }, { "epoch": 0.013222442204080658, "grad_norm": 202.9072723388672, "learning_rate": 4.9978782102492415e-06, "loss": 1.7225, "step": 12175 }, { "epoch": 0.013227872365150096, "grad_norm": 65.99710083007812, "learning_rate": 4.9978764529585175e-06, "loss": 1.3968, "step": 12180 }, { "epoch": 0.013233302526219532, "grad_norm": 22.051074981689453, "learning_rate": 4.997874694940699e-06, "loss": 1.9719, "step": 12185 }, { "epoch": 0.01323873268728897, "grad_norm": 15.953997611999512, "learning_rate": 4.997872936195787e-06, "loss": 1.4405, "step": 12190 }, { "epoch": 0.013244162848358408, "grad_norm": 22.739187240600586, "learning_rate": 4.997871176723782e-06, "loss": 1.8762, "step": 12195 }, { "epoch": 0.013249593009427846, "grad_norm": 65.63195037841797, "learning_rate": 4.997869416524685e-06, "loss": 2.0734, "step": 12200 }, { "epoch": 0.013255023170497284, "grad_norm": 43.41096496582031, "learning_rate": 4.997867655598496e-06, "loss": 1.6976, "step": 12205 }, { "epoch": 0.01326045333156672, "grad_norm": 19.142093658447266, "learning_rate": 4.997865893945217e-06, "loss": 1.4551, "step": 12210 }, { "epoch": 0.013265883492636158, "grad_norm": 21.836135864257812, "learning_rate": 4.997864131564846e-06, "loss": 1.6561, "step": 12215 }, { "epoch": 0.013271313653705596, "grad_norm": 16.44571304321289, "learning_rate": 4.997862368457385e-06, "loss": 1.4829, "step": 12220 }, { "epoch": 0.013276743814775034, "grad_norm": 27.698606491088867, "learning_rate": 4.997860604622833e-06, "loss": 1.515, "step": 12225 }, { "epoch": 0.01328217397584447, "grad_norm": 12.198974609375, "learning_rate": 4.997858840061194e-06, "loss": 1.7638, "step": 12230 }, { "epoch": 0.013287604136913909, "grad_norm": 17.080806732177734, "learning_rate": 4.997857074772464e-06, "loss": 1.6851, "step": 12235 }, { "epoch": 0.013293034297983347, "grad_norm": 21.2011775970459, "learning_rate": 4.997855308756648e-06, "loss": 1.747, "step": 12240 }, { "epoch": 0.013298464459052785, "grad_norm": 98.2800064086914, "learning_rate": 4.997853542013743e-06, "loss": 1.59, "step": 12245 }, { "epoch": 0.013303894620122223, "grad_norm": 83.12832641601562, "learning_rate": 4.997851774543752e-06, "loss": 1.8197, "step": 12250 }, { "epoch": 0.013309324781191659, "grad_norm": 26.31748390197754, "learning_rate": 4.997850006346674e-06, "loss": 1.5557, "step": 12255 }, { "epoch": 0.013314754942261097, "grad_norm": 14.987446784973145, "learning_rate": 4.997848237422509e-06, "loss": 1.5458, "step": 12260 }, { "epoch": 0.013320185103330535, "grad_norm": 19.580503463745117, "learning_rate": 4.997846467771259e-06, "loss": 2.5879, "step": 12265 }, { "epoch": 0.013325615264399973, "grad_norm": 18.365598678588867, "learning_rate": 4.997844697392924e-06, "loss": 1.6968, "step": 12270 }, { "epoch": 0.013331045425469411, "grad_norm": 16.121416091918945, "learning_rate": 4.997842926287504e-06, "loss": 1.9362, "step": 12275 }, { "epoch": 0.013336475586538847, "grad_norm": 15.06814193725586, "learning_rate": 4.997841154455e-06, "loss": 1.4289, "step": 12280 }, { "epoch": 0.013341905747608285, "grad_norm": 12.333553314208984, "learning_rate": 4.997839381895412e-06, "loss": 1.5088, "step": 12285 }, { "epoch": 0.013347335908677723, "grad_norm": 58.19549560546875, "learning_rate": 4.997837608608742e-06, "loss": 1.2539, "step": 12290 }, { "epoch": 0.013352766069747161, "grad_norm": 13.137247085571289, "learning_rate": 4.997835834594988e-06, "loss": 1.7633, "step": 12295 }, { "epoch": 0.013358196230816598, "grad_norm": 13.52885913848877, "learning_rate": 4.997834059854154e-06, "loss": 1.9301, "step": 12300 }, { "epoch": 0.013363626391886036, "grad_norm": 17.473663330078125, "learning_rate": 4.997832284386237e-06, "loss": 1.3446, "step": 12305 }, { "epoch": 0.013369056552955474, "grad_norm": 35.29755401611328, "learning_rate": 4.99783050819124e-06, "loss": 2.3784, "step": 12310 }, { "epoch": 0.013374486714024912, "grad_norm": 54.13011932373047, "learning_rate": 4.997828731269162e-06, "loss": 2.4317, "step": 12315 }, { "epoch": 0.01337991687509435, "grad_norm": 12.366741180419922, "learning_rate": 4.997826953620004e-06, "loss": 1.9724, "step": 12320 }, { "epoch": 0.013385347036163786, "grad_norm": 21.33897590637207, "learning_rate": 4.997825175243767e-06, "loss": 1.4741, "step": 12325 }, { "epoch": 0.013390777197233224, "grad_norm": 34.83774948120117, "learning_rate": 4.99782339614045e-06, "loss": 1.4083, "step": 12330 }, { "epoch": 0.013396207358302662, "grad_norm": 18.508798599243164, "learning_rate": 4.997821616310056e-06, "loss": 1.6714, "step": 12335 }, { "epoch": 0.0134016375193721, "grad_norm": 173.4017333984375, "learning_rate": 4.997819835752583e-06, "loss": 2.9883, "step": 12340 }, { "epoch": 0.013407067680441536, "grad_norm": 54.266029357910156, "learning_rate": 4.9978180544680334e-06, "loss": 1.5087, "step": 12345 }, { "epoch": 0.013412497841510974, "grad_norm": 68.43119049072266, "learning_rate": 4.997816272456407e-06, "loss": 1.2966, "step": 12350 }, { "epoch": 0.013417928002580412, "grad_norm": 67.3568344116211, "learning_rate": 4.997814489717704e-06, "loss": 2.1443, "step": 12355 }, { "epoch": 0.01342335816364985, "grad_norm": 26.397480010986328, "learning_rate": 4.997812706251925e-06, "loss": 2.0593, "step": 12360 }, { "epoch": 0.013428788324719288, "grad_norm": 107.54570770263672, "learning_rate": 4.9978109220590715e-06, "loss": 1.2165, "step": 12365 }, { "epoch": 0.013434218485788725, "grad_norm": 25.031965255737305, "learning_rate": 4.997809137139142e-06, "loss": 2.1367, "step": 12370 }, { "epoch": 0.013439648646858163, "grad_norm": 24.251293182373047, "learning_rate": 4.99780735149214e-06, "loss": 1.7248, "step": 12375 }, { "epoch": 0.0134450788079276, "grad_norm": 21.83884048461914, "learning_rate": 4.997805565118063e-06, "loss": 1.5745, "step": 12380 }, { "epoch": 0.013450508968997039, "grad_norm": 21.33796501159668, "learning_rate": 4.997803778016913e-06, "loss": 1.8597, "step": 12385 }, { "epoch": 0.013455939130066477, "grad_norm": 36.12498474121094, "learning_rate": 4.997801990188691e-06, "loss": 2.1431, "step": 12390 }, { "epoch": 0.013461369291135913, "grad_norm": 18.9975643157959, "learning_rate": 4.997800201633396e-06, "loss": 2.2468, "step": 12395 }, { "epoch": 0.013466799452205351, "grad_norm": 31.482011795043945, "learning_rate": 4.9977984123510295e-06, "loss": 1.8593, "step": 12400 }, { "epoch": 0.013472229613274789, "grad_norm": 14.73244571685791, "learning_rate": 4.997796622341592e-06, "loss": 1.4523, "step": 12405 }, { "epoch": 0.013477659774344227, "grad_norm": 15.796283721923828, "learning_rate": 4.997794831605084e-06, "loss": 2.3059, "step": 12410 }, { "epoch": 0.013483089935413663, "grad_norm": 19.413150787353516, "learning_rate": 4.997793040141505e-06, "loss": 1.4848, "step": 12415 }, { "epoch": 0.013488520096483101, "grad_norm": 27.553869247436523, "learning_rate": 4.997791247950858e-06, "loss": 1.7045, "step": 12420 }, { "epoch": 0.01349395025755254, "grad_norm": 42.20549774169922, "learning_rate": 4.99778945503314e-06, "loss": 1.6634, "step": 12425 }, { "epoch": 0.013499380418621977, "grad_norm": 56.94994354248047, "learning_rate": 4.997787661388356e-06, "loss": 0.9292, "step": 12430 }, { "epoch": 0.013504810579691415, "grad_norm": 17.884685516357422, "learning_rate": 4.997785867016502e-06, "loss": 1.1628, "step": 12435 }, { "epoch": 0.013510240740760852, "grad_norm": 43.032527923583984, "learning_rate": 4.997784071917582e-06, "loss": 2.1478, "step": 12440 }, { "epoch": 0.01351567090183029, "grad_norm": 18.008541107177734, "learning_rate": 4.997782276091594e-06, "loss": 1.6151, "step": 12445 }, { "epoch": 0.013521101062899728, "grad_norm": 19.682085037231445, "learning_rate": 4.99778047953854e-06, "loss": 1.7771, "step": 12450 }, { "epoch": 0.013526531223969166, "grad_norm": 17.778301239013672, "learning_rate": 4.99777868225842e-06, "loss": 1.5746, "step": 12455 }, { "epoch": 0.013531961385038604, "grad_norm": 17.783105850219727, "learning_rate": 4.9977768842512345e-06, "loss": 1.3404, "step": 12460 }, { "epoch": 0.01353739154610804, "grad_norm": 21.083959579467773, "learning_rate": 4.997775085516985e-06, "loss": 1.385, "step": 12465 }, { "epoch": 0.013542821707177478, "grad_norm": 19.011112213134766, "learning_rate": 4.99777328605567e-06, "loss": 1.5708, "step": 12470 }, { "epoch": 0.013548251868246916, "grad_norm": 35.10181427001953, "learning_rate": 4.997771485867292e-06, "loss": 2.2358, "step": 12475 }, { "epoch": 0.013553682029316354, "grad_norm": 20.24428367614746, "learning_rate": 4.99776968495185e-06, "loss": 2.0084, "step": 12480 }, { "epoch": 0.01355911219038579, "grad_norm": 224.1102752685547, "learning_rate": 4.997767883309346e-06, "loss": 2.0183, "step": 12485 }, { "epoch": 0.013564542351455228, "grad_norm": 13.202455520629883, "learning_rate": 4.997766080939779e-06, "loss": 1.6081, "step": 12490 }, { "epoch": 0.013569972512524666, "grad_norm": 76.23111724853516, "learning_rate": 4.9977642778431516e-06, "loss": 1.6614, "step": 12495 }, { "epoch": 0.013575402673594104, "grad_norm": 11.702932357788086, "learning_rate": 4.997762474019462e-06, "loss": 1.6756, "step": 12500 }, { "epoch": 0.013580832834663542, "grad_norm": 23.89158821105957, "learning_rate": 4.997760669468712e-06, "loss": 1.7237, "step": 12505 }, { "epoch": 0.013586262995732979, "grad_norm": 98.55681610107422, "learning_rate": 4.997758864190901e-06, "loss": 1.8139, "step": 12510 }, { "epoch": 0.013591693156802417, "grad_norm": 19.291732788085938, "learning_rate": 4.997757058186031e-06, "loss": 1.4885, "step": 12515 }, { "epoch": 0.013597123317871855, "grad_norm": 23.405269622802734, "learning_rate": 4.997755251454103e-06, "loss": 0.9663, "step": 12520 }, { "epoch": 0.013602553478941293, "grad_norm": 33.694427490234375, "learning_rate": 4.997753443995115e-06, "loss": 1.6263, "step": 12525 }, { "epoch": 0.01360798364001073, "grad_norm": 19.123310089111328, "learning_rate": 4.99775163580907e-06, "loss": 1.2237, "step": 12530 }, { "epoch": 0.013613413801080167, "grad_norm": 52.43777084350586, "learning_rate": 4.997749826895967e-06, "loss": 1.8445, "step": 12535 }, { "epoch": 0.013618843962149605, "grad_norm": 24.358190536499023, "learning_rate": 4.997748017255807e-06, "loss": 2.0393, "step": 12540 }, { "epoch": 0.013624274123219043, "grad_norm": 20.27357292175293, "learning_rate": 4.997746206888591e-06, "loss": 1.8332, "step": 12545 }, { "epoch": 0.013629704284288481, "grad_norm": 108.03236389160156, "learning_rate": 4.9977443957943185e-06, "loss": 1.6452, "step": 12550 }, { "epoch": 0.013635134445357917, "grad_norm": 25.223060607910156, "learning_rate": 4.9977425839729906e-06, "loss": 1.5606, "step": 12555 }, { "epoch": 0.013640564606427355, "grad_norm": 24.40560531616211, "learning_rate": 4.997740771424608e-06, "loss": 1.9401, "step": 12560 }, { "epoch": 0.013645994767496793, "grad_norm": 36.352352142333984, "learning_rate": 4.997738958149171e-06, "loss": 1.7143, "step": 12565 }, { "epoch": 0.013651424928566231, "grad_norm": 23.303382873535156, "learning_rate": 4.99773714414668e-06, "loss": 1.7062, "step": 12570 }, { "epoch": 0.01365685508963567, "grad_norm": 17.133987426757812, "learning_rate": 4.997735329417136e-06, "loss": 2.1366, "step": 12575 }, { "epoch": 0.013662285250705106, "grad_norm": 62.78969955444336, "learning_rate": 4.99773351396054e-06, "loss": 1.5724, "step": 12580 }, { "epoch": 0.013667715411774544, "grad_norm": 23.055208206176758, "learning_rate": 4.997731697776891e-06, "loss": 1.9508, "step": 12585 }, { "epoch": 0.013673145572843982, "grad_norm": 108.10824584960938, "learning_rate": 4.99772988086619e-06, "loss": 1.8754, "step": 12590 }, { "epoch": 0.01367857573391342, "grad_norm": 186.54808044433594, "learning_rate": 4.997728063228438e-06, "loss": 2.2514, "step": 12595 }, { "epoch": 0.013684005894982858, "grad_norm": 32.24524688720703, "learning_rate": 4.997726244863635e-06, "loss": 1.3277, "step": 12600 }, { "epoch": 0.013689436056052294, "grad_norm": 18.841785430908203, "learning_rate": 4.997724425771783e-06, "loss": 2.184, "step": 12605 }, { "epoch": 0.013694866217121732, "grad_norm": 19.713876724243164, "learning_rate": 4.997722605952881e-06, "loss": 1.5531, "step": 12610 }, { "epoch": 0.01370029637819117, "grad_norm": 14.731682777404785, "learning_rate": 4.99772078540693e-06, "loss": 1.7058, "step": 12615 }, { "epoch": 0.013705726539260608, "grad_norm": 20.768041610717773, "learning_rate": 4.9977189641339305e-06, "loss": 1.4739, "step": 12620 }, { "epoch": 0.013711156700330044, "grad_norm": 13.446596145629883, "learning_rate": 4.997717142133882e-06, "loss": 1.5458, "step": 12625 }, { "epoch": 0.013716586861399482, "grad_norm": 41.60920333862305, "learning_rate": 4.997715319406787e-06, "loss": 1.7136, "step": 12630 }, { "epoch": 0.01372201702246892, "grad_norm": 28.317941665649414, "learning_rate": 4.997713495952645e-06, "loss": 1.5798, "step": 12635 }, { "epoch": 0.013727447183538358, "grad_norm": 39.098697662353516, "learning_rate": 4.9977116717714565e-06, "loss": 1.9062, "step": 12640 }, { "epoch": 0.013732877344607796, "grad_norm": 28.73493766784668, "learning_rate": 4.997709846863222e-06, "loss": 1.6201, "step": 12645 }, { "epoch": 0.013738307505677233, "grad_norm": 14.872146606445312, "learning_rate": 4.997708021227943e-06, "loss": 1.6421, "step": 12650 }, { "epoch": 0.01374373766674667, "grad_norm": 239.02261352539062, "learning_rate": 4.997706194865618e-06, "loss": 2.1, "step": 12655 }, { "epoch": 0.013749167827816109, "grad_norm": 15.017864227294922, "learning_rate": 4.99770436777625e-06, "loss": 1.4586, "step": 12660 }, { "epoch": 0.013754597988885547, "grad_norm": 28.71395492553711, "learning_rate": 4.997702539959837e-06, "loss": 2.2319, "step": 12665 }, { "epoch": 0.013760028149954985, "grad_norm": 16.00248146057129, "learning_rate": 4.997700711416381e-06, "loss": 1.178, "step": 12670 }, { "epoch": 0.013765458311024421, "grad_norm": 51.627098083496094, "learning_rate": 4.997698882145882e-06, "loss": 1.9624, "step": 12675 }, { "epoch": 0.013770888472093859, "grad_norm": 21.92610740661621, "learning_rate": 4.997697052148343e-06, "loss": 1.6233, "step": 12680 }, { "epoch": 0.013776318633163297, "grad_norm": 101.39983367919922, "learning_rate": 4.99769522142376e-06, "loss": 2.6059, "step": 12685 }, { "epoch": 0.013781748794232735, "grad_norm": 29.88629913330078, "learning_rate": 4.997693389972137e-06, "loss": 2.7231, "step": 12690 }, { "epoch": 0.013787178955302171, "grad_norm": 15.439696311950684, "learning_rate": 4.9976915577934736e-06, "loss": 1.9347, "step": 12695 }, { "epoch": 0.01379260911637161, "grad_norm": 33.594444274902344, "learning_rate": 4.99768972488777e-06, "loss": 2.3186, "step": 12700 }, { "epoch": 0.013798039277441047, "grad_norm": 30.169723510742188, "learning_rate": 4.997687891255026e-06, "loss": 1.4324, "step": 12705 }, { "epoch": 0.013803469438510485, "grad_norm": 18.17859649658203, "learning_rate": 4.997686056895244e-06, "loss": 1.3895, "step": 12710 }, { "epoch": 0.013808899599579923, "grad_norm": 15.486205101013184, "learning_rate": 4.997684221808423e-06, "loss": 2.0235, "step": 12715 }, { "epoch": 0.01381432976064936, "grad_norm": 19.302766799926758, "learning_rate": 4.997682385994565e-06, "loss": 1.4574, "step": 12720 }, { "epoch": 0.013819759921718798, "grad_norm": 22.680461883544922, "learning_rate": 4.997680549453669e-06, "loss": 2.2732, "step": 12725 }, { "epoch": 0.013825190082788236, "grad_norm": 15.25006103515625, "learning_rate": 4.997678712185737e-06, "loss": 1.5147, "step": 12730 }, { "epoch": 0.013830620243857674, "grad_norm": 13.454582214355469, "learning_rate": 4.997676874190768e-06, "loss": 1.5045, "step": 12735 }, { "epoch": 0.013836050404927112, "grad_norm": 17.973804473876953, "learning_rate": 4.997675035468763e-06, "loss": 1.3232, "step": 12740 }, { "epoch": 0.013841480565996548, "grad_norm": 26.976238250732422, "learning_rate": 4.997673196019723e-06, "loss": 1.28, "step": 12745 }, { "epoch": 0.013846910727065986, "grad_norm": 26.3631591796875, "learning_rate": 4.997671355843649e-06, "loss": 2.0792, "step": 12750 }, { "epoch": 0.013852340888135424, "grad_norm": 18.47333335876465, "learning_rate": 4.997669514940541e-06, "loss": 1.6483, "step": 12755 }, { "epoch": 0.013857771049204862, "grad_norm": 43.37543869018555, "learning_rate": 4.997667673310398e-06, "loss": 1.4472, "step": 12760 }, { "epoch": 0.013863201210274298, "grad_norm": 60.56368637084961, "learning_rate": 4.997665830953223e-06, "loss": 1.4575, "step": 12765 }, { "epoch": 0.013868631371343736, "grad_norm": 32.327762603759766, "learning_rate": 4.9976639878690155e-06, "loss": 2.1555, "step": 12770 }, { "epoch": 0.013874061532413174, "grad_norm": 30.21219825744629, "learning_rate": 4.997662144057776e-06, "loss": 1.6303, "step": 12775 }, { "epoch": 0.013879491693482612, "grad_norm": 38.36017990112305, "learning_rate": 4.9976602995195045e-06, "loss": 1.6021, "step": 12780 }, { "epoch": 0.01388492185455205, "grad_norm": 38.41847610473633, "learning_rate": 4.997658454254202e-06, "loss": 1.4745, "step": 12785 }, { "epoch": 0.013890352015621487, "grad_norm": 34.64310073852539, "learning_rate": 4.997656608261869e-06, "loss": 1.1997, "step": 12790 }, { "epoch": 0.013895782176690925, "grad_norm": 45.96076583862305, "learning_rate": 4.997654761542508e-06, "loss": 1.4399, "step": 12795 }, { "epoch": 0.013901212337760363, "grad_norm": 13.335958480834961, "learning_rate": 4.997652914096116e-06, "loss": 1.6875, "step": 12800 }, { "epoch": 0.0139066424988298, "grad_norm": 13.194109916687012, "learning_rate": 4.997651065922695e-06, "loss": 1.9175, "step": 12805 }, { "epoch": 0.013912072659899239, "grad_norm": 20.157812118530273, "learning_rate": 4.997649217022247e-06, "loss": 1.2952, "step": 12810 }, { "epoch": 0.013917502820968675, "grad_norm": 15.101015090942383, "learning_rate": 4.99764736739477e-06, "loss": 2.1715, "step": 12815 }, { "epoch": 0.013922932982038113, "grad_norm": 16.278310775756836, "learning_rate": 4.997645517040267e-06, "loss": 1.5529, "step": 12820 }, { "epoch": 0.013928363143107551, "grad_norm": 16.77041244506836, "learning_rate": 4.997643665958737e-06, "loss": 1.7347, "step": 12825 }, { "epoch": 0.013933793304176989, "grad_norm": 29.5686092376709, "learning_rate": 4.99764181415018e-06, "loss": 1.6856, "step": 12830 }, { "epoch": 0.013939223465246425, "grad_norm": 21.144058227539062, "learning_rate": 4.9976399616145985e-06, "loss": 1.115, "step": 12835 }, { "epoch": 0.013944653626315863, "grad_norm": 14.12763786315918, "learning_rate": 4.997638108351992e-06, "loss": 1.9836, "step": 12840 }, { "epoch": 0.013950083787385301, "grad_norm": 245.5764617919922, "learning_rate": 4.997636254362361e-06, "loss": 1.5434, "step": 12845 }, { "epoch": 0.01395551394845474, "grad_norm": 21.516841888427734, "learning_rate": 4.997634399645706e-06, "loss": 2.1582, "step": 12850 }, { "epoch": 0.013960944109524177, "grad_norm": 22.71213150024414, "learning_rate": 4.997632544202028e-06, "loss": 1.4755, "step": 12855 }, { "epoch": 0.013966374270593614, "grad_norm": 12.431103706359863, "learning_rate": 4.997630688031326e-06, "loss": 1.9895, "step": 12860 }, { "epoch": 0.013971804431663052, "grad_norm": 24.56557273864746, "learning_rate": 4.997628831133603e-06, "loss": 1.5617, "step": 12865 }, { "epoch": 0.01397723459273249, "grad_norm": 43.48880386352539, "learning_rate": 4.997626973508858e-06, "loss": 1.626, "step": 12870 }, { "epoch": 0.013982664753801928, "grad_norm": 79.48741149902344, "learning_rate": 4.997625115157091e-06, "loss": 1.2664, "step": 12875 }, { "epoch": 0.013988094914871366, "grad_norm": 16.128820419311523, "learning_rate": 4.997623256078305e-06, "loss": 2.0809, "step": 12880 }, { "epoch": 0.013993525075940802, "grad_norm": 54.87763595581055, "learning_rate": 4.997621396272497e-06, "loss": 1.5845, "step": 12885 }, { "epoch": 0.01399895523701024, "grad_norm": 24.2255916595459, "learning_rate": 4.99761953573967e-06, "loss": 1.7029, "step": 12890 }, { "epoch": 0.014004385398079678, "grad_norm": 69.48719787597656, "learning_rate": 4.9976176744798244e-06, "loss": 3.0374, "step": 12895 }, { "epoch": 0.014009815559149116, "grad_norm": 18.999916076660156, "learning_rate": 4.99761581249296e-06, "loss": 1.4424, "step": 12900 }, { "epoch": 0.014015245720218552, "grad_norm": 60.74335479736328, "learning_rate": 4.997613949779078e-06, "loss": 2.076, "step": 12905 }, { "epoch": 0.01402067588128799, "grad_norm": 25.637216567993164, "learning_rate": 4.997612086338177e-06, "loss": 1.1979, "step": 12910 }, { "epoch": 0.014026106042357428, "grad_norm": 37.65385437011719, "learning_rate": 4.997610222170261e-06, "loss": 1.39, "step": 12915 }, { "epoch": 0.014031536203426866, "grad_norm": 17.19044303894043, "learning_rate": 4.997608357275327e-06, "loss": 2.2602, "step": 12920 }, { "epoch": 0.014036966364496304, "grad_norm": 92.35063171386719, "learning_rate": 4.997606491653379e-06, "loss": 1.9636, "step": 12925 }, { "epoch": 0.01404239652556574, "grad_norm": 27.34238052368164, "learning_rate": 4.9976046253044156e-06, "loss": 1.5707, "step": 12930 }, { "epoch": 0.014047826686635179, "grad_norm": 42.02851486206055, "learning_rate": 4.997602758228436e-06, "loss": 0.8849, "step": 12935 }, { "epoch": 0.014053256847704617, "grad_norm": 16.33241844177246, "learning_rate": 4.997600890425443e-06, "loss": 2.8954, "step": 12940 }, { "epoch": 0.014058687008774055, "grad_norm": 40.41416931152344, "learning_rate": 4.997599021895437e-06, "loss": 2.1471, "step": 12945 }, { "epoch": 0.014064117169843493, "grad_norm": 16.98455238342285, "learning_rate": 4.997597152638417e-06, "loss": 1.3474, "step": 12950 }, { "epoch": 0.014069547330912929, "grad_norm": 17.420299530029297, "learning_rate": 4.997595282654385e-06, "loss": 1.9713, "step": 12955 }, { "epoch": 0.014074977491982367, "grad_norm": 54.5960693359375, "learning_rate": 4.997593411943341e-06, "loss": 1.503, "step": 12960 }, { "epoch": 0.014080407653051805, "grad_norm": 14.928996086120605, "learning_rate": 4.997591540505285e-06, "loss": 1.8647, "step": 12965 }, { "epoch": 0.014085837814121243, "grad_norm": 82.60499572753906, "learning_rate": 4.997589668340219e-06, "loss": 2.0536, "step": 12970 }, { "epoch": 0.01409126797519068, "grad_norm": 39.27020263671875, "learning_rate": 4.997587795448142e-06, "loss": 1.8111, "step": 12975 }, { "epoch": 0.014096698136260117, "grad_norm": 15.461400985717773, "learning_rate": 4.997585921829056e-06, "loss": 1.6421, "step": 12980 }, { "epoch": 0.014102128297329555, "grad_norm": 14.159661293029785, "learning_rate": 4.997584047482959e-06, "loss": 1.6666, "step": 12985 }, { "epoch": 0.014107558458398993, "grad_norm": 28.226200103759766, "learning_rate": 4.997582172409855e-06, "loss": 1.8985, "step": 12990 }, { "epoch": 0.014112988619468431, "grad_norm": 21.894437789916992, "learning_rate": 4.997580296609742e-06, "loss": 1.4378, "step": 12995 }, { "epoch": 0.014118418780537868, "grad_norm": 49.95966720581055, "learning_rate": 4.997578420082622e-06, "loss": 1.9035, "step": 13000 }, { "epoch": 0.014123848941607306, "grad_norm": 29.364404678344727, "learning_rate": 4.997576542828494e-06, "loss": 1.4573, "step": 13005 }, { "epoch": 0.014129279102676744, "grad_norm": 12.912454605102539, "learning_rate": 4.99757466484736e-06, "loss": 1.5719, "step": 13010 }, { "epoch": 0.014134709263746182, "grad_norm": 38.702152252197266, "learning_rate": 4.99757278613922e-06, "loss": 1.8088, "step": 13015 }, { "epoch": 0.01414013942481562, "grad_norm": 15.224455833435059, "learning_rate": 4.9975709067040745e-06, "loss": 1.7737, "step": 13020 }, { "epoch": 0.014145569585885056, "grad_norm": 18.91033935546875, "learning_rate": 4.997569026541924e-06, "loss": 1.2935, "step": 13025 }, { "epoch": 0.014150999746954494, "grad_norm": 21.168716430664062, "learning_rate": 4.997567145652769e-06, "loss": 1.8707, "step": 13030 }, { "epoch": 0.014156429908023932, "grad_norm": 42.12873077392578, "learning_rate": 4.997565264036611e-06, "loss": 1.6228, "step": 13035 }, { "epoch": 0.01416186006909337, "grad_norm": 91.44623565673828, "learning_rate": 4.997563381693449e-06, "loss": 1.7218, "step": 13040 }, { "epoch": 0.014167290230162806, "grad_norm": 18.288082122802734, "learning_rate": 4.997561498623285e-06, "loss": 1.1889, "step": 13045 }, { "epoch": 0.014172720391232244, "grad_norm": 18.506690979003906, "learning_rate": 4.9975596148261185e-06, "loss": 1.7048, "step": 13050 }, { "epoch": 0.014178150552301682, "grad_norm": 18.00705337524414, "learning_rate": 4.99755773030195e-06, "loss": 1.4402, "step": 13055 }, { "epoch": 0.01418358071337112, "grad_norm": 22.78580665588379, "learning_rate": 4.997555845050781e-06, "loss": 2.2872, "step": 13060 }, { "epoch": 0.014189010874440558, "grad_norm": 34.534202575683594, "learning_rate": 4.997553959072612e-06, "loss": 1.6046, "step": 13065 }, { "epoch": 0.014194441035509995, "grad_norm": 82.06012725830078, "learning_rate": 4.997552072367442e-06, "loss": 1.755, "step": 13070 }, { "epoch": 0.014199871196579433, "grad_norm": 20.237289428710938, "learning_rate": 4.9975501849352725e-06, "loss": 1.689, "step": 13075 }, { "epoch": 0.01420530135764887, "grad_norm": 249.4492645263672, "learning_rate": 4.997548296776105e-06, "loss": 2.1444, "step": 13080 }, { "epoch": 0.014210731518718309, "grad_norm": 36.305274963378906, "learning_rate": 4.997546407889938e-06, "loss": 1.8812, "step": 13085 }, { "epoch": 0.014216161679787747, "grad_norm": 75.66197204589844, "learning_rate": 4.997544518276775e-06, "loss": 2.0118, "step": 13090 }, { "epoch": 0.014221591840857183, "grad_norm": 16.553983688354492, "learning_rate": 4.997542627936613e-06, "loss": 1.3946, "step": 13095 }, { "epoch": 0.014227022001926621, "grad_norm": 28.803287506103516, "learning_rate": 4.997540736869455e-06, "loss": 1.3455, "step": 13100 }, { "epoch": 0.014232452162996059, "grad_norm": 18.678531646728516, "learning_rate": 4.9975388450753014e-06, "loss": 1.4466, "step": 13105 }, { "epoch": 0.014237882324065497, "grad_norm": 21.570497512817383, "learning_rate": 4.997536952554152e-06, "loss": 2.1646, "step": 13110 }, { "epoch": 0.014243312485134933, "grad_norm": 64.23193359375, "learning_rate": 4.997535059306007e-06, "loss": 2.7175, "step": 13115 }, { "epoch": 0.014248742646204371, "grad_norm": 54.14787673950195, "learning_rate": 4.997533165330868e-06, "loss": 1.7639, "step": 13120 }, { "epoch": 0.01425417280727381, "grad_norm": 108.34911346435547, "learning_rate": 4.997531270628735e-06, "loss": 2.063, "step": 13125 }, { "epoch": 0.014259602968343247, "grad_norm": 31.85701560974121, "learning_rate": 4.997529375199609e-06, "loss": 3.1592, "step": 13130 }, { "epoch": 0.014265033129412685, "grad_norm": 23.80753517150879, "learning_rate": 4.9975274790434895e-06, "loss": 1.5137, "step": 13135 }, { "epoch": 0.014270463290482122, "grad_norm": 16.89912223815918, "learning_rate": 4.997525582160379e-06, "loss": 1.171, "step": 13140 }, { "epoch": 0.01427589345155156, "grad_norm": 13.960247993469238, "learning_rate": 4.997523684550275e-06, "loss": 2.4995, "step": 13145 }, { "epoch": 0.014281323612620998, "grad_norm": 34.984825134277344, "learning_rate": 4.9975217862131805e-06, "loss": 1.5505, "step": 13150 }, { "epoch": 0.014286753773690436, "grad_norm": 26.787532806396484, "learning_rate": 4.997519887149096e-06, "loss": 1.2182, "step": 13155 }, { "epoch": 0.014292183934759874, "grad_norm": 15.259222984313965, "learning_rate": 4.9975179873580215e-06, "loss": 1.2925, "step": 13160 }, { "epoch": 0.01429761409582931, "grad_norm": 15.639178276062012, "learning_rate": 4.997516086839958e-06, "loss": 1.9351, "step": 13165 }, { "epoch": 0.014303044256898748, "grad_norm": 123.05892944335938, "learning_rate": 4.9975141855949035e-06, "loss": 1.6156, "step": 13170 }, { "epoch": 0.014308474417968186, "grad_norm": 45.35590362548828, "learning_rate": 4.997512283622862e-06, "loss": 1.4158, "step": 13175 }, { "epoch": 0.014313904579037624, "grad_norm": 22.9853515625, "learning_rate": 4.9975103809238325e-06, "loss": 1.2011, "step": 13180 }, { "epoch": 0.01431933474010706, "grad_norm": 13.115616798400879, "learning_rate": 4.997508477497817e-06, "loss": 1.0474, "step": 13185 }, { "epoch": 0.014324764901176498, "grad_norm": 27.9364070892334, "learning_rate": 4.997506573344813e-06, "loss": 1.2588, "step": 13190 }, { "epoch": 0.014330195062245936, "grad_norm": 17.157819747924805, "learning_rate": 4.997504668464823e-06, "loss": 2.4052, "step": 13195 }, { "epoch": 0.014335625223315374, "grad_norm": 20.629289627075195, "learning_rate": 4.997502762857848e-06, "loss": 1.6255, "step": 13200 }, { "epoch": 0.014341055384384812, "grad_norm": 21.23131561279297, "learning_rate": 4.997500856523888e-06, "loss": 2.1656, "step": 13205 }, { "epoch": 0.014346485545454249, "grad_norm": 50.79793167114258, "learning_rate": 4.997498949462944e-06, "loss": 1.3323, "step": 13210 }, { "epoch": 0.014351915706523687, "grad_norm": 25.469736099243164, "learning_rate": 4.997497041675015e-06, "loss": 1.4829, "step": 13215 }, { "epoch": 0.014357345867593125, "grad_norm": 39.44157409667969, "learning_rate": 4.997495133160102e-06, "loss": 1.1064, "step": 13220 }, { "epoch": 0.014362776028662563, "grad_norm": 139.60354614257812, "learning_rate": 4.997493223918207e-06, "loss": 1.3984, "step": 13225 }, { "epoch": 0.014368206189732, "grad_norm": 24.0460205078125, "learning_rate": 4.99749131394933e-06, "loss": 1.6223, "step": 13230 }, { "epoch": 0.014373636350801437, "grad_norm": 13.029147148132324, "learning_rate": 4.997489403253472e-06, "loss": 1.1491, "step": 13235 }, { "epoch": 0.014379066511870875, "grad_norm": 17.63663101196289, "learning_rate": 4.997487491830631e-06, "loss": 1.3032, "step": 13240 }, { "epoch": 0.014384496672940313, "grad_norm": 32.86801528930664, "learning_rate": 4.997485579680811e-06, "loss": 1.3533, "step": 13245 }, { "epoch": 0.014389926834009751, "grad_norm": 15.495920181274414, "learning_rate": 4.9974836668040094e-06, "loss": 1.4379, "step": 13250 }, { "epoch": 0.014395356995079187, "grad_norm": 22.41504669189453, "learning_rate": 4.9974817532002295e-06, "loss": 1.8445, "step": 13255 }, { "epoch": 0.014400787156148625, "grad_norm": 22.029258728027344, "learning_rate": 4.99747983886947e-06, "loss": 1.5768, "step": 13260 }, { "epoch": 0.014406217317218063, "grad_norm": 19.95577049255371, "learning_rate": 4.9974779238117325e-06, "loss": 1.5211, "step": 13265 }, { "epoch": 0.014411647478287501, "grad_norm": 21.349090576171875, "learning_rate": 4.997476008027017e-06, "loss": 1.2337, "step": 13270 }, { "epoch": 0.01441707763935694, "grad_norm": 30.148082733154297, "learning_rate": 4.997474091515324e-06, "loss": 1.4294, "step": 13275 }, { "epoch": 0.014422507800426376, "grad_norm": 17.12652015686035, "learning_rate": 4.9974721742766554e-06, "loss": 1.6154, "step": 13280 }, { "epoch": 0.014427937961495814, "grad_norm": 54.523345947265625, "learning_rate": 4.99747025631101e-06, "loss": 1.0301, "step": 13285 }, { "epoch": 0.014433368122565252, "grad_norm": 30.10854721069336, "learning_rate": 4.997468337618389e-06, "loss": 1.8139, "step": 13290 }, { "epoch": 0.01443879828363469, "grad_norm": 30.84034538269043, "learning_rate": 4.997466418198792e-06, "loss": 1.7797, "step": 13295 }, { "epoch": 0.014444228444704128, "grad_norm": 19.141145706176758, "learning_rate": 4.997464498052221e-06, "loss": 2.2923, "step": 13300 }, { "epoch": 0.014449658605773564, "grad_norm": 14.426185607910156, "learning_rate": 4.9974625771786775e-06, "loss": 1.5183, "step": 13305 }, { "epoch": 0.014455088766843002, "grad_norm": 38.38336181640625, "learning_rate": 4.997460655578159e-06, "loss": 1.8854, "step": 13310 }, { "epoch": 0.01446051892791244, "grad_norm": 22.259145736694336, "learning_rate": 4.9974587332506685e-06, "loss": 1.8832, "step": 13315 }, { "epoch": 0.014465949088981878, "grad_norm": 11.601249694824219, "learning_rate": 4.997456810196206e-06, "loss": 1.757, "step": 13320 }, { "epoch": 0.014471379250051314, "grad_norm": 18.846879959106445, "learning_rate": 4.997454886414771e-06, "loss": 1.4606, "step": 13325 }, { "epoch": 0.014476809411120752, "grad_norm": 33.55439376831055, "learning_rate": 4.997452961906365e-06, "loss": 1.7373, "step": 13330 }, { "epoch": 0.01448223957219019, "grad_norm": 22.611356735229492, "learning_rate": 4.997451036670989e-06, "loss": 1.4729, "step": 13335 }, { "epoch": 0.014487669733259628, "grad_norm": 16.876813888549805, "learning_rate": 4.9974491107086424e-06, "loss": 1.3307, "step": 13340 }, { "epoch": 0.014493099894329066, "grad_norm": 13.709921836853027, "learning_rate": 4.9974471840193274e-06, "loss": 1.4533, "step": 13345 }, { "epoch": 0.014498530055398503, "grad_norm": 21.742633819580078, "learning_rate": 4.997445256603042e-06, "loss": 1.3837, "step": 13350 }, { "epoch": 0.01450396021646794, "grad_norm": 22.144031524658203, "learning_rate": 4.997443328459789e-06, "loss": 1.3972, "step": 13355 }, { "epoch": 0.014509390377537379, "grad_norm": 17.83857536315918, "learning_rate": 4.9974413995895685e-06, "loss": 2.0007, "step": 13360 }, { "epoch": 0.014514820538606817, "grad_norm": 31.346406936645508, "learning_rate": 4.997439469992381e-06, "loss": 1.5228, "step": 13365 }, { "epoch": 0.014520250699676255, "grad_norm": 25.15586280822754, "learning_rate": 4.9974375396682265e-06, "loss": 1.6218, "step": 13370 }, { "epoch": 0.014525680860745691, "grad_norm": 21.128286361694336, "learning_rate": 4.997435608617106e-06, "loss": 1.4883, "step": 13375 }, { "epoch": 0.014531111021815129, "grad_norm": 23.987110137939453, "learning_rate": 4.9974336768390196e-06, "loss": 1.2017, "step": 13380 }, { "epoch": 0.014536541182884567, "grad_norm": 31.167816162109375, "learning_rate": 4.997431744333968e-06, "loss": 1.9163, "step": 13385 }, { "epoch": 0.014541971343954005, "grad_norm": 28.545888900756836, "learning_rate": 4.997429811101953e-06, "loss": 1.9844, "step": 13390 }, { "epoch": 0.014547401505023441, "grad_norm": 41.26688766479492, "learning_rate": 4.997427877142974e-06, "loss": 2.2026, "step": 13395 }, { "epoch": 0.01455283166609288, "grad_norm": 18.068647384643555, "learning_rate": 4.997425942457031e-06, "loss": 1.5749, "step": 13400 }, { "epoch": 0.014558261827162317, "grad_norm": 25.932546615600586, "learning_rate": 4.997424007044126e-06, "loss": 1.7307, "step": 13405 }, { "epoch": 0.014563691988231755, "grad_norm": 24.126873016357422, "learning_rate": 4.997422070904259e-06, "loss": 1.7874, "step": 13410 }, { "epoch": 0.014569122149301193, "grad_norm": 16.22195816040039, "learning_rate": 4.99742013403743e-06, "loss": 1.7589, "step": 13415 }, { "epoch": 0.01457455231037063, "grad_norm": 14.386886596679688, "learning_rate": 4.9974181964436404e-06, "loss": 1.6235, "step": 13420 }, { "epoch": 0.014579982471440068, "grad_norm": 48.726078033447266, "learning_rate": 4.99741625812289e-06, "loss": 1.0023, "step": 13425 }, { "epoch": 0.014585412632509506, "grad_norm": 30.453935623168945, "learning_rate": 4.99741431907518e-06, "loss": 1.5644, "step": 13430 }, { "epoch": 0.014590842793578944, "grad_norm": 20.13917350769043, "learning_rate": 4.99741237930051e-06, "loss": 1.3779, "step": 13435 }, { "epoch": 0.014596272954648382, "grad_norm": 176.37876892089844, "learning_rate": 4.997410438798882e-06, "loss": 1.5196, "step": 13440 }, { "epoch": 0.014601703115717818, "grad_norm": 19.836992263793945, "learning_rate": 4.997408497570295e-06, "loss": 1.3923, "step": 13445 }, { "epoch": 0.014607133276787256, "grad_norm": 25.968273162841797, "learning_rate": 4.997406555614751e-06, "loss": 1.7245, "step": 13450 }, { "epoch": 0.014612563437856694, "grad_norm": 69.91226959228516, "learning_rate": 4.99740461293225e-06, "loss": 1.2195, "step": 13455 }, { "epoch": 0.014617993598926132, "grad_norm": 17.23809242248535, "learning_rate": 4.997402669522793e-06, "loss": 1.7314, "step": 13460 }, { "epoch": 0.014623423759995568, "grad_norm": 118.37472534179688, "learning_rate": 4.997400725386379e-06, "loss": 1.731, "step": 13465 }, { "epoch": 0.014628853921065006, "grad_norm": 17.60842514038086, "learning_rate": 4.997398780523011e-06, "loss": 1.7137, "step": 13470 }, { "epoch": 0.014634284082134444, "grad_norm": 25.00621795654297, "learning_rate": 4.9973968349326866e-06, "loss": 1.6365, "step": 13475 }, { "epoch": 0.014639714243203882, "grad_norm": 14.626567840576172, "learning_rate": 4.997394888615409e-06, "loss": 2.0605, "step": 13480 }, { "epoch": 0.01464514440427332, "grad_norm": 13.204402923583984, "learning_rate": 4.9973929415711775e-06, "loss": 1.7599, "step": 13485 }, { "epoch": 0.014650574565342757, "grad_norm": 61.90251922607422, "learning_rate": 4.997390993799993e-06, "loss": 1.4879, "step": 13490 }, { "epoch": 0.014656004726412195, "grad_norm": 72.68951416015625, "learning_rate": 4.997389045301856e-06, "loss": 1.0642, "step": 13495 }, { "epoch": 0.014661434887481633, "grad_norm": 62.46284484863281, "learning_rate": 4.997387096076767e-06, "loss": 1.3832, "step": 13500 }, { "epoch": 0.01466686504855107, "grad_norm": 35.893672943115234, "learning_rate": 4.9973851461247255e-06, "loss": 1.6928, "step": 13505 }, { "epoch": 0.014672295209620509, "grad_norm": 21.513978958129883, "learning_rate": 4.997383195445735e-06, "loss": 1.5292, "step": 13510 }, { "epoch": 0.014677725370689945, "grad_norm": 20.358497619628906, "learning_rate": 4.997381244039793e-06, "loss": 2.4739, "step": 13515 }, { "epoch": 0.014683155531759383, "grad_norm": 27.557960510253906, "learning_rate": 4.997379291906902e-06, "loss": 1.5564, "step": 13520 }, { "epoch": 0.014688585692828821, "grad_norm": 19.415512084960938, "learning_rate": 4.997377339047062e-06, "loss": 1.3571, "step": 13525 }, { "epoch": 0.014694015853898259, "grad_norm": 14.149596214294434, "learning_rate": 4.997375385460272e-06, "loss": 1.4428, "step": 13530 }, { "epoch": 0.014699446014967695, "grad_norm": 86.40433502197266, "learning_rate": 4.997373431146536e-06, "loss": 1.2288, "step": 13535 }, { "epoch": 0.014704876176037133, "grad_norm": 47.09113693237305, "learning_rate": 4.997371476105851e-06, "loss": 1.4403, "step": 13540 }, { "epoch": 0.014710306337106571, "grad_norm": 22.8165283203125, "learning_rate": 4.99736952033822e-06, "loss": 1.6078, "step": 13545 }, { "epoch": 0.01471573649817601, "grad_norm": 26.599517822265625, "learning_rate": 4.997367563843643e-06, "loss": 1.7944, "step": 13550 }, { "epoch": 0.014721166659245447, "grad_norm": 27.320466995239258, "learning_rate": 4.99736560662212e-06, "loss": 1.7615, "step": 13555 }, { "epoch": 0.014726596820314884, "grad_norm": 30.91830825805664, "learning_rate": 4.997363648673651e-06, "loss": 1.879, "step": 13560 }, { "epoch": 0.014732026981384322, "grad_norm": 41.333927154541016, "learning_rate": 4.997361689998239e-06, "loss": 1.1948, "step": 13565 }, { "epoch": 0.01473745714245376, "grad_norm": 71.51856231689453, "learning_rate": 4.997359730595882e-06, "loss": 2.1612, "step": 13570 }, { "epoch": 0.014742887303523198, "grad_norm": 35.82046127319336, "learning_rate": 4.997357770466582e-06, "loss": 1.2036, "step": 13575 }, { "epoch": 0.014748317464592636, "grad_norm": 16.193660736083984, "learning_rate": 4.997355809610339e-06, "loss": 0.7769, "step": 13580 }, { "epoch": 0.014753747625662072, "grad_norm": 20.43475914001465, "learning_rate": 4.997353848027153e-06, "loss": 1.707, "step": 13585 }, { "epoch": 0.01475917778673151, "grad_norm": 88.25676727294922, "learning_rate": 4.997351885717027e-06, "loss": 1.2802, "step": 13590 }, { "epoch": 0.014764607947800948, "grad_norm": 18.587589263916016, "learning_rate": 4.997349922679958e-06, "loss": 1.8731, "step": 13595 }, { "epoch": 0.014770038108870386, "grad_norm": 59.10287857055664, "learning_rate": 4.997347958915949e-06, "loss": 1.8978, "step": 13600 }, { "epoch": 0.014775468269939822, "grad_norm": 62.28667068481445, "learning_rate": 4.997345994425001e-06, "loss": 1.8896, "step": 13605 }, { "epoch": 0.01478089843100926, "grad_norm": 52.940914154052734, "learning_rate": 4.997344029207113e-06, "loss": 1.2493, "step": 13610 }, { "epoch": 0.014786328592078698, "grad_norm": 42.958213806152344, "learning_rate": 4.997342063262286e-06, "loss": 1.3433, "step": 13615 }, { "epoch": 0.014791758753148136, "grad_norm": 176.98989868164062, "learning_rate": 4.99734009659052e-06, "loss": 1.4292, "step": 13620 }, { "epoch": 0.014797188914217574, "grad_norm": 61.10389709472656, "learning_rate": 4.9973381291918175e-06, "loss": 1.2179, "step": 13625 }, { "epoch": 0.01480261907528701, "grad_norm": 32.837242126464844, "learning_rate": 4.997336161066178e-06, "loss": 2.5222, "step": 13630 }, { "epoch": 0.014808049236356449, "grad_norm": 141.0140380859375, "learning_rate": 4.997334192213601e-06, "loss": 1.8465, "step": 13635 }, { "epoch": 0.014813479397425887, "grad_norm": 23.97825813293457, "learning_rate": 4.997332222634088e-06, "loss": 1.6975, "step": 13640 }, { "epoch": 0.014818909558495325, "grad_norm": 16.30675506591797, "learning_rate": 4.9973302523276404e-06, "loss": 1.6528, "step": 13645 }, { "epoch": 0.014824339719564763, "grad_norm": 21.24622917175293, "learning_rate": 4.997328281294257e-06, "loss": 1.4401, "step": 13650 }, { "epoch": 0.014829769880634199, "grad_norm": 15.999366760253906, "learning_rate": 4.99732630953394e-06, "loss": 1.5463, "step": 13655 }, { "epoch": 0.014835200041703637, "grad_norm": 41.618221282958984, "learning_rate": 4.9973243370466885e-06, "loss": 1.6718, "step": 13660 }, { "epoch": 0.014840630202773075, "grad_norm": 41.67134475708008, "learning_rate": 4.997322363832505e-06, "loss": 1.5874, "step": 13665 }, { "epoch": 0.014846060363842513, "grad_norm": 20.243593215942383, "learning_rate": 4.997320389891388e-06, "loss": 1.8796, "step": 13670 }, { "epoch": 0.01485149052491195, "grad_norm": 77.9864730834961, "learning_rate": 4.99731841522334e-06, "loss": 1.2227, "step": 13675 }, { "epoch": 0.014856920685981387, "grad_norm": 67.62828063964844, "learning_rate": 4.9973164398283605e-06, "loss": 1.8497, "step": 13680 }, { "epoch": 0.014862350847050825, "grad_norm": 50.446659088134766, "learning_rate": 4.99731446370645e-06, "loss": 1.8676, "step": 13685 }, { "epoch": 0.014867781008120263, "grad_norm": 22.70779800415039, "learning_rate": 4.997312486857608e-06, "loss": 2.3552, "step": 13690 }, { "epoch": 0.014873211169189701, "grad_norm": 21.0637264251709, "learning_rate": 4.997310509281838e-06, "loss": 1.4391, "step": 13695 }, { "epoch": 0.014878641330259137, "grad_norm": 12.393885612487793, "learning_rate": 4.997308530979138e-06, "loss": 1.9826, "step": 13700 }, { "epoch": 0.014884071491328576, "grad_norm": 21.491588592529297, "learning_rate": 4.99730655194951e-06, "loss": 1.2737, "step": 13705 }, { "epoch": 0.014889501652398014, "grad_norm": 38.02705383300781, "learning_rate": 4.997304572192954e-06, "loss": 1.5813, "step": 13710 }, { "epoch": 0.014894931813467452, "grad_norm": 18.62124252319336, "learning_rate": 4.99730259170947e-06, "loss": 1.6589, "step": 13715 }, { "epoch": 0.01490036197453689, "grad_norm": 20.181785583496094, "learning_rate": 4.99730061049906e-06, "loss": 1.5343, "step": 13720 }, { "epoch": 0.014905792135606326, "grad_norm": 22.692842483520508, "learning_rate": 4.997298628561723e-06, "loss": 1.2897, "step": 13725 }, { "epoch": 0.014911222296675764, "grad_norm": 17.282793045043945, "learning_rate": 4.997296645897461e-06, "loss": 2.346, "step": 13730 }, { "epoch": 0.014916652457745202, "grad_norm": 20.90361213684082, "learning_rate": 4.997294662506273e-06, "loss": 1.7653, "step": 13735 }, { "epoch": 0.01492208261881464, "grad_norm": 31.774112701416016, "learning_rate": 4.997292678388162e-06, "loss": 1.4437, "step": 13740 }, { "epoch": 0.014927512779884076, "grad_norm": 39.41404724121094, "learning_rate": 4.9972906935431264e-06, "loss": 2.0645, "step": 13745 }, { "epoch": 0.014932942940953514, "grad_norm": 66.00365447998047, "learning_rate": 4.9972887079711675e-06, "loss": 3.1466, "step": 13750 }, { "epoch": 0.014938373102022952, "grad_norm": 15.25656795501709, "learning_rate": 4.997286721672286e-06, "loss": 1.9668, "step": 13755 }, { "epoch": 0.01494380326309239, "grad_norm": 13.935808181762695, "learning_rate": 4.997284734646482e-06, "loss": 1.2138, "step": 13760 }, { "epoch": 0.014949233424161828, "grad_norm": 19.48055076599121, "learning_rate": 4.9972827468937564e-06, "loss": 1.7981, "step": 13765 }, { "epoch": 0.014954663585231264, "grad_norm": 22.403079986572266, "learning_rate": 4.9972807584141105e-06, "loss": 1.5132, "step": 13770 }, { "epoch": 0.014960093746300702, "grad_norm": 21.208215713500977, "learning_rate": 4.997278769207543e-06, "loss": 2.2337, "step": 13775 }, { "epoch": 0.01496552390737014, "grad_norm": 19.62400245666504, "learning_rate": 4.997276779274056e-06, "loss": 0.9177, "step": 13780 }, { "epoch": 0.014970954068439579, "grad_norm": 22.868684768676758, "learning_rate": 4.99727478861365e-06, "loss": 1.7814, "step": 13785 }, { "epoch": 0.014976384229509017, "grad_norm": 14.05359935760498, "learning_rate": 4.997272797226325e-06, "loss": 1.7895, "step": 13790 }, { "epoch": 0.014981814390578453, "grad_norm": 20.669715881347656, "learning_rate": 4.997270805112083e-06, "loss": 1.6927, "step": 13795 }, { "epoch": 0.01498724455164789, "grad_norm": 22.531402587890625, "learning_rate": 4.997268812270922e-06, "loss": 1.5862, "step": 13800 }, { "epoch": 0.014992674712717329, "grad_norm": 34.446529388427734, "learning_rate": 4.997266818702845e-06, "loss": 1.2986, "step": 13805 }, { "epoch": 0.014998104873786767, "grad_norm": 42.61188507080078, "learning_rate": 4.997264824407851e-06, "loss": 1.3276, "step": 13810 }, { "epoch": 0.015003535034856203, "grad_norm": 18.460094451904297, "learning_rate": 4.997262829385941e-06, "loss": 1.7174, "step": 13815 }, { "epoch": 0.015008965195925641, "grad_norm": 51.303855895996094, "learning_rate": 4.997260833637117e-06, "loss": 1.8498, "step": 13820 }, { "epoch": 0.01501439535699508, "grad_norm": 25.470420837402344, "learning_rate": 4.997258837161378e-06, "loss": 1.7718, "step": 13825 }, { "epoch": 0.015019825518064517, "grad_norm": 43.54543685913086, "learning_rate": 4.997256839958724e-06, "loss": 1.2569, "step": 13830 }, { "epoch": 0.015025255679133955, "grad_norm": 49.23960494995117, "learning_rate": 4.997254842029158e-06, "loss": 2.0348, "step": 13835 }, { "epoch": 0.015030685840203391, "grad_norm": 20.844905853271484, "learning_rate": 4.997252843372677e-06, "loss": 1.5352, "step": 13840 }, { "epoch": 0.01503611600127283, "grad_norm": 47.32645034790039, "learning_rate": 4.9972508439892854e-06, "loss": 1.446, "step": 13845 }, { "epoch": 0.015041546162342267, "grad_norm": 30.65675926208496, "learning_rate": 4.997248843878982e-06, "loss": 1.5047, "step": 13850 }, { "epoch": 0.015046976323411706, "grad_norm": 16.18125343322754, "learning_rate": 4.997246843041767e-06, "loss": 1.6851, "step": 13855 }, { "epoch": 0.015052406484481144, "grad_norm": 22.655784606933594, "learning_rate": 4.997244841477642e-06, "loss": 1.579, "step": 13860 }, { "epoch": 0.01505783664555058, "grad_norm": 14.04697036743164, "learning_rate": 4.997242839186606e-06, "loss": 2.0609, "step": 13865 }, { "epoch": 0.015063266806620018, "grad_norm": 71.99524688720703, "learning_rate": 4.997240836168661e-06, "loss": 1.2223, "step": 13870 }, { "epoch": 0.015068696967689456, "grad_norm": 16.251325607299805, "learning_rate": 4.9972388324238075e-06, "loss": 2.2139, "step": 13875 }, { "epoch": 0.015074127128758894, "grad_norm": 36.03681182861328, "learning_rate": 4.997236827952046e-06, "loss": 1.8202, "step": 13880 }, { "epoch": 0.01507955728982833, "grad_norm": 17.165760040283203, "learning_rate": 4.997234822753376e-06, "loss": 2.213, "step": 13885 }, { "epoch": 0.015084987450897768, "grad_norm": 15.345526695251465, "learning_rate": 4.997232816827799e-06, "loss": 1.5499, "step": 13890 }, { "epoch": 0.015090417611967206, "grad_norm": 71.46479034423828, "learning_rate": 4.997230810175316e-06, "loss": 0.9444, "step": 13895 }, { "epoch": 0.015095847773036644, "grad_norm": 13.021551132202148, "learning_rate": 4.997228802795927e-06, "loss": 1.7993, "step": 13900 }, { "epoch": 0.015101277934106082, "grad_norm": 17.04140853881836, "learning_rate": 4.997226794689633e-06, "loss": 2.3151, "step": 13905 }, { "epoch": 0.015106708095175518, "grad_norm": 17.16789436340332, "learning_rate": 4.997224785856434e-06, "loss": 2.1543, "step": 13910 }, { "epoch": 0.015112138256244956, "grad_norm": 23.84238624572754, "learning_rate": 4.997222776296331e-06, "loss": 1.399, "step": 13915 }, { "epoch": 0.015117568417314394, "grad_norm": 32.25491714477539, "learning_rate": 4.9972207660093245e-06, "loss": 1.9679, "step": 13920 }, { "epoch": 0.015122998578383832, "grad_norm": 97.97024536132812, "learning_rate": 4.997218754995414e-06, "loss": 2.2439, "step": 13925 }, { "epoch": 0.01512842873945327, "grad_norm": 33.155113220214844, "learning_rate": 4.9972167432546025e-06, "loss": 1.6209, "step": 13930 }, { "epoch": 0.015133858900522707, "grad_norm": 15.68958568572998, "learning_rate": 4.997214730786889e-06, "loss": 1.1006, "step": 13935 }, { "epoch": 0.015139289061592145, "grad_norm": 21.660171508789062, "learning_rate": 4.997212717592274e-06, "loss": 1.4345, "step": 13940 }, { "epoch": 0.015144719222661583, "grad_norm": 17.0858154296875, "learning_rate": 4.9972107036707575e-06, "loss": 1.3829, "step": 13945 }, { "epoch": 0.01515014938373102, "grad_norm": 18.885398864746094, "learning_rate": 4.997208689022341e-06, "loss": 1.1299, "step": 13950 }, { "epoch": 0.015155579544800457, "grad_norm": 22.59654998779297, "learning_rate": 4.997206673647027e-06, "loss": 1.4492, "step": 13955 }, { "epoch": 0.015161009705869895, "grad_norm": 16.372512817382812, "learning_rate": 4.997204657544812e-06, "loss": 1.5487, "step": 13960 }, { "epoch": 0.015166439866939333, "grad_norm": 44.8357048034668, "learning_rate": 4.9972026407157e-06, "loss": 1.7034, "step": 13965 }, { "epoch": 0.015171870028008771, "grad_norm": 20.828397750854492, "learning_rate": 4.997200623159691e-06, "loss": 1.2315, "step": 13970 }, { "epoch": 0.01517730018907821, "grad_norm": 19.272417068481445, "learning_rate": 4.997198604876783e-06, "loss": 1.6777, "step": 13975 }, { "epoch": 0.015182730350147645, "grad_norm": 21.046279907226562, "learning_rate": 4.997196585866979e-06, "loss": 1.8782, "step": 13980 }, { "epoch": 0.015188160511217083, "grad_norm": 53.027915954589844, "learning_rate": 4.99719456613028e-06, "loss": 1.9266, "step": 13985 }, { "epoch": 0.015193590672286521, "grad_norm": 20.919517517089844, "learning_rate": 4.997192545666685e-06, "loss": 2.3487, "step": 13990 }, { "epoch": 0.01519902083335596, "grad_norm": 51.70448303222656, "learning_rate": 4.997190524476195e-06, "loss": 1.6666, "step": 13995 }, { "epoch": 0.015204450994425397, "grad_norm": 14.000082969665527, "learning_rate": 4.997188502558811e-06, "loss": 1.596, "step": 14000 }, { "epoch": 0.015209881155494834, "grad_norm": 19.776206970214844, "learning_rate": 4.997186479914534e-06, "loss": 1.8992, "step": 14005 }, { "epoch": 0.015215311316564272, "grad_norm": 19.686328887939453, "learning_rate": 4.997184456543363e-06, "loss": 1.8984, "step": 14010 }, { "epoch": 0.01522074147763371, "grad_norm": 30.065143585205078, "learning_rate": 4.9971824324453e-06, "loss": 1.6215, "step": 14015 }, { "epoch": 0.015226171638703148, "grad_norm": 65.50743103027344, "learning_rate": 4.997180407620346e-06, "loss": 2.3227, "step": 14020 }, { "epoch": 0.015231601799772584, "grad_norm": 30.811922073364258, "learning_rate": 4.9971783820684995e-06, "loss": 1.5693, "step": 14025 }, { "epoch": 0.015237031960842022, "grad_norm": 114.22526550292969, "learning_rate": 4.9971763557897634e-06, "loss": 1.487, "step": 14030 }, { "epoch": 0.01524246212191146, "grad_norm": 25.50044822692871, "learning_rate": 4.997174328784137e-06, "loss": 1.478, "step": 14035 }, { "epoch": 0.015247892282980898, "grad_norm": 47.64854049682617, "learning_rate": 4.99717230105162e-06, "loss": 1.7638, "step": 14040 }, { "epoch": 0.015253322444050336, "grad_norm": 22.74177360534668, "learning_rate": 4.997170272592216e-06, "loss": 1.2756, "step": 14045 }, { "epoch": 0.015258752605119772, "grad_norm": 16.682973861694336, "learning_rate": 4.997168243405923e-06, "loss": 1.1732, "step": 14050 }, { "epoch": 0.01526418276618921, "grad_norm": 61.14828872680664, "learning_rate": 4.9971662134927414e-06, "loss": 1.3679, "step": 14055 }, { "epoch": 0.015269612927258648, "grad_norm": 22.53866958618164, "learning_rate": 4.997164182852674e-06, "loss": 1.5783, "step": 14060 }, { "epoch": 0.015275043088328086, "grad_norm": 94.3997573852539, "learning_rate": 4.997162151485719e-06, "loss": 1.2518, "step": 14065 }, { "epoch": 0.015280473249397524, "grad_norm": 22.564739227294922, "learning_rate": 4.99716011939188e-06, "loss": 1.5295, "step": 14070 }, { "epoch": 0.01528590341046696, "grad_norm": 44.46489715576172, "learning_rate": 4.9971580865711535e-06, "loss": 1.6473, "step": 14075 }, { "epoch": 0.015291333571536399, "grad_norm": 13.51102352142334, "learning_rate": 4.997156053023543e-06, "loss": 2.4802, "step": 14080 }, { "epoch": 0.015296763732605837, "grad_norm": 22.444509506225586, "learning_rate": 4.997154018749048e-06, "loss": 1.9534, "step": 14085 }, { "epoch": 0.015302193893675275, "grad_norm": 14.427681922912598, "learning_rate": 4.99715198374767e-06, "loss": 1.2483, "step": 14090 }, { "epoch": 0.015307624054744711, "grad_norm": 19.237558364868164, "learning_rate": 4.99714994801941e-06, "loss": 1.6424, "step": 14095 }, { "epoch": 0.015313054215814149, "grad_norm": 26.44073486328125, "learning_rate": 4.997147911564266e-06, "loss": 1.7063, "step": 14100 }, { "epoch": 0.015318484376883587, "grad_norm": 42.56153106689453, "learning_rate": 4.997145874382241e-06, "loss": 1.9157, "step": 14105 }, { "epoch": 0.015323914537953025, "grad_norm": 51.95165252685547, "learning_rate": 4.997143836473335e-06, "loss": 1.2141, "step": 14110 }, { "epoch": 0.015329344699022463, "grad_norm": 20.324256896972656, "learning_rate": 4.997141797837548e-06, "loss": 1.2899, "step": 14115 }, { "epoch": 0.0153347748600919, "grad_norm": 126.16992950439453, "learning_rate": 4.997139758474882e-06, "loss": 3.0081, "step": 14120 }, { "epoch": 0.015340205021161337, "grad_norm": 108.26888275146484, "learning_rate": 4.9971377183853346e-06, "loss": 2.4846, "step": 14125 }, { "epoch": 0.015345635182230775, "grad_norm": 17.923885345458984, "learning_rate": 4.997135677568909e-06, "loss": 1.8315, "step": 14130 }, { "epoch": 0.015351065343300213, "grad_norm": 15.026663780212402, "learning_rate": 4.997133636025607e-06, "loss": 1.1616, "step": 14135 }, { "epoch": 0.015356495504369651, "grad_norm": 19.575754165649414, "learning_rate": 4.997131593755425e-06, "loss": 2.4891, "step": 14140 }, { "epoch": 0.015361925665439088, "grad_norm": 25.82917594909668, "learning_rate": 4.997129550758368e-06, "loss": 1.7117, "step": 14145 }, { "epoch": 0.015367355826508526, "grad_norm": 35.72087860107422, "learning_rate": 4.9971275070344335e-06, "loss": 1.9433, "step": 14150 }, { "epoch": 0.015372785987577964, "grad_norm": 26.419416427612305, "learning_rate": 4.997125462583624e-06, "loss": 1.6819, "step": 14155 }, { "epoch": 0.015378216148647402, "grad_norm": 22.55197525024414, "learning_rate": 4.997123417405938e-06, "loss": 1.0295, "step": 14160 }, { "epoch": 0.015383646309716838, "grad_norm": 16.455596923828125, "learning_rate": 4.997121371501378e-06, "loss": 0.9863, "step": 14165 }, { "epoch": 0.015389076470786276, "grad_norm": 20.239715576171875, "learning_rate": 4.9971193248699445e-06, "loss": 1.7568, "step": 14170 }, { "epoch": 0.015394506631855714, "grad_norm": 133.6836700439453, "learning_rate": 4.997117277511637e-06, "loss": 1.4362, "step": 14175 }, { "epoch": 0.015399936792925152, "grad_norm": 51.18174743652344, "learning_rate": 4.997115229426457e-06, "loss": 1.0351, "step": 14180 }, { "epoch": 0.01540536695399459, "grad_norm": 31.30020523071289, "learning_rate": 4.997113180614404e-06, "loss": 1.676, "step": 14185 }, { "epoch": 0.015410797115064026, "grad_norm": 40.07598114013672, "learning_rate": 4.99711113107548e-06, "loss": 2.1057, "step": 14190 }, { "epoch": 0.015416227276133464, "grad_norm": 40.43072509765625, "learning_rate": 4.997109080809685e-06, "loss": 1.9484, "step": 14195 }, { "epoch": 0.015421657437202902, "grad_norm": 29.796289443969727, "learning_rate": 4.997107029817019e-06, "loss": 1.0991, "step": 14200 }, { "epoch": 0.01542708759827234, "grad_norm": 19.96859359741211, "learning_rate": 4.997104978097483e-06, "loss": 1.4625, "step": 14205 }, { "epoch": 0.015432517759341778, "grad_norm": 44.947845458984375, "learning_rate": 4.997102925651078e-06, "loss": 1.6351, "step": 14210 }, { "epoch": 0.015437947920411215, "grad_norm": 31.607133865356445, "learning_rate": 4.997100872477805e-06, "loss": 1.4731, "step": 14215 }, { "epoch": 0.015443378081480653, "grad_norm": 23.566987991333008, "learning_rate": 4.9970988185776635e-06, "loss": 1.3964, "step": 14220 }, { "epoch": 0.01544880824255009, "grad_norm": 29.23942756652832, "learning_rate": 4.997096763950654e-06, "loss": 1.7473, "step": 14225 }, { "epoch": 0.015454238403619529, "grad_norm": 32.05918502807617, "learning_rate": 4.997094708596778e-06, "loss": 1.2516, "step": 14230 }, { "epoch": 0.015459668564688965, "grad_norm": 41.32154846191406, "learning_rate": 4.997092652516036e-06, "loss": 1.6062, "step": 14235 }, { "epoch": 0.015465098725758403, "grad_norm": 27.02302360534668, "learning_rate": 4.9970905957084286e-06, "loss": 1.197, "step": 14240 }, { "epoch": 0.015470528886827841, "grad_norm": 27.43245506286621, "learning_rate": 4.997088538173956e-06, "loss": 2.1544, "step": 14245 }, { "epoch": 0.015475959047897279, "grad_norm": 21.00448226928711, "learning_rate": 4.997086479912618e-06, "loss": 2.1111, "step": 14250 }, { "epoch": 0.015481389208966717, "grad_norm": 16.9123477935791, "learning_rate": 4.997084420924416e-06, "loss": 1.2172, "step": 14255 }, { "epoch": 0.015486819370036153, "grad_norm": 26.459430694580078, "learning_rate": 4.9970823612093525e-06, "loss": 1.5027, "step": 14260 }, { "epoch": 0.015492249531105591, "grad_norm": 20.0551700592041, "learning_rate": 4.997080300767425e-06, "loss": 1.9746, "step": 14265 }, { "epoch": 0.01549767969217503, "grad_norm": 16.527976989746094, "learning_rate": 4.997078239598635e-06, "loss": 2.4863, "step": 14270 }, { "epoch": 0.015503109853244467, "grad_norm": 23.54715347290039, "learning_rate": 4.997076177702984e-06, "loss": 1.8617, "step": 14275 }, { "epoch": 0.015508540014313905, "grad_norm": 32.101173400878906, "learning_rate": 4.9970741150804726e-06, "loss": 1.4539, "step": 14280 }, { "epoch": 0.015513970175383342, "grad_norm": 24.321857452392578, "learning_rate": 4.997072051731101e-06, "loss": 1.5467, "step": 14285 }, { "epoch": 0.01551940033645278, "grad_norm": 30.683927536010742, "learning_rate": 4.997069987654869e-06, "loss": 1.6989, "step": 14290 }, { "epoch": 0.015524830497522218, "grad_norm": 27.609081268310547, "learning_rate": 4.997067922851778e-06, "loss": 1.1916, "step": 14295 }, { "epoch": 0.015530260658591656, "grad_norm": 32.48031234741211, "learning_rate": 4.997065857321829e-06, "loss": 1.3634, "step": 14300 }, { "epoch": 0.015535690819661092, "grad_norm": 49.78948974609375, "learning_rate": 4.997063791065022e-06, "loss": 1.4847, "step": 14305 }, { "epoch": 0.01554112098073053, "grad_norm": 41.58252716064453, "learning_rate": 4.997061724081357e-06, "loss": 1.5226, "step": 14310 }, { "epoch": 0.015546551141799968, "grad_norm": 14.87684440612793, "learning_rate": 4.997059656370836e-06, "loss": 1.8212, "step": 14315 }, { "epoch": 0.015551981302869406, "grad_norm": 18.739299774169922, "learning_rate": 4.9970575879334595e-06, "loss": 1.8231, "step": 14320 }, { "epoch": 0.015557411463938844, "grad_norm": 22.331331253051758, "learning_rate": 4.997055518769227e-06, "loss": 1.8266, "step": 14325 }, { "epoch": 0.01556284162500828, "grad_norm": 28.9488582611084, "learning_rate": 4.997053448878139e-06, "loss": 1.6895, "step": 14330 }, { "epoch": 0.015568271786077718, "grad_norm": 29.7093563079834, "learning_rate": 4.997051378260197e-06, "loss": 2.0451, "step": 14335 }, { "epoch": 0.015573701947147156, "grad_norm": 33.99785232543945, "learning_rate": 4.997049306915402e-06, "loss": 1.4616, "step": 14340 }, { "epoch": 0.015579132108216594, "grad_norm": 14.380786895751953, "learning_rate": 4.997047234843753e-06, "loss": 1.6929, "step": 14345 }, { "epoch": 0.015584562269286032, "grad_norm": 26.007259368896484, "learning_rate": 4.997045162045252e-06, "loss": 1.5093, "step": 14350 }, { "epoch": 0.015589992430355469, "grad_norm": 21.658552169799805, "learning_rate": 4.9970430885199e-06, "loss": 1.2586, "step": 14355 }, { "epoch": 0.015595422591424907, "grad_norm": 23.90587043762207, "learning_rate": 4.997041014267696e-06, "loss": 2.0855, "step": 14360 }, { "epoch": 0.015600852752494345, "grad_norm": 18.035245895385742, "learning_rate": 4.9970389392886405e-06, "loss": 1.9482, "step": 14365 }, { "epoch": 0.015606282913563783, "grad_norm": 40.6556396484375, "learning_rate": 4.997036863582736e-06, "loss": 2.4335, "step": 14370 }, { "epoch": 0.015611713074633219, "grad_norm": 25.029319763183594, "learning_rate": 4.997034787149982e-06, "loss": 1.344, "step": 14375 }, { "epoch": 0.015617143235702657, "grad_norm": 29.432777404785156, "learning_rate": 4.997032709990379e-06, "loss": 1.7032, "step": 14380 }, { "epoch": 0.015622573396772095, "grad_norm": 19.366439819335938, "learning_rate": 4.9970306321039275e-06, "loss": 1.7934, "step": 14385 }, { "epoch": 0.01562800355784153, "grad_norm": 25.310590744018555, "learning_rate": 4.997028553490629e-06, "loss": 2.12, "step": 14390 }, { "epoch": 0.01563343371891097, "grad_norm": 10.210470199584961, "learning_rate": 4.997026474150483e-06, "loss": 1.8616, "step": 14395 }, { "epoch": 0.015638863879980407, "grad_norm": 22.097002029418945, "learning_rate": 4.99702439408349e-06, "loss": 1.1488, "step": 14400 }, { "epoch": 0.015644294041049847, "grad_norm": 53.4296875, "learning_rate": 4.997022313289652e-06, "loss": 1.851, "step": 14405 }, { "epoch": 0.015649724202119283, "grad_norm": 25.38834571838379, "learning_rate": 4.997020231768969e-06, "loss": 1.9232, "step": 14410 }, { "epoch": 0.01565515436318872, "grad_norm": 25.072221755981445, "learning_rate": 4.9970181495214405e-06, "loss": 1.2629, "step": 14415 }, { "epoch": 0.01566058452425816, "grad_norm": 17.04715347290039, "learning_rate": 4.997016066547069e-06, "loss": 1.6666, "step": 14420 }, { "epoch": 0.015666014685327596, "grad_norm": 19.89698028564453, "learning_rate": 4.997013982845854e-06, "loss": 1.5606, "step": 14425 }, { "epoch": 0.015671444846397035, "grad_norm": 75.29006958007812, "learning_rate": 4.997011898417796e-06, "loss": 1.2299, "step": 14430 }, { "epoch": 0.01567687500746647, "grad_norm": 19.614477157592773, "learning_rate": 4.997009813262895e-06, "loss": 1.6196, "step": 14435 }, { "epoch": 0.015682305168535908, "grad_norm": 32.05229949951172, "learning_rate": 4.997007727381153e-06, "loss": 1.5252, "step": 14440 }, { "epoch": 0.015687735329605348, "grad_norm": 21.641433715820312, "learning_rate": 4.997005640772571e-06, "loss": 1.102, "step": 14445 }, { "epoch": 0.015693165490674784, "grad_norm": 17.112377166748047, "learning_rate": 4.997003553437147e-06, "loss": 1.2546, "step": 14450 }, { "epoch": 0.01569859565174422, "grad_norm": 37.123844146728516, "learning_rate": 4.997001465374885e-06, "loss": 2.1249, "step": 14455 }, { "epoch": 0.01570402581281366, "grad_norm": 30.69364356994629, "learning_rate": 4.996999376585783e-06, "loss": 1.5461, "step": 14460 }, { "epoch": 0.015709455973883096, "grad_norm": 30.99183464050293, "learning_rate": 4.996997287069843e-06, "loss": 1.6981, "step": 14465 }, { "epoch": 0.015714886134952536, "grad_norm": 17.681724548339844, "learning_rate": 4.996995196827064e-06, "loss": 2.346, "step": 14470 }, { "epoch": 0.015720316296021972, "grad_norm": 25.601884841918945, "learning_rate": 4.9969931058574485e-06, "loss": 1.7537, "step": 14475 }, { "epoch": 0.01572574645709141, "grad_norm": 34.280235290527344, "learning_rate": 4.996991014160996e-06, "loss": 1.4, "step": 14480 }, { "epoch": 0.01573117661816085, "grad_norm": 16.002887725830078, "learning_rate": 4.996988921737707e-06, "loss": 1.8165, "step": 14485 }, { "epoch": 0.015736606779230285, "grad_norm": 43.205997467041016, "learning_rate": 4.996986828587583e-06, "loss": 2.1386, "step": 14490 }, { "epoch": 0.015742036940299724, "grad_norm": 20.803102493286133, "learning_rate": 4.996984734710624e-06, "loss": 1.6706, "step": 14495 }, { "epoch": 0.01574746710136916, "grad_norm": 27.280088424682617, "learning_rate": 4.9969826401068315e-06, "loss": 2.0667, "step": 14500 }, { "epoch": 0.015752897262438597, "grad_norm": 21.203556060791016, "learning_rate": 4.996980544776204e-06, "loss": 2.0981, "step": 14505 }, { "epoch": 0.015758327423508037, "grad_norm": 39.4842643737793, "learning_rate": 4.996978448718744e-06, "loss": 2.7359, "step": 14510 }, { "epoch": 0.015763757584577473, "grad_norm": 114.4882583618164, "learning_rate": 4.996976351934452e-06, "loss": 1.1719, "step": 14515 }, { "epoch": 0.015769187745646913, "grad_norm": 15.65989875793457, "learning_rate": 4.996974254423328e-06, "loss": 1.3685, "step": 14520 }, { "epoch": 0.01577461790671635, "grad_norm": 30.88286018371582, "learning_rate": 4.996972156185372e-06, "loss": 1.575, "step": 14525 }, { "epoch": 0.015780048067785785, "grad_norm": 18.002248764038086, "learning_rate": 4.996970057220587e-06, "loss": 1.9974, "step": 14530 }, { "epoch": 0.015785478228855225, "grad_norm": 72.4874496459961, "learning_rate": 4.996967957528971e-06, "loss": 1.6972, "step": 14535 }, { "epoch": 0.01579090838992466, "grad_norm": 20.67418670654297, "learning_rate": 4.996965857110525e-06, "loss": 1.6901, "step": 14540 }, { "epoch": 0.0157963385509941, "grad_norm": 23.24113655090332, "learning_rate": 4.996963755965251e-06, "loss": 2.2266, "step": 14545 }, { "epoch": 0.015801768712063537, "grad_norm": 141.09304809570312, "learning_rate": 4.996961654093149e-06, "loss": 1.766, "step": 14550 }, { "epoch": 0.015807198873132974, "grad_norm": 14.426762580871582, "learning_rate": 4.996959551494219e-06, "loss": 2.3962, "step": 14555 }, { "epoch": 0.015812629034202413, "grad_norm": 28.07637596130371, "learning_rate": 4.9969574481684626e-06, "loss": 1.4386, "step": 14560 }, { "epoch": 0.01581805919527185, "grad_norm": 16.43466567993164, "learning_rate": 4.996955344115879e-06, "loss": 1.3779, "step": 14565 }, { "epoch": 0.01582348935634129, "grad_norm": 15.611655235290527, "learning_rate": 4.99695323933647e-06, "loss": 1.6912, "step": 14570 }, { "epoch": 0.015828919517410726, "grad_norm": 120.44915008544922, "learning_rate": 4.9969511338302366e-06, "loss": 1.0227, "step": 14575 }, { "epoch": 0.015834349678480162, "grad_norm": 17.386568069458008, "learning_rate": 4.996949027597178e-06, "loss": 2.058, "step": 14580 }, { "epoch": 0.0158397798395496, "grad_norm": 25.06220817565918, "learning_rate": 4.996946920637296e-06, "loss": 1.6287, "step": 14585 }, { "epoch": 0.015845210000619038, "grad_norm": 35.36405944824219, "learning_rate": 4.99694481295059e-06, "loss": 2.0254, "step": 14590 }, { "epoch": 0.015850640161688474, "grad_norm": 29.7506160736084, "learning_rate": 4.996942704537062e-06, "loss": 1.8384, "step": 14595 }, { "epoch": 0.015856070322757914, "grad_norm": 45.40391159057617, "learning_rate": 4.996940595396712e-06, "loss": 1.4208, "step": 14600 }, { "epoch": 0.01586150048382735, "grad_norm": 20.061939239501953, "learning_rate": 4.996938485529541e-06, "loss": 0.9938, "step": 14605 }, { "epoch": 0.01586693064489679, "grad_norm": 18.01753044128418, "learning_rate": 4.996936374935548e-06, "loss": 1.7774, "step": 14610 }, { "epoch": 0.015872360805966226, "grad_norm": 23.284486770629883, "learning_rate": 4.996934263614736e-06, "loss": 1.97, "step": 14615 }, { "epoch": 0.015877790967035663, "grad_norm": 18.680362701416016, "learning_rate": 4.996932151567104e-06, "loss": 2.0407, "step": 14620 }, { "epoch": 0.015883221128105102, "grad_norm": 96.10221862792969, "learning_rate": 4.996930038792652e-06, "loss": 1.7189, "step": 14625 }, { "epoch": 0.01588865128917454, "grad_norm": 141.9732208251953, "learning_rate": 4.996927925291383e-06, "loss": 1.4937, "step": 14630 }, { "epoch": 0.01589408145024398, "grad_norm": 35.85714340209961, "learning_rate": 4.9969258110632955e-06, "loss": 1.6064, "step": 14635 }, { "epoch": 0.015899511611313415, "grad_norm": 29.051227569580078, "learning_rate": 4.996923696108391e-06, "loss": 1.8502, "step": 14640 }, { "epoch": 0.01590494177238285, "grad_norm": 18.101943969726562, "learning_rate": 4.996921580426671e-06, "loss": 1.8426, "step": 14645 }, { "epoch": 0.01591037193345229, "grad_norm": 19.79663848876953, "learning_rate": 4.996919464018134e-06, "loss": 2.0148, "step": 14650 }, { "epoch": 0.015915802094521727, "grad_norm": 53.649810791015625, "learning_rate": 4.996917346882782e-06, "loss": 1.635, "step": 14655 }, { "epoch": 0.015921232255591167, "grad_norm": 153.44210815429688, "learning_rate": 4.9969152290206156e-06, "loss": 2.0661, "step": 14660 }, { "epoch": 0.015926662416660603, "grad_norm": 19.17485809326172, "learning_rate": 4.996913110431635e-06, "loss": 1.7145, "step": 14665 }, { "epoch": 0.01593209257773004, "grad_norm": 195.66883850097656, "learning_rate": 4.996910991115841e-06, "loss": 1.49, "step": 14670 }, { "epoch": 0.01593752273879948, "grad_norm": 28.718507766723633, "learning_rate": 4.996908871073234e-06, "loss": 1.2612, "step": 14675 }, { "epoch": 0.015942952899868915, "grad_norm": 19.636646270751953, "learning_rate": 4.996906750303815e-06, "loss": 1.7078, "step": 14680 }, { "epoch": 0.015948383060938355, "grad_norm": 95.11876678466797, "learning_rate": 4.996904628807585e-06, "loss": 1.1208, "step": 14685 }, { "epoch": 0.01595381322200779, "grad_norm": 13.832849502563477, "learning_rate": 4.996902506584543e-06, "loss": 1.8097, "step": 14690 }, { "epoch": 0.015959243383077228, "grad_norm": 30.163171768188477, "learning_rate": 4.996900383634692e-06, "loss": 2.0569, "step": 14695 }, { "epoch": 0.015964673544146667, "grad_norm": 28.23021125793457, "learning_rate": 4.99689825995803e-06, "loss": 1.8191, "step": 14700 }, { "epoch": 0.015970103705216104, "grad_norm": 105.04468536376953, "learning_rate": 4.996896135554559e-06, "loss": 1.3899, "step": 14705 }, { "epoch": 0.015975533866285543, "grad_norm": 22.970626831054688, "learning_rate": 4.99689401042428e-06, "loss": 1.2542, "step": 14710 }, { "epoch": 0.01598096402735498, "grad_norm": 21.428361892700195, "learning_rate": 4.996891884567193e-06, "loss": 1.45, "step": 14715 }, { "epoch": 0.015986394188424416, "grad_norm": 20.0694637298584, "learning_rate": 4.996889757983299e-06, "loss": 1.6842, "step": 14720 }, { "epoch": 0.015991824349493856, "grad_norm": 19.6214656829834, "learning_rate": 4.996887630672598e-06, "loss": 1.9539, "step": 14725 }, { "epoch": 0.015997254510563292, "grad_norm": 30.795385360717773, "learning_rate": 4.996885502635091e-06, "loss": 1.4552, "step": 14730 }, { "epoch": 0.016002684671632728, "grad_norm": 15.648787498474121, "learning_rate": 4.996883373870779e-06, "loss": 1.8677, "step": 14735 }, { "epoch": 0.016008114832702168, "grad_norm": 17.602170944213867, "learning_rate": 4.996881244379662e-06, "loss": 1.6357, "step": 14740 }, { "epoch": 0.016013544993771604, "grad_norm": 22.503860473632812, "learning_rate": 4.996879114161741e-06, "loss": 1.6728, "step": 14745 }, { "epoch": 0.016018975154841044, "grad_norm": 23.503477096557617, "learning_rate": 4.996876983217016e-06, "loss": 2.5305, "step": 14750 }, { "epoch": 0.01602440531591048, "grad_norm": 34.206092834472656, "learning_rate": 4.996874851545488e-06, "loss": 1.7927, "step": 14755 }, { "epoch": 0.016029835476979917, "grad_norm": 31.66971206665039, "learning_rate": 4.996872719147159e-06, "loss": 1.9648, "step": 14760 }, { "epoch": 0.016035265638049356, "grad_norm": 21.90875816345215, "learning_rate": 4.9968705860220264e-06, "loss": 1.4461, "step": 14765 }, { "epoch": 0.016040695799118793, "grad_norm": 14.628077507019043, "learning_rate": 4.996868452170094e-06, "loss": 1.5907, "step": 14770 }, { "epoch": 0.016046125960188232, "grad_norm": 29.38642120361328, "learning_rate": 4.996866317591361e-06, "loss": 1.8273, "step": 14775 }, { "epoch": 0.01605155612125767, "grad_norm": 23.39612579345703, "learning_rate": 4.996864182285829e-06, "loss": 1.907, "step": 14780 }, { "epoch": 0.016056986282327105, "grad_norm": 19.254127502441406, "learning_rate": 4.996862046253497e-06, "loss": 2.0503, "step": 14785 }, { "epoch": 0.016062416443396545, "grad_norm": 15.467010498046875, "learning_rate": 4.996859909494366e-06, "loss": 1.1261, "step": 14790 }, { "epoch": 0.01606784660446598, "grad_norm": 44.34664535522461, "learning_rate": 4.996857772008437e-06, "loss": 1.2208, "step": 14795 }, { "epoch": 0.01607327676553542, "grad_norm": 23.667041778564453, "learning_rate": 4.996855633795713e-06, "loss": 1.8165, "step": 14800 }, { "epoch": 0.016078706926604857, "grad_norm": 19.651029586791992, "learning_rate": 4.99685349485619e-06, "loss": 1.9323, "step": 14805 }, { "epoch": 0.016084137087674293, "grad_norm": 27.44158172607422, "learning_rate": 4.996851355189871e-06, "loss": 1.8816, "step": 14810 }, { "epoch": 0.016089567248743733, "grad_norm": 61.09477615356445, "learning_rate": 4.9968492147967575e-06, "loss": 1.3827, "step": 14815 }, { "epoch": 0.01609499740981317, "grad_norm": 56.279666900634766, "learning_rate": 4.996847073676849e-06, "loss": 1.6371, "step": 14820 }, { "epoch": 0.01610042757088261, "grad_norm": 26.78658103942871, "learning_rate": 4.996844931830146e-06, "loss": 2.1322, "step": 14825 }, { "epoch": 0.016105857731952045, "grad_norm": 18.24936294555664, "learning_rate": 4.9968427892566496e-06, "loss": 1.1989, "step": 14830 }, { "epoch": 0.01611128789302148, "grad_norm": 26.101219177246094, "learning_rate": 4.9968406459563605e-06, "loss": 1.4403, "step": 14835 }, { "epoch": 0.01611671805409092, "grad_norm": 89.35967254638672, "learning_rate": 4.996838501929279e-06, "loss": 1.489, "step": 14840 }, { "epoch": 0.016122148215160358, "grad_norm": 88.40464782714844, "learning_rate": 4.996836357175405e-06, "loss": 1.7977, "step": 14845 }, { "epoch": 0.016127578376229797, "grad_norm": 16.31072425842285, "learning_rate": 4.9968342116947405e-06, "loss": 1.0706, "step": 14850 }, { "epoch": 0.016133008537299234, "grad_norm": 23.511760711669922, "learning_rate": 4.996832065487285e-06, "loss": 1.6337, "step": 14855 }, { "epoch": 0.01613843869836867, "grad_norm": 40.746307373046875, "learning_rate": 4.99682991855304e-06, "loss": 1.4194, "step": 14860 }, { "epoch": 0.01614386885943811, "grad_norm": 54.80628204345703, "learning_rate": 4.996827770892007e-06, "loss": 2.0511, "step": 14865 }, { "epoch": 0.016149299020507546, "grad_norm": 9.881696701049805, "learning_rate": 4.996825622504185e-06, "loss": 1.5169, "step": 14870 }, { "epoch": 0.016154729181576982, "grad_norm": 46.073848724365234, "learning_rate": 4.996823473389573e-06, "loss": 1.3893, "step": 14875 }, { "epoch": 0.016160159342646422, "grad_norm": 24.00255012512207, "learning_rate": 4.996821323548176e-06, "loss": 1.4819, "step": 14880 }, { "epoch": 0.016165589503715858, "grad_norm": 16.602453231811523, "learning_rate": 4.996819172979991e-06, "loss": 1.423, "step": 14885 }, { "epoch": 0.016171019664785298, "grad_norm": 29.79899787902832, "learning_rate": 4.996817021685021e-06, "loss": 2.0694, "step": 14890 }, { "epoch": 0.016176449825854734, "grad_norm": 102.65619659423828, "learning_rate": 4.996814869663265e-06, "loss": 1.0414, "step": 14895 }, { "epoch": 0.01618187998692417, "grad_norm": 15.551023483276367, "learning_rate": 4.996812716914724e-06, "loss": 1.5379, "step": 14900 }, { "epoch": 0.01618731014799361, "grad_norm": 17.08490753173828, "learning_rate": 4.996810563439398e-06, "loss": 1.3417, "step": 14905 }, { "epoch": 0.016192740309063047, "grad_norm": 28.900279998779297, "learning_rate": 4.99680840923729e-06, "loss": 1.7567, "step": 14910 }, { "epoch": 0.016198170470132486, "grad_norm": 32.14387130737305, "learning_rate": 4.996806254308398e-06, "loss": 1.4596, "step": 14915 }, { "epoch": 0.016203600631201923, "grad_norm": 63.9833869934082, "learning_rate": 4.996804098652724e-06, "loss": 1.5669, "step": 14920 }, { "epoch": 0.01620903079227136, "grad_norm": 214.02268981933594, "learning_rate": 4.9968019422702685e-06, "loss": 1.6921, "step": 14925 }, { "epoch": 0.0162144609533408, "grad_norm": 16.200315475463867, "learning_rate": 4.9967997851610325e-06, "loss": 1.7815, "step": 14930 }, { "epoch": 0.016219891114410235, "grad_norm": 28.120220184326172, "learning_rate": 4.996797627325015e-06, "loss": 1.3244, "step": 14935 }, { "epoch": 0.016225321275479675, "grad_norm": 56.67091751098633, "learning_rate": 4.996795468762218e-06, "loss": 1.825, "step": 14940 }, { "epoch": 0.01623075143654911, "grad_norm": 39.40170669555664, "learning_rate": 4.996793309472642e-06, "loss": 1.0894, "step": 14945 }, { "epoch": 0.016236181597618547, "grad_norm": 48.34370040893555, "learning_rate": 4.9967911494562875e-06, "loss": 1.6566, "step": 14950 }, { "epoch": 0.016241611758687987, "grad_norm": 20.65774154663086, "learning_rate": 4.996788988713155e-06, "loss": 1.7676, "step": 14955 }, { "epoch": 0.016247041919757423, "grad_norm": 35.53593444824219, "learning_rate": 4.996786827243245e-06, "loss": 1.6134, "step": 14960 }, { "epoch": 0.016252472080826863, "grad_norm": 35.263973236083984, "learning_rate": 4.996784665046559e-06, "loss": 2.2479, "step": 14965 }, { "epoch": 0.0162579022418963, "grad_norm": 12.635210990905762, "learning_rate": 4.996782502123096e-06, "loss": 1.9657, "step": 14970 }, { "epoch": 0.016263332402965736, "grad_norm": 27.303707122802734, "learning_rate": 4.9967803384728585e-06, "loss": 1.3651, "step": 14975 }, { "epoch": 0.016268762564035175, "grad_norm": 23.68026351928711, "learning_rate": 4.996778174095846e-06, "loss": 1.2384, "step": 14980 }, { "epoch": 0.01627419272510461, "grad_norm": 22.68205451965332, "learning_rate": 4.996776008992059e-06, "loss": 1.7305, "step": 14985 }, { "epoch": 0.01627962288617405, "grad_norm": 27.340967178344727, "learning_rate": 4.996773843161499e-06, "loss": 1.9222, "step": 14990 }, { "epoch": 0.016285053047243488, "grad_norm": 17.23065757751465, "learning_rate": 4.996771676604166e-06, "loss": 1.7288, "step": 14995 }, { "epoch": 0.016290483208312924, "grad_norm": 36.91911315917969, "learning_rate": 4.996769509320061e-06, "loss": 1.5191, "step": 15000 }, { "epoch": 0.016295913369382364, "grad_norm": 27.86121940612793, "learning_rate": 4.996767341309183e-06, "loss": 2.0215, "step": 15005 }, { "epoch": 0.0163013435304518, "grad_norm": 12.567779541015625, "learning_rate": 4.996765172571535e-06, "loss": 1.898, "step": 15010 }, { "epoch": 0.016306773691521236, "grad_norm": 21.768381118774414, "learning_rate": 4.9967630031071165e-06, "loss": 1.4109, "step": 15015 }, { "epoch": 0.016312203852590676, "grad_norm": 42.87599182128906, "learning_rate": 4.996760832915929e-06, "loss": 1.2249, "step": 15020 }, { "epoch": 0.016317634013660112, "grad_norm": 62.22185516357422, "learning_rate": 4.996758661997972e-06, "loss": 1.4846, "step": 15025 }, { "epoch": 0.016323064174729552, "grad_norm": 25.306703567504883, "learning_rate": 4.996756490353246e-06, "loss": 1.8352, "step": 15030 }, { "epoch": 0.016328494335798988, "grad_norm": 23.559995651245117, "learning_rate": 4.996754317981752e-06, "loss": 1.6349, "step": 15035 }, { "epoch": 0.016333924496868425, "grad_norm": 18.958881378173828, "learning_rate": 4.996752144883492e-06, "loss": 1.706, "step": 15040 }, { "epoch": 0.016339354657937864, "grad_norm": 19.378923416137695, "learning_rate": 4.996749971058464e-06, "loss": 1.3552, "step": 15045 }, { "epoch": 0.0163447848190073, "grad_norm": 12.735556602478027, "learning_rate": 4.9967477965066715e-06, "loss": 1.3732, "step": 15050 }, { "epoch": 0.01635021498007674, "grad_norm": 19.040733337402344, "learning_rate": 4.996745621228113e-06, "loss": 2.2698, "step": 15055 }, { "epoch": 0.016355645141146177, "grad_norm": 15.629082679748535, "learning_rate": 4.996743445222791e-06, "loss": 1.5794, "step": 15060 }, { "epoch": 0.016361075302215613, "grad_norm": 26.452383041381836, "learning_rate": 4.996741268490703e-06, "loss": 1.4155, "step": 15065 }, { "epoch": 0.016366505463285053, "grad_norm": 27.732118606567383, "learning_rate": 4.996739091031852e-06, "loss": 1.4196, "step": 15070 }, { "epoch": 0.01637193562435449, "grad_norm": 99.21183776855469, "learning_rate": 4.996736912846239e-06, "loss": 1.4476, "step": 15075 }, { "epoch": 0.01637736578542393, "grad_norm": 21.38644790649414, "learning_rate": 4.996734733933864e-06, "loss": 1.7618, "step": 15080 }, { "epoch": 0.016382795946493365, "grad_norm": 16.717926025390625, "learning_rate": 4.996732554294727e-06, "loss": 1.4671, "step": 15085 }, { "epoch": 0.0163882261075628, "grad_norm": 26.69161033630371, "learning_rate": 4.996730373928828e-06, "loss": 1.8467, "step": 15090 }, { "epoch": 0.01639365626863224, "grad_norm": 16.208791732788086, "learning_rate": 4.996728192836171e-06, "loss": 2.4297, "step": 15095 }, { "epoch": 0.016399086429701677, "grad_norm": 22.71969223022461, "learning_rate": 4.996726011016754e-06, "loss": 1.4925, "step": 15100 }, { "epoch": 0.016404516590771117, "grad_norm": 16.648927688598633, "learning_rate": 4.996723828470576e-06, "loss": 1.8213, "step": 15105 }, { "epoch": 0.016409946751840553, "grad_norm": 32.60178756713867, "learning_rate": 4.996721645197642e-06, "loss": 1.2308, "step": 15110 }, { "epoch": 0.01641537691290999, "grad_norm": 52.29813003540039, "learning_rate": 4.99671946119795e-06, "loss": 1.9613, "step": 15115 }, { "epoch": 0.01642080707397943, "grad_norm": 20.75661849975586, "learning_rate": 4.9967172764714996e-06, "loss": 1.7937, "step": 15120 }, { "epoch": 0.016426237235048866, "grad_norm": 21.116451263427734, "learning_rate": 4.996715091018294e-06, "loss": 1.5346, "step": 15125 }, { "epoch": 0.016431667396118305, "grad_norm": 13.780348777770996, "learning_rate": 4.996712904838332e-06, "loss": 1.3713, "step": 15130 }, { "epoch": 0.01643709755718774, "grad_norm": 37.18586730957031, "learning_rate": 4.996710717931616e-06, "loss": 2.5931, "step": 15135 }, { "epoch": 0.016442527718257178, "grad_norm": 37.49594497680664, "learning_rate": 4.996708530298144e-06, "loss": 1.7441, "step": 15140 }, { "epoch": 0.016447957879326618, "grad_norm": 83.9393539428711, "learning_rate": 4.996706341937919e-06, "loss": 2.6135, "step": 15145 }, { "epoch": 0.016453388040396054, "grad_norm": 42.219078063964844, "learning_rate": 4.99670415285094e-06, "loss": 1.2661, "step": 15150 }, { "epoch": 0.01645881820146549, "grad_norm": 28.224225997924805, "learning_rate": 4.996701963037209e-06, "loss": 1.4939, "step": 15155 }, { "epoch": 0.01646424836253493, "grad_norm": 16.687028884887695, "learning_rate": 4.996699772496726e-06, "loss": 2.1719, "step": 15160 }, { "epoch": 0.016469678523604366, "grad_norm": 31.590782165527344, "learning_rate": 4.996697581229491e-06, "loss": 2.0155, "step": 15165 }, { "epoch": 0.016475108684673806, "grad_norm": 17.160741806030273, "learning_rate": 4.9966953892355066e-06, "loss": 1.1187, "step": 15170 }, { "epoch": 0.016480538845743242, "grad_norm": 29.56540870666504, "learning_rate": 4.996693196514771e-06, "loss": 1.7378, "step": 15175 }, { "epoch": 0.01648596900681268, "grad_norm": 19.613798141479492, "learning_rate": 4.996691003067286e-06, "loss": 1.6409, "step": 15180 }, { "epoch": 0.016491399167882118, "grad_norm": 12.986740112304688, "learning_rate": 4.996688808893053e-06, "loss": 1.3042, "step": 15185 }, { "epoch": 0.016496829328951555, "grad_norm": 26.224136352539062, "learning_rate": 4.9966866139920725e-06, "loss": 1.4856, "step": 15190 }, { "epoch": 0.016502259490020994, "grad_norm": 33.88182830810547, "learning_rate": 4.9966844183643436e-06, "loss": 1.7629, "step": 15195 }, { "epoch": 0.01650768965109043, "grad_norm": 39.816165924072266, "learning_rate": 4.996682222009868e-06, "loss": 1.868, "step": 15200 }, { "epoch": 0.016513119812159867, "grad_norm": 28.640338897705078, "learning_rate": 4.996680024928646e-06, "loss": 1.2745, "step": 15205 }, { "epoch": 0.016518549973229307, "grad_norm": 48.13541030883789, "learning_rate": 4.9966778271206786e-06, "loss": 1.3768, "step": 15210 }, { "epoch": 0.016523980134298743, "grad_norm": 18.864334106445312, "learning_rate": 4.996675628585966e-06, "loss": 1.7593, "step": 15215 }, { "epoch": 0.016529410295368183, "grad_norm": 34.33082580566406, "learning_rate": 4.996673429324509e-06, "loss": 2.1661, "step": 15220 }, { "epoch": 0.01653484045643762, "grad_norm": 31.153907775878906, "learning_rate": 4.996671229336308e-06, "loss": 2.0143, "step": 15225 }, { "epoch": 0.016540270617507055, "grad_norm": 25.815628051757812, "learning_rate": 4.9966690286213646e-06, "loss": 1.7213, "step": 15230 }, { "epoch": 0.016545700778576495, "grad_norm": 22.696409225463867, "learning_rate": 4.996666827179679e-06, "loss": 1.8662, "step": 15235 }, { "epoch": 0.01655113093964593, "grad_norm": 15.909729957580566, "learning_rate": 4.996664625011252e-06, "loss": 1.7309, "step": 15240 }, { "epoch": 0.01655656110071537, "grad_norm": 175.57081604003906, "learning_rate": 4.996662422116083e-06, "loss": 1.4156, "step": 15245 }, { "epoch": 0.016561991261784807, "grad_norm": 25.182056427001953, "learning_rate": 4.9966602184941745e-06, "loss": 1.6544, "step": 15250 }, { "epoch": 0.016567421422854244, "grad_norm": 19.211917877197266, "learning_rate": 4.996658014145526e-06, "loss": 1.8099, "step": 15255 }, { "epoch": 0.016572851583923683, "grad_norm": 17.05879020690918, "learning_rate": 4.996655809070138e-06, "loss": 1.3354, "step": 15260 }, { "epoch": 0.01657828174499312, "grad_norm": 18.068496704101562, "learning_rate": 4.996653603268012e-06, "loss": 1.7922, "step": 15265 }, { "epoch": 0.01658371190606256, "grad_norm": 32.9541015625, "learning_rate": 4.996651396739147e-06, "loss": 1.7443, "step": 15270 }, { "epoch": 0.016589142067131996, "grad_norm": 57.48588562011719, "learning_rate": 4.996649189483545e-06, "loss": 2.0365, "step": 15275 }, { "epoch": 0.016594572228201432, "grad_norm": 19.030166625976562, "learning_rate": 4.996646981501207e-06, "loss": 1.6913, "step": 15280 }, { "epoch": 0.01660000238927087, "grad_norm": 22.799331665039062, "learning_rate": 4.996644772792133e-06, "loss": 1.9523, "step": 15285 }, { "epoch": 0.016605432550340308, "grad_norm": 84.42897033691406, "learning_rate": 4.996642563356323e-06, "loss": 2.0869, "step": 15290 }, { "epoch": 0.016610862711409744, "grad_norm": 47.6934700012207, "learning_rate": 4.996640353193779e-06, "loss": 1.9265, "step": 15295 }, { "epoch": 0.016616292872479184, "grad_norm": 25.52988624572754, "learning_rate": 4.996638142304501e-06, "loss": 1.0026, "step": 15300 }, { "epoch": 0.01662172303354862, "grad_norm": 13.443469047546387, "learning_rate": 4.996635930688489e-06, "loss": 1.3349, "step": 15305 }, { "epoch": 0.01662715319461806, "grad_norm": 20.528669357299805, "learning_rate": 4.996633718345746e-06, "loss": 1.6257, "step": 15310 }, { "epoch": 0.016632583355687496, "grad_norm": 16.047565460205078, "learning_rate": 4.996631505276269e-06, "loss": 1.6338, "step": 15315 }, { "epoch": 0.016638013516756932, "grad_norm": 19.7324161529541, "learning_rate": 4.996629291480061e-06, "loss": 1.5352, "step": 15320 }, { "epoch": 0.016643443677826372, "grad_norm": 21.02644920349121, "learning_rate": 4.996627076957122e-06, "loss": 1.4525, "step": 15325 }, { "epoch": 0.01664887383889581, "grad_norm": 43.936790466308594, "learning_rate": 4.996624861707454e-06, "loss": 1.5858, "step": 15330 }, { "epoch": 0.01665430399996525, "grad_norm": 16.20281410217285, "learning_rate": 4.996622645731056e-06, "loss": 1.8083, "step": 15335 }, { "epoch": 0.016659734161034685, "grad_norm": 14.609383583068848, "learning_rate": 4.996620429027928e-06, "loss": 1.6461, "step": 15340 }, { "epoch": 0.01666516432210412, "grad_norm": 42.68476486206055, "learning_rate": 4.996618211598073e-06, "loss": 2.3902, "step": 15345 }, { "epoch": 0.01667059448317356, "grad_norm": 32.84621810913086, "learning_rate": 4.99661599344149e-06, "loss": 2.5562, "step": 15350 }, { "epoch": 0.016676024644242997, "grad_norm": 16.52674102783203, "learning_rate": 4.99661377455818e-06, "loss": 1.0713, "step": 15355 }, { "epoch": 0.016681454805312437, "grad_norm": 16.084087371826172, "learning_rate": 4.996611554948144e-06, "loss": 1.5358, "step": 15360 }, { "epoch": 0.016686884966381873, "grad_norm": 39.66708755493164, "learning_rate": 4.996609334611382e-06, "loss": 1.6037, "step": 15365 }, { "epoch": 0.01669231512745131, "grad_norm": 22.002742767333984, "learning_rate": 4.996607113547895e-06, "loss": 1.3902, "step": 15370 }, { "epoch": 0.01669774528852075, "grad_norm": 23.706724166870117, "learning_rate": 4.996604891757685e-06, "loss": 1.7358, "step": 15375 }, { "epoch": 0.016703175449590185, "grad_norm": 30.384702682495117, "learning_rate": 4.99660266924075e-06, "loss": 2.3118, "step": 15380 }, { "epoch": 0.016708605610659625, "grad_norm": 46.676212310791016, "learning_rate": 4.996600445997092e-06, "loss": 1.5494, "step": 15385 }, { "epoch": 0.01671403577172906, "grad_norm": 61.40350341796875, "learning_rate": 4.9965982220267115e-06, "loss": 1.8928, "step": 15390 }, { "epoch": 0.016719465932798497, "grad_norm": 18.17824363708496, "learning_rate": 4.99659599732961e-06, "loss": 1.474, "step": 15395 }, { "epoch": 0.016724896093867937, "grad_norm": 15.408659934997559, "learning_rate": 4.996593771905787e-06, "loss": 1.6741, "step": 15400 }, { "epoch": 0.016730326254937374, "grad_norm": 17.8835506439209, "learning_rate": 4.996591545755242e-06, "loss": 1.8827, "step": 15405 }, { "epoch": 0.016735756416006813, "grad_norm": 13.525188446044922, "learning_rate": 4.9965893188779795e-06, "loss": 1.3358, "step": 15410 }, { "epoch": 0.01674118657707625, "grad_norm": 18.755361557006836, "learning_rate": 4.996587091273997e-06, "loss": 2.2519, "step": 15415 }, { "epoch": 0.016746616738145686, "grad_norm": 37.82682800292969, "learning_rate": 4.996584862943296e-06, "loss": 1.8215, "step": 15420 }, { "epoch": 0.016752046899215126, "grad_norm": 31.705812454223633, "learning_rate": 4.996582633885877e-06, "loss": 1.8233, "step": 15425 }, { "epoch": 0.016757477060284562, "grad_norm": 14.969091415405273, "learning_rate": 4.996580404101741e-06, "loss": 1.8093, "step": 15430 }, { "epoch": 0.016762907221353998, "grad_norm": 13.628769874572754, "learning_rate": 4.996578173590888e-06, "loss": 1.9553, "step": 15435 }, { "epoch": 0.016768337382423438, "grad_norm": 19.516414642333984, "learning_rate": 4.996575942353319e-06, "loss": 1.305, "step": 15440 }, { "epoch": 0.016773767543492874, "grad_norm": 27.293840408325195, "learning_rate": 4.996573710389036e-06, "loss": 1.2449, "step": 15445 }, { "epoch": 0.016779197704562314, "grad_norm": 13.155572891235352, "learning_rate": 4.996571477698037e-06, "loss": 1.6502, "step": 15450 }, { "epoch": 0.01678462786563175, "grad_norm": 31.241174697875977, "learning_rate": 4.996569244280324e-06, "loss": 1.616, "step": 15455 }, { "epoch": 0.016790058026701186, "grad_norm": 19.728443145751953, "learning_rate": 4.996567010135899e-06, "loss": 1.4027, "step": 15460 }, { "epoch": 0.016795488187770626, "grad_norm": 14.513264656066895, "learning_rate": 4.99656477526476e-06, "loss": 1.4464, "step": 15465 }, { "epoch": 0.016800918348840062, "grad_norm": 19.89834213256836, "learning_rate": 4.996562539666909e-06, "loss": 0.9165, "step": 15470 }, { "epoch": 0.016806348509909502, "grad_norm": 45.617164611816406, "learning_rate": 4.996560303342347e-06, "loss": 1.8483, "step": 15475 }, { "epoch": 0.01681177867097894, "grad_norm": 32.31407165527344, "learning_rate": 4.996558066291075e-06, "loss": 1.6911, "step": 15480 }, { "epoch": 0.016817208832048375, "grad_norm": 72.17658996582031, "learning_rate": 4.996555828513092e-06, "loss": 1.8487, "step": 15485 }, { "epoch": 0.016822638993117815, "grad_norm": 18.229644775390625, "learning_rate": 4.9965535900084e-06, "loss": 1.2063, "step": 15490 }, { "epoch": 0.01682806915418725, "grad_norm": 21.430213928222656, "learning_rate": 4.996551350776999e-06, "loss": 2.0639, "step": 15495 }, { "epoch": 0.01683349931525669, "grad_norm": 20.94442367553711, "learning_rate": 4.99654911081889e-06, "loss": 1.5549, "step": 15500 }, { "epoch": 0.016838929476326127, "grad_norm": 27.383779525756836, "learning_rate": 4.996546870134074e-06, "loss": 1.7731, "step": 15505 }, { "epoch": 0.016844359637395563, "grad_norm": 82.30640411376953, "learning_rate": 4.9965446287225505e-06, "loss": 1.6615, "step": 15510 }, { "epoch": 0.016849789798465003, "grad_norm": 23.08425521850586, "learning_rate": 4.996542386584321e-06, "loss": 1.3648, "step": 15515 }, { "epoch": 0.01685521995953444, "grad_norm": 112.49699401855469, "learning_rate": 4.996540143719386e-06, "loss": 1.9796, "step": 15520 }, { "epoch": 0.01686065012060388, "grad_norm": 17.136524200439453, "learning_rate": 4.996537900127746e-06, "loss": 2.1645, "step": 15525 }, { "epoch": 0.016866080281673315, "grad_norm": 32.67265701293945, "learning_rate": 4.9965356558094024e-06, "loss": 2.1878, "step": 15530 }, { "epoch": 0.01687151044274275, "grad_norm": 32.17512130737305, "learning_rate": 4.996533410764355e-06, "loss": 1.7373, "step": 15535 }, { "epoch": 0.01687694060381219, "grad_norm": 12.083773612976074, "learning_rate": 4.996531164992604e-06, "loss": 1.2043, "step": 15540 }, { "epoch": 0.016882370764881627, "grad_norm": 62.17143630981445, "learning_rate": 4.996528918494152e-06, "loss": 1.9901, "step": 15545 }, { "epoch": 0.016887800925951067, "grad_norm": 27.929393768310547, "learning_rate": 4.9965266712689976e-06, "loss": 1.7694, "step": 15550 }, { "epoch": 0.016893231087020504, "grad_norm": 42.906951904296875, "learning_rate": 4.996524423317142e-06, "loss": 1.6857, "step": 15555 }, { "epoch": 0.01689866124808994, "grad_norm": 12.907888412475586, "learning_rate": 4.996522174638587e-06, "loss": 1.6205, "step": 15560 }, { "epoch": 0.01690409140915938, "grad_norm": 42.1142463684082, "learning_rate": 4.996519925233332e-06, "loss": 1.0505, "step": 15565 }, { "epoch": 0.016909521570228816, "grad_norm": 21.614057540893555, "learning_rate": 4.996517675101377e-06, "loss": 1.7026, "step": 15570 }, { "epoch": 0.016914951731298252, "grad_norm": 16.48335075378418, "learning_rate": 4.996515424242725e-06, "loss": 1.7785, "step": 15575 }, { "epoch": 0.016920381892367692, "grad_norm": 33.242923736572266, "learning_rate": 4.996513172657376e-06, "loss": 1.3472, "step": 15580 }, { "epoch": 0.016925812053437128, "grad_norm": 13.264674186706543, "learning_rate": 4.996510920345329e-06, "loss": 1.3825, "step": 15585 }, { "epoch": 0.016931242214506568, "grad_norm": 14.915390014648438, "learning_rate": 4.996508667306586e-06, "loss": 2.0284, "step": 15590 }, { "epoch": 0.016936672375576004, "grad_norm": 197.9081573486328, "learning_rate": 4.996506413541146e-06, "loss": 1.6714, "step": 15595 }, { "epoch": 0.01694210253664544, "grad_norm": 23.860036849975586, "learning_rate": 4.996504159049012e-06, "loss": 1.6592, "step": 15600 }, { "epoch": 0.01694753269771488, "grad_norm": 22.36371421813965, "learning_rate": 4.996501903830184e-06, "loss": 1.2851, "step": 15605 }, { "epoch": 0.016952962858784316, "grad_norm": 59.33125305175781, "learning_rate": 4.996499647884662e-06, "loss": 2.2217, "step": 15610 }, { "epoch": 0.016958393019853756, "grad_norm": 15.581674575805664, "learning_rate": 4.996497391212446e-06, "loss": 0.7796, "step": 15615 }, { "epoch": 0.016963823180923192, "grad_norm": 11.760795593261719, "learning_rate": 4.996495133813538e-06, "loss": 1.8732, "step": 15620 }, { "epoch": 0.01696925334199263, "grad_norm": 37.60288619995117, "learning_rate": 4.996492875687939e-06, "loss": 1.4848, "step": 15625 }, { "epoch": 0.01697468350306207, "grad_norm": 39.703372955322266, "learning_rate": 4.996490616835649e-06, "loss": 2.189, "step": 15630 }, { "epoch": 0.016980113664131505, "grad_norm": 16.95796775817871, "learning_rate": 4.996488357256668e-06, "loss": 1.7207, "step": 15635 }, { "epoch": 0.016985543825200945, "grad_norm": 26.941572189331055, "learning_rate": 4.996486096950997e-06, "loss": 1.211, "step": 15640 }, { "epoch": 0.01699097398627038, "grad_norm": 28.12346076965332, "learning_rate": 4.996483835918637e-06, "loss": 1.9644, "step": 15645 }, { "epoch": 0.016996404147339817, "grad_norm": 129.60313415527344, "learning_rate": 4.996481574159589e-06, "loss": 1.4191, "step": 15650 }, { "epoch": 0.017001834308409257, "grad_norm": 23.46526527404785, "learning_rate": 4.996479311673853e-06, "loss": 1.5459, "step": 15655 }, { "epoch": 0.017007264469478693, "grad_norm": 76.82270812988281, "learning_rate": 4.99647704846143e-06, "loss": 1.5903, "step": 15660 }, { "epoch": 0.017012694630548133, "grad_norm": 15.350500106811523, "learning_rate": 4.996474784522321e-06, "loss": 1.177, "step": 15665 }, { "epoch": 0.01701812479161757, "grad_norm": 33.8175048828125, "learning_rate": 4.996472519856524e-06, "loss": 1.5389, "step": 15670 }, { "epoch": 0.017023554952687005, "grad_norm": 12.650153160095215, "learning_rate": 4.996470254464044e-06, "loss": 1.0141, "step": 15675 }, { "epoch": 0.017028985113756445, "grad_norm": 56.913055419921875, "learning_rate": 4.996467988344879e-06, "loss": 1.4213, "step": 15680 }, { "epoch": 0.01703441527482588, "grad_norm": 15.61722183227539, "learning_rate": 4.9964657214990295e-06, "loss": 2.1107, "step": 15685 }, { "epoch": 0.01703984543589532, "grad_norm": 19.14300537109375, "learning_rate": 4.996463453926497e-06, "loss": 1.8053, "step": 15690 }, { "epoch": 0.017045275596964757, "grad_norm": 14.284893035888672, "learning_rate": 4.996461185627283e-06, "loss": 1.2157, "step": 15695 }, { "epoch": 0.017050705758034194, "grad_norm": 18.284900665283203, "learning_rate": 4.996458916601386e-06, "loss": 1.9766, "step": 15700 }, { "epoch": 0.017056135919103634, "grad_norm": 18.84840202331543, "learning_rate": 4.996456646848808e-06, "loss": 1.2954, "step": 15705 }, { "epoch": 0.01706156608017307, "grad_norm": 30.278240203857422, "learning_rate": 4.9964543763695495e-06, "loss": 1.1961, "step": 15710 }, { "epoch": 0.017066996241242506, "grad_norm": 17.908863067626953, "learning_rate": 4.996452105163612e-06, "loss": 2.8921, "step": 15715 }, { "epoch": 0.017072426402311946, "grad_norm": 48.19648361206055, "learning_rate": 4.996449833230994e-06, "loss": 1.6286, "step": 15720 }, { "epoch": 0.017077856563381382, "grad_norm": 49.02919006347656, "learning_rate": 4.996447560571698e-06, "loss": 1.601, "step": 15725 }, { "epoch": 0.017083286724450822, "grad_norm": 16.003278732299805, "learning_rate": 4.996445287185724e-06, "loss": 1.2007, "step": 15730 }, { "epoch": 0.017088716885520258, "grad_norm": 17.2207088470459, "learning_rate": 4.996443013073072e-06, "loss": 1.5943, "step": 15735 }, { "epoch": 0.017094147046589694, "grad_norm": 68.09904479980469, "learning_rate": 4.996440738233745e-06, "loss": 1.9487, "step": 15740 }, { "epoch": 0.017099577207659134, "grad_norm": 31.389089584350586, "learning_rate": 4.996438462667741e-06, "loss": 1.6452, "step": 15745 }, { "epoch": 0.01710500736872857, "grad_norm": 121.5096435546875, "learning_rate": 4.996436186375062e-06, "loss": 1.2961, "step": 15750 }, { "epoch": 0.01711043752979801, "grad_norm": 21.241954803466797, "learning_rate": 4.996433909355709e-06, "loss": 2.0051, "step": 15755 }, { "epoch": 0.017115867690867446, "grad_norm": 21.51947593688965, "learning_rate": 4.996431631609682e-06, "loss": 1.9889, "step": 15760 }, { "epoch": 0.017121297851936883, "grad_norm": 21.076196670532227, "learning_rate": 4.996429353136981e-06, "loss": 1.4245, "step": 15765 }, { "epoch": 0.017126728013006322, "grad_norm": 75.09902954101562, "learning_rate": 4.9964270739376074e-06, "loss": 1.2895, "step": 15770 }, { "epoch": 0.01713215817407576, "grad_norm": 13.955092430114746, "learning_rate": 4.996424794011562e-06, "loss": 1.5906, "step": 15775 }, { "epoch": 0.0171375883351452, "grad_norm": 23.354652404785156, "learning_rate": 4.9964225133588455e-06, "loss": 1.1827, "step": 15780 }, { "epoch": 0.017143018496214635, "grad_norm": 21.028884887695312, "learning_rate": 4.9964202319794584e-06, "loss": 1.2213, "step": 15785 }, { "epoch": 0.01714844865728407, "grad_norm": 35.19290542602539, "learning_rate": 4.996417949873401e-06, "loss": 1.5575, "step": 15790 }, { "epoch": 0.01715387881835351, "grad_norm": 38.723731994628906, "learning_rate": 4.996415667040675e-06, "loss": 1.4098, "step": 15795 }, { "epoch": 0.017159308979422947, "grad_norm": 35.70969772338867, "learning_rate": 4.99641338348128e-06, "loss": 1.8858, "step": 15800 }, { "epoch": 0.017164739140492387, "grad_norm": 18.921192169189453, "learning_rate": 4.996411099195216e-06, "loss": 1.2866, "step": 15805 }, { "epoch": 0.017170169301561823, "grad_norm": 171.80242919921875, "learning_rate": 4.996408814182487e-06, "loss": 1.2951, "step": 15810 }, { "epoch": 0.01717559946263126, "grad_norm": 14.870591163635254, "learning_rate": 4.99640652844309e-06, "loss": 0.9114, "step": 15815 }, { "epoch": 0.0171810296237007, "grad_norm": 71.37185668945312, "learning_rate": 4.996404241977027e-06, "loss": 2.1073, "step": 15820 }, { "epoch": 0.017186459784770135, "grad_norm": 189.8609619140625, "learning_rate": 4.996401954784299e-06, "loss": 1.1156, "step": 15825 }, { "epoch": 0.017191889945839575, "grad_norm": 114.61920166015625, "learning_rate": 4.996399666864907e-06, "loss": 1.3764, "step": 15830 }, { "epoch": 0.01719732010690901, "grad_norm": 37.697235107421875, "learning_rate": 4.996397378218849e-06, "loss": 1.7234, "step": 15835 }, { "epoch": 0.017202750267978448, "grad_norm": 54.84749984741211, "learning_rate": 4.996395088846129e-06, "loss": 1.5096, "step": 15840 }, { "epoch": 0.017208180429047887, "grad_norm": 20.636226654052734, "learning_rate": 4.996392798746746e-06, "loss": 1.319, "step": 15845 }, { "epoch": 0.017213610590117324, "grad_norm": 199.5380859375, "learning_rate": 4.996390507920702e-06, "loss": 1.5939, "step": 15850 }, { "epoch": 0.01721904075118676, "grad_norm": 17.720535278320312, "learning_rate": 4.996388216367996e-06, "loss": 1.6591, "step": 15855 }, { "epoch": 0.0172244709122562, "grad_norm": 20.038881301879883, "learning_rate": 4.99638592408863e-06, "loss": 1.5043, "step": 15860 }, { "epoch": 0.017229901073325636, "grad_norm": 16.388032913208008, "learning_rate": 4.996383631082603e-06, "loss": 1.1391, "step": 15865 }, { "epoch": 0.017235331234395076, "grad_norm": 42.82012939453125, "learning_rate": 4.9963813373499175e-06, "loss": 1.7126, "step": 15870 }, { "epoch": 0.017240761395464512, "grad_norm": 40.72372055053711, "learning_rate": 4.996379042890573e-06, "loss": 1.8279, "step": 15875 }, { "epoch": 0.01724619155653395, "grad_norm": 40.682621002197266, "learning_rate": 4.99637674770457e-06, "loss": 1.9288, "step": 15880 }, { "epoch": 0.017251621717603388, "grad_norm": 20.477468490600586, "learning_rate": 4.996374451791911e-06, "loss": 1.916, "step": 15885 }, { "epoch": 0.017257051878672824, "grad_norm": 129.5032501220703, "learning_rate": 4.996372155152595e-06, "loss": 2.2891, "step": 15890 }, { "epoch": 0.017262482039742264, "grad_norm": 26.70676612854004, "learning_rate": 4.996369857786623e-06, "loss": 1.4016, "step": 15895 }, { "epoch": 0.0172679122008117, "grad_norm": 29.149911880493164, "learning_rate": 4.996367559693995e-06, "loss": 2.0945, "step": 15900 }, { "epoch": 0.017273342361881137, "grad_norm": 18.125391006469727, "learning_rate": 4.996365260874713e-06, "loss": 1.6221, "step": 15905 }, { "epoch": 0.017278772522950576, "grad_norm": 48.75996398925781, "learning_rate": 4.996362961328777e-06, "loss": 2.6166, "step": 15910 }, { "epoch": 0.017284202684020013, "grad_norm": 14.020015716552734, "learning_rate": 4.9963606610561876e-06, "loss": 1.4989, "step": 15915 }, { "epoch": 0.017289632845089453, "grad_norm": 11.284219741821289, "learning_rate": 4.996358360056945e-06, "loss": 0.9702, "step": 15920 }, { "epoch": 0.01729506300615889, "grad_norm": 19.361125946044922, "learning_rate": 4.996356058331052e-06, "loss": 1.5303, "step": 15925 }, { "epoch": 0.017300493167228325, "grad_norm": 18.710248947143555, "learning_rate": 4.996353755878506e-06, "loss": 1.2173, "step": 15930 }, { "epoch": 0.017305923328297765, "grad_norm": 22.753755569458008, "learning_rate": 4.996351452699311e-06, "loss": 2.6414, "step": 15935 }, { "epoch": 0.0173113534893672, "grad_norm": 25.107437133789062, "learning_rate": 4.996349148793465e-06, "loss": 1.6267, "step": 15940 }, { "epoch": 0.01731678365043664, "grad_norm": 16.41731834411621, "learning_rate": 4.996346844160969e-06, "loss": 1.4292, "step": 15945 }, { "epoch": 0.017322213811506077, "grad_norm": 32.30752182006836, "learning_rate": 4.996344538801826e-06, "loss": 1.7357, "step": 15950 }, { "epoch": 0.017327643972575513, "grad_norm": 30.777204513549805, "learning_rate": 4.996342232716035e-06, "loss": 1.8796, "step": 15955 }, { "epoch": 0.017333074133644953, "grad_norm": 81.73136901855469, "learning_rate": 4.996339925903596e-06, "loss": 2.1345, "step": 15960 }, { "epoch": 0.01733850429471439, "grad_norm": 20.913795471191406, "learning_rate": 4.996337618364511e-06, "loss": 2.0346, "step": 15965 }, { "epoch": 0.01734393445578383, "grad_norm": 34.43454360961914, "learning_rate": 4.99633531009878e-06, "loss": 1.4923, "step": 15970 }, { "epoch": 0.017349364616853265, "grad_norm": 34.298912048339844, "learning_rate": 4.996333001106404e-06, "loss": 1.5474, "step": 15975 }, { "epoch": 0.0173547947779227, "grad_norm": 19.51902198791504, "learning_rate": 4.9963306913873824e-06, "loss": 1.806, "step": 15980 }, { "epoch": 0.01736022493899214, "grad_norm": 41.05169677734375, "learning_rate": 4.996328380941718e-06, "loss": 1.6342, "step": 15985 }, { "epoch": 0.017365655100061578, "grad_norm": 15.568279266357422, "learning_rate": 4.9963260697694095e-06, "loss": 1.521, "step": 15990 }, { "epoch": 0.017371085261131014, "grad_norm": 19.303592681884766, "learning_rate": 4.996323757870459e-06, "loss": 1.1484, "step": 15995 }, { "epoch": 0.017376515422200454, "grad_norm": 14.225829124450684, "learning_rate": 4.996321445244868e-06, "loss": 2.0092, "step": 16000 }, { "epoch": 0.01738194558326989, "grad_norm": 19.163921356201172, "learning_rate": 4.996319131892634e-06, "loss": 1.9025, "step": 16005 }, { "epoch": 0.01738737574433933, "grad_norm": 49.49658966064453, "learning_rate": 4.996316817813759e-06, "loss": 2.1929, "step": 16010 }, { "epoch": 0.017392805905408766, "grad_norm": 17.47484588623047, "learning_rate": 4.996314503008245e-06, "loss": 1.0119, "step": 16015 }, { "epoch": 0.017398236066478202, "grad_norm": 15.718912124633789, "learning_rate": 4.996312187476092e-06, "loss": 1.9354, "step": 16020 }, { "epoch": 0.017403666227547642, "grad_norm": 23.980680465698242, "learning_rate": 4.996309871217301e-06, "loss": 1.1441, "step": 16025 }, { "epoch": 0.01740909638861708, "grad_norm": 22.915739059448242, "learning_rate": 4.996307554231871e-06, "loss": 1.592, "step": 16030 }, { "epoch": 0.017414526549686518, "grad_norm": 24.029287338256836, "learning_rate": 4.9963052365198045e-06, "loss": 1.5075, "step": 16035 }, { "epoch": 0.017419956710755954, "grad_norm": 190.0675811767578, "learning_rate": 4.996302918081101e-06, "loss": 1.3693, "step": 16040 }, { "epoch": 0.01742538687182539, "grad_norm": 29.113000869750977, "learning_rate": 4.996300598915763e-06, "loss": 1.3442, "step": 16045 }, { "epoch": 0.01743081703289483, "grad_norm": 23.668832778930664, "learning_rate": 4.996298279023788e-06, "loss": 1.5546, "step": 16050 }, { "epoch": 0.017436247193964267, "grad_norm": 42.46885681152344, "learning_rate": 4.99629595840518e-06, "loss": 1.2195, "step": 16055 }, { "epoch": 0.017441677355033706, "grad_norm": 22.372173309326172, "learning_rate": 4.996293637059938e-06, "loss": 2.1923, "step": 16060 }, { "epoch": 0.017447107516103143, "grad_norm": 13.239251136779785, "learning_rate": 4.996291314988062e-06, "loss": 1.3382, "step": 16065 }, { "epoch": 0.01745253767717258, "grad_norm": 15.435226440429688, "learning_rate": 4.996288992189555e-06, "loss": 1.9815, "step": 16070 }, { "epoch": 0.01745796783824202, "grad_norm": 22.76461410522461, "learning_rate": 4.9962866686644154e-06, "loss": 1.4593, "step": 16075 }, { "epoch": 0.017463397999311455, "grad_norm": 25.24770736694336, "learning_rate": 4.996284344412645e-06, "loss": 1.6047, "step": 16080 }, { "epoch": 0.017468828160380895, "grad_norm": 63.30172348022461, "learning_rate": 4.996282019434243e-06, "loss": 1.0602, "step": 16085 }, { "epoch": 0.01747425832145033, "grad_norm": 48.59504318237305, "learning_rate": 4.996279693729213e-06, "loss": 1.7191, "step": 16090 }, { "epoch": 0.017479688482519767, "grad_norm": 58.85773468017578, "learning_rate": 4.996277367297553e-06, "loss": 1.8144, "step": 16095 }, { "epoch": 0.017485118643589207, "grad_norm": 17.699905395507812, "learning_rate": 4.996275040139266e-06, "loss": 1.8223, "step": 16100 }, { "epoch": 0.017490548804658643, "grad_norm": 20.285017013549805, "learning_rate": 4.99627271225435e-06, "loss": 1.4507, "step": 16105 }, { "epoch": 0.017495978965728083, "grad_norm": 25.84637451171875, "learning_rate": 4.996270383642807e-06, "loss": 1.9295, "step": 16110 }, { "epoch": 0.01750140912679752, "grad_norm": 18.365312576293945, "learning_rate": 4.996268054304637e-06, "loss": 1.3866, "step": 16115 }, { "epoch": 0.017506839287866956, "grad_norm": 16.58183479309082, "learning_rate": 4.996265724239843e-06, "loss": 1.7154, "step": 16120 }, { "epoch": 0.017512269448936395, "grad_norm": 18.7646541595459, "learning_rate": 4.996263393448423e-06, "loss": 1.9338, "step": 16125 }, { "epoch": 0.01751769961000583, "grad_norm": 12.979842185974121, "learning_rate": 4.996261061930379e-06, "loss": 1.6249, "step": 16130 }, { "epoch": 0.017523129771075268, "grad_norm": 44.49403762817383, "learning_rate": 4.996258729685711e-06, "loss": 1.5725, "step": 16135 }, { "epoch": 0.017528559932144708, "grad_norm": 34.80929946899414, "learning_rate": 4.996256396714421e-06, "loss": 1.9527, "step": 16140 }, { "epoch": 0.017533990093214144, "grad_norm": 28.64907455444336, "learning_rate": 4.996254063016508e-06, "loss": 1.04, "step": 16145 }, { "epoch": 0.017539420254283584, "grad_norm": 21.439773559570312, "learning_rate": 4.9962517285919745e-06, "loss": 1.6495, "step": 16150 }, { "epoch": 0.01754485041535302, "grad_norm": 42.560951232910156, "learning_rate": 4.9962493934408184e-06, "loss": 2.007, "step": 16155 }, { "epoch": 0.017550280576422456, "grad_norm": 16.768638610839844, "learning_rate": 4.996247057563043e-06, "loss": 1.4886, "step": 16160 }, { "epoch": 0.017555710737491896, "grad_norm": 11.987628936767578, "learning_rate": 4.996244720958648e-06, "loss": 0.996, "step": 16165 }, { "epoch": 0.017561140898561332, "grad_norm": 35.58417892456055, "learning_rate": 4.996242383627634e-06, "loss": 2.4808, "step": 16170 }, { "epoch": 0.017566571059630772, "grad_norm": 23.782066345214844, "learning_rate": 4.996240045570002e-06, "loss": 1.2905, "step": 16175 }, { "epoch": 0.01757200122070021, "grad_norm": 28.68199920654297, "learning_rate": 4.9962377067857525e-06, "loss": 1.6466, "step": 16180 }, { "epoch": 0.017577431381769645, "grad_norm": 16.92975425720215, "learning_rate": 4.9962353672748865e-06, "loss": 2.1026, "step": 16185 }, { "epoch": 0.017582861542839084, "grad_norm": 19.626379013061523, "learning_rate": 4.9962330270374035e-06, "loss": 2.3288, "step": 16190 }, { "epoch": 0.01758829170390852, "grad_norm": 55.28654098510742, "learning_rate": 4.996230686073306e-06, "loss": 1.281, "step": 16195 }, { "epoch": 0.01759372186497796, "grad_norm": 18.033246994018555, "learning_rate": 4.996228344382593e-06, "loss": 0.9575, "step": 16200 }, { "epoch": 0.017599152026047397, "grad_norm": 19.203052520751953, "learning_rate": 4.996226001965266e-06, "loss": 1.3857, "step": 16205 }, { "epoch": 0.017604582187116833, "grad_norm": 89.45415496826172, "learning_rate": 4.996223658821326e-06, "loss": 2.3211, "step": 16210 }, { "epoch": 0.017610012348186273, "grad_norm": 38.87462615966797, "learning_rate": 4.996221314950773e-06, "loss": 1.6405, "step": 16215 }, { "epoch": 0.01761544250925571, "grad_norm": 45.32347869873047, "learning_rate": 4.996218970353608e-06, "loss": 1.553, "step": 16220 }, { "epoch": 0.01762087267032515, "grad_norm": 28.428518295288086, "learning_rate": 4.996216625029832e-06, "loss": 2.0658, "step": 16225 }, { "epoch": 0.017626302831394585, "grad_norm": 34.23979187011719, "learning_rate": 4.9962142789794446e-06, "loss": 1.4429, "step": 16230 }, { "epoch": 0.01763173299246402, "grad_norm": 13.062455177307129, "learning_rate": 4.996211932202447e-06, "loss": 1.8764, "step": 16235 }, { "epoch": 0.01763716315353346, "grad_norm": 39.571590423583984, "learning_rate": 4.996209584698841e-06, "loss": 1.6839, "step": 16240 }, { "epoch": 0.017642593314602897, "grad_norm": 17.34278678894043, "learning_rate": 4.996207236468626e-06, "loss": 1.7813, "step": 16245 }, { "epoch": 0.017648023475672337, "grad_norm": 24.60250473022461, "learning_rate": 4.996204887511803e-06, "loss": 1.6052, "step": 16250 }, { "epoch": 0.017653453636741773, "grad_norm": 26.643566131591797, "learning_rate": 4.996202537828373e-06, "loss": 1.7928, "step": 16255 }, { "epoch": 0.01765888379781121, "grad_norm": 26.12314224243164, "learning_rate": 4.996200187418336e-06, "loss": 1.4366, "step": 16260 }, { "epoch": 0.01766431395888065, "grad_norm": 18.282379150390625, "learning_rate": 4.996197836281693e-06, "loss": 1.4485, "step": 16265 }, { "epoch": 0.017669744119950086, "grad_norm": 13.822900772094727, "learning_rate": 4.996195484418445e-06, "loss": 1.223, "step": 16270 }, { "epoch": 0.017675174281019522, "grad_norm": 37.370941162109375, "learning_rate": 4.9961931318285925e-06, "loss": 1.2718, "step": 16275 }, { "epoch": 0.01768060444208896, "grad_norm": 35.20306396484375, "learning_rate": 4.996190778512136e-06, "loss": 1.9872, "step": 16280 }, { "epoch": 0.017686034603158398, "grad_norm": 14.924355506896973, "learning_rate": 4.996188424469078e-06, "loss": 1.7005, "step": 16285 }, { "epoch": 0.017691464764227838, "grad_norm": 58.427860260009766, "learning_rate": 4.996186069699415e-06, "loss": 1.4792, "step": 16290 }, { "epoch": 0.017696894925297274, "grad_norm": 54.92068099975586, "learning_rate": 4.996183714203151e-06, "loss": 1.8798, "step": 16295 }, { "epoch": 0.01770232508636671, "grad_norm": 13.738118171691895, "learning_rate": 4.996181357980286e-06, "loss": 2.0999, "step": 16300 }, { "epoch": 0.01770775524743615, "grad_norm": 137.5789337158203, "learning_rate": 4.996179001030821e-06, "loss": 1.9626, "step": 16305 }, { "epoch": 0.017713185408505586, "grad_norm": 64.31997680664062, "learning_rate": 4.996176643354757e-06, "loss": 1.4731, "step": 16310 }, { "epoch": 0.017718615569575026, "grad_norm": 32.05767059326172, "learning_rate": 4.996174284952092e-06, "loss": 1.6034, "step": 16315 }, { "epoch": 0.017724045730644462, "grad_norm": 29.742462158203125, "learning_rate": 4.99617192582283e-06, "loss": 1.2363, "step": 16320 }, { "epoch": 0.0177294758917139, "grad_norm": 12.019579887390137, "learning_rate": 4.9961695659669704e-06, "loss": 1.0522, "step": 16325 }, { "epoch": 0.01773490605278334, "grad_norm": 21.53575325012207, "learning_rate": 4.996167205384513e-06, "loss": 1.6979, "step": 16330 }, { "epoch": 0.017740336213852775, "grad_norm": 23.375165939331055, "learning_rate": 4.99616484407546e-06, "loss": 1.0077, "step": 16335 }, { "epoch": 0.017745766374922214, "grad_norm": 40.18255615234375, "learning_rate": 4.996162482039811e-06, "loss": 2.8043, "step": 16340 }, { "epoch": 0.01775119653599165, "grad_norm": 26.381839752197266, "learning_rate": 4.996160119277568e-06, "loss": 2.1146, "step": 16345 }, { "epoch": 0.017756626697061087, "grad_norm": 13.257762908935547, "learning_rate": 4.996157755788729e-06, "loss": 1.397, "step": 16350 }, { "epoch": 0.017762056858130527, "grad_norm": 14.1730375289917, "learning_rate": 4.9961553915732975e-06, "loss": 1.6078, "step": 16355 }, { "epoch": 0.017767487019199963, "grad_norm": 28.85419273376465, "learning_rate": 4.996153026631273e-06, "loss": 2.0199, "step": 16360 }, { "epoch": 0.017772917180269403, "grad_norm": 86.00106048583984, "learning_rate": 4.996150660962656e-06, "loss": 1.7167, "step": 16365 }, { "epoch": 0.01777834734133884, "grad_norm": 20.36481475830078, "learning_rate": 4.996148294567449e-06, "loss": 1.1613, "step": 16370 }, { "epoch": 0.017783777502408275, "grad_norm": 67.72639465332031, "learning_rate": 4.99614592744565e-06, "loss": 1.439, "step": 16375 }, { "epoch": 0.017789207663477715, "grad_norm": 18.07756233215332, "learning_rate": 4.99614355959726e-06, "loss": 2.1281, "step": 16380 }, { "epoch": 0.01779463782454715, "grad_norm": 46.92524719238281, "learning_rate": 4.996141191022282e-06, "loss": 1.2248, "step": 16385 }, { "epoch": 0.01780006798561659, "grad_norm": 109.9154281616211, "learning_rate": 4.996138821720715e-06, "loss": 1.891, "step": 16390 }, { "epoch": 0.017805498146686027, "grad_norm": 31.09214973449707, "learning_rate": 4.99613645169256e-06, "loss": 1.7693, "step": 16395 }, { "epoch": 0.017810928307755464, "grad_norm": 17.081151962280273, "learning_rate": 4.996134080937818e-06, "loss": 1.4162, "step": 16400 }, { "epoch": 0.017816358468824903, "grad_norm": 32.22515106201172, "learning_rate": 4.996131709456489e-06, "loss": 0.9125, "step": 16405 }, { "epoch": 0.01782178862989434, "grad_norm": 22.77564239501953, "learning_rate": 4.996129337248573e-06, "loss": 1.7714, "step": 16410 }, { "epoch": 0.017827218790963776, "grad_norm": 16.193424224853516, "learning_rate": 4.996126964314073e-06, "loss": 1.6178, "step": 16415 }, { "epoch": 0.017832648952033216, "grad_norm": 95.58729553222656, "learning_rate": 4.996124590652988e-06, "loss": 1.408, "step": 16420 }, { "epoch": 0.017838079113102652, "grad_norm": 20.122264862060547, "learning_rate": 4.996122216265319e-06, "loss": 2.2443, "step": 16425 }, { "epoch": 0.01784350927417209, "grad_norm": 16.45705223083496, "learning_rate": 4.996119841151067e-06, "loss": 1.5979, "step": 16430 }, { "epoch": 0.017848939435241528, "grad_norm": 19.223979949951172, "learning_rate": 4.996117465310233e-06, "loss": 1.7921, "step": 16435 }, { "epoch": 0.017854369596310964, "grad_norm": 28.103649139404297, "learning_rate": 4.9961150887428166e-06, "loss": 1.6521, "step": 16440 }, { "epoch": 0.017859799757380404, "grad_norm": 19.798030853271484, "learning_rate": 4.9961127114488186e-06, "loss": 1.712, "step": 16445 }, { "epoch": 0.01786522991844984, "grad_norm": 29.99531364440918, "learning_rate": 4.996110333428242e-06, "loss": 1.7554, "step": 16450 }, { "epoch": 0.01787066007951928, "grad_norm": 24.05889892578125, "learning_rate": 4.996107954681083e-06, "loss": 1.1393, "step": 16455 }, { "epoch": 0.017876090240588716, "grad_norm": 30.59659767150879, "learning_rate": 4.996105575207346e-06, "loss": 1.6608, "step": 16460 }, { "epoch": 0.017881520401658153, "grad_norm": 68.5751724243164, "learning_rate": 4.996103195007031e-06, "loss": 1.3587, "step": 16465 }, { "epoch": 0.017886950562727592, "grad_norm": 19.918485641479492, "learning_rate": 4.996100814080139e-06, "loss": 2.0337, "step": 16470 }, { "epoch": 0.01789238072379703, "grad_norm": 15.560559272766113, "learning_rate": 4.996098432426669e-06, "loss": 1.5707, "step": 16475 }, { "epoch": 0.01789781088486647, "grad_norm": 17.276620864868164, "learning_rate": 4.996096050046623e-06, "loss": 1.3729, "step": 16480 }, { "epoch": 0.017903241045935905, "grad_norm": 17.043794631958008, "learning_rate": 4.996093666940002e-06, "loss": 1.8571, "step": 16485 }, { "epoch": 0.01790867120700534, "grad_norm": 12.24339771270752, "learning_rate": 4.996091283106805e-06, "loss": 1.756, "step": 16490 }, { "epoch": 0.01791410136807478, "grad_norm": 21.656442642211914, "learning_rate": 4.996088898547035e-06, "loss": 2.0432, "step": 16495 }, { "epoch": 0.017919531529144217, "grad_norm": 14.761302947998047, "learning_rate": 4.9960865132606905e-06, "loss": 1.7628, "step": 16500 }, { "epoch": 0.017924961690213657, "grad_norm": 26.800498962402344, "learning_rate": 4.996084127247773e-06, "loss": 0.8137, "step": 16505 }, { "epoch": 0.017930391851283093, "grad_norm": 17.513721466064453, "learning_rate": 4.996081740508284e-06, "loss": 1.828, "step": 16510 }, { "epoch": 0.01793582201235253, "grad_norm": 24.221839904785156, "learning_rate": 4.996079353042223e-06, "loss": 2.0983, "step": 16515 }, { "epoch": 0.01794125217342197, "grad_norm": 21.23375129699707, "learning_rate": 4.996076964849593e-06, "loss": 1.4283, "step": 16520 }, { "epoch": 0.017946682334491405, "grad_norm": 27.447607040405273, "learning_rate": 4.996074575930391e-06, "loss": 2.3857, "step": 16525 }, { "epoch": 0.01795211249556084, "grad_norm": 20.477209091186523, "learning_rate": 4.996072186284621e-06, "loss": 1.7547, "step": 16530 }, { "epoch": 0.01795754265663028, "grad_norm": 41.63151550292969, "learning_rate": 4.996069795912282e-06, "loss": 1.3762, "step": 16535 }, { "epoch": 0.017962972817699718, "grad_norm": 31.8832950592041, "learning_rate": 4.996067404813375e-06, "loss": 1.4018, "step": 16540 }, { "epoch": 0.017968402978769157, "grad_norm": 43.291839599609375, "learning_rate": 4.9960650129879005e-06, "loss": 1.5674, "step": 16545 }, { "epoch": 0.017973833139838594, "grad_norm": 26.980812072753906, "learning_rate": 4.996062620435859e-06, "loss": 1.6299, "step": 16550 }, { "epoch": 0.01797926330090803, "grad_norm": 102.94217681884766, "learning_rate": 4.996060227157253e-06, "loss": 1.8899, "step": 16555 }, { "epoch": 0.01798469346197747, "grad_norm": 36.556644439697266, "learning_rate": 4.996057833152081e-06, "loss": 1.6958, "step": 16560 }, { "epoch": 0.017990123623046906, "grad_norm": 18.273542404174805, "learning_rate": 4.996055438420344e-06, "loss": 1.5461, "step": 16565 }, { "epoch": 0.017995553784116346, "grad_norm": 15.029814720153809, "learning_rate": 4.996053042962044e-06, "loss": 1.8977, "step": 16570 }, { "epoch": 0.018000983945185782, "grad_norm": 24.741621017456055, "learning_rate": 4.996050646777181e-06, "loss": 1.7561, "step": 16575 }, { "epoch": 0.01800641410625522, "grad_norm": 28.979984283447266, "learning_rate": 4.996048249865755e-06, "loss": 1.791, "step": 16580 }, { "epoch": 0.018011844267324658, "grad_norm": 16.165306091308594, "learning_rate": 4.996045852227768e-06, "loss": 1.795, "step": 16585 }, { "epoch": 0.018017274428394094, "grad_norm": 34.182247161865234, "learning_rate": 4.99604345386322e-06, "loss": 1.5175, "step": 16590 }, { "epoch": 0.018022704589463534, "grad_norm": 15.807541847229004, "learning_rate": 4.996041054772111e-06, "loss": 1.2641, "step": 16595 }, { "epoch": 0.01802813475053297, "grad_norm": 16.475671768188477, "learning_rate": 4.9960386549544434e-06, "loss": 1.8504, "step": 16600 }, { "epoch": 0.018033564911602407, "grad_norm": 17.320619583129883, "learning_rate": 4.996036254410216e-06, "loss": 1.9025, "step": 16605 }, { "epoch": 0.018038995072671846, "grad_norm": 94.24618530273438, "learning_rate": 4.996033853139432e-06, "loss": 1.2133, "step": 16610 }, { "epoch": 0.018044425233741283, "grad_norm": 33.46694564819336, "learning_rate": 4.996031451142088e-06, "loss": 1.3864, "step": 16615 }, { "epoch": 0.018049855394810722, "grad_norm": 20.28070831298828, "learning_rate": 4.996029048418189e-06, "loss": 1.9642, "step": 16620 }, { "epoch": 0.01805528555588016, "grad_norm": 35.4303092956543, "learning_rate": 4.996026644967734e-06, "loss": 2.0841, "step": 16625 }, { "epoch": 0.018060715716949595, "grad_norm": 13.260308265686035, "learning_rate": 4.996024240790723e-06, "loss": 1.6131, "step": 16630 }, { "epoch": 0.018066145878019035, "grad_norm": 17.500696182250977, "learning_rate": 4.996021835887157e-06, "loss": 1.8312, "step": 16635 }, { "epoch": 0.01807157603908847, "grad_norm": 42.21771240234375, "learning_rate": 4.996019430257038e-06, "loss": 1.3959, "step": 16640 }, { "epoch": 0.01807700620015791, "grad_norm": 36.553775787353516, "learning_rate": 4.996017023900365e-06, "loss": 1.3414, "step": 16645 }, { "epoch": 0.018082436361227347, "grad_norm": 23.88077163696289, "learning_rate": 4.99601461681714e-06, "loss": 1.3804, "step": 16650 }, { "epoch": 0.018087866522296783, "grad_norm": 22.760509490966797, "learning_rate": 4.996012209007362e-06, "loss": 1.2636, "step": 16655 }, { "epoch": 0.018093296683366223, "grad_norm": 19.75514793395996, "learning_rate": 4.996009800471034e-06, "loss": 1.2704, "step": 16660 }, { "epoch": 0.01809872684443566, "grad_norm": 17.50397300720215, "learning_rate": 4.996007391208155e-06, "loss": 1.5605, "step": 16665 }, { "epoch": 0.018104157005505096, "grad_norm": 54.83206558227539, "learning_rate": 4.996004981218727e-06, "loss": 2.0392, "step": 16670 }, { "epoch": 0.018109587166574535, "grad_norm": 71.99544525146484, "learning_rate": 4.9960025705027495e-06, "loss": 1.6361, "step": 16675 }, { "epoch": 0.01811501732764397, "grad_norm": 15.783403396606445, "learning_rate": 4.996000159060223e-06, "loss": 1.1352, "step": 16680 }, { "epoch": 0.01812044748871341, "grad_norm": 135.14244079589844, "learning_rate": 4.995997746891149e-06, "loss": 1.9481, "step": 16685 }, { "epoch": 0.018125877649782848, "grad_norm": 20.977439880371094, "learning_rate": 4.995995333995528e-06, "loss": 1.7549, "step": 16690 }, { "epoch": 0.018131307810852284, "grad_norm": 29.121532440185547, "learning_rate": 4.995992920373362e-06, "loss": 1.927, "step": 16695 }, { "epoch": 0.018136737971921724, "grad_norm": 14.903604507446289, "learning_rate": 4.995990506024648e-06, "loss": 1.5786, "step": 16700 }, { "epoch": 0.01814216813299116, "grad_norm": 18.976924896240234, "learning_rate": 4.995988090949392e-06, "loss": 1.6045, "step": 16705 }, { "epoch": 0.0181475982940606, "grad_norm": 24.985759735107422, "learning_rate": 4.99598567514759e-06, "loss": 1.2037, "step": 16710 }, { "epoch": 0.018153028455130036, "grad_norm": 33.36278533935547, "learning_rate": 4.995983258619245e-06, "loss": 1.8912, "step": 16715 }, { "epoch": 0.018158458616199472, "grad_norm": 28.819562911987305, "learning_rate": 4.995980841364357e-06, "loss": 1.6938, "step": 16720 }, { "epoch": 0.018163888777268912, "grad_norm": 61.09877014160156, "learning_rate": 4.995978423382927e-06, "loss": 1.2718, "step": 16725 }, { "epoch": 0.01816931893833835, "grad_norm": 22.488723754882812, "learning_rate": 4.995976004674956e-06, "loss": 1.5904, "step": 16730 }, { "epoch": 0.018174749099407788, "grad_norm": 15.990873336791992, "learning_rate": 4.995973585240444e-06, "loss": 2.2241, "step": 16735 }, { "epoch": 0.018180179260477224, "grad_norm": 37.61878204345703, "learning_rate": 4.995971165079392e-06, "loss": 1.9685, "step": 16740 }, { "epoch": 0.01818560942154666, "grad_norm": 14.914793968200684, "learning_rate": 4.9959687441918015e-06, "loss": 1.5574, "step": 16745 }, { "epoch": 0.0181910395826161, "grad_norm": 19.693012237548828, "learning_rate": 4.995966322577672e-06, "loss": 1.8323, "step": 16750 }, { "epoch": 0.018196469743685537, "grad_norm": 22.29275894165039, "learning_rate": 4.995963900237004e-06, "loss": 1.4361, "step": 16755 }, { "epoch": 0.018201899904754976, "grad_norm": 44.31939697265625, "learning_rate": 4.9959614771698005e-06, "loss": 1.6945, "step": 16760 }, { "epoch": 0.018207330065824413, "grad_norm": 23.586305618286133, "learning_rate": 4.9959590533760595e-06, "loss": 1.3713, "step": 16765 }, { "epoch": 0.01821276022689385, "grad_norm": 45.2210693359375, "learning_rate": 4.995956628855783e-06, "loss": 1.8546, "step": 16770 }, { "epoch": 0.01821819038796329, "grad_norm": 63.15631866455078, "learning_rate": 4.995954203608971e-06, "loss": 1.6593, "step": 16775 }, { "epoch": 0.018223620549032725, "grad_norm": 22.70370101928711, "learning_rate": 4.9959517776356245e-06, "loss": 1.9747, "step": 16780 }, { "epoch": 0.018229050710102165, "grad_norm": 38.26826095581055, "learning_rate": 4.995949350935746e-06, "loss": 1.6529, "step": 16785 }, { "epoch": 0.0182344808711716, "grad_norm": 33.15584945678711, "learning_rate": 4.9959469235093336e-06, "loss": 1.0094, "step": 16790 }, { "epoch": 0.018239911032241037, "grad_norm": 40.05393600463867, "learning_rate": 4.995944495356388e-06, "loss": 1.5434, "step": 16795 }, { "epoch": 0.018245341193310477, "grad_norm": 25.648527145385742, "learning_rate": 4.995942066476913e-06, "loss": 1.7231, "step": 16800 }, { "epoch": 0.018250771354379913, "grad_norm": 36.40654373168945, "learning_rate": 4.995939636870906e-06, "loss": 1.3853, "step": 16805 }, { "epoch": 0.01825620151544935, "grad_norm": 22.796049118041992, "learning_rate": 4.9959372065383694e-06, "loss": 1.995, "step": 16810 }, { "epoch": 0.01826163167651879, "grad_norm": 12.148756980895996, "learning_rate": 4.995934775479303e-06, "loss": 1.3111, "step": 16815 }, { "epoch": 0.018267061837588226, "grad_norm": 14.452869415283203, "learning_rate": 4.995932343693708e-06, "loss": 2.5109, "step": 16820 }, { "epoch": 0.018272491998657665, "grad_norm": 25.258474349975586, "learning_rate": 4.995929911181587e-06, "loss": 1.1035, "step": 16825 }, { "epoch": 0.0182779221597271, "grad_norm": 43.40121078491211, "learning_rate": 4.995927477942936e-06, "loss": 0.9968, "step": 16830 }, { "epoch": 0.018283352320796538, "grad_norm": 29.9730167388916, "learning_rate": 4.99592504397776e-06, "loss": 1.656, "step": 16835 }, { "epoch": 0.018288782481865978, "grad_norm": 16.75084686279297, "learning_rate": 4.995922609286057e-06, "loss": 1.5134, "step": 16840 }, { "epoch": 0.018294212642935414, "grad_norm": 25.291208267211914, "learning_rate": 4.995920173867829e-06, "loss": 1.397, "step": 16845 }, { "epoch": 0.018299642804004854, "grad_norm": 14.375666618347168, "learning_rate": 4.995917737723078e-06, "loss": 1.855, "step": 16850 }, { "epoch": 0.01830507296507429, "grad_norm": 17.60347557067871, "learning_rate": 4.9959153008518025e-06, "loss": 1.6022, "step": 16855 }, { "epoch": 0.018310503126143726, "grad_norm": 23.195354461669922, "learning_rate": 4.9959128632540045e-06, "loss": 1.6947, "step": 16860 }, { "epoch": 0.018315933287213166, "grad_norm": 19.26018524169922, "learning_rate": 4.995910424929684e-06, "loss": 2.2351, "step": 16865 }, { "epoch": 0.018321363448282602, "grad_norm": 69.09245300292969, "learning_rate": 4.995907985878842e-06, "loss": 1.3495, "step": 16870 }, { "epoch": 0.018326793609352042, "grad_norm": 28.730356216430664, "learning_rate": 4.995905546101478e-06, "loss": 1.0957, "step": 16875 }, { "epoch": 0.01833222377042148, "grad_norm": 15.667546272277832, "learning_rate": 4.995903105597596e-06, "loss": 0.8447, "step": 16880 }, { "epoch": 0.018337653931490915, "grad_norm": 33.14296340942383, "learning_rate": 4.995900664367193e-06, "loss": 1.5103, "step": 16885 }, { "epoch": 0.018343084092560354, "grad_norm": 35.44947814941406, "learning_rate": 4.995898222410272e-06, "loss": 1.2052, "step": 16890 }, { "epoch": 0.01834851425362979, "grad_norm": 30.0620174407959, "learning_rate": 4.995895779726833e-06, "loss": 1.7451, "step": 16895 }, { "epoch": 0.01835394441469923, "grad_norm": 23.71302604675293, "learning_rate": 4.995893336316876e-06, "loss": 1.3867, "step": 16900 }, { "epoch": 0.018359374575768667, "grad_norm": 155.03639221191406, "learning_rate": 4.995890892180403e-06, "loss": 1.3918, "step": 16905 }, { "epoch": 0.018364804736838103, "grad_norm": 70.8016586303711, "learning_rate": 4.995888447317414e-06, "loss": 3.4067, "step": 16910 }, { "epoch": 0.018370234897907543, "grad_norm": 16.5734920501709, "learning_rate": 4.99588600172791e-06, "loss": 1.6714, "step": 16915 }, { "epoch": 0.01837566505897698, "grad_norm": 14.987817764282227, "learning_rate": 4.9958835554118914e-06, "loss": 1.6991, "step": 16920 }, { "epoch": 0.01838109522004642, "grad_norm": 31.52494239807129, "learning_rate": 4.9958811083693584e-06, "loss": 1.6643, "step": 16925 }, { "epoch": 0.018386525381115855, "grad_norm": 14.353504180908203, "learning_rate": 4.9958786606003126e-06, "loss": 1.9132, "step": 16930 }, { "epoch": 0.01839195554218529, "grad_norm": 26.50870132446289, "learning_rate": 4.9958762121047555e-06, "loss": 1.5553, "step": 16935 }, { "epoch": 0.01839738570325473, "grad_norm": 25.96765899658203, "learning_rate": 4.9958737628826865e-06, "loss": 2.0278, "step": 16940 }, { "epoch": 0.018402815864324167, "grad_norm": 61.72339630126953, "learning_rate": 4.995871312934106e-06, "loss": 1.0486, "step": 16945 }, { "epoch": 0.018408246025393604, "grad_norm": 93.15813446044922, "learning_rate": 4.995868862259016e-06, "loss": 1.7426, "step": 16950 }, { "epoch": 0.018413676186463043, "grad_norm": 22.122051239013672, "learning_rate": 4.9958664108574165e-06, "loss": 1.5155, "step": 16955 }, { "epoch": 0.01841910634753248, "grad_norm": 57.17677307128906, "learning_rate": 4.995863958729308e-06, "loss": 1.8262, "step": 16960 }, { "epoch": 0.01842453650860192, "grad_norm": 18.76450538635254, "learning_rate": 4.9958615058746914e-06, "loss": 1.9502, "step": 16965 }, { "epoch": 0.018429966669671356, "grad_norm": 16.61404037475586, "learning_rate": 4.995859052293568e-06, "loss": 1.715, "step": 16970 }, { "epoch": 0.018435396830740792, "grad_norm": 13.809488296508789, "learning_rate": 4.995856597985938e-06, "loss": 1.5517, "step": 16975 }, { "epoch": 0.01844082699181023, "grad_norm": 25.925315856933594, "learning_rate": 4.9958541429518005e-06, "loss": 1.6287, "step": 16980 }, { "epoch": 0.018446257152879668, "grad_norm": 54.19487762451172, "learning_rate": 4.99585168719116e-06, "loss": 1.5408, "step": 16985 }, { "epoch": 0.018451687313949108, "grad_norm": 94.2833251953125, "learning_rate": 4.995849230704014e-06, "loss": 1.5691, "step": 16990 }, { "epoch": 0.018457117475018544, "grad_norm": 42.53447341918945, "learning_rate": 4.995846773490364e-06, "loss": 1.4496, "step": 16995 }, { "epoch": 0.01846254763608798, "grad_norm": 22.427183151245117, "learning_rate": 4.995844315550211e-06, "loss": 1.3907, "step": 17000 }, { "epoch": 0.01846797779715742, "grad_norm": 51.67694091796875, "learning_rate": 4.995841856883557e-06, "loss": 1.4729, "step": 17005 }, { "epoch": 0.018473407958226856, "grad_norm": 18.8845157623291, "learning_rate": 4.995839397490399e-06, "loss": 1.517, "step": 17010 }, { "epoch": 0.018478838119296296, "grad_norm": 20.089698791503906, "learning_rate": 4.995836937370743e-06, "loss": 1.653, "step": 17015 }, { "epoch": 0.018484268280365732, "grad_norm": 16.62115478515625, "learning_rate": 4.995834476524585e-06, "loss": 1.3115, "step": 17020 }, { "epoch": 0.01848969844143517, "grad_norm": 40.59888458251953, "learning_rate": 4.995832014951928e-06, "loss": 1.9799, "step": 17025 }, { "epoch": 0.01849512860250461, "grad_norm": 27.982492446899414, "learning_rate": 4.995829552652772e-06, "loss": 1.3977, "step": 17030 }, { "epoch": 0.018500558763574045, "grad_norm": 60.60350799560547, "learning_rate": 4.995827089627119e-06, "loss": 1.8734, "step": 17035 }, { "epoch": 0.018505988924643484, "grad_norm": 29.004505157470703, "learning_rate": 4.995824625874968e-06, "loss": 1.7712, "step": 17040 }, { "epoch": 0.01851141908571292, "grad_norm": 39.0101432800293, "learning_rate": 4.99582216139632e-06, "loss": 2.4952, "step": 17045 }, { "epoch": 0.018516849246782357, "grad_norm": 19.172712326049805, "learning_rate": 4.995819696191176e-06, "loss": 1.6047, "step": 17050 }, { "epoch": 0.018522279407851797, "grad_norm": 37.37424087524414, "learning_rate": 4.995817230259537e-06, "loss": 1.9728, "step": 17055 }, { "epoch": 0.018527709568921233, "grad_norm": 34.46635818481445, "learning_rate": 4.995814763601404e-06, "loss": 1.1788, "step": 17060 }, { "epoch": 0.018533139729990673, "grad_norm": 38.907493591308594, "learning_rate": 4.995812296216778e-06, "loss": 1.86, "step": 17065 }, { "epoch": 0.01853856989106011, "grad_norm": 19.938207626342773, "learning_rate": 4.995809828105657e-06, "loss": 1.4286, "step": 17070 }, { "epoch": 0.018544000052129545, "grad_norm": 20.544858932495117, "learning_rate": 4.995807359268046e-06, "loss": 1.4992, "step": 17075 }, { "epoch": 0.018549430213198985, "grad_norm": 27.295228958129883, "learning_rate": 4.995804889703943e-06, "loss": 1.6445, "step": 17080 }, { "epoch": 0.01855486037426842, "grad_norm": 16.673320770263672, "learning_rate": 4.995802419413348e-06, "loss": 1.8687, "step": 17085 }, { "epoch": 0.018560290535337857, "grad_norm": 36.54752731323242, "learning_rate": 4.995799948396264e-06, "loss": 1.5534, "step": 17090 }, { "epoch": 0.018565720696407297, "grad_norm": 21.16550636291504, "learning_rate": 4.99579747665269e-06, "loss": 1.6068, "step": 17095 }, { "epoch": 0.018571150857476734, "grad_norm": 48.16471481323242, "learning_rate": 4.995795004182627e-06, "loss": 1.7604, "step": 17100 }, { "epoch": 0.018576581018546173, "grad_norm": 158.54623413085938, "learning_rate": 4.995792530986076e-06, "loss": 1.2082, "step": 17105 }, { "epoch": 0.01858201117961561, "grad_norm": 41.97954559326172, "learning_rate": 4.995790057063039e-06, "loss": 1.5677, "step": 17110 }, { "epoch": 0.018587441340685046, "grad_norm": 24.953746795654297, "learning_rate": 4.995787582413515e-06, "loss": 1.7016, "step": 17115 }, { "epoch": 0.018592871501754486, "grad_norm": 51.997188568115234, "learning_rate": 4.995785107037505e-06, "loss": 2.0678, "step": 17120 }, { "epoch": 0.018598301662823922, "grad_norm": 17.637792587280273, "learning_rate": 4.99578263093501e-06, "loss": 1.6625, "step": 17125 }, { "epoch": 0.01860373182389336, "grad_norm": 41.81455993652344, "learning_rate": 4.99578015410603e-06, "loss": 1.877, "step": 17130 }, { "epoch": 0.018609161984962798, "grad_norm": 22.010183334350586, "learning_rate": 4.995777676550568e-06, "loss": 2.2608, "step": 17135 }, { "epoch": 0.018614592146032234, "grad_norm": 19.521739959716797, "learning_rate": 4.995775198268621e-06, "loss": 1.6929, "step": 17140 }, { "epoch": 0.018620022307101674, "grad_norm": 23.893156051635742, "learning_rate": 4.995772719260193e-06, "loss": 1.6129, "step": 17145 }, { "epoch": 0.01862545246817111, "grad_norm": 21.883464813232422, "learning_rate": 4.995770239525283e-06, "loss": 1.7996, "step": 17150 }, { "epoch": 0.01863088262924055, "grad_norm": 18.127424240112305, "learning_rate": 4.995767759063893e-06, "loss": 1.4086, "step": 17155 }, { "epoch": 0.018636312790309986, "grad_norm": 26.74381446838379, "learning_rate": 4.9957652778760225e-06, "loss": 1.7727, "step": 17160 }, { "epoch": 0.018641742951379422, "grad_norm": 39.52963638305664, "learning_rate": 4.995762795961672e-06, "loss": 2.0289, "step": 17165 }, { "epoch": 0.018647173112448862, "grad_norm": 40.64313507080078, "learning_rate": 4.995760313320844e-06, "loss": 1.2298, "step": 17170 }, { "epoch": 0.0186526032735183, "grad_norm": 28.692049026489258, "learning_rate": 4.995757829953538e-06, "loss": 1.4795, "step": 17175 }, { "epoch": 0.01865803343458774, "grad_norm": 19.321680068969727, "learning_rate": 4.995755345859754e-06, "loss": 2.4496, "step": 17180 }, { "epoch": 0.018663463595657175, "grad_norm": 34.5648307800293, "learning_rate": 4.995752861039494e-06, "loss": 1.5773, "step": 17185 }, { "epoch": 0.01866889375672661, "grad_norm": 16.253692626953125, "learning_rate": 4.9957503754927595e-06, "loss": 1.9144, "step": 17190 }, { "epoch": 0.01867432391779605, "grad_norm": 16.126873016357422, "learning_rate": 4.995747889219549e-06, "loss": 1.2598, "step": 17195 }, { "epoch": 0.018679754078865487, "grad_norm": 24.342269897460938, "learning_rate": 4.995745402219865e-06, "loss": 1.436, "step": 17200 }, { "epoch": 0.018685184239934927, "grad_norm": 96.3156967163086, "learning_rate": 4.995742914493707e-06, "loss": 2.0576, "step": 17205 }, { "epoch": 0.018690614401004363, "grad_norm": 14.080514907836914, "learning_rate": 4.995740426041075e-06, "loss": 2.1033, "step": 17210 }, { "epoch": 0.0186960445620738, "grad_norm": 30.12204360961914, "learning_rate": 4.995737936861973e-06, "loss": 1.6665, "step": 17215 }, { "epoch": 0.01870147472314324, "grad_norm": 15.255182266235352, "learning_rate": 4.995735446956399e-06, "loss": 1.4009, "step": 17220 }, { "epoch": 0.018706904884212675, "grad_norm": 22.44013023376465, "learning_rate": 4.9957329563243535e-06, "loss": 1.6636, "step": 17225 }, { "epoch": 0.01871233504528211, "grad_norm": 35.47859573364258, "learning_rate": 4.995730464965839e-06, "loss": 1.5583, "step": 17230 }, { "epoch": 0.01871776520635155, "grad_norm": 11.168428421020508, "learning_rate": 4.995727972880855e-06, "loss": 1.6072, "step": 17235 }, { "epoch": 0.018723195367420988, "grad_norm": 20.96849250793457, "learning_rate": 4.995725480069402e-06, "loss": 1.9966, "step": 17240 }, { "epoch": 0.018728625528490427, "grad_norm": 32.186649322509766, "learning_rate": 4.995722986531482e-06, "loss": 1.4154, "step": 17245 }, { "epoch": 0.018734055689559864, "grad_norm": 17.036882400512695, "learning_rate": 4.9957204922670946e-06, "loss": 1.4466, "step": 17250 }, { "epoch": 0.0187394858506293, "grad_norm": 25.079683303833008, "learning_rate": 4.995717997276241e-06, "loss": 1.8721, "step": 17255 }, { "epoch": 0.01874491601169874, "grad_norm": 16.984943389892578, "learning_rate": 4.995715501558923e-06, "loss": 1.7466, "step": 17260 }, { "epoch": 0.018750346172768176, "grad_norm": 21.595703125, "learning_rate": 4.995713005115139e-06, "loss": 1.4412, "step": 17265 }, { "epoch": 0.018755776333837616, "grad_norm": 53.31869125366211, "learning_rate": 4.995710507944891e-06, "loss": 2.3544, "step": 17270 }, { "epoch": 0.018761206494907052, "grad_norm": 339.8849792480469, "learning_rate": 4.9957080100481804e-06, "loss": 1.5725, "step": 17275 }, { "epoch": 0.018766636655976488, "grad_norm": 15.92650318145752, "learning_rate": 4.995705511425006e-06, "loss": 2.0145, "step": 17280 }, { "epoch": 0.018772066817045928, "grad_norm": 46.47712707519531, "learning_rate": 4.9957030120753705e-06, "loss": 1.692, "step": 17285 }, { "epoch": 0.018777496978115364, "grad_norm": 12.284225463867188, "learning_rate": 4.995700511999274e-06, "loss": 1.8585, "step": 17290 }, { "epoch": 0.018782927139184804, "grad_norm": 16.603214263916016, "learning_rate": 4.995698011196717e-06, "loss": 1.6682, "step": 17295 }, { "epoch": 0.01878835730025424, "grad_norm": 150.37954711914062, "learning_rate": 4.9956955096677e-06, "loss": 1.6227, "step": 17300 }, { "epoch": 0.018793787461323676, "grad_norm": 44.208091735839844, "learning_rate": 4.995693007412223e-06, "loss": 1.6189, "step": 17305 }, { "epoch": 0.018799217622393116, "grad_norm": 30.864229202270508, "learning_rate": 4.995690504430289e-06, "loss": 2.514, "step": 17310 }, { "epoch": 0.018804647783462553, "grad_norm": 14.259963989257812, "learning_rate": 4.995688000721897e-06, "loss": 2.0349, "step": 17315 }, { "epoch": 0.018810077944531992, "grad_norm": 16.815462112426758, "learning_rate": 4.995685496287049e-06, "loss": 1.1917, "step": 17320 }, { "epoch": 0.01881550810560143, "grad_norm": 24.298049926757812, "learning_rate": 4.995682991125744e-06, "loss": 1.1985, "step": 17325 }, { "epoch": 0.018820938266670865, "grad_norm": 36.219940185546875, "learning_rate": 4.995680485237984e-06, "loss": 2.3678, "step": 17330 }, { "epoch": 0.018826368427740305, "grad_norm": 19.503173828125, "learning_rate": 4.995677978623769e-06, "loss": 2.0468, "step": 17335 }, { "epoch": 0.01883179858880974, "grad_norm": 45.02452850341797, "learning_rate": 4.9956754712831e-06, "loss": 1.7482, "step": 17340 }, { "epoch": 0.01883722874987918, "grad_norm": 19.462539672851562, "learning_rate": 4.995672963215979e-06, "loss": 2.1802, "step": 17345 }, { "epoch": 0.018842658910948617, "grad_norm": 20.911258697509766, "learning_rate": 4.995670454422404e-06, "loss": 1.2618, "step": 17350 }, { "epoch": 0.018848089072018053, "grad_norm": 14.116644859313965, "learning_rate": 4.995667944902379e-06, "loss": 1.5334, "step": 17355 }, { "epoch": 0.018853519233087493, "grad_norm": 33.77659606933594, "learning_rate": 4.995665434655902e-06, "loss": 1.2969, "step": 17360 }, { "epoch": 0.01885894939415693, "grad_norm": 45.865421295166016, "learning_rate": 4.995662923682974e-06, "loss": 1.6259, "step": 17365 }, { "epoch": 0.018864379555226365, "grad_norm": 17.028318405151367, "learning_rate": 4.995660411983598e-06, "loss": 1.7867, "step": 17370 }, { "epoch": 0.018869809716295805, "grad_norm": 22.684974670410156, "learning_rate": 4.995657899557773e-06, "loss": 2.1369, "step": 17375 }, { "epoch": 0.01887523987736524, "grad_norm": 12.7839994430542, "learning_rate": 4.995655386405499e-06, "loss": 1.3533, "step": 17380 }, { "epoch": 0.01888067003843468, "grad_norm": 18.848726272583008, "learning_rate": 4.995652872526779e-06, "loss": 1.5445, "step": 17385 }, { "epoch": 0.018886100199504118, "grad_norm": 21.010112762451172, "learning_rate": 4.995650357921611e-06, "loss": 2.0551, "step": 17390 }, { "epoch": 0.018891530360573554, "grad_norm": 96.22025299072266, "learning_rate": 4.995647842589999e-06, "loss": 1.629, "step": 17395 }, { "epoch": 0.018896960521642994, "grad_norm": 16.562393188476562, "learning_rate": 4.9956453265319405e-06, "loss": 1.6225, "step": 17400 }, { "epoch": 0.01890239068271243, "grad_norm": 130.01593017578125, "learning_rate": 4.995642809747438e-06, "loss": 1.4682, "step": 17405 }, { "epoch": 0.01890782084378187, "grad_norm": 21.990840911865234, "learning_rate": 4.995640292236491e-06, "loss": 2.4182, "step": 17410 }, { "epoch": 0.018913251004851306, "grad_norm": 30.93906021118164, "learning_rate": 4.995637773999102e-06, "loss": 1.2626, "step": 17415 }, { "epoch": 0.018918681165920742, "grad_norm": 22.861997604370117, "learning_rate": 4.99563525503527e-06, "loss": 1.3965, "step": 17420 }, { "epoch": 0.018924111326990182, "grad_norm": 74.12432861328125, "learning_rate": 4.995632735344998e-06, "loss": 1.9003, "step": 17425 }, { "epoch": 0.018929541488059618, "grad_norm": 30.24619483947754, "learning_rate": 4.995630214928284e-06, "loss": 1.0287, "step": 17430 }, { "epoch": 0.018934971649129058, "grad_norm": 26.930940628051758, "learning_rate": 4.995627693785131e-06, "loss": 1.9966, "step": 17435 }, { "epoch": 0.018940401810198494, "grad_norm": 12.423486709594727, "learning_rate": 4.995625171915538e-06, "loss": 1.4858, "step": 17440 }, { "epoch": 0.01894583197126793, "grad_norm": 16.05397605895996, "learning_rate": 4.995622649319507e-06, "loss": 1.7845, "step": 17445 }, { "epoch": 0.01895126213233737, "grad_norm": 81.04167938232422, "learning_rate": 4.9956201259970375e-06, "loss": 1.6413, "step": 17450 }, { "epoch": 0.018956692293406806, "grad_norm": 13.75011920928955, "learning_rate": 4.995617601948132e-06, "loss": 1.9764, "step": 17455 }, { "epoch": 0.018962122454476246, "grad_norm": 17.400283813476562, "learning_rate": 4.995615077172789e-06, "loss": 1.6941, "step": 17460 }, { "epoch": 0.018967552615545683, "grad_norm": 32.3177604675293, "learning_rate": 4.995612551671011e-06, "loss": 1.5326, "step": 17465 }, { "epoch": 0.01897298277661512, "grad_norm": 23.74802017211914, "learning_rate": 4.995610025442799e-06, "loss": 1.6417, "step": 17470 }, { "epoch": 0.01897841293768456, "grad_norm": 26.51461410522461, "learning_rate": 4.995607498488151e-06, "loss": 1.4339, "step": 17475 }, { "epoch": 0.018983843098753995, "grad_norm": 122.57401275634766, "learning_rate": 4.995604970807071e-06, "loss": 1.3679, "step": 17480 }, { "epoch": 0.018989273259823435, "grad_norm": 24.30470085144043, "learning_rate": 4.9956024423995576e-06, "loss": 1.4784, "step": 17485 }, { "epoch": 0.01899470342089287, "grad_norm": 28.333545684814453, "learning_rate": 4.995599913265613e-06, "loss": 1.295, "step": 17490 }, { "epoch": 0.019000133581962307, "grad_norm": 25.769941329956055, "learning_rate": 4.995597383405237e-06, "loss": 1.6894, "step": 17495 }, { "epoch": 0.019005563743031747, "grad_norm": 49.951778411865234, "learning_rate": 4.995594852818431e-06, "loss": 1.5762, "step": 17500 }, { "epoch": 0.019010993904101183, "grad_norm": 76.79054260253906, "learning_rate": 4.995592321505195e-06, "loss": 1.4613, "step": 17505 }, { "epoch": 0.01901642406517062, "grad_norm": 13.035563468933105, "learning_rate": 4.9955897894655295e-06, "loss": 2.1527, "step": 17510 }, { "epoch": 0.01902185422624006, "grad_norm": 26.099390029907227, "learning_rate": 4.995587256699435e-06, "loss": 1.5853, "step": 17515 }, { "epoch": 0.019027284387309495, "grad_norm": 97.1773910522461, "learning_rate": 4.9955847232069156e-06, "loss": 1.2485, "step": 17520 }, { "epoch": 0.019032714548378935, "grad_norm": 31.17519760131836, "learning_rate": 4.995582188987968e-06, "loss": 1.5227, "step": 17525 }, { "epoch": 0.01903814470944837, "grad_norm": 20.355987548828125, "learning_rate": 4.995579654042594e-06, "loss": 1.8956, "step": 17530 }, { "epoch": 0.019043574870517808, "grad_norm": 60.516361236572266, "learning_rate": 4.995577118370795e-06, "loss": 2.1366, "step": 17535 }, { "epoch": 0.019049005031587248, "grad_norm": 29.792287826538086, "learning_rate": 4.995574581972572e-06, "loss": 1.7435, "step": 17540 }, { "epoch": 0.019054435192656684, "grad_norm": 189.8100128173828, "learning_rate": 4.995572044847925e-06, "loss": 1.8864, "step": 17545 }, { "epoch": 0.019059865353726124, "grad_norm": 31.95005989074707, "learning_rate": 4.995569506996855e-06, "loss": 1.7074, "step": 17550 }, { "epoch": 0.01906529551479556, "grad_norm": 145.2863006591797, "learning_rate": 4.995566968419362e-06, "loss": 2.0093, "step": 17555 }, { "epoch": 0.019070725675864996, "grad_norm": 19.091751098632812, "learning_rate": 4.995564429115448e-06, "loss": 1.7392, "step": 17560 }, { "epoch": 0.019076155836934436, "grad_norm": 39.19459915161133, "learning_rate": 4.995561889085113e-06, "loss": 1.8779, "step": 17565 }, { "epoch": 0.019081585998003872, "grad_norm": 17.150409698486328, "learning_rate": 4.995559348328358e-06, "loss": 1.4654, "step": 17570 }, { "epoch": 0.019087016159073312, "grad_norm": 44.28902053833008, "learning_rate": 4.995556806845183e-06, "loss": 2.1594, "step": 17575 }, { "epoch": 0.019092446320142748, "grad_norm": 16.69728660583496, "learning_rate": 4.99555426463559e-06, "loss": 2.0536, "step": 17580 }, { "epoch": 0.019097876481212184, "grad_norm": 169.7117156982422, "learning_rate": 4.9955517216995795e-06, "loss": 1.6087, "step": 17585 }, { "epoch": 0.019103306642281624, "grad_norm": 26.30014419555664, "learning_rate": 4.995549178037151e-06, "loss": 1.3632, "step": 17590 }, { "epoch": 0.01910873680335106, "grad_norm": 25.31925392150879, "learning_rate": 4.995546633648307e-06, "loss": 1.138, "step": 17595 }, { "epoch": 0.0191141669644205, "grad_norm": 19.69489097595215, "learning_rate": 4.995544088533047e-06, "loss": 1.1443, "step": 17600 }, { "epoch": 0.019119597125489936, "grad_norm": 22.4984130859375, "learning_rate": 4.995541542691372e-06, "loss": 1.7224, "step": 17605 }, { "epoch": 0.019125027286559373, "grad_norm": 180.56585693359375, "learning_rate": 4.9955389961232825e-06, "loss": 1.5735, "step": 17610 }, { "epoch": 0.019130457447628813, "grad_norm": 22.48917007446289, "learning_rate": 4.99553644882878e-06, "loss": 0.9895, "step": 17615 }, { "epoch": 0.01913588760869825, "grad_norm": 21.189077377319336, "learning_rate": 4.995533900807865e-06, "loss": 1.6239, "step": 17620 }, { "epoch": 0.01914131776976769, "grad_norm": 18.884502410888672, "learning_rate": 4.9955313520605375e-06, "loss": 1.6487, "step": 17625 }, { "epoch": 0.019146747930837125, "grad_norm": 24.56068229675293, "learning_rate": 4.995528802586799e-06, "loss": 1.8936, "step": 17630 }, { "epoch": 0.01915217809190656, "grad_norm": 20.0670108795166, "learning_rate": 4.995526252386651e-06, "loss": 1.4912, "step": 17635 }, { "epoch": 0.019157608252976, "grad_norm": 22.786218643188477, "learning_rate": 4.995523701460092e-06, "loss": 1.4839, "step": 17640 }, { "epoch": 0.019163038414045437, "grad_norm": 30.225107192993164, "learning_rate": 4.995521149807124e-06, "loss": 1.385, "step": 17645 }, { "epoch": 0.019168468575114873, "grad_norm": 23.983604431152344, "learning_rate": 4.9955185974277485e-06, "loss": 1.3358, "step": 17650 }, { "epoch": 0.019173898736184313, "grad_norm": 19.766761779785156, "learning_rate": 4.995516044321965e-06, "loss": 1.8143, "step": 17655 }, { "epoch": 0.01917932889725375, "grad_norm": 166.088134765625, "learning_rate": 4.995513490489775e-06, "loss": 1.6163, "step": 17660 }, { "epoch": 0.01918475905832319, "grad_norm": 15.676535606384277, "learning_rate": 4.995510935931179e-06, "loss": 1.7087, "step": 17665 }, { "epoch": 0.019190189219392625, "grad_norm": 21.43535614013672, "learning_rate": 4.995508380646177e-06, "loss": 1.7013, "step": 17670 }, { "epoch": 0.01919561938046206, "grad_norm": 26.130067825317383, "learning_rate": 4.995505824634771e-06, "loss": 1.1317, "step": 17675 }, { "epoch": 0.0192010495415315, "grad_norm": 151.58062744140625, "learning_rate": 4.995503267896962e-06, "loss": 2.6866, "step": 17680 }, { "epoch": 0.019206479702600938, "grad_norm": 25.691951751708984, "learning_rate": 4.995500710432749e-06, "loss": 1.3876, "step": 17685 }, { "epoch": 0.019211909863670378, "grad_norm": 16.3123779296875, "learning_rate": 4.9954981522421345e-06, "loss": 2.6346, "step": 17690 }, { "epoch": 0.019217340024739814, "grad_norm": 21.62204933166504, "learning_rate": 4.995495593325118e-06, "loss": 1.4641, "step": 17695 }, { "epoch": 0.01922277018580925, "grad_norm": 21.266902923583984, "learning_rate": 4.9954930336817e-06, "loss": 1.5375, "step": 17700 }, { "epoch": 0.01922820034687869, "grad_norm": 37.55168151855469, "learning_rate": 4.995490473311882e-06, "loss": 1.694, "step": 17705 }, { "epoch": 0.019233630507948126, "grad_norm": 20.467008590698242, "learning_rate": 4.9954879122156665e-06, "loss": 1.7122, "step": 17710 }, { "epoch": 0.019239060669017566, "grad_norm": 32.35834884643555, "learning_rate": 4.9954853503930515e-06, "loss": 1.6933, "step": 17715 }, { "epoch": 0.019244490830087002, "grad_norm": 21.648571014404297, "learning_rate": 4.995482787844038e-06, "loss": 1.8014, "step": 17720 }, { "epoch": 0.01924992099115644, "grad_norm": 13.850282669067383, "learning_rate": 4.995480224568628e-06, "loss": 1.6569, "step": 17725 }, { "epoch": 0.019255351152225878, "grad_norm": 40.24352264404297, "learning_rate": 4.995477660566821e-06, "loss": 1.3669, "step": 17730 }, { "epoch": 0.019260781313295314, "grad_norm": 14.624213218688965, "learning_rate": 4.995475095838619e-06, "loss": 1.5486, "step": 17735 }, { "epoch": 0.019266211474364754, "grad_norm": 20.220317840576172, "learning_rate": 4.995472530384023e-06, "loss": 1.342, "step": 17740 }, { "epoch": 0.01927164163543419, "grad_norm": 21.790246963500977, "learning_rate": 4.995469964203031e-06, "loss": 1.5571, "step": 17745 }, { "epoch": 0.019277071796503627, "grad_norm": 22.95465087890625, "learning_rate": 4.995467397295647e-06, "loss": 1.42, "step": 17750 }, { "epoch": 0.019282501957573066, "grad_norm": 18.354843139648438, "learning_rate": 4.99546482966187e-06, "loss": 1.5156, "step": 17755 }, { "epoch": 0.019287932118642503, "grad_norm": 54.997581481933594, "learning_rate": 4.995462261301701e-06, "loss": 1.6042, "step": 17760 }, { "epoch": 0.019293362279711943, "grad_norm": 94.69255065917969, "learning_rate": 4.995459692215142e-06, "loss": 1.2778, "step": 17765 }, { "epoch": 0.01929879244078138, "grad_norm": 17.32315444946289, "learning_rate": 4.9954571224021915e-06, "loss": 1.8184, "step": 17770 }, { "epoch": 0.019304222601850815, "grad_norm": 16.023033142089844, "learning_rate": 4.995454551862852e-06, "loss": 1.7563, "step": 17775 }, { "epoch": 0.019309652762920255, "grad_norm": 125.30899047851562, "learning_rate": 4.995451980597123e-06, "loss": 1.5136, "step": 17780 }, { "epoch": 0.01931508292398969, "grad_norm": 17.63258171081543, "learning_rate": 4.995449408605006e-06, "loss": 1.2368, "step": 17785 }, { "epoch": 0.019320513085059127, "grad_norm": 13.3632173538208, "learning_rate": 4.995446835886502e-06, "loss": 1.8144, "step": 17790 }, { "epoch": 0.019325943246128567, "grad_norm": 11.4785737991333, "learning_rate": 4.995444262441612e-06, "loss": 1.5497, "step": 17795 }, { "epoch": 0.019331373407198003, "grad_norm": 38.25003433227539, "learning_rate": 4.995441688270335e-06, "loss": 1.8934, "step": 17800 }, { "epoch": 0.019336803568267443, "grad_norm": 119.57943725585938, "learning_rate": 4.995439113372673e-06, "loss": 1.7321, "step": 17805 }, { "epoch": 0.01934223372933688, "grad_norm": 17.435504913330078, "learning_rate": 4.995436537748627e-06, "loss": 1.7909, "step": 17810 }, { "epoch": 0.019347663890406316, "grad_norm": 23.24427032470703, "learning_rate": 4.995433961398197e-06, "loss": 1.9876, "step": 17815 }, { "epoch": 0.019353094051475755, "grad_norm": 23.20606803894043, "learning_rate": 4.9954313843213855e-06, "loss": 1.774, "step": 17820 }, { "epoch": 0.01935852421254519, "grad_norm": 55.00552749633789, "learning_rate": 4.99542880651819e-06, "loss": 1.5648, "step": 17825 }, { "epoch": 0.01936395437361463, "grad_norm": 17.71833610534668, "learning_rate": 4.9954262279886145e-06, "loss": 2.1462, "step": 17830 }, { "epoch": 0.019369384534684068, "grad_norm": 13.272788047790527, "learning_rate": 4.995423648732658e-06, "loss": 1.0157, "step": 17835 }, { "epoch": 0.019374814695753504, "grad_norm": 31.242233276367188, "learning_rate": 4.9954210687503215e-06, "loss": 1.7368, "step": 17840 }, { "epoch": 0.019380244856822944, "grad_norm": 33.648094177246094, "learning_rate": 4.995418488041606e-06, "loss": 1.2875, "step": 17845 }, { "epoch": 0.01938567501789238, "grad_norm": 66.8342056274414, "learning_rate": 4.995415906606512e-06, "loss": 1.7764, "step": 17850 }, { "epoch": 0.01939110517896182, "grad_norm": 18.310993194580078, "learning_rate": 4.9954133244450405e-06, "loss": 1.1945, "step": 17855 }, { "epoch": 0.019396535340031256, "grad_norm": 14.247681617736816, "learning_rate": 4.995410741557192e-06, "loss": 2.0729, "step": 17860 }, { "epoch": 0.019401965501100692, "grad_norm": 47.14130783081055, "learning_rate": 4.995408157942968e-06, "loss": 1.2128, "step": 17865 }, { "epoch": 0.019407395662170132, "grad_norm": 45.185848236083984, "learning_rate": 4.995405573602369e-06, "loss": 1.5193, "step": 17870 }, { "epoch": 0.01941282582323957, "grad_norm": 22.570690155029297, "learning_rate": 4.995402988535394e-06, "loss": 1.2577, "step": 17875 }, { "epoch": 0.019418255984309008, "grad_norm": 20.41792869567871, "learning_rate": 4.995400402742046e-06, "loss": 1.877, "step": 17880 }, { "epoch": 0.019423686145378444, "grad_norm": 34.850250244140625, "learning_rate": 4.995397816222324e-06, "loss": 1.5171, "step": 17885 }, { "epoch": 0.01942911630644788, "grad_norm": 14.32577133178711, "learning_rate": 4.9953952289762306e-06, "loss": 1.5561, "step": 17890 }, { "epoch": 0.01943454646751732, "grad_norm": 35.7641716003418, "learning_rate": 4.9953926410037654e-06, "loss": 1.6524, "step": 17895 }, { "epoch": 0.019439976628586757, "grad_norm": 36.42069625854492, "learning_rate": 4.995390052304929e-06, "loss": 1.6916, "step": 17900 }, { "epoch": 0.019445406789656196, "grad_norm": 27.998828887939453, "learning_rate": 4.9953874628797234e-06, "loss": 1.6259, "step": 17905 }, { "epoch": 0.019450836950725633, "grad_norm": 61.153785705566406, "learning_rate": 4.9953848727281475e-06, "loss": 1.3577, "step": 17910 }, { "epoch": 0.01945626711179507, "grad_norm": 21.306957244873047, "learning_rate": 4.995382281850204e-06, "loss": 1.8729, "step": 17915 }, { "epoch": 0.01946169727286451, "grad_norm": 18.291044235229492, "learning_rate": 4.995379690245891e-06, "loss": 1.0468, "step": 17920 }, { "epoch": 0.019467127433933945, "grad_norm": 15.355956077575684, "learning_rate": 4.995377097915213e-06, "loss": 2.1342, "step": 17925 }, { "epoch": 0.01947255759500338, "grad_norm": 19.485198974609375, "learning_rate": 4.995374504858168e-06, "loss": 1.894, "step": 17930 }, { "epoch": 0.01947798775607282, "grad_norm": 15.835463523864746, "learning_rate": 4.9953719110747565e-06, "loss": 1.3869, "step": 17935 }, { "epoch": 0.019483417917142257, "grad_norm": 24.343061447143555, "learning_rate": 4.995369316564982e-06, "loss": 1.6702, "step": 17940 }, { "epoch": 0.019488848078211697, "grad_norm": 30.01667022705078, "learning_rate": 4.9953667213288414e-06, "loss": 1.4569, "step": 17945 }, { "epoch": 0.019494278239281133, "grad_norm": 25.276044845581055, "learning_rate": 4.995364125366339e-06, "loss": 1.4906, "step": 17950 }, { "epoch": 0.01949970840035057, "grad_norm": 14.32303524017334, "learning_rate": 4.995361528677474e-06, "loss": 1.7198, "step": 17955 }, { "epoch": 0.01950513856142001, "grad_norm": 15.034417152404785, "learning_rate": 4.995358931262246e-06, "loss": 1.6307, "step": 17960 }, { "epoch": 0.019510568722489446, "grad_norm": 39.53078079223633, "learning_rate": 4.995356333120658e-06, "loss": 1.4555, "step": 17965 }, { "epoch": 0.019515998883558885, "grad_norm": 55.32291030883789, "learning_rate": 4.9953537342527095e-06, "loss": 1.2414, "step": 17970 }, { "epoch": 0.019521429044628322, "grad_norm": 17.759654998779297, "learning_rate": 4.995351134658402e-06, "loss": 1.9738, "step": 17975 }, { "epoch": 0.019526859205697758, "grad_norm": 31.442291259765625, "learning_rate": 4.995348534337735e-06, "loss": 1.2722, "step": 17980 }, { "epoch": 0.019532289366767198, "grad_norm": 18.611690521240234, "learning_rate": 4.995345933290711e-06, "loss": 1.3826, "step": 17985 }, { "epoch": 0.019537719527836634, "grad_norm": 135.08837890625, "learning_rate": 4.995343331517328e-06, "loss": 1.4198, "step": 17990 }, { "epoch": 0.019543149688906074, "grad_norm": 10.388139724731445, "learning_rate": 4.99534072901759e-06, "loss": 1.0967, "step": 17995 }, { "epoch": 0.01954857984997551, "grad_norm": 27.370134353637695, "learning_rate": 4.995338125791496e-06, "loss": 1.5845, "step": 18000 }, { "epoch": 0.019554010011044946, "grad_norm": 121.13643646240234, "learning_rate": 4.995335521839046e-06, "loss": 1.7128, "step": 18005 }, { "epoch": 0.019559440172114386, "grad_norm": 23.070775985717773, "learning_rate": 4.9953329171602435e-06, "loss": 1.975, "step": 18010 }, { "epoch": 0.019564870333183822, "grad_norm": 86.8413314819336, "learning_rate": 4.9953303117550865e-06, "loss": 1.3584, "step": 18015 }, { "epoch": 0.019570300494253262, "grad_norm": 18.18549919128418, "learning_rate": 4.9953277056235775e-06, "loss": 1.6092, "step": 18020 }, { "epoch": 0.0195757306553227, "grad_norm": 20.376066207885742, "learning_rate": 4.995325098765716e-06, "loss": 1.1945, "step": 18025 }, { "epoch": 0.019581160816392135, "grad_norm": 72.8013687133789, "learning_rate": 4.995322491181504e-06, "loss": 1.7786, "step": 18030 }, { "epoch": 0.019586590977461574, "grad_norm": 51.38368225097656, "learning_rate": 4.995319882870941e-06, "loss": 2.237, "step": 18035 }, { "epoch": 0.01959202113853101, "grad_norm": 40.45314025878906, "learning_rate": 4.995317273834027e-06, "loss": 1.5833, "step": 18040 }, { "epoch": 0.01959745129960045, "grad_norm": 16.72550392150879, "learning_rate": 4.995314664070766e-06, "loss": 1.3393, "step": 18045 }, { "epoch": 0.019602881460669887, "grad_norm": 30.772884368896484, "learning_rate": 4.995312053581156e-06, "loss": 1.4299, "step": 18050 }, { "epoch": 0.019608311621739323, "grad_norm": 18.68768310546875, "learning_rate": 4.9953094423651995e-06, "loss": 1.7708, "step": 18055 }, { "epoch": 0.019613741782808763, "grad_norm": 18.461105346679688, "learning_rate": 4.9953068304228954e-06, "loss": 1.4218, "step": 18060 }, { "epoch": 0.0196191719438782, "grad_norm": 67.7723388671875, "learning_rate": 4.995304217754246e-06, "loss": 1.5669, "step": 18065 }, { "epoch": 0.019624602104947635, "grad_norm": 27.690288543701172, "learning_rate": 4.995301604359252e-06, "loss": 1.2739, "step": 18070 }, { "epoch": 0.019630032266017075, "grad_norm": 55.40298843383789, "learning_rate": 4.995298990237913e-06, "loss": 1.839, "step": 18075 }, { "epoch": 0.01963546242708651, "grad_norm": 34.277732849121094, "learning_rate": 4.995296375390231e-06, "loss": 2.1226, "step": 18080 }, { "epoch": 0.01964089258815595, "grad_norm": 28.143898010253906, "learning_rate": 4.995293759816205e-06, "loss": 1.9574, "step": 18085 }, { "epoch": 0.019646322749225387, "grad_norm": 20.225772857666016, "learning_rate": 4.995291143515838e-06, "loss": 2.073, "step": 18090 }, { "epoch": 0.019651752910294824, "grad_norm": 19.776771545410156, "learning_rate": 4.99528852648913e-06, "loss": 1.8001, "step": 18095 }, { "epoch": 0.019657183071364263, "grad_norm": 24.661842346191406, "learning_rate": 4.99528590873608e-06, "loss": 2.056, "step": 18100 }, { "epoch": 0.0196626132324337, "grad_norm": 12.587522506713867, "learning_rate": 4.995283290256691e-06, "loss": 1.4736, "step": 18105 }, { "epoch": 0.01966804339350314, "grad_norm": 18.332700729370117, "learning_rate": 4.9952806710509636e-06, "loss": 1.8692, "step": 18110 }, { "epoch": 0.019673473554572576, "grad_norm": 32.67190933227539, "learning_rate": 4.995278051118897e-06, "loss": 2.0094, "step": 18115 }, { "epoch": 0.019678903715642012, "grad_norm": 24.53140640258789, "learning_rate": 4.995275430460493e-06, "loss": 2.0285, "step": 18120 }, { "epoch": 0.019684333876711452, "grad_norm": 16.018753051757812, "learning_rate": 4.995272809075753e-06, "loss": 1.7107, "step": 18125 }, { "epoch": 0.019689764037780888, "grad_norm": 14.848108291625977, "learning_rate": 4.995270186964677e-06, "loss": 1.3812, "step": 18130 }, { "epoch": 0.019695194198850328, "grad_norm": 16.499481201171875, "learning_rate": 4.995267564127265e-06, "loss": 1.9619, "step": 18135 }, { "epoch": 0.019700624359919764, "grad_norm": 55.87539291381836, "learning_rate": 4.99526494056352e-06, "loss": 2.392, "step": 18140 }, { "epoch": 0.0197060545209892, "grad_norm": 40.56390380859375, "learning_rate": 4.9952623162734395e-06, "loss": 1.8517, "step": 18145 }, { "epoch": 0.01971148468205864, "grad_norm": 23.128543853759766, "learning_rate": 4.995259691257027e-06, "loss": 1.3121, "step": 18150 }, { "epoch": 0.019716914843128076, "grad_norm": 19.33599853515625, "learning_rate": 4.995257065514283e-06, "loss": 2.0172, "step": 18155 }, { "epoch": 0.019722345004197516, "grad_norm": 31.591609954833984, "learning_rate": 4.995254439045207e-06, "loss": 2.3187, "step": 18160 }, { "epoch": 0.019727775165266952, "grad_norm": 15.72990608215332, "learning_rate": 4.9952518118498e-06, "loss": 1.6863, "step": 18165 }, { "epoch": 0.01973320532633639, "grad_norm": 12.956934928894043, "learning_rate": 4.995249183928064e-06, "loss": 1.8324, "step": 18170 }, { "epoch": 0.01973863548740583, "grad_norm": 31.637897491455078, "learning_rate": 4.995246555279999e-06, "loss": 1.4518, "step": 18175 }, { "epoch": 0.019744065648475265, "grad_norm": 57.03280258178711, "learning_rate": 4.995243925905605e-06, "loss": 1.29, "step": 18180 }, { "epoch": 0.019749495809544704, "grad_norm": 46.18941116333008, "learning_rate": 4.995241295804883e-06, "loss": 1.4174, "step": 18185 }, { "epoch": 0.01975492597061414, "grad_norm": 17.720550537109375, "learning_rate": 4.995238664977835e-06, "loss": 1.4613, "step": 18190 }, { "epoch": 0.019760356131683577, "grad_norm": 26.912572860717773, "learning_rate": 4.995236033424461e-06, "loss": 0.8399, "step": 18195 }, { "epoch": 0.019765786292753017, "grad_norm": 31.837873458862305, "learning_rate": 4.9952334011447626e-06, "loss": 1.4349, "step": 18200 }, { "epoch": 0.019771216453822453, "grad_norm": 19.58806610107422, "learning_rate": 4.995230768138738e-06, "loss": 1.2125, "step": 18205 }, { "epoch": 0.01977664661489189, "grad_norm": 15.36837387084961, "learning_rate": 4.99522813440639e-06, "loss": 1.9208, "step": 18210 }, { "epoch": 0.01978207677596133, "grad_norm": 20.09476089477539, "learning_rate": 4.99522549994772e-06, "loss": 1.8501, "step": 18215 }, { "epoch": 0.019787506937030765, "grad_norm": 13.479281425476074, "learning_rate": 4.995222864762728e-06, "loss": 1.8708, "step": 18220 }, { "epoch": 0.019792937098100205, "grad_norm": 32.9139518737793, "learning_rate": 4.995220228851413e-06, "loss": 1.4841, "step": 18225 }, { "epoch": 0.01979836725916964, "grad_norm": 38.90848159790039, "learning_rate": 4.995217592213779e-06, "loss": 1.4893, "step": 18230 }, { "epoch": 0.019803797420239078, "grad_norm": 16.594619750976562, "learning_rate": 4.995214954849824e-06, "loss": 1.9976, "step": 18235 }, { "epoch": 0.019809227581308517, "grad_norm": 31.924219131469727, "learning_rate": 4.99521231675955e-06, "loss": 1.3113, "step": 18240 }, { "epoch": 0.019814657742377954, "grad_norm": 41.09795379638672, "learning_rate": 4.995209677942958e-06, "loss": 1.0241, "step": 18245 }, { "epoch": 0.019820087903447393, "grad_norm": 23.364734649658203, "learning_rate": 4.995207038400048e-06, "loss": 1.4546, "step": 18250 }, { "epoch": 0.01982551806451683, "grad_norm": 20.87929344177246, "learning_rate": 4.995204398130823e-06, "loss": 1.3271, "step": 18255 }, { "epoch": 0.019830948225586266, "grad_norm": 29.31467628479004, "learning_rate": 4.995201757135279e-06, "loss": 1.1953, "step": 18260 }, { "epoch": 0.019836378386655706, "grad_norm": 161.50311279296875, "learning_rate": 4.995199115413422e-06, "loss": 2.1785, "step": 18265 }, { "epoch": 0.019841808547725142, "grad_norm": 28.08157730102539, "learning_rate": 4.99519647296525e-06, "loss": 1.5928, "step": 18270 }, { "epoch": 0.019847238708794582, "grad_norm": 32.0324592590332, "learning_rate": 4.995193829790764e-06, "loss": 2.0158, "step": 18275 }, { "epoch": 0.019852668869864018, "grad_norm": 103.06784057617188, "learning_rate": 4.9951911858899645e-06, "loss": 1.1637, "step": 18280 }, { "epoch": 0.019858099030933454, "grad_norm": 20.495153427124023, "learning_rate": 4.9951885412628545e-06, "loss": 1.2481, "step": 18285 }, { "epoch": 0.019863529192002894, "grad_norm": 42.38594436645508, "learning_rate": 4.9951858959094316e-06, "loss": 1.6629, "step": 18290 }, { "epoch": 0.01986895935307233, "grad_norm": 69.86489868164062, "learning_rate": 4.995183249829698e-06, "loss": 1.5403, "step": 18295 }, { "epoch": 0.01987438951414177, "grad_norm": 35.336612701416016, "learning_rate": 4.995180603023655e-06, "loss": 1.521, "step": 18300 }, { "epoch": 0.019879819675211206, "grad_norm": 20.970155715942383, "learning_rate": 4.9951779554913035e-06, "loss": 1.9637, "step": 18305 }, { "epoch": 0.019885249836280643, "grad_norm": 21.715314865112305, "learning_rate": 4.995175307232642e-06, "loss": 1.6591, "step": 18310 }, { "epoch": 0.019890679997350082, "grad_norm": 13.988344192504883, "learning_rate": 4.995172658247675e-06, "loss": 1.7831, "step": 18315 }, { "epoch": 0.01989611015841952, "grad_norm": 29.347204208374023, "learning_rate": 4.995170008536399e-06, "loss": 1.2121, "step": 18320 }, { "epoch": 0.01990154031948896, "grad_norm": 28.79248046875, "learning_rate": 4.9951673580988195e-06, "loss": 1.8087, "step": 18325 }, { "epoch": 0.019906970480558395, "grad_norm": 27.21586799621582, "learning_rate": 4.995164706934933e-06, "loss": 1.4817, "step": 18330 }, { "epoch": 0.01991240064162783, "grad_norm": 18.92418098449707, "learning_rate": 4.995162055044742e-06, "loss": 1.637, "step": 18335 }, { "epoch": 0.01991783080269727, "grad_norm": 22.18349266052246, "learning_rate": 4.995159402428248e-06, "loss": 1.9246, "step": 18340 }, { "epoch": 0.019923260963766707, "grad_norm": 16.658035278320312, "learning_rate": 4.99515674908545e-06, "loss": 1.5812, "step": 18345 }, { "epoch": 0.019928691124836143, "grad_norm": 21.639127731323242, "learning_rate": 4.995154095016351e-06, "loss": 1.1461, "step": 18350 }, { "epoch": 0.019934121285905583, "grad_norm": 17.624250411987305, "learning_rate": 4.99515144022095e-06, "loss": 1.6263, "step": 18355 }, { "epoch": 0.01993955144697502, "grad_norm": 14.964234352111816, "learning_rate": 4.995148784699248e-06, "loss": 1.4505, "step": 18360 }, { "epoch": 0.01994498160804446, "grad_norm": 23.46994972229004, "learning_rate": 4.995146128451248e-06, "loss": 1.744, "step": 18365 }, { "epoch": 0.019950411769113895, "grad_norm": 15.240792274475098, "learning_rate": 4.995143471476947e-06, "loss": 1.2605, "step": 18370 }, { "epoch": 0.01995584193018333, "grad_norm": 20.813438415527344, "learning_rate": 4.9951408137763484e-06, "loss": 1.7418, "step": 18375 }, { "epoch": 0.01996127209125277, "grad_norm": 182.9668426513672, "learning_rate": 4.995138155349452e-06, "loss": 1.5489, "step": 18380 }, { "epoch": 0.019966702252322208, "grad_norm": 20.944080352783203, "learning_rate": 4.995135496196259e-06, "loss": 1.8257, "step": 18385 }, { "epoch": 0.019972132413391647, "grad_norm": 91.3568115234375, "learning_rate": 4.995132836316771e-06, "loss": 1.5156, "step": 18390 }, { "epoch": 0.019977562574461084, "grad_norm": 166.82080078125, "learning_rate": 4.995130175710986e-06, "loss": 2.9277, "step": 18395 }, { "epoch": 0.01998299273553052, "grad_norm": 34.47665023803711, "learning_rate": 4.995127514378908e-06, "loss": 1.5985, "step": 18400 }, { "epoch": 0.01998842289659996, "grad_norm": 31.342660903930664, "learning_rate": 4.995124852320536e-06, "loss": 2.1373, "step": 18405 }, { "epoch": 0.019993853057669396, "grad_norm": 16.303056716918945, "learning_rate": 4.99512218953587e-06, "loss": 1.7153, "step": 18410 }, { "epoch": 0.019999283218738836, "grad_norm": 66.85179901123047, "learning_rate": 4.995119526024913e-06, "loss": 1.9784, "step": 18415 }, { "epoch": 0.020004713379808272, "grad_norm": 13.145451545715332, "learning_rate": 4.995116861787664e-06, "loss": 1.5212, "step": 18420 }, { "epoch": 0.02001014354087771, "grad_norm": 20.59392738342285, "learning_rate": 4.995114196824126e-06, "loss": 1.3938, "step": 18425 }, { "epoch": 0.020015573701947148, "grad_norm": 18.77417755126953, "learning_rate": 4.9951115311342965e-06, "loss": 1.3794, "step": 18430 }, { "epoch": 0.020021003863016584, "grad_norm": 25.294801712036133, "learning_rate": 4.995108864718179e-06, "loss": 1.311, "step": 18435 }, { "epoch": 0.020026434024086024, "grad_norm": 14.478038787841797, "learning_rate": 4.9951061975757725e-06, "loss": 1.5477, "step": 18440 }, { "epoch": 0.02003186418515546, "grad_norm": 38.43084716796875, "learning_rate": 4.99510352970708e-06, "loss": 2.3836, "step": 18445 }, { "epoch": 0.020037294346224897, "grad_norm": 23.372173309326172, "learning_rate": 4.995100861112099e-06, "loss": 1.8347, "step": 18450 }, { "epoch": 0.020042724507294336, "grad_norm": 21.916135787963867, "learning_rate": 4.995098191790833e-06, "loss": 1.5576, "step": 18455 }, { "epoch": 0.020048154668363773, "grad_norm": 72.42576599121094, "learning_rate": 4.9950955217432816e-06, "loss": 2.3202, "step": 18460 }, { "epoch": 0.02005358482943321, "grad_norm": 25.8453426361084, "learning_rate": 4.995092850969446e-06, "loss": 1.3113, "step": 18465 }, { "epoch": 0.02005901499050265, "grad_norm": 21.914934158325195, "learning_rate": 4.9950901794693264e-06, "loss": 1.4932, "step": 18470 }, { "epoch": 0.020064445151572085, "grad_norm": 17.010942459106445, "learning_rate": 4.995087507242924e-06, "loss": 1.9534, "step": 18475 }, { "epoch": 0.020069875312641525, "grad_norm": 23.104198455810547, "learning_rate": 4.995084834290241e-06, "loss": 1.4629, "step": 18480 }, { "epoch": 0.02007530547371096, "grad_norm": 36.15534591674805, "learning_rate": 4.995082160611275e-06, "loss": 2.0943, "step": 18485 }, { "epoch": 0.020080735634780397, "grad_norm": 25.75395965576172, "learning_rate": 4.99507948620603e-06, "loss": 1.1824, "step": 18490 }, { "epoch": 0.020086165795849837, "grad_norm": 23.67867088317871, "learning_rate": 4.995076811074504e-06, "loss": 1.8978, "step": 18495 }, { "epoch": 0.020091595956919273, "grad_norm": 25.862972259521484, "learning_rate": 4.9950741352166995e-06, "loss": 1.6291, "step": 18500 }, { "epoch": 0.020097026117988713, "grad_norm": 30.746902465820312, "learning_rate": 4.995071458632618e-06, "loss": 1.5124, "step": 18505 }, { "epoch": 0.02010245627905815, "grad_norm": 32.622867584228516, "learning_rate": 4.995068781322258e-06, "loss": 1.7278, "step": 18510 }, { "epoch": 0.020107886440127586, "grad_norm": 81.85467529296875, "learning_rate": 4.995066103285621e-06, "loss": 2.1779, "step": 18515 }, { "epoch": 0.020113316601197025, "grad_norm": 16.90744400024414, "learning_rate": 4.99506342452271e-06, "loss": 1.9306, "step": 18520 }, { "epoch": 0.02011874676226646, "grad_norm": 19.952674865722656, "learning_rate": 4.995060745033523e-06, "loss": 1.5889, "step": 18525 }, { "epoch": 0.0201241769233359, "grad_norm": 22.069026947021484, "learning_rate": 4.995058064818061e-06, "loss": 1.6251, "step": 18530 }, { "epoch": 0.020129607084405338, "grad_norm": 14.854122161865234, "learning_rate": 4.995055383876327e-06, "loss": 2.2514, "step": 18535 }, { "epoch": 0.020135037245474774, "grad_norm": 27.71482276916504, "learning_rate": 4.99505270220832e-06, "loss": 1.957, "step": 18540 }, { "epoch": 0.020140467406544214, "grad_norm": 14.829834938049316, "learning_rate": 4.995050019814041e-06, "loss": 1.6768, "step": 18545 }, { "epoch": 0.02014589756761365, "grad_norm": 45.507606506347656, "learning_rate": 4.99504733669349e-06, "loss": 1.5885, "step": 18550 }, { "epoch": 0.02015132772868309, "grad_norm": 9.883825302124023, "learning_rate": 4.995044652846669e-06, "loss": 1.6222, "step": 18555 }, { "epoch": 0.020156757889752526, "grad_norm": 15.898567199707031, "learning_rate": 4.995041968273579e-06, "loss": 1.8435, "step": 18560 }, { "epoch": 0.020162188050821962, "grad_norm": 17.611793518066406, "learning_rate": 4.99503928297422e-06, "loss": 1.6353, "step": 18565 }, { "epoch": 0.020167618211891402, "grad_norm": 13.613929748535156, "learning_rate": 4.995036596948593e-06, "loss": 1.3611, "step": 18570 }, { "epoch": 0.02017304837296084, "grad_norm": 190.93914794921875, "learning_rate": 4.9950339101967e-06, "loss": 1.4825, "step": 18575 }, { "epoch": 0.020178478534030278, "grad_norm": 27.985918045043945, "learning_rate": 4.9950312227185385e-06, "loss": 1.7732, "step": 18580 }, { "epoch": 0.020183908695099714, "grad_norm": 24.284578323364258, "learning_rate": 4.9950285345141126e-06, "loss": 1.5509, "step": 18585 }, { "epoch": 0.02018933885616915, "grad_norm": 33.26997756958008, "learning_rate": 4.995025845583421e-06, "loss": 1.7838, "step": 18590 }, { "epoch": 0.02019476901723859, "grad_norm": 17.39118766784668, "learning_rate": 4.995023155926466e-06, "loss": 1.4884, "step": 18595 }, { "epoch": 0.020200199178308027, "grad_norm": 52.25371551513672, "learning_rate": 4.995020465543248e-06, "loss": 1.7909, "step": 18600 }, { "epoch": 0.020205629339377463, "grad_norm": 31.526321411132812, "learning_rate": 4.995017774433767e-06, "loss": 2.2804, "step": 18605 }, { "epoch": 0.020211059500446903, "grad_norm": 40.971683502197266, "learning_rate": 4.995015082598025e-06, "loss": 2.0715, "step": 18610 }, { "epoch": 0.02021648966151634, "grad_norm": 18.708520889282227, "learning_rate": 4.995012390036022e-06, "loss": 1.3768, "step": 18615 }, { "epoch": 0.02022191982258578, "grad_norm": 19.270183563232422, "learning_rate": 4.995009696747758e-06, "loss": 1.4118, "step": 18620 }, { "epoch": 0.020227349983655215, "grad_norm": 21.712825775146484, "learning_rate": 4.995007002733234e-06, "loss": 1.2962, "step": 18625 }, { "epoch": 0.02023278014472465, "grad_norm": 39.50220489501953, "learning_rate": 4.9950043079924525e-06, "loss": 1.7264, "step": 18630 }, { "epoch": 0.02023821030579409, "grad_norm": 39.62070846557617, "learning_rate": 4.995001612525414e-06, "loss": 1.6967, "step": 18635 }, { "epoch": 0.020243640466863527, "grad_norm": 19.51809310913086, "learning_rate": 4.994998916332117e-06, "loss": 0.9253, "step": 18640 }, { "epoch": 0.020249070627932967, "grad_norm": 26.063642501831055, "learning_rate": 4.994996219412565e-06, "loss": 1.9208, "step": 18645 }, { "epoch": 0.020254500789002403, "grad_norm": 17.5734806060791, "learning_rate": 4.994993521766756e-06, "loss": 1.9125, "step": 18650 }, { "epoch": 0.02025993095007184, "grad_norm": 32.094051361083984, "learning_rate": 4.994990823394694e-06, "loss": 1.5334, "step": 18655 }, { "epoch": 0.02026536111114128, "grad_norm": 59.89437484741211, "learning_rate": 4.994988124296377e-06, "loss": 1.8988, "step": 18660 }, { "epoch": 0.020270791272210716, "grad_norm": 51.64692306518555, "learning_rate": 4.994985424471808e-06, "loss": 1.759, "step": 18665 }, { "epoch": 0.020276221433280155, "grad_norm": 21.966611862182617, "learning_rate": 4.9949827239209854e-06, "loss": 1.8042, "step": 18670 }, { "epoch": 0.02028165159434959, "grad_norm": 85.62294006347656, "learning_rate": 4.994980022643913e-06, "loss": 1.7387, "step": 18675 }, { "epoch": 0.020287081755419028, "grad_norm": 43.50691223144531, "learning_rate": 4.9949773206405885e-06, "loss": 2.6904, "step": 18680 }, { "epoch": 0.020292511916488468, "grad_norm": 110.97557067871094, "learning_rate": 4.994974617911015e-06, "loss": 1.8619, "step": 18685 }, { "epoch": 0.020297942077557904, "grad_norm": 75.79154968261719, "learning_rate": 4.994971914455191e-06, "loss": 1.9537, "step": 18690 }, { "epoch": 0.020303372238627344, "grad_norm": 22.317623138427734, "learning_rate": 4.994969210273119e-06, "loss": 1.5312, "step": 18695 }, { "epoch": 0.02030880239969678, "grad_norm": 76.63953399658203, "learning_rate": 4.9949665053648e-06, "loss": 1.2337, "step": 18700 }, { "epoch": 0.020314232560766216, "grad_norm": 30.037006378173828, "learning_rate": 4.994963799730234e-06, "loss": 1.3899, "step": 18705 }, { "epoch": 0.020319662721835656, "grad_norm": 27.976205825805664, "learning_rate": 4.994961093369422e-06, "loss": 1.8257, "step": 18710 }, { "epoch": 0.020325092882905092, "grad_norm": 19.280654907226562, "learning_rate": 4.9949583862823644e-06, "loss": 1.5965, "step": 18715 }, { "epoch": 0.020330523043974532, "grad_norm": 9.638097763061523, "learning_rate": 4.994955678469063e-06, "loss": 1.6465, "step": 18720 }, { "epoch": 0.02033595320504397, "grad_norm": 23.056926727294922, "learning_rate": 4.994952969929518e-06, "loss": 1.5666, "step": 18725 }, { "epoch": 0.020341383366113405, "grad_norm": 25.41585350036621, "learning_rate": 4.99495026066373e-06, "loss": 1.3572, "step": 18730 }, { "epoch": 0.020346813527182844, "grad_norm": 39.1169319152832, "learning_rate": 4.9949475506717e-06, "loss": 1.3209, "step": 18735 }, { "epoch": 0.02035224368825228, "grad_norm": 21.737468719482422, "learning_rate": 4.994944839953429e-06, "loss": 1.7818, "step": 18740 }, { "epoch": 0.020357673849321717, "grad_norm": 102.89612579345703, "learning_rate": 4.994942128508917e-06, "loss": 1.4373, "step": 18745 }, { "epoch": 0.020363104010391157, "grad_norm": 30.870758056640625, "learning_rate": 4.994939416338164e-06, "loss": 1.574, "step": 18750 }, { "epoch": 0.020368534171460593, "grad_norm": 21.600053787231445, "learning_rate": 4.994936703441175e-06, "loss": 1.4836, "step": 18755 }, { "epoch": 0.020373964332530033, "grad_norm": 23.238235473632812, "learning_rate": 4.9949339898179464e-06, "loss": 1.525, "step": 18760 }, { "epoch": 0.02037939449359947, "grad_norm": 16.779333114624023, "learning_rate": 4.99493127546848e-06, "loss": 1.977, "step": 18765 }, { "epoch": 0.020384824654668905, "grad_norm": 14.932086944580078, "learning_rate": 4.994928560392778e-06, "loss": 2.0313, "step": 18770 }, { "epoch": 0.020390254815738345, "grad_norm": 25.260997772216797, "learning_rate": 4.994925844590839e-06, "loss": 1.5614, "step": 18775 }, { "epoch": 0.02039568497680778, "grad_norm": 48.55820083618164, "learning_rate": 4.9949231280626655e-06, "loss": 1.9455, "step": 18780 }, { "epoch": 0.02040111513787722, "grad_norm": 15.28946590423584, "learning_rate": 4.994920410808259e-06, "loss": 1.3096, "step": 18785 }, { "epoch": 0.020406545298946657, "grad_norm": 19.06997299194336, "learning_rate": 4.994917692827618e-06, "loss": 1.4981, "step": 18790 }, { "epoch": 0.020411975460016094, "grad_norm": 14.326855659484863, "learning_rate": 4.994914974120745e-06, "loss": 1.2081, "step": 18795 }, { "epoch": 0.020417405621085533, "grad_norm": 30.872312545776367, "learning_rate": 4.994912254687639e-06, "loss": 1.6162, "step": 18800 }, { "epoch": 0.02042283578215497, "grad_norm": 43.94981384277344, "learning_rate": 4.994909534528304e-06, "loss": 1.1019, "step": 18805 }, { "epoch": 0.02042826594322441, "grad_norm": 15.799054145812988, "learning_rate": 4.994906813642737e-06, "loss": 1.501, "step": 18810 }, { "epoch": 0.020433696104293846, "grad_norm": 56.3419075012207, "learning_rate": 4.994904092030941e-06, "loss": 2.0119, "step": 18815 }, { "epoch": 0.020439126265363282, "grad_norm": 21.3057804107666, "learning_rate": 4.9949013696929174e-06, "loss": 1.8575, "step": 18820 }, { "epoch": 0.02044455642643272, "grad_norm": 23.449993133544922, "learning_rate": 4.994898646628665e-06, "loss": 1.3955, "step": 18825 }, { "epoch": 0.020449986587502158, "grad_norm": 33.948368072509766, "learning_rate": 4.994895922838185e-06, "loss": 1.5098, "step": 18830 }, { "epoch": 0.020455416748571598, "grad_norm": 18.143125534057617, "learning_rate": 4.994893198321481e-06, "loss": 1.396, "step": 18835 }, { "epoch": 0.020460846909641034, "grad_norm": 20.084545135498047, "learning_rate": 4.9948904730785495e-06, "loss": 1.4676, "step": 18840 }, { "epoch": 0.02046627707071047, "grad_norm": 19.496112823486328, "learning_rate": 4.994887747109393e-06, "loss": 1.3311, "step": 18845 }, { "epoch": 0.02047170723177991, "grad_norm": 40.185340881347656, "learning_rate": 4.994885020414014e-06, "loss": 1.5597, "step": 18850 }, { "epoch": 0.020477137392849346, "grad_norm": 15.082489013671875, "learning_rate": 4.9948822929924126e-06, "loss": 1.9472, "step": 18855 }, { "epoch": 0.020482567553918786, "grad_norm": 139.81503295898438, "learning_rate": 4.994879564844587e-06, "loss": 2.0071, "step": 18860 }, { "epoch": 0.020487997714988222, "grad_norm": 43.86052322387695, "learning_rate": 4.9948768359705415e-06, "loss": 1.5831, "step": 18865 }, { "epoch": 0.02049342787605766, "grad_norm": 59.785587310791016, "learning_rate": 4.994874106370275e-06, "loss": 1.3776, "step": 18870 }, { "epoch": 0.0204988580371271, "grad_norm": 25.082469940185547, "learning_rate": 4.994871376043787e-06, "loss": 1.3704, "step": 18875 }, { "epoch": 0.020504288198196535, "grad_norm": 25.865116119384766, "learning_rate": 4.994868644991081e-06, "loss": 1.7744, "step": 18880 }, { "epoch": 0.02050971835926597, "grad_norm": 16.12051010131836, "learning_rate": 4.994865913212157e-06, "loss": 1.4923, "step": 18885 }, { "epoch": 0.02051514852033541, "grad_norm": 54.89503860473633, "learning_rate": 4.994863180707016e-06, "loss": 1.7819, "step": 18890 }, { "epoch": 0.020520578681404847, "grad_norm": 13.323091506958008, "learning_rate": 4.994860447475657e-06, "loss": 1.5539, "step": 18895 }, { "epoch": 0.020526008842474287, "grad_norm": 37.8212776184082, "learning_rate": 4.994857713518082e-06, "loss": 2.2703, "step": 18900 }, { "epoch": 0.020531439003543723, "grad_norm": 15.637961387634277, "learning_rate": 4.994854978834293e-06, "loss": 1.6361, "step": 18905 }, { "epoch": 0.02053686916461316, "grad_norm": 35.922908782958984, "learning_rate": 4.99485224342429e-06, "loss": 1.7991, "step": 18910 }, { "epoch": 0.0205422993256826, "grad_norm": 14.709684371948242, "learning_rate": 4.994849507288072e-06, "loss": 1.2571, "step": 18915 }, { "epoch": 0.020547729486752035, "grad_norm": 15.57337474822998, "learning_rate": 4.994846770425642e-06, "loss": 1.2806, "step": 18920 }, { "epoch": 0.020553159647821475, "grad_norm": 35.973567962646484, "learning_rate": 4.9948440328369995e-06, "loss": 2.0435, "step": 18925 }, { "epoch": 0.02055858980889091, "grad_norm": 38.06816482543945, "learning_rate": 4.994841294522147e-06, "loss": 1.5912, "step": 18930 }, { "epoch": 0.020564019969960348, "grad_norm": 25.851896286010742, "learning_rate": 4.994838555481083e-06, "loss": 1.8609, "step": 18935 }, { "epoch": 0.020569450131029787, "grad_norm": 15.63569450378418, "learning_rate": 4.99483581571381e-06, "loss": 1.6299, "step": 18940 }, { "epoch": 0.020574880292099224, "grad_norm": 21.037487030029297, "learning_rate": 4.994833075220329e-06, "loss": 2.6646, "step": 18945 }, { "epoch": 0.020580310453168663, "grad_norm": 23.87211036682129, "learning_rate": 4.994830334000639e-06, "loss": 1.8853, "step": 18950 }, { "epoch": 0.0205857406142381, "grad_norm": 14.327765464782715, "learning_rate": 4.994827592054741e-06, "loss": 1.3467, "step": 18955 }, { "epoch": 0.020591170775307536, "grad_norm": 19.771926879882812, "learning_rate": 4.994824849382639e-06, "loss": 1.4336, "step": 18960 }, { "epoch": 0.020596600936376976, "grad_norm": 21.66570472717285, "learning_rate": 4.99482210598433e-06, "loss": 1.6054, "step": 18965 }, { "epoch": 0.020602031097446412, "grad_norm": 43.90258026123047, "learning_rate": 4.994819361859816e-06, "loss": 1.7599, "step": 18970 }, { "epoch": 0.02060746125851585, "grad_norm": 60.28157424926758, "learning_rate": 4.994816617009099e-06, "loss": 2.025, "step": 18975 }, { "epoch": 0.020612891419585288, "grad_norm": 107.24319458007812, "learning_rate": 4.994813871432178e-06, "loss": 1.2933, "step": 18980 }, { "epoch": 0.020618321580654724, "grad_norm": 67.26813507080078, "learning_rate": 4.994811125129055e-06, "loss": 0.9753, "step": 18985 }, { "epoch": 0.020623751741724164, "grad_norm": 22.020204544067383, "learning_rate": 4.99480837809973e-06, "loss": 1.2151, "step": 18990 }, { "epoch": 0.0206291819027936, "grad_norm": 14.748276710510254, "learning_rate": 4.994805630344205e-06, "loss": 1.3977, "step": 18995 }, { "epoch": 0.02063461206386304, "grad_norm": 104.43624877929688, "learning_rate": 4.99480288186248e-06, "loss": 1.5365, "step": 19000 }, { "epoch": 0.020640042224932476, "grad_norm": 23.761089324951172, "learning_rate": 4.994800132654556e-06, "loss": 2.0105, "step": 19005 }, { "epoch": 0.020645472386001913, "grad_norm": 17.148107528686523, "learning_rate": 4.994797382720433e-06, "loss": 1.2639, "step": 19010 }, { "epoch": 0.020650902547071352, "grad_norm": 109.17085266113281, "learning_rate": 4.994794632060112e-06, "loss": 1.9235, "step": 19015 }, { "epoch": 0.02065633270814079, "grad_norm": 35.1529655456543, "learning_rate": 4.994791880673595e-06, "loss": 1.2175, "step": 19020 }, { "epoch": 0.020661762869210225, "grad_norm": 82.69837951660156, "learning_rate": 4.994789128560882e-06, "loss": 1.6556, "step": 19025 }, { "epoch": 0.020667193030279665, "grad_norm": 18.198917388916016, "learning_rate": 4.994786375721974e-06, "loss": 1.6121, "step": 19030 }, { "epoch": 0.0206726231913491, "grad_norm": 36.86338806152344, "learning_rate": 4.9947836221568715e-06, "loss": 1.553, "step": 19035 }, { "epoch": 0.02067805335241854, "grad_norm": 131.7992706298828, "learning_rate": 4.9947808678655755e-06, "loss": 2.4186, "step": 19040 }, { "epoch": 0.020683483513487977, "grad_norm": 164.66258239746094, "learning_rate": 4.994778112848086e-06, "loss": 1.5856, "step": 19045 }, { "epoch": 0.020688913674557413, "grad_norm": 55.845333099365234, "learning_rate": 4.994775357104406e-06, "loss": 1.9529, "step": 19050 }, { "epoch": 0.020694343835626853, "grad_norm": 22.32333755493164, "learning_rate": 4.994772600634533e-06, "loss": 1.1737, "step": 19055 }, { "epoch": 0.02069977399669629, "grad_norm": 20.411664962768555, "learning_rate": 4.994769843438471e-06, "loss": 1.3791, "step": 19060 }, { "epoch": 0.02070520415776573, "grad_norm": 62.092987060546875, "learning_rate": 4.994767085516219e-06, "loss": 1.7776, "step": 19065 }, { "epoch": 0.020710634318835165, "grad_norm": 74.29585266113281, "learning_rate": 4.994764326867778e-06, "loss": 1.6942, "step": 19070 }, { "epoch": 0.0207160644799046, "grad_norm": 42.49941635131836, "learning_rate": 4.994761567493149e-06, "loss": 1.7681, "step": 19075 }, { "epoch": 0.02072149464097404, "grad_norm": 66.60586547851562, "learning_rate": 4.994758807392334e-06, "loss": 2.0133, "step": 19080 }, { "epoch": 0.020726924802043478, "grad_norm": 17.887563705444336, "learning_rate": 4.994756046565332e-06, "loss": 1.2683, "step": 19085 }, { "epoch": 0.020732354963112917, "grad_norm": 19.388355255126953, "learning_rate": 4.994753285012144e-06, "loss": 1.4484, "step": 19090 }, { "epoch": 0.020737785124182354, "grad_norm": 16.461292266845703, "learning_rate": 4.994750522732772e-06, "loss": 1.752, "step": 19095 }, { "epoch": 0.02074321528525179, "grad_norm": 12.617445945739746, "learning_rate": 4.994747759727215e-06, "loss": 1.8035, "step": 19100 }, { "epoch": 0.02074864544632123, "grad_norm": 14.200645446777344, "learning_rate": 4.994744995995476e-06, "loss": 2.0577, "step": 19105 }, { "epoch": 0.020754075607390666, "grad_norm": 112.68350219726562, "learning_rate": 4.994742231537555e-06, "loss": 1.9805, "step": 19110 }, { "epoch": 0.020759505768460106, "grad_norm": 27.769433975219727, "learning_rate": 4.994739466353451e-06, "loss": 1.4793, "step": 19115 }, { "epoch": 0.020764935929529542, "grad_norm": 21.503000259399414, "learning_rate": 4.994736700443168e-06, "loss": 1.2778, "step": 19120 }, { "epoch": 0.020770366090598978, "grad_norm": 14.121273040771484, "learning_rate": 4.994733933806704e-06, "loss": 1.4501, "step": 19125 }, { "epoch": 0.020775796251668418, "grad_norm": 26.210712432861328, "learning_rate": 4.994731166444061e-06, "loss": 1.9492, "step": 19130 }, { "epoch": 0.020781226412737854, "grad_norm": 19.707447052001953, "learning_rate": 4.994728398355239e-06, "loss": 1.4282, "step": 19135 }, { "epoch": 0.020786656573807294, "grad_norm": 19.417484283447266, "learning_rate": 4.994725629540241e-06, "loss": 1.1003, "step": 19140 }, { "epoch": 0.02079208673487673, "grad_norm": 16.804410934448242, "learning_rate": 4.994722859999066e-06, "loss": 1.9645, "step": 19145 }, { "epoch": 0.020797516895946166, "grad_norm": 32.72727584838867, "learning_rate": 4.994720089731715e-06, "loss": 1.6762, "step": 19150 }, { "epoch": 0.020802947057015606, "grad_norm": 57.62007141113281, "learning_rate": 4.9947173187381885e-06, "loss": 1.3346, "step": 19155 }, { "epoch": 0.020808377218085043, "grad_norm": 24.098814010620117, "learning_rate": 4.9947145470184875e-06, "loss": 1.5826, "step": 19160 }, { "epoch": 0.02081380737915448, "grad_norm": 13.191418647766113, "learning_rate": 4.994711774572614e-06, "loss": 1.8455, "step": 19165 }, { "epoch": 0.02081923754022392, "grad_norm": 22.62103271484375, "learning_rate": 4.9947090014005675e-06, "loss": 1.403, "step": 19170 }, { "epoch": 0.020824667701293355, "grad_norm": 94.77223205566406, "learning_rate": 4.9947062275023495e-06, "loss": 1.4425, "step": 19175 }, { "epoch": 0.020830097862362795, "grad_norm": 17.44072151184082, "learning_rate": 4.99470345287796e-06, "loss": 1.7073, "step": 19180 }, { "epoch": 0.02083552802343223, "grad_norm": 17.10236167907715, "learning_rate": 4.9947006775274e-06, "loss": 1.4913, "step": 19185 }, { "epoch": 0.020840958184501667, "grad_norm": 16.864784240722656, "learning_rate": 4.994697901450671e-06, "loss": 1.527, "step": 19190 }, { "epoch": 0.020846388345571107, "grad_norm": 36.1942024230957, "learning_rate": 4.994695124647774e-06, "loss": 2.0035, "step": 19195 }, { "epoch": 0.020851818506640543, "grad_norm": 22.037437438964844, "learning_rate": 4.994692347118708e-06, "loss": 1.1562, "step": 19200 }, { "epoch": 0.020857248667709983, "grad_norm": 19.509479522705078, "learning_rate": 4.994689568863476e-06, "loss": 1.7596, "step": 19205 }, { "epoch": 0.02086267882877942, "grad_norm": 17.259201049804688, "learning_rate": 4.994686789882077e-06, "loss": 2.1472, "step": 19210 }, { "epoch": 0.020868108989848855, "grad_norm": 15.85853385925293, "learning_rate": 4.994684010174513e-06, "loss": 1.4708, "step": 19215 }, { "epoch": 0.020873539150918295, "grad_norm": 15.595564842224121, "learning_rate": 4.994681229740784e-06, "loss": 1.8178, "step": 19220 }, { "epoch": 0.02087896931198773, "grad_norm": 31.742467880249023, "learning_rate": 4.994678448580892e-06, "loss": 1.689, "step": 19225 }, { "epoch": 0.02088439947305717, "grad_norm": 70.41069030761719, "learning_rate": 4.9946756666948375e-06, "loss": 0.9796, "step": 19230 }, { "epoch": 0.020889829634126608, "grad_norm": 21.14167594909668, "learning_rate": 4.99467288408262e-06, "loss": 1.3929, "step": 19235 }, { "epoch": 0.020895259795196044, "grad_norm": 25.498458862304688, "learning_rate": 4.9946701007442414e-06, "loss": 1.5304, "step": 19240 }, { "epoch": 0.020900689956265484, "grad_norm": 14.89990234375, "learning_rate": 4.994667316679702e-06, "loss": 1.9966, "step": 19245 }, { "epoch": 0.02090612011733492, "grad_norm": 11.27430248260498, "learning_rate": 4.994664531889003e-06, "loss": 0.9729, "step": 19250 }, { "epoch": 0.02091155027840436, "grad_norm": 15.067880630493164, "learning_rate": 4.9946617463721445e-06, "loss": 2.211, "step": 19255 }, { "epoch": 0.020916980439473796, "grad_norm": 17.675920486450195, "learning_rate": 4.994658960129129e-06, "loss": 1.61, "step": 19260 }, { "epoch": 0.020922410600543232, "grad_norm": 18.320589065551758, "learning_rate": 4.994656173159956e-06, "loss": 1.2031, "step": 19265 }, { "epoch": 0.020927840761612672, "grad_norm": 59.83646774291992, "learning_rate": 4.994653385464626e-06, "loss": 1.5431, "step": 19270 }, { "epoch": 0.020933270922682108, "grad_norm": 15.02082347869873, "learning_rate": 4.994650597043141e-06, "loss": 1.7756, "step": 19275 }, { "epoch": 0.020938701083751548, "grad_norm": 116.13727569580078, "learning_rate": 4.994647807895501e-06, "loss": 1.5918, "step": 19280 }, { "epoch": 0.020944131244820984, "grad_norm": 25.789207458496094, "learning_rate": 4.994645018021707e-06, "loss": 1.3407, "step": 19285 }, { "epoch": 0.02094956140589042, "grad_norm": 26.21515464782715, "learning_rate": 4.99464222742176e-06, "loss": 1.546, "step": 19290 }, { "epoch": 0.02095499156695986, "grad_norm": 19.117128372192383, "learning_rate": 4.9946394360956596e-06, "loss": 1.977, "step": 19295 }, { "epoch": 0.020960421728029296, "grad_norm": 23.300100326538086, "learning_rate": 4.994636644043409e-06, "loss": 1.8909, "step": 19300 }, { "epoch": 0.020965851889098733, "grad_norm": 40.459922790527344, "learning_rate": 4.994633851265007e-06, "loss": 1.736, "step": 19305 }, { "epoch": 0.020971282050168173, "grad_norm": 35.61161422729492, "learning_rate": 4.994631057760454e-06, "loss": 1.1041, "step": 19310 }, { "epoch": 0.02097671221123761, "grad_norm": 36.30259323120117, "learning_rate": 4.994628263529754e-06, "loss": 1.3416, "step": 19315 }, { "epoch": 0.02098214237230705, "grad_norm": 25.420351028442383, "learning_rate": 4.994625468572904e-06, "loss": 1.81, "step": 19320 }, { "epoch": 0.020987572533376485, "grad_norm": 44.22880172729492, "learning_rate": 4.994622672889907e-06, "loss": 1.6655, "step": 19325 }, { "epoch": 0.02099300269444592, "grad_norm": 20.889549255371094, "learning_rate": 4.994619876480763e-06, "loss": 1.3325, "step": 19330 }, { "epoch": 0.02099843285551536, "grad_norm": 19.12393569946289, "learning_rate": 4.994617079345474e-06, "loss": 2.0827, "step": 19335 }, { "epoch": 0.021003863016584797, "grad_norm": 34.84128189086914, "learning_rate": 4.9946142814840396e-06, "loss": 1.2989, "step": 19340 }, { "epoch": 0.021009293177654237, "grad_norm": 13.99052619934082, "learning_rate": 4.99461148289646e-06, "loss": 1.6147, "step": 19345 }, { "epoch": 0.021014723338723673, "grad_norm": 30.576982498168945, "learning_rate": 4.994608683582738e-06, "loss": 1.29, "step": 19350 }, { "epoch": 0.02102015349979311, "grad_norm": 48.83424377441406, "learning_rate": 4.994605883542873e-06, "loss": 1.6949, "step": 19355 }, { "epoch": 0.02102558366086255, "grad_norm": 19.200830459594727, "learning_rate": 4.994603082776865e-06, "loss": 1.7305, "step": 19360 }, { "epoch": 0.021031013821931985, "grad_norm": 15.320321083068848, "learning_rate": 4.994600281284718e-06, "loss": 1.2798, "step": 19365 }, { "epoch": 0.021036443983001425, "grad_norm": 15.593677520751953, "learning_rate": 4.99459747906643e-06, "loss": 1.2597, "step": 19370 }, { "epoch": 0.02104187414407086, "grad_norm": 13.029932022094727, "learning_rate": 4.9945946761220026e-06, "loss": 1.4879, "step": 19375 }, { "epoch": 0.021047304305140298, "grad_norm": 24.032867431640625, "learning_rate": 4.994591872451436e-06, "loss": 1.8215, "step": 19380 }, { "epoch": 0.021052734466209738, "grad_norm": 13.527658462524414, "learning_rate": 4.994589068054733e-06, "loss": 1.3496, "step": 19385 }, { "epoch": 0.021058164627279174, "grad_norm": 20.086687088012695, "learning_rate": 4.994586262931892e-06, "loss": 1.4507, "step": 19390 }, { "epoch": 0.021063594788348614, "grad_norm": 223.63583374023438, "learning_rate": 4.994583457082915e-06, "loss": 1.3595, "step": 19395 }, { "epoch": 0.02106902494941805, "grad_norm": 21.610767364501953, "learning_rate": 4.994580650507803e-06, "loss": 1.6515, "step": 19400 }, { "epoch": 0.021074455110487486, "grad_norm": 19.421432495117188, "learning_rate": 4.994577843206557e-06, "loss": 1.3635, "step": 19405 }, { "epoch": 0.021079885271556926, "grad_norm": 42.72462844848633, "learning_rate": 4.994575035179177e-06, "loss": 2.3245, "step": 19410 }, { "epoch": 0.021085315432626362, "grad_norm": 20.18406105041504, "learning_rate": 4.994572226425664e-06, "loss": 1.7736, "step": 19415 }, { "epoch": 0.021090745593695802, "grad_norm": 47.19614028930664, "learning_rate": 4.994569416946019e-06, "loss": 2.1731, "step": 19420 }, { "epoch": 0.021096175754765238, "grad_norm": 22.05243492126465, "learning_rate": 4.994566606740243e-06, "loss": 1.58, "step": 19425 }, { "epoch": 0.021101605915834674, "grad_norm": 34.39042282104492, "learning_rate": 4.994563795808336e-06, "loss": 1.3264, "step": 19430 }, { "epoch": 0.021107036076904114, "grad_norm": 14.642498970031738, "learning_rate": 4.9945609841503e-06, "loss": 1.1447, "step": 19435 }, { "epoch": 0.02111246623797355, "grad_norm": 19.8865909576416, "learning_rate": 4.994558171766135e-06, "loss": 1.8922, "step": 19440 }, { "epoch": 0.021117896399042987, "grad_norm": 96.11297607421875, "learning_rate": 4.994555358655843e-06, "loss": 1.6456, "step": 19445 }, { "epoch": 0.021123326560112426, "grad_norm": 27.37070083618164, "learning_rate": 4.9945525448194224e-06, "loss": 1.846, "step": 19450 }, { "epoch": 0.021128756721181863, "grad_norm": 25.877098083496094, "learning_rate": 4.994549730256876e-06, "loss": 2.1405, "step": 19455 }, { "epoch": 0.021134186882251303, "grad_norm": 31.100971221923828, "learning_rate": 4.994546914968204e-06, "loss": 1.9354, "step": 19460 }, { "epoch": 0.02113961704332074, "grad_norm": 24.306537628173828, "learning_rate": 4.9945440989534075e-06, "loss": 1.6106, "step": 19465 }, { "epoch": 0.021145047204390175, "grad_norm": 20.68866539001465, "learning_rate": 4.994541282212488e-06, "loss": 1.5425, "step": 19470 }, { "epoch": 0.021150477365459615, "grad_norm": 16.2805233001709, "learning_rate": 4.994538464745445e-06, "loss": 1.8243, "step": 19475 }, { "epoch": 0.02115590752652905, "grad_norm": 27.966588973999023, "learning_rate": 4.994535646552279e-06, "loss": 1.6751, "step": 19480 }, { "epoch": 0.02116133768759849, "grad_norm": 29.665746688842773, "learning_rate": 4.994532827632993e-06, "loss": 2.2244, "step": 19485 }, { "epoch": 0.021166767848667927, "grad_norm": 17.311023712158203, "learning_rate": 4.994530007987585e-06, "loss": 1.3381, "step": 19490 }, { "epoch": 0.021172198009737363, "grad_norm": 16.65382957458496, "learning_rate": 4.994527187616059e-06, "loss": 1.7148, "step": 19495 }, { "epoch": 0.021177628170806803, "grad_norm": 24.235837936401367, "learning_rate": 4.994524366518412e-06, "loss": 1.7081, "step": 19500 }, { "epoch": 0.02118305833187624, "grad_norm": 24.635868072509766, "learning_rate": 4.994521544694648e-06, "loss": 1.3643, "step": 19505 }, { "epoch": 0.02118848849294568, "grad_norm": 15.699983596801758, "learning_rate": 4.9945187221447665e-06, "loss": 1.4268, "step": 19510 }, { "epoch": 0.021193918654015115, "grad_norm": 56.014404296875, "learning_rate": 4.9945158988687696e-06, "loss": 1.7941, "step": 19515 }, { "epoch": 0.021199348815084552, "grad_norm": 16.45744514465332, "learning_rate": 4.994513074866656e-06, "loss": 1.75, "step": 19520 }, { "epoch": 0.02120477897615399, "grad_norm": 20.931114196777344, "learning_rate": 4.994510250138427e-06, "loss": 1.63, "step": 19525 }, { "epoch": 0.021210209137223428, "grad_norm": 21.482433319091797, "learning_rate": 4.994507424684085e-06, "loss": 1.8269, "step": 19530 }, { "epoch": 0.021215639298292868, "grad_norm": 16.502086639404297, "learning_rate": 4.99450459850363e-06, "loss": 1.452, "step": 19535 }, { "epoch": 0.021221069459362304, "grad_norm": 12.921049118041992, "learning_rate": 4.9945017715970625e-06, "loss": 2.3235, "step": 19540 }, { "epoch": 0.02122649962043174, "grad_norm": 13.42961597442627, "learning_rate": 4.994498943964383e-06, "loss": 2.0825, "step": 19545 }, { "epoch": 0.02123192978150118, "grad_norm": 59.79586410522461, "learning_rate": 4.994496115605594e-06, "loss": 2.1909, "step": 19550 }, { "epoch": 0.021237359942570616, "grad_norm": 29.60788345336914, "learning_rate": 4.994493286520694e-06, "loss": 2.3257, "step": 19555 }, { "epoch": 0.021242790103640056, "grad_norm": 55.64584732055664, "learning_rate": 4.994490456709684e-06, "loss": 1.5296, "step": 19560 }, { "epoch": 0.021248220264709492, "grad_norm": 22.763521194458008, "learning_rate": 4.994487626172568e-06, "loss": 1.2835, "step": 19565 }, { "epoch": 0.02125365042577893, "grad_norm": 21.541275024414062, "learning_rate": 4.994484794909343e-06, "loss": 2.0222, "step": 19570 }, { "epoch": 0.021259080586848368, "grad_norm": 21.873573303222656, "learning_rate": 4.994481962920012e-06, "loss": 1.6107, "step": 19575 }, { "epoch": 0.021264510747917804, "grad_norm": 26.164390563964844, "learning_rate": 4.994479130204576e-06, "loss": 1.8744, "step": 19580 }, { "epoch": 0.02126994090898724, "grad_norm": 22.265296936035156, "learning_rate": 4.994476296763034e-06, "loss": 1.9934, "step": 19585 }, { "epoch": 0.02127537107005668, "grad_norm": 20.370912551879883, "learning_rate": 4.994473462595389e-06, "loss": 1.5153, "step": 19590 }, { "epoch": 0.021280801231126117, "grad_norm": 58.27020263671875, "learning_rate": 4.994470627701639e-06, "loss": 2.1143, "step": 19595 }, { "epoch": 0.021286231392195556, "grad_norm": 17.937143325805664, "learning_rate": 4.994467792081788e-06, "loss": 2.1786, "step": 19600 }, { "epoch": 0.021291661553264993, "grad_norm": 29.732391357421875, "learning_rate": 4.994464955735835e-06, "loss": 2.3116, "step": 19605 }, { "epoch": 0.02129709171433443, "grad_norm": 45.148502349853516, "learning_rate": 4.994462118663781e-06, "loss": 1.9516, "step": 19610 }, { "epoch": 0.02130252187540387, "grad_norm": 29.125940322875977, "learning_rate": 4.994459280865627e-06, "loss": 1.0379, "step": 19615 }, { "epoch": 0.021307952036473305, "grad_norm": 17.36399269104004, "learning_rate": 4.994456442341374e-06, "loss": 1.6671, "step": 19620 }, { "epoch": 0.021313382197542745, "grad_norm": 17.3786678314209, "learning_rate": 4.994453603091023e-06, "loss": 1.8087, "step": 19625 }, { "epoch": 0.02131881235861218, "grad_norm": 15.486355781555176, "learning_rate": 4.994450763114574e-06, "loss": 1.7789, "step": 19630 }, { "epoch": 0.021324242519681617, "grad_norm": 15.915514945983887, "learning_rate": 4.994447922412029e-06, "loss": 2.1071, "step": 19635 }, { "epoch": 0.021329672680751057, "grad_norm": 72.47582244873047, "learning_rate": 4.9944450809833875e-06, "loss": 1.6037, "step": 19640 }, { "epoch": 0.021335102841820493, "grad_norm": 57.85175323486328, "learning_rate": 4.994442238828652e-06, "loss": 1.9278, "step": 19645 }, { "epoch": 0.021340533002889933, "grad_norm": 90.88634490966797, "learning_rate": 4.994439395947821e-06, "loss": 2.0793, "step": 19650 }, { "epoch": 0.02134596316395937, "grad_norm": 42.547996520996094, "learning_rate": 4.9944365523408965e-06, "loss": 1.9999, "step": 19655 }, { "epoch": 0.021351393325028806, "grad_norm": 109.89546203613281, "learning_rate": 4.994433708007881e-06, "loss": 1.5729, "step": 19660 }, { "epoch": 0.021356823486098245, "grad_norm": 72.00994110107422, "learning_rate": 4.994430862948773e-06, "loss": 1.9219, "step": 19665 }, { "epoch": 0.021362253647167682, "grad_norm": 18.887758255004883, "learning_rate": 4.994428017163573e-06, "loss": 1.6703, "step": 19670 }, { "epoch": 0.02136768380823712, "grad_norm": 15.69442081451416, "learning_rate": 4.9944251706522844e-06, "loss": 1.3911, "step": 19675 }, { "epoch": 0.021373113969306558, "grad_norm": 34.22526168823242, "learning_rate": 4.994422323414906e-06, "loss": 1.576, "step": 19680 }, { "epoch": 0.021378544130375994, "grad_norm": 19.33778953552246, "learning_rate": 4.994419475451439e-06, "loss": 2.1815, "step": 19685 }, { "epoch": 0.021383974291445434, "grad_norm": 24.03241539001465, "learning_rate": 4.994416626761886e-06, "loss": 1.8785, "step": 19690 }, { "epoch": 0.02138940445251487, "grad_norm": 20.449481964111328, "learning_rate": 4.9944137773462445e-06, "loss": 1.9084, "step": 19695 }, { "epoch": 0.02139483461358431, "grad_norm": 30.461233139038086, "learning_rate": 4.994410927204518e-06, "loss": 1.8492, "step": 19700 }, { "epoch": 0.021400264774653746, "grad_norm": 80.42002868652344, "learning_rate": 4.994408076336707e-06, "loss": 1.777, "step": 19705 }, { "epoch": 0.021405694935723182, "grad_norm": 24.77373504638672, "learning_rate": 4.99440522474281e-06, "loss": 1.8946, "step": 19710 }, { "epoch": 0.021411125096792622, "grad_norm": 105.27932739257812, "learning_rate": 4.994402372422831e-06, "loss": 2.7124, "step": 19715 }, { "epoch": 0.02141655525786206, "grad_norm": 22.850852966308594, "learning_rate": 4.994399519376768e-06, "loss": 1.5326, "step": 19720 }, { "epoch": 0.021421985418931495, "grad_norm": 37.777687072753906, "learning_rate": 4.994396665604625e-06, "loss": 2.1948, "step": 19725 }, { "epoch": 0.021427415580000934, "grad_norm": 34.4624137878418, "learning_rate": 4.9943938111064e-06, "loss": 1.3944, "step": 19730 }, { "epoch": 0.02143284574107037, "grad_norm": 35.24955368041992, "learning_rate": 4.994390955882095e-06, "loss": 2.2585, "step": 19735 }, { "epoch": 0.02143827590213981, "grad_norm": 16.38161849975586, "learning_rate": 4.994388099931711e-06, "loss": 1.1886, "step": 19740 }, { "epoch": 0.021443706063209247, "grad_norm": 40.7375373840332, "learning_rate": 4.994385243255249e-06, "loss": 1.6557, "step": 19745 }, { "epoch": 0.021449136224278683, "grad_norm": 16.461158752441406, "learning_rate": 4.994382385852709e-06, "loss": 2.1805, "step": 19750 }, { "epoch": 0.021454566385348123, "grad_norm": 45.78654861450195, "learning_rate": 4.994379527724091e-06, "loss": 2.4771, "step": 19755 }, { "epoch": 0.02145999654641756, "grad_norm": 37.78250503540039, "learning_rate": 4.994376668869399e-06, "loss": 1.6086, "step": 19760 }, { "epoch": 0.021465426707487, "grad_norm": 23.691194534301758, "learning_rate": 4.994373809288631e-06, "loss": 1.1615, "step": 19765 }, { "epoch": 0.021470856868556435, "grad_norm": 46.619140625, "learning_rate": 4.994370948981789e-06, "loss": 1.8664, "step": 19770 }, { "epoch": 0.02147628702962587, "grad_norm": 17.858388900756836, "learning_rate": 4.994368087948873e-06, "loss": 0.9524, "step": 19775 }, { "epoch": 0.02148171719069531, "grad_norm": 103.25812530517578, "learning_rate": 4.994365226189885e-06, "loss": 2.0031, "step": 19780 }, { "epoch": 0.021487147351764747, "grad_norm": 17.045642852783203, "learning_rate": 4.994362363704825e-06, "loss": 1.3132, "step": 19785 }, { "epoch": 0.021492577512834187, "grad_norm": 13.565329551696777, "learning_rate": 4.994359500493694e-06, "loss": 2.0357, "step": 19790 }, { "epoch": 0.021498007673903623, "grad_norm": 24.598493576049805, "learning_rate": 4.994356636556494e-06, "loss": 1.5412, "step": 19795 }, { "epoch": 0.02150343783497306, "grad_norm": 23.743324279785156, "learning_rate": 4.9943537718932235e-06, "loss": 1.5136, "step": 19800 }, { "epoch": 0.0215088679960425, "grad_norm": 19.52215003967285, "learning_rate": 4.994350906503884e-06, "loss": 2.0136, "step": 19805 }, { "epoch": 0.021514298157111936, "grad_norm": 37.3823127746582, "learning_rate": 4.9943480403884785e-06, "loss": 1.7566, "step": 19810 }, { "epoch": 0.021519728318181375, "grad_norm": 16.810466766357422, "learning_rate": 4.994345173547005e-06, "loss": 1.3969, "step": 19815 }, { "epoch": 0.021525158479250812, "grad_norm": 14.146956443786621, "learning_rate": 4.994342305979466e-06, "loss": 2.009, "step": 19820 }, { "epoch": 0.021530588640320248, "grad_norm": 23.25777244567871, "learning_rate": 4.9943394376858625e-06, "loss": 1.5698, "step": 19825 }, { "epoch": 0.021536018801389688, "grad_norm": 22.581279754638672, "learning_rate": 4.994336568666193e-06, "loss": 1.9547, "step": 19830 }, { "epoch": 0.021541448962459124, "grad_norm": 42.24113082885742, "learning_rate": 4.994333698920462e-06, "loss": 1.6532, "step": 19835 }, { "epoch": 0.021546879123528564, "grad_norm": 19.758560180664062, "learning_rate": 4.994330828448668e-06, "loss": 1.531, "step": 19840 }, { "epoch": 0.021552309284598, "grad_norm": 25.252756118774414, "learning_rate": 4.994327957250812e-06, "loss": 1.7702, "step": 19845 }, { "epoch": 0.021557739445667436, "grad_norm": 25.830141067504883, "learning_rate": 4.994325085326895e-06, "loss": 1.4347, "step": 19850 }, { "epoch": 0.021563169606736876, "grad_norm": 19.77467918395996, "learning_rate": 4.994322212676918e-06, "loss": 1.1862, "step": 19855 }, { "epoch": 0.021568599767806312, "grad_norm": 18.888568878173828, "learning_rate": 4.994319339300882e-06, "loss": 1.6675, "step": 19860 }, { "epoch": 0.02157402992887575, "grad_norm": 23.436237335205078, "learning_rate": 4.9943164651987864e-06, "loss": 1.5859, "step": 19865 }, { "epoch": 0.02157946008994519, "grad_norm": 86.88626098632812, "learning_rate": 4.994313590370634e-06, "loss": 1.729, "step": 19870 }, { "epoch": 0.021584890251014625, "grad_norm": 39.52510452270508, "learning_rate": 4.994310714816426e-06, "loss": 2.0947, "step": 19875 }, { "epoch": 0.021590320412084064, "grad_norm": 10.880677223205566, "learning_rate": 4.9943078385361605e-06, "loss": 1.731, "step": 19880 }, { "epoch": 0.0215957505731535, "grad_norm": 19.76401138305664, "learning_rate": 4.994304961529841e-06, "loss": 2.2426, "step": 19885 }, { "epoch": 0.021601180734222937, "grad_norm": 18.34990692138672, "learning_rate": 4.994302083797467e-06, "loss": 1.2844, "step": 19890 }, { "epoch": 0.021606610895292377, "grad_norm": 14.348336219787598, "learning_rate": 4.994299205339038e-06, "loss": 2.0384, "step": 19895 }, { "epoch": 0.021612041056361813, "grad_norm": 12.985107421875, "learning_rate": 4.994296326154558e-06, "loss": 1.5557, "step": 19900 }, { "epoch": 0.021617471217431253, "grad_norm": 31.825117111206055, "learning_rate": 4.994293446244026e-06, "loss": 1.4025, "step": 19905 }, { "epoch": 0.02162290137850069, "grad_norm": 23.263813018798828, "learning_rate": 4.994290565607444e-06, "loss": 2.0558, "step": 19910 }, { "epoch": 0.021628331539570125, "grad_norm": 16.785799026489258, "learning_rate": 4.994287684244811e-06, "loss": 2.1457, "step": 19915 }, { "epoch": 0.021633761700639565, "grad_norm": 19.413166046142578, "learning_rate": 4.994284802156129e-06, "loss": 1.5944, "step": 19920 }, { "epoch": 0.021639191861709, "grad_norm": 184.63905334472656, "learning_rate": 4.994281919341399e-06, "loss": 1.7634, "step": 19925 }, { "epoch": 0.02164462202277844, "grad_norm": 39.0566291809082, "learning_rate": 4.994279035800621e-06, "loss": 1.6506, "step": 19930 }, { "epoch": 0.021650052183847877, "grad_norm": 24.198760986328125, "learning_rate": 4.994276151533797e-06, "loss": 1.688, "step": 19935 }, { "epoch": 0.021655482344917314, "grad_norm": 26.17591667175293, "learning_rate": 4.994273266540926e-06, "loss": 1.963, "step": 19940 }, { "epoch": 0.021660912505986753, "grad_norm": 16.960674285888672, "learning_rate": 4.99427038082201e-06, "loss": 1.1558, "step": 19945 }, { "epoch": 0.02166634266705619, "grad_norm": 25.465017318725586, "learning_rate": 4.9942674943770514e-06, "loss": 1.6626, "step": 19950 }, { "epoch": 0.02167177282812563, "grad_norm": 37.009849548339844, "learning_rate": 4.994264607206048e-06, "loss": 1.737, "step": 19955 }, { "epoch": 0.021677202989195066, "grad_norm": 18.2247314453125, "learning_rate": 4.994261719309003e-06, "loss": 1.7419, "step": 19960 }, { "epoch": 0.021682633150264502, "grad_norm": 77.32970428466797, "learning_rate": 4.994258830685916e-06, "loss": 1.363, "step": 19965 }, { "epoch": 0.021688063311333942, "grad_norm": 19.451313018798828, "learning_rate": 4.994255941336789e-06, "loss": 1.357, "step": 19970 }, { "epoch": 0.021693493472403378, "grad_norm": 34.39308166503906, "learning_rate": 4.99425305126162e-06, "loss": 1.2718, "step": 19975 }, { "epoch": 0.021698923633472818, "grad_norm": 25.253955841064453, "learning_rate": 4.994250160460414e-06, "loss": 1.8488, "step": 19980 }, { "epoch": 0.021704353794542254, "grad_norm": 16.063762664794922, "learning_rate": 4.994247268933169e-06, "loss": 1.5764, "step": 19985 }, { "epoch": 0.02170978395561169, "grad_norm": 51.89494705200195, "learning_rate": 4.994244376679887e-06, "loss": 1.8809, "step": 19990 }, { "epoch": 0.02171521411668113, "grad_norm": 23.77412986755371, "learning_rate": 4.994241483700567e-06, "loss": 2.0211, "step": 19995 }, { "epoch": 0.021720644277750566, "grad_norm": 26.988513946533203, "learning_rate": 4.9942385899952126e-06, "loss": 2.0148, "step": 20000 }, { "epoch": 0.021726074438820003, "grad_norm": 16.11789321899414, "learning_rate": 4.994235695563822e-06, "loss": 1.3348, "step": 20005 }, { "epoch": 0.021731504599889442, "grad_norm": 16.090688705444336, "learning_rate": 4.994232800406399e-06, "loss": 1.7858, "step": 20010 }, { "epoch": 0.02173693476095888, "grad_norm": 13.639013290405273, "learning_rate": 4.994229904522942e-06, "loss": 1.702, "step": 20015 }, { "epoch": 0.02174236492202832, "grad_norm": 40.79269027709961, "learning_rate": 4.994227007913453e-06, "loss": 2.2181, "step": 20020 }, { "epoch": 0.021747795083097755, "grad_norm": 17.804758071899414, "learning_rate": 4.994224110577931e-06, "loss": 1.7049, "step": 20025 }, { "epoch": 0.02175322524416719, "grad_norm": 18.607730865478516, "learning_rate": 4.99422121251638e-06, "loss": 1.7642, "step": 20030 }, { "epoch": 0.02175865540523663, "grad_norm": 14.433475494384766, "learning_rate": 4.994218313728798e-06, "loss": 1.5663, "step": 20035 }, { "epoch": 0.021764085566306067, "grad_norm": 18.99093246459961, "learning_rate": 4.994215414215188e-06, "loss": 1.9289, "step": 20040 }, { "epoch": 0.021769515727375507, "grad_norm": 30.147403717041016, "learning_rate": 4.994212513975549e-06, "loss": 1.8892, "step": 20045 }, { "epoch": 0.021774945888444943, "grad_norm": 37.59225082397461, "learning_rate": 4.994209613009884e-06, "loss": 1.8927, "step": 20050 }, { "epoch": 0.02178037604951438, "grad_norm": 44.176307678222656, "learning_rate": 4.994206711318191e-06, "loss": 1.9203, "step": 20055 }, { "epoch": 0.02178580621058382, "grad_norm": 44.84906768798828, "learning_rate": 4.994203808900473e-06, "loss": 1.4588, "step": 20060 }, { "epoch": 0.021791236371653255, "grad_norm": 17.026199340820312, "learning_rate": 4.9942009057567305e-06, "loss": 1.798, "step": 20065 }, { "epoch": 0.021796666532722695, "grad_norm": 43.88307189941406, "learning_rate": 4.994198001886963e-06, "loss": 1.1537, "step": 20070 }, { "epoch": 0.02180209669379213, "grad_norm": 14.07160758972168, "learning_rate": 4.994195097291173e-06, "loss": 1.4885, "step": 20075 }, { "epoch": 0.021807526854861568, "grad_norm": 23.12123680114746, "learning_rate": 4.994192191969361e-06, "loss": 1.1754, "step": 20080 }, { "epoch": 0.021812957015931007, "grad_norm": 13.759331703186035, "learning_rate": 4.994189285921528e-06, "loss": 1.5731, "step": 20085 }, { "epoch": 0.021818387177000444, "grad_norm": 51.603233337402344, "learning_rate": 4.9941863791476745e-06, "loss": 1.1185, "step": 20090 }, { "epoch": 0.021823817338069883, "grad_norm": 19.091772079467773, "learning_rate": 4.9941834716478e-06, "loss": 1.4437, "step": 20095 }, { "epoch": 0.02182924749913932, "grad_norm": 12.075485229492188, "learning_rate": 4.994180563421907e-06, "loss": 1.5612, "step": 20100 }, { "epoch": 0.021834677660208756, "grad_norm": 37.25170135498047, "learning_rate": 4.994177654469997e-06, "loss": 1.9881, "step": 20105 }, { "epoch": 0.021840107821278196, "grad_norm": 23.745393753051758, "learning_rate": 4.994174744792069e-06, "loss": 1.9986, "step": 20110 }, { "epoch": 0.021845537982347632, "grad_norm": 15.37241268157959, "learning_rate": 4.994171834388125e-06, "loss": 1.3277, "step": 20115 }, { "epoch": 0.021850968143417072, "grad_norm": 15.785223007202148, "learning_rate": 4.994168923258166e-06, "loss": 1.544, "step": 20120 }, { "epoch": 0.021856398304486508, "grad_norm": 15.653594970703125, "learning_rate": 4.994166011402192e-06, "loss": 1.3488, "step": 20125 }, { "epoch": 0.021861828465555944, "grad_norm": 45.74237823486328, "learning_rate": 4.994163098820204e-06, "loss": 1.4584, "step": 20130 }, { "epoch": 0.021867258626625384, "grad_norm": 21.165998458862305, "learning_rate": 4.994160185512203e-06, "loss": 1.975, "step": 20135 }, { "epoch": 0.02187268878769482, "grad_norm": 36.73088073730469, "learning_rate": 4.994157271478189e-06, "loss": 1.6478, "step": 20140 }, { "epoch": 0.021878118948764257, "grad_norm": 13.419097900390625, "learning_rate": 4.994154356718166e-06, "loss": 1.2594, "step": 20145 }, { "epoch": 0.021883549109833696, "grad_norm": 23.110416412353516, "learning_rate": 4.994151441232131e-06, "loss": 2.1975, "step": 20150 }, { "epoch": 0.021888979270903133, "grad_norm": 16.68861198425293, "learning_rate": 4.9941485250200865e-06, "loss": 1.3428, "step": 20155 }, { "epoch": 0.021894409431972572, "grad_norm": 20.60615348815918, "learning_rate": 4.994145608082034e-06, "loss": 1.6668, "step": 20160 }, { "epoch": 0.02189983959304201, "grad_norm": 21.66018295288086, "learning_rate": 4.994142690417973e-06, "loss": 1.4351, "step": 20165 }, { "epoch": 0.021905269754111445, "grad_norm": 17.346935272216797, "learning_rate": 4.9941397720279055e-06, "loss": 1.0579, "step": 20170 }, { "epoch": 0.021910699915180885, "grad_norm": 15.98559284210205, "learning_rate": 4.994136852911831e-06, "loss": 1.6989, "step": 20175 }, { "epoch": 0.02191613007625032, "grad_norm": 26.473344802856445, "learning_rate": 4.994133933069753e-06, "loss": 2.0813, "step": 20180 }, { "epoch": 0.02192156023731976, "grad_norm": 11.487579345703125, "learning_rate": 4.994131012501669e-06, "loss": 1.5276, "step": 20185 }, { "epoch": 0.021926990398389197, "grad_norm": 20.275230407714844, "learning_rate": 4.994128091207582e-06, "loss": 1.55, "step": 20190 }, { "epoch": 0.021932420559458633, "grad_norm": 27.2379150390625, "learning_rate": 4.994125169187492e-06, "loss": 1.4063, "step": 20195 }, { "epoch": 0.021937850720528073, "grad_norm": 24.535625457763672, "learning_rate": 4.9941222464414005e-06, "loss": 2.2148, "step": 20200 }, { "epoch": 0.02194328088159751, "grad_norm": 22.60789680480957, "learning_rate": 4.994119322969308e-06, "loss": 1.4357, "step": 20205 }, { "epoch": 0.02194871104266695, "grad_norm": 17.83708953857422, "learning_rate": 4.994116398771215e-06, "loss": 1.3975, "step": 20210 }, { "epoch": 0.021954141203736385, "grad_norm": 36.029266357421875, "learning_rate": 4.994113473847122e-06, "loss": 2.0254, "step": 20215 }, { "epoch": 0.02195957136480582, "grad_norm": 32.332515716552734, "learning_rate": 4.994110548197032e-06, "loss": 1.4819, "step": 20220 }, { "epoch": 0.02196500152587526, "grad_norm": 30.86577033996582, "learning_rate": 4.994107621820942e-06, "loss": 1.919, "step": 20225 }, { "epoch": 0.021970431686944698, "grad_norm": 18.19399070739746, "learning_rate": 4.994104694718858e-06, "loss": 1.2867, "step": 20230 }, { "epoch": 0.021975861848014137, "grad_norm": 15.435524940490723, "learning_rate": 4.994101766890776e-06, "loss": 1.0479, "step": 20235 }, { "epoch": 0.021981292009083574, "grad_norm": 15.081009864807129, "learning_rate": 4.9940988383367e-06, "loss": 1.8918, "step": 20240 }, { "epoch": 0.02198672217015301, "grad_norm": 32.69580841064453, "learning_rate": 4.99409590905663e-06, "loss": 1.8547, "step": 20245 }, { "epoch": 0.02199215233122245, "grad_norm": 24.41537857055664, "learning_rate": 4.994092979050566e-06, "loss": 1.5542, "step": 20250 }, { "epoch": 0.021997582492291886, "grad_norm": 30.5537166595459, "learning_rate": 4.99409004831851e-06, "loss": 2.2925, "step": 20255 }, { "epoch": 0.022003012653361326, "grad_norm": 18.487056732177734, "learning_rate": 4.994087116860462e-06, "loss": 1.5643, "step": 20260 }, { "epoch": 0.022008442814430762, "grad_norm": 31.945768356323242, "learning_rate": 4.994084184676423e-06, "loss": 0.9966, "step": 20265 }, { "epoch": 0.0220138729755002, "grad_norm": 15.879034996032715, "learning_rate": 4.994081251766395e-06, "loss": 2.0125, "step": 20270 }, { "epoch": 0.022019303136569638, "grad_norm": 76.0359115600586, "learning_rate": 4.994078318130377e-06, "loss": 1.5859, "step": 20275 }, { "epoch": 0.022024733297639074, "grad_norm": 21.191801071166992, "learning_rate": 4.99407538376837e-06, "loss": 1.9785, "step": 20280 }, { "epoch": 0.02203016345870851, "grad_norm": 13.7738676071167, "learning_rate": 4.994072448680377e-06, "loss": 1.7901, "step": 20285 }, { "epoch": 0.02203559361977795, "grad_norm": 40.36952209472656, "learning_rate": 4.994069512866397e-06, "loss": 1.5354, "step": 20290 }, { "epoch": 0.022041023780847387, "grad_norm": 19.133623123168945, "learning_rate": 4.994066576326431e-06, "loss": 2.1398, "step": 20295 }, { "epoch": 0.022046453941916826, "grad_norm": 26.690837860107422, "learning_rate": 4.99406363906048e-06, "loss": 1.3533, "step": 20300 }, { "epoch": 0.022051884102986263, "grad_norm": 17.603038787841797, "learning_rate": 4.994060701068546e-06, "loss": 1.8079, "step": 20305 }, { "epoch": 0.0220573142640557, "grad_norm": 21.37961196899414, "learning_rate": 4.9940577623506285e-06, "loss": 1.5122, "step": 20310 }, { "epoch": 0.02206274442512514, "grad_norm": 36.8828010559082, "learning_rate": 4.9940548229067284e-06, "loss": 1.7941, "step": 20315 }, { "epoch": 0.022068174586194575, "grad_norm": 31.648311614990234, "learning_rate": 4.9940518827368466e-06, "loss": 1.2141, "step": 20320 }, { "epoch": 0.022073604747264015, "grad_norm": 17.081390380859375, "learning_rate": 4.9940489418409855e-06, "loss": 1.3874, "step": 20325 }, { "epoch": 0.02207903490833345, "grad_norm": 18.6125545501709, "learning_rate": 4.9940460002191435e-06, "loss": 1.7244, "step": 20330 }, { "epoch": 0.022084465069402887, "grad_norm": 69.52564239501953, "learning_rate": 4.994043057871323e-06, "loss": 1.8951, "step": 20335 }, { "epoch": 0.022089895230472327, "grad_norm": 19.753421783447266, "learning_rate": 4.994040114797525e-06, "loss": 1.9707, "step": 20340 }, { "epoch": 0.022095325391541763, "grad_norm": 24.177812576293945, "learning_rate": 4.994037170997749e-06, "loss": 1.7038, "step": 20345 }, { "epoch": 0.022100755552611203, "grad_norm": 16.5686092376709, "learning_rate": 4.9940342264719975e-06, "loss": 2.034, "step": 20350 }, { "epoch": 0.02210618571368064, "grad_norm": 17.73090934753418, "learning_rate": 4.9940312812202695e-06, "loss": 1.9408, "step": 20355 }, { "epoch": 0.022111615874750076, "grad_norm": 25.913925170898438, "learning_rate": 4.994028335242568e-06, "loss": 1.6546, "step": 20360 }, { "epoch": 0.022117046035819515, "grad_norm": 11.92155933380127, "learning_rate": 4.994025388538893e-06, "loss": 1.3603, "step": 20365 }, { "epoch": 0.02212247619688895, "grad_norm": 41.606178283691406, "learning_rate": 4.994022441109244e-06, "loss": 1.3024, "step": 20370 }, { "epoch": 0.02212790635795839, "grad_norm": 16.504270553588867, "learning_rate": 4.9940194929536235e-06, "loss": 2.0639, "step": 20375 }, { "epoch": 0.022133336519027828, "grad_norm": 27.18010139465332, "learning_rate": 4.994016544072032e-06, "loss": 1.7545, "step": 20380 }, { "epoch": 0.022138766680097264, "grad_norm": 28.939790725708008, "learning_rate": 4.9940135944644704e-06, "loss": 2.1524, "step": 20385 }, { "epoch": 0.022144196841166704, "grad_norm": 62.13677215576172, "learning_rate": 4.994010644130939e-06, "loss": 1.526, "step": 20390 }, { "epoch": 0.02214962700223614, "grad_norm": 14.764123916625977, "learning_rate": 4.99400769307144e-06, "loss": 1.3721, "step": 20395 }, { "epoch": 0.02215505716330558, "grad_norm": 15.1353120803833, "learning_rate": 4.994004741285972e-06, "loss": 1.4186, "step": 20400 }, { "epoch": 0.022160487324375016, "grad_norm": 81.07228088378906, "learning_rate": 4.994001788774538e-06, "loss": 0.8682, "step": 20405 }, { "epoch": 0.022165917485444452, "grad_norm": 22.70361328125, "learning_rate": 4.9939988355371375e-06, "loss": 1.5289, "step": 20410 }, { "epoch": 0.022171347646513892, "grad_norm": 52.41940689086914, "learning_rate": 4.993995881573772e-06, "loss": 1.2586, "step": 20415 }, { "epoch": 0.02217677780758333, "grad_norm": 18.13900375366211, "learning_rate": 4.993992926884442e-06, "loss": 1.1528, "step": 20420 }, { "epoch": 0.022182207968652765, "grad_norm": 13.271919250488281, "learning_rate": 4.99398997146915e-06, "loss": 1.3697, "step": 20425 }, { "epoch": 0.022187638129722204, "grad_norm": 21.793010711669922, "learning_rate": 4.993987015327894e-06, "loss": 1.2679, "step": 20430 }, { "epoch": 0.02219306829079164, "grad_norm": 32.156803131103516, "learning_rate": 4.993984058460677e-06, "loss": 1.1671, "step": 20435 }, { "epoch": 0.02219849845186108, "grad_norm": 32.61456298828125, "learning_rate": 4.9939811008674985e-06, "loss": 1.3804, "step": 20440 }, { "epoch": 0.022203928612930517, "grad_norm": 78.04997253417969, "learning_rate": 4.993978142548361e-06, "loss": 2.3516, "step": 20445 }, { "epoch": 0.022209358773999953, "grad_norm": 14.796621322631836, "learning_rate": 4.993975183503263e-06, "loss": 1.7615, "step": 20450 }, { "epoch": 0.022214788935069393, "grad_norm": 24.86359214782715, "learning_rate": 4.993972223732209e-06, "loss": 1.8336, "step": 20455 }, { "epoch": 0.02222021909613883, "grad_norm": 14.044770240783691, "learning_rate": 4.993969263235196e-06, "loss": 1.0791, "step": 20460 }, { "epoch": 0.02222564925720827, "grad_norm": 15.003090858459473, "learning_rate": 4.993966302012227e-06, "loss": 1.6885, "step": 20465 }, { "epoch": 0.022231079418277705, "grad_norm": 24.978273391723633, "learning_rate": 4.993963340063302e-06, "loss": 1.7357, "step": 20470 }, { "epoch": 0.02223650957934714, "grad_norm": 13.087569236755371, "learning_rate": 4.993960377388422e-06, "loss": 1.2127, "step": 20475 }, { "epoch": 0.02224193974041658, "grad_norm": 25.967575073242188, "learning_rate": 4.993957413987589e-06, "loss": 1.4838, "step": 20480 }, { "epoch": 0.022247369901486017, "grad_norm": 14.602943420410156, "learning_rate": 4.9939544498608025e-06, "loss": 1.7041, "step": 20485 }, { "epoch": 0.022252800062555457, "grad_norm": 20.99726104736328, "learning_rate": 4.993951485008063e-06, "loss": 1.3804, "step": 20490 }, { "epoch": 0.022258230223624893, "grad_norm": 12.833792686462402, "learning_rate": 4.993948519429374e-06, "loss": 1.4245, "step": 20495 }, { "epoch": 0.02226366038469433, "grad_norm": 265.6318054199219, "learning_rate": 4.9939455531247325e-06, "loss": 1.8202, "step": 20500 }, { "epoch": 0.02226909054576377, "grad_norm": 16.95115089416504, "learning_rate": 4.9939425860941425e-06, "loss": 1.175, "step": 20505 }, { "epoch": 0.022274520706833206, "grad_norm": 77.2870864868164, "learning_rate": 4.993939618337603e-06, "loss": 1.247, "step": 20510 }, { "epoch": 0.022279950867902645, "grad_norm": 15.30884075164795, "learning_rate": 4.993936649855117e-06, "loss": 1.3603, "step": 20515 }, { "epoch": 0.02228538102897208, "grad_norm": 17.871593475341797, "learning_rate": 4.993933680646683e-06, "loss": 1.5347, "step": 20520 }, { "epoch": 0.022290811190041518, "grad_norm": 13.842467308044434, "learning_rate": 4.9939307107123034e-06, "loss": 1.6779, "step": 20525 }, { "epoch": 0.022296241351110958, "grad_norm": 32.01366424560547, "learning_rate": 4.993927740051978e-06, "loss": 1.4123, "step": 20530 }, { "epoch": 0.022301671512180394, "grad_norm": 15.399415969848633, "learning_rate": 4.9939247686657076e-06, "loss": 1.6674, "step": 20535 }, { "epoch": 0.02230710167324983, "grad_norm": 21.099937438964844, "learning_rate": 4.993921796553495e-06, "loss": 0.9873, "step": 20540 }, { "epoch": 0.02231253183431927, "grad_norm": 23.859264373779297, "learning_rate": 4.993918823715339e-06, "loss": 2.2828, "step": 20545 }, { "epoch": 0.022317961995388706, "grad_norm": 16.677349090576172, "learning_rate": 4.9939158501512405e-06, "loss": 1.8655, "step": 20550 }, { "epoch": 0.022323392156458146, "grad_norm": 41.478111267089844, "learning_rate": 4.993912875861202e-06, "loss": 1.3329, "step": 20555 }, { "epoch": 0.022328822317527582, "grad_norm": 24.20793342590332, "learning_rate": 4.993909900845223e-06, "loss": 1.2421, "step": 20560 }, { "epoch": 0.02233425247859702, "grad_norm": 20.980220794677734, "learning_rate": 4.993906925103306e-06, "loss": 1.9175, "step": 20565 }, { "epoch": 0.02233968263966646, "grad_norm": 22.037118911743164, "learning_rate": 4.993903948635449e-06, "loss": 1.8522, "step": 20570 }, { "epoch": 0.022345112800735895, "grad_norm": 33.43545150756836, "learning_rate": 4.993900971441656e-06, "loss": 2.2189, "step": 20575 }, { "epoch": 0.022350542961805334, "grad_norm": 43.6757926940918, "learning_rate": 4.993897993521924e-06, "loss": 2.626, "step": 20580 }, { "epoch": 0.02235597312287477, "grad_norm": 17.10997772216797, "learning_rate": 4.993895014876259e-06, "loss": 1.437, "step": 20585 }, { "epoch": 0.022361403283944207, "grad_norm": 15.233718872070312, "learning_rate": 4.993892035504657e-06, "loss": 1.6499, "step": 20590 }, { "epoch": 0.022366833445013647, "grad_norm": 15.728096008300781, "learning_rate": 4.9938890554071225e-06, "loss": 1.34, "step": 20595 }, { "epoch": 0.022372263606083083, "grad_norm": 16.532512664794922, "learning_rate": 4.993886074583654e-06, "loss": 1.0291, "step": 20600 }, { "epoch": 0.022377693767152523, "grad_norm": 22.550416946411133, "learning_rate": 4.993883093034253e-06, "loss": 1.698, "step": 20605 }, { "epoch": 0.02238312392822196, "grad_norm": 27.364404678344727, "learning_rate": 4.993880110758921e-06, "loss": 2.2017, "step": 20610 }, { "epoch": 0.022388554089291395, "grad_norm": 20.419437408447266, "learning_rate": 4.9938771277576586e-06, "loss": 1.8858, "step": 20615 }, { "epoch": 0.022393984250360835, "grad_norm": 52.91302490234375, "learning_rate": 4.993874144030466e-06, "loss": 1.2576, "step": 20620 }, { "epoch": 0.02239941441143027, "grad_norm": 27.20047378540039, "learning_rate": 4.993871159577345e-06, "loss": 1.5765, "step": 20625 }, { "epoch": 0.02240484457249971, "grad_norm": 33.486900329589844, "learning_rate": 4.993868174398296e-06, "loss": 1.6892, "step": 20630 }, { "epoch": 0.022410274733569147, "grad_norm": 40.60007858276367, "learning_rate": 4.99386518849332e-06, "loss": 1.8116, "step": 20635 }, { "epoch": 0.022415704894638584, "grad_norm": 31.031295776367188, "learning_rate": 4.993862201862417e-06, "loss": 1.8429, "step": 20640 }, { "epoch": 0.022421135055708023, "grad_norm": 100.33097076416016, "learning_rate": 4.99385921450559e-06, "loss": 1.5625, "step": 20645 }, { "epoch": 0.02242656521677746, "grad_norm": 55.682403564453125, "learning_rate": 4.993856226422837e-06, "loss": 1.7302, "step": 20650 }, { "epoch": 0.0224319953778469, "grad_norm": 26.836959838867188, "learning_rate": 4.993853237614162e-06, "loss": 1.6989, "step": 20655 }, { "epoch": 0.022437425538916336, "grad_norm": 24.09429931640625, "learning_rate": 4.993850248079563e-06, "loss": 1.7464, "step": 20660 }, { "epoch": 0.022442855699985772, "grad_norm": 15.762079238891602, "learning_rate": 4.993847257819043e-06, "loss": 1.7967, "step": 20665 }, { "epoch": 0.02244828586105521, "grad_norm": 24.07015037536621, "learning_rate": 4.993844266832602e-06, "loss": 1.4465, "step": 20670 }, { "epoch": 0.022453716022124648, "grad_norm": 16.73897361755371, "learning_rate": 4.99384127512024e-06, "loss": 1.5168, "step": 20675 }, { "epoch": 0.022459146183194084, "grad_norm": 18.325029373168945, "learning_rate": 4.993838282681959e-06, "loss": 1.3909, "step": 20680 }, { "epoch": 0.022464576344263524, "grad_norm": 15.089925765991211, "learning_rate": 4.99383528951776e-06, "loss": 1.0942, "step": 20685 }, { "epoch": 0.02247000650533296, "grad_norm": 68.61408996582031, "learning_rate": 4.993832295627643e-06, "loss": 1.8922, "step": 20690 }, { "epoch": 0.0224754366664024, "grad_norm": 56.47751998901367, "learning_rate": 4.99382930101161e-06, "loss": 1.5605, "step": 20695 }, { "epoch": 0.022480866827471836, "grad_norm": 35.566890716552734, "learning_rate": 4.993826305669661e-06, "loss": 1.3574, "step": 20700 }, { "epoch": 0.022486296988541273, "grad_norm": 18.76156234741211, "learning_rate": 4.9938233096017975e-06, "loss": 2.1445, "step": 20705 }, { "epoch": 0.022491727149610712, "grad_norm": 22.542564392089844, "learning_rate": 4.993820312808019e-06, "loss": 2.4113, "step": 20710 }, { "epoch": 0.02249715731068015, "grad_norm": 23.49126434326172, "learning_rate": 4.993817315288328e-06, "loss": 1.5686, "step": 20715 }, { "epoch": 0.02250258747174959, "grad_norm": 26.04693603515625, "learning_rate": 4.993814317042725e-06, "loss": 1.3397, "step": 20720 }, { "epoch": 0.022508017632819025, "grad_norm": 63.763587951660156, "learning_rate": 4.99381131807121e-06, "loss": 1.2752, "step": 20725 }, { "epoch": 0.02251344779388846, "grad_norm": 36.916961669921875, "learning_rate": 4.993808318373784e-06, "loss": 1.1649, "step": 20730 }, { "epoch": 0.0225188779549579, "grad_norm": 21.28053855895996, "learning_rate": 4.99380531795045e-06, "loss": 2.0598, "step": 20735 }, { "epoch": 0.022524308116027337, "grad_norm": 44.485599517822266, "learning_rate": 4.993802316801206e-06, "loss": 2.1267, "step": 20740 }, { "epoch": 0.022529738277096777, "grad_norm": 45.01779556274414, "learning_rate": 4.993799314926054e-06, "loss": 1.9025, "step": 20745 }, { "epoch": 0.022535168438166213, "grad_norm": 24.388225555419922, "learning_rate": 4.993796312324995e-06, "loss": 1.4928, "step": 20750 }, { "epoch": 0.02254059859923565, "grad_norm": 29.980051040649414, "learning_rate": 4.99379330899803e-06, "loss": 1.986, "step": 20755 }, { "epoch": 0.02254602876030509, "grad_norm": 141.17369079589844, "learning_rate": 4.99379030494516e-06, "loss": 1.0629, "step": 20760 }, { "epoch": 0.022551458921374525, "grad_norm": 25.55675506591797, "learning_rate": 4.993787300166386e-06, "loss": 1.7557, "step": 20765 }, { "epoch": 0.022556889082443965, "grad_norm": 71.94026184082031, "learning_rate": 4.993784294661708e-06, "loss": 1.1584, "step": 20770 }, { "epoch": 0.0225623192435134, "grad_norm": 17.542451858520508, "learning_rate": 4.9937812884311265e-06, "loss": 1.7095, "step": 20775 }, { "epoch": 0.022567749404582838, "grad_norm": 23.512664794921875, "learning_rate": 4.993778281474644e-06, "loss": 1.6384, "step": 20780 }, { "epoch": 0.022573179565652277, "grad_norm": 44.92743682861328, "learning_rate": 4.99377527379226e-06, "loss": 1.167, "step": 20785 }, { "epoch": 0.022578609726721714, "grad_norm": 15.043946266174316, "learning_rate": 4.993772265383977e-06, "loss": 1.6775, "step": 20790 }, { "epoch": 0.022584039887791153, "grad_norm": 13.932615280151367, "learning_rate": 4.993769256249794e-06, "loss": 2.1666, "step": 20795 }, { "epoch": 0.02258947004886059, "grad_norm": 57.76084518432617, "learning_rate": 4.993766246389713e-06, "loss": 1.3122, "step": 20800 }, { "epoch": 0.022594900209930026, "grad_norm": 13.802202224731445, "learning_rate": 4.993763235803734e-06, "loss": 2.1176, "step": 20805 }, { "epoch": 0.022600330370999466, "grad_norm": 11.213041305541992, "learning_rate": 4.993760224491859e-06, "loss": 1.9951, "step": 20810 }, { "epoch": 0.022605760532068902, "grad_norm": 41.38987731933594, "learning_rate": 4.993757212454089e-06, "loss": 1.2274, "step": 20815 }, { "epoch": 0.022611190693138338, "grad_norm": 13.499621391296387, "learning_rate": 4.993754199690424e-06, "loss": 1.7334, "step": 20820 }, { "epoch": 0.022616620854207778, "grad_norm": 255.66053771972656, "learning_rate": 4.993751186200865e-06, "loss": 1.7898, "step": 20825 }, { "epoch": 0.022622051015277214, "grad_norm": 18.26567268371582, "learning_rate": 4.9937481719854115e-06, "loss": 1.8722, "step": 20830 }, { "epoch": 0.022627481176346654, "grad_norm": 33.003944396972656, "learning_rate": 4.993745157044067e-06, "loss": 1.2415, "step": 20835 }, { "epoch": 0.02263291133741609, "grad_norm": 14.673226356506348, "learning_rate": 4.993742141376832e-06, "loss": 2.0667, "step": 20840 }, { "epoch": 0.022638341498485526, "grad_norm": 26.661399841308594, "learning_rate": 4.993739124983706e-06, "loss": 1.3495, "step": 20845 }, { "epoch": 0.022643771659554966, "grad_norm": 16.284717559814453, "learning_rate": 4.99373610786469e-06, "loss": 1.4896, "step": 20850 }, { "epoch": 0.022649201820624403, "grad_norm": 33.94048309326172, "learning_rate": 4.9937330900197855e-06, "loss": 1.5691, "step": 20855 }, { "epoch": 0.022654631981693842, "grad_norm": 18.643571853637695, "learning_rate": 4.993730071448993e-06, "loss": 1.9381, "step": 20860 }, { "epoch": 0.02266006214276328, "grad_norm": 50.8658561706543, "learning_rate": 4.993727052152315e-06, "loss": 1.7678, "step": 20865 }, { "epoch": 0.022665492303832715, "grad_norm": 19.02729606628418, "learning_rate": 4.9937240321297494e-06, "loss": 1.7611, "step": 20870 }, { "epoch": 0.022670922464902155, "grad_norm": 35.62952423095703, "learning_rate": 4.9937210113813e-06, "loss": 2.3337, "step": 20875 }, { "epoch": 0.02267635262597159, "grad_norm": 19.819332122802734, "learning_rate": 4.993717989906966e-06, "loss": 1.6905, "step": 20880 }, { "epoch": 0.02268178278704103, "grad_norm": 17.50533103942871, "learning_rate": 4.993714967706747e-06, "loss": 1.4955, "step": 20885 }, { "epoch": 0.022687212948110467, "grad_norm": 22.863658905029297, "learning_rate": 4.993711944780648e-06, "loss": 1.3077, "step": 20890 }, { "epoch": 0.022692643109179903, "grad_norm": 32.6266975402832, "learning_rate": 4.993708921128666e-06, "loss": 1.8469, "step": 20895 }, { "epoch": 0.022698073270249343, "grad_norm": 25.002172470092773, "learning_rate": 4.993705896750804e-06, "loss": 1.4463, "step": 20900 }, { "epoch": 0.02270350343131878, "grad_norm": 20.38374900817871, "learning_rate": 4.993702871647061e-06, "loss": 1.4018, "step": 20905 }, { "epoch": 0.02270893359238822, "grad_norm": 27.063100814819336, "learning_rate": 4.99369984581744e-06, "loss": 1.7188, "step": 20910 }, { "epoch": 0.022714363753457655, "grad_norm": 41.73640823364258, "learning_rate": 4.9936968192619404e-06, "loss": 2.2114, "step": 20915 }, { "epoch": 0.02271979391452709, "grad_norm": 35.53876876831055, "learning_rate": 4.993693791980564e-06, "loss": 1.5254, "step": 20920 }, { "epoch": 0.02272522407559653, "grad_norm": 13.213571548461914, "learning_rate": 4.993690763973311e-06, "loss": 1.4292, "step": 20925 }, { "epoch": 0.022730654236665968, "grad_norm": 53.99330520629883, "learning_rate": 4.993687735240182e-06, "loss": 1.8032, "step": 20930 }, { "epoch": 0.022736084397735407, "grad_norm": 42.15294647216797, "learning_rate": 4.9936847057811796e-06, "loss": 1.893, "step": 20935 }, { "epoch": 0.022741514558804844, "grad_norm": 17.828359603881836, "learning_rate": 4.9936816755963026e-06, "loss": 1.3106, "step": 20940 }, { "epoch": 0.02274694471987428, "grad_norm": 70.28306579589844, "learning_rate": 4.993678644685553e-06, "loss": 1.4214, "step": 20945 }, { "epoch": 0.02275237488094372, "grad_norm": 27.8616886138916, "learning_rate": 4.993675613048932e-06, "loss": 1.7373, "step": 20950 }, { "epoch": 0.022757805042013156, "grad_norm": 46.09930419921875, "learning_rate": 4.993672580686439e-06, "loss": 1.8605, "step": 20955 }, { "epoch": 0.022763235203082592, "grad_norm": 26.85961151123047, "learning_rate": 4.993669547598077e-06, "loss": 2.1522, "step": 20960 }, { "epoch": 0.022768665364152032, "grad_norm": 36.175350189208984, "learning_rate": 4.993666513783846e-06, "loss": 1.3502, "step": 20965 }, { "epoch": 0.022774095525221468, "grad_norm": 17.083919525146484, "learning_rate": 4.993663479243746e-06, "loss": 1.0464, "step": 20970 }, { "epoch": 0.022779525686290908, "grad_norm": 12.855502128601074, "learning_rate": 4.993660443977778e-06, "loss": 1.2307, "step": 20975 }, { "epoch": 0.022784955847360344, "grad_norm": 24.123462677001953, "learning_rate": 4.993657407985943e-06, "loss": 1.7398, "step": 20980 }, { "epoch": 0.02279038600842978, "grad_norm": 76.86215209960938, "learning_rate": 4.993654371268244e-06, "loss": 1.4186, "step": 20985 }, { "epoch": 0.02279581616949922, "grad_norm": 14.964653015136719, "learning_rate": 4.993651333824679e-06, "loss": 1.0739, "step": 20990 }, { "epoch": 0.022801246330568656, "grad_norm": 99.67698669433594, "learning_rate": 4.99364829565525e-06, "loss": 1.5785, "step": 20995 }, { "epoch": 0.022806676491638096, "grad_norm": 18.994136810302734, "learning_rate": 4.993645256759958e-06, "loss": 1.9578, "step": 21000 }, { "epoch": 0.022812106652707533, "grad_norm": 30.883256912231445, "learning_rate": 4.993642217138804e-06, "loss": 2.6111, "step": 21005 }, { "epoch": 0.02281753681377697, "grad_norm": 16.74103355407715, "learning_rate": 4.993639176791789e-06, "loss": 1.7713, "step": 21010 }, { "epoch": 0.02282296697484641, "grad_norm": 23.193391799926758, "learning_rate": 4.9936361357189135e-06, "loss": 1.6526, "step": 21015 }, { "epoch": 0.022828397135915845, "grad_norm": 19.070905685424805, "learning_rate": 4.993633093920178e-06, "loss": 1.7141, "step": 21020 }, { "epoch": 0.022833827296985285, "grad_norm": 16.078369140625, "learning_rate": 4.993630051395584e-06, "loss": 2.0532, "step": 21025 }, { "epoch": 0.02283925745805472, "grad_norm": 78.57373046875, "learning_rate": 4.993627008145132e-06, "loss": 1.3864, "step": 21030 }, { "epoch": 0.022844687619124157, "grad_norm": 81.82662963867188, "learning_rate": 4.993623964168823e-06, "loss": 1.4218, "step": 21035 }, { "epoch": 0.022850117780193597, "grad_norm": 56.33750534057617, "learning_rate": 4.9936209194666585e-06, "loss": 1.4577, "step": 21040 }, { "epoch": 0.022855547941263033, "grad_norm": 13.653002738952637, "learning_rate": 4.9936178740386384e-06, "loss": 1.6814, "step": 21045 }, { "epoch": 0.022860978102332473, "grad_norm": 19.538122177124023, "learning_rate": 4.993614827884765e-06, "loss": 1.8372, "step": 21050 }, { "epoch": 0.02286640826340191, "grad_norm": 173.50970458984375, "learning_rate": 4.9936117810050375e-06, "loss": 1.6649, "step": 21055 }, { "epoch": 0.022871838424471345, "grad_norm": 97.86211395263672, "learning_rate": 4.9936087333994575e-06, "loss": 2.0515, "step": 21060 }, { "epoch": 0.022877268585540785, "grad_norm": 15.634221076965332, "learning_rate": 4.993605685068027e-06, "loss": 1.4868, "step": 21065 }, { "epoch": 0.02288269874661022, "grad_norm": 16.499860763549805, "learning_rate": 4.993602636010745e-06, "loss": 1.33, "step": 21070 }, { "epoch": 0.02288812890767966, "grad_norm": 17.96260643005371, "learning_rate": 4.993599586227613e-06, "loss": 1.6304, "step": 21075 }, { "epoch": 0.022893559068749098, "grad_norm": 17.949567794799805, "learning_rate": 4.993596535718632e-06, "loss": 2.2938, "step": 21080 }, { "epoch": 0.022898989229818534, "grad_norm": 88.27405548095703, "learning_rate": 4.993593484483804e-06, "loss": 1.504, "step": 21085 }, { "epoch": 0.022904419390887974, "grad_norm": 71.36280822753906, "learning_rate": 4.9935904325231275e-06, "loss": 1.5423, "step": 21090 }, { "epoch": 0.02290984955195741, "grad_norm": 22.732955932617188, "learning_rate": 4.993587379836606e-06, "loss": 1.849, "step": 21095 }, { "epoch": 0.022915279713026846, "grad_norm": 43.8634033203125, "learning_rate": 4.993584326424238e-06, "loss": 1.6536, "step": 21100 }, { "epoch": 0.022920709874096286, "grad_norm": 21.243104934692383, "learning_rate": 4.993581272286027e-06, "loss": 2.0167, "step": 21105 }, { "epoch": 0.022926140035165722, "grad_norm": 113.28362274169922, "learning_rate": 4.993578217421971e-06, "loss": 1.5463, "step": 21110 }, { "epoch": 0.022931570196235162, "grad_norm": 17.866971969604492, "learning_rate": 4.993575161832073e-06, "loss": 1.3064, "step": 21115 }, { "epoch": 0.022937000357304598, "grad_norm": 18.763599395751953, "learning_rate": 4.993572105516333e-06, "loss": 1.7075, "step": 21120 }, { "epoch": 0.022942430518374034, "grad_norm": 25.583974838256836, "learning_rate": 4.993569048474752e-06, "loss": 1.8615, "step": 21125 }, { "epoch": 0.022947860679443474, "grad_norm": 33.8676872253418, "learning_rate": 4.993565990707331e-06, "loss": 1.0591, "step": 21130 }, { "epoch": 0.02295329084051291, "grad_norm": 14.338035583496094, "learning_rate": 4.993562932214071e-06, "loss": 1.584, "step": 21135 }, { "epoch": 0.02295872100158235, "grad_norm": 52.0946044921875, "learning_rate": 4.993559872994973e-06, "loss": 2.5906, "step": 21140 }, { "epoch": 0.022964151162651786, "grad_norm": 26.08655548095703, "learning_rate": 4.9935568130500375e-06, "loss": 2.5542, "step": 21145 }, { "epoch": 0.022969581323721223, "grad_norm": 49.683349609375, "learning_rate": 4.993553752379266e-06, "loss": 2.0997, "step": 21150 }, { "epoch": 0.022975011484790663, "grad_norm": 12.847677230834961, "learning_rate": 4.993550690982659e-06, "loss": 1.6415, "step": 21155 }, { "epoch": 0.0229804416458601, "grad_norm": 18.281993865966797, "learning_rate": 4.993547628860216e-06, "loss": 1.7476, "step": 21160 }, { "epoch": 0.02298587180692954, "grad_norm": 70.255126953125, "learning_rate": 4.9935445660119404e-06, "loss": 1.3879, "step": 21165 }, { "epoch": 0.022991301967998975, "grad_norm": 14.982494354248047, "learning_rate": 4.993541502437832e-06, "loss": 1.4657, "step": 21170 }, { "epoch": 0.02299673212906841, "grad_norm": 12.277390480041504, "learning_rate": 4.993538438137891e-06, "loss": 1.3356, "step": 21175 }, { "epoch": 0.02300216229013785, "grad_norm": 56.83434295654297, "learning_rate": 4.993535373112119e-06, "loss": 1.2045, "step": 21180 }, { "epoch": 0.023007592451207287, "grad_norm": 25.132736206054688, "learning_rate": 4.993532307360517e-06, "loss": 1.5872, "step": 21185 }, { "epoch": 0.023013022612276727, "grad_norm": 33.729915618896484, "learning_rate": 4.993529240883085e-06, "loss": 1.6633, "step": 21190 }, { "epoch": 0.023018452773346163, "grad_norm": 24.259431838989258, "learning_rate": 4.993526173679826e-06, "loss": 1.8401, "step": 21195 }, { "epoch": 0.0230238829344156, "grad_norm": 60.22037887573242, "learning_rate": 4.993523105750738e-06, "loss": 1.9346, "step": 21200 }, { "epoch": 0.02302931309548504, "grad_norm": 23.878536224365234, "learning_rate": 4.9935200370958234e-06, "loss": 1.5719, "step": 21205 }, { "epoch": 0.023034743256554475, "grad_norm": 122.81961822509766, "learning_rate": 4.993516967715084e-06, "loss": 1.8458, "step": 21210 }, { "epoch": 0.023040173417623915, "grad_norm": 10.42257308959961, "learning_rate": 4.99351389760852e-06, "loss": 1.2745, "step": 21215 }, { "epoch": 0.02304560357869335, "grad_norm": 72.57475280761719, "learning_rate": 4.993510826776131e-06, "loss": 1.5601, "step": 21220 }, { "epoch": 0.023051033739762788, "grad_norm": 166.44151306152344, "learning_rate": 4.9935077552179195e-06, "loss": 1.9792, "step": 21225 }, { "epoch": 0.023056463900832228, "grad_norm": 15.192915916442871, "learning_rate": 4.9935046829338856e-06, "loss": 1.4501, "step": 21230 }, { "epoch": 0.023061894061901664, "grad_norm": 32.639644622802734, "learning_rate": 4.993501609924031e-06, "loss": 1.9876, "step": 21235 }, { "epoch": 0.0230673242229711, "grad_norm": 28.653358459472656, "learning_rate": 4.993498536188355e-06, "loss": 1.9907, "step": 21240 }, { "epoch": 0.02307275438404054, "grad_norm": 26.878643035888672, "learning_rate": 4.99349546172686e-06, "loss": 2.3492, "step": 21245 }, { "epoch": 0.023078184545109976, "grad_norm": 55.85746383666992, "learning_rate": 4.993492386539547e-06, "loss": 1.5747, "step": 21250 }, { "epoch": 0.023083614706179416, "grad_norm": 11.266759872436523, "learning_rate": 4.993489310626416e-06, "loss": 1.1855, "step": 21255 }, { "epoch": 0.023089044867248852, "grad_norm": 47.56296157836914, "learning_rate": 4.993486233987468e-06, "loss": 1.8574, "step": 21260 }, { "epoch": 0.02309447502831829, "grad_norm": 30.001625061035156, "learning_rate": 4.993483156622703e-06, "loss": 1.9149, "step": 21265 }, { "epoch": 0.023099905189387728, "grad_norm": 33.711769104003906, "learning_rate": 4.993480078532124e-06, "loss": 1.6149, "step": 21270 }, { "epoch": 0.023105335350457164, "grad_norm": 21.514402389526367, "learning_rate": 4.993476999715731e-06, "loss": 1.6044, "step": 21275 }, { "epoch": 0.023110765511526604, "grad_norm": 73.05706787109375, "learning_rate": 4.993473920173525e-06, "loss": 1.4521, "step": 21280 }, { "epoch": 0.02311619567259604, "grad_norm": 16.838054656982422, "learning_rate": 4.9934708399055065e-06, "loss": 1.2563, "step": 21285 }, { "epoch": 0.023121625833665477, "grad_norm": 26.9096736907959, "learning_rate": 4.993467758911677e-06, "loss": 2.1002, "step": 21290 }, { "epoch": 0.023127055994734916, "grad_norm": 34.695098876953125, "learning_rate": 4.993464677192036e-06, "loss": 1.5972, "step": 21295 }, { "epoch": 0.023132486155804353, "grad_norm": 59.17270278930664, "learning_rate": 4.993461594746586e-06, "loss": 1.631, "step": 21300 }, { "epoch": 0.023137916316873793, "grad_norm": 14.30577564239502, "learning_rate": 4.9934585115753265e-06, "loss": 1.7364, "step": 21305 }, { "epoch": 0.02314334647794323, "grad_norm": 13.020549774169922, "learning_rate": 4.99345542767826e-06, "loss": 1.7316, "step": 21310 }, { "epoch": 0.023148776639012665, "grad_norm": 19.916423797607422, "learning_rate": 4.993452343055386e-06, "loss": 1.4353, "step": 21315 }, { "epoch": 0.023154206800082105, "grad_norm": 35.561763763427734, "learning_rate": 4.9934492577067065e-06, "loss": 1.6697, "step": 21320 }, { "epoch": 0.02315963696115154, "grad_norm": 66.18185424804688, "learning_rate": 4.993446171632221e-06, "loss": 1.5173, "step": 21325 }, { "epoch": 0.02316506712222098, "grad_norm": 18.747804641723633, "learning_rate": 4.993443084831932e-06, "loss": 1.5312, "step": 21330 }, { "epoch": 0.023170497283290417, "grad_norm": 23.31655502319336, "learning_rate": 4.99343999730584e-06, "loss": 1.6848, "step": 21335 }, { "epoch": 0.023175927444359853, "grad_norm": 13.206652641296387, "learning_rate": 4.993436909053945e-06, "loss": 1.8264, "step": 21340 }, { "epoch": 0.023181357605429293, "grad_norm": 15.745899200439453, "learning_rate": 4.993433820076249e-06, "loss": 1.2814, "step": 21345 }, { "epoch": 0.02318678776649873, "grad_norm": 57.98302459716797, "learning_rate": 4.993430730372751e-06, "loss": 1.8907, "step": 21350 }, { "epoch": 0.02319221792756817, "grad_norm": 16.31968116760254, "learning_rate": 4.9934276399434544e-06, "loss": 1.2147, "step": 21355 }, { "epoch": 0.023197648088637605, "grad_norm": 51.78936767578125, "learning_rate": 4.993424548788359e-06, "loss": 1.3131, "step": 21360 }, { "epoch": 0.023203078249707042, "grad_norm": 25.635311126708984, "learning_rate": 4.993421456907466e-06, "loss": 1.2704, "step": 21365 }, { "epoch": 0.02320850841077648, "grad_norm": 94.41297149658203, "learning_rate": 4.993418364300775e-06, "loss": 1.5054, "step": 21370 }, { "epoch": 0.023213938571845918, "grad_norm": 16.620529174804688, "learning_rate": 4.993415270968288e-06, "loss": 1.7196, "step": 21375 }, { "epoch": 0.023219368732915354, "grad_norm": 34.80724334716797, "learning_rate": 4.993412176910006e-06, "loss": 1.3351, "step": 21380 }, { "epoch": 0.023224798893984794, "grad_norm": 22.6812686920166, "learning_rate": 4.993409082125929e-06, "loss": 1.2285, "step": 21385 }, { "epoch": 0.02323022905505423, "grad_norm": 19.25381851196289, "learning_rate": 4.99340598661606e-06, "loss": 1.531, "step": 21390 }, { "epoch": 0.02323565921612367, "grad_norm": 19.448986053466797, "learning_rate": 4.993402890380397e-06, "loss": 1.6681, "step": 21395 }, { "epoch": 0.023241089377193106, "grad_norm": 54.960445404052734, "learning_rate": 4.993399793418942e-06, "loss": 1.9091, "step": 21400 }, { "epoch": 0.023246519538262542, "grad_norm": 16.791494369506836, "learning_rate": 4.993396695731698e-06, "loss": 1.7094, "step": 21405 }, { "epoch": 0.023251949699331982, "grad_norm": 61.24124526977539, "learning_rate": 4.993393597318663e-06, "loss": 0.8485, "step": 21410 }, { "epoch": 0.02325737986040142, "grad_norm": 12.152717590332031, "learning_rate": 4.99339049817984e-06, "loss": 1.5516, "step": 21415 }, { "epoch": 0.023262810021470858, "grad_norm": 35.69513702392578, "learning_rate": 4.993387398315228e-06, "loss": 1.7957, "step": 21420 }, { "epoch": 0.023268240182540294, "grad_norm": 19.722244262695312, "learning_rate": 4.993384297724829e-06, "loss": 1.3066, "step": 21425 }, { "epoch": 0.02327367034360973, "grad_norm": 19.693449020385742, "learning_rate": 4.993381196408645e-06, "loss": 1.8576, "step": 21430 }, { "epoch": 0.02327910050467917, "grad_norm": 15.28137493133545, "learning_rate": 4.9933780943666745e-06, "loss": 1.6202, "step": 21435 }, { "epoch": 0.023284530665748607, "grad_norm": 24.779966354370117, "learning_rate": 4.993374991598919e-06, "loss": 2.246, "step": 21440 }, { "epoch": 0.023289960826818046, "grad_norm": 50.59046936035156, "learning_rate": 4.993371888105381e-06, "loss": 1.4113, "step": 21445 }, { "epoch": 0.023295390987887483, "grad_norm": 65.43311309814453, "learning_rate": 4.99336878388606e-06, "loss": 1.2365, "step": 21450 }, { "epoch": 0.02330082114895692, "grad_norm": 26.599458694458008, "learning_rate": 4.993365678940958e-06, "loss": 1.1659, "step": 21455 }, { "epoch": 0.02330625131002636, "grad_norm": 17.35736846923828, "learning_rate": 4.993362573270074e-06, "loss": 1.6178, "step": 21460 }, { "epoch": 0.023311681471095795, "grad_norm": 75.9873046875, "learning_rate": 4.993359466873411e-06, "loss": 1.739, "step": 21465 }, { "epoch": 0.023317111632165235, "grad_norm": 27.967300415039062, "learning_rate": 4.9933563597509685e-06, "loss": 1.7322, "step": 21470 }, { "epoch": 0.02332254179323467, "grad_norm": 12.8014554977417, "learning_rate": 4.9933532519027485e-06, "loss": 2.0226, "step": 21475 }, { "epoch": 0.023327971954304107, "grad_norm": 27.821369171142578, "learning_rate": 4.99335014332875e-06, "loss": 1.3737, "step": 21480 }, { "epoch": 0.023333402115373547, "grad_norm": 28.261096954345703, "learning_rate": 4.993347034028976e-06, "loss": 1.4099, "step": 21485 }, { "epoch": 0.023338832276442983, "grad_norm": 17.388856887817383, "learning_rate": 4.993343924003427e-06, "loss": 1.0762, "step": 21490 }, { "epoch": 0.023344262437512423, "grad_norm": 17.267353057861328, "learning_rate": 4.993340813252103e-06, "loss": 1.6371, "step": 21495 }, { "epoch": 0.02334969259858186, "grad_norm": 75.41110229492188, "learning_rate": 4.993337701775006e-06, "loss": 1.9492, "step": 21500 }, { "epoch": 0.023355122759651296, "grad_norm": 15.672773361206055, "learning_rate": 4.993334589572136e-06, "loss": 2.1031, "step": 21505 } ], "logging_steps": 5, "max_steps": 920783, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1955, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.901676020629504e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }