diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,28707 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 1275, + "global_step": 20392, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019615535504119262, + "eval_loss": 13.359358787536621, + "eval_runtime": 7.6451, + "eval_samples_per_second": 27.338, + "eval_steps_per_second": 13.734, + "step": 1 + }, + { + "epoch": 0.000980776775205963, + "grad_norm": 261.527587890625, + "learning_rate": 5.000000000000001e-07, + "loss": 13.4315, + "step": 5 + }, + { + "epoch": 0.001961553550411926, + "grad_norm": 281.323486328125, + "learning_rate": 1.0000000000000002e-06, + "loss": 13.1245, + "step": 10 + }, + { + "epoch": 0.0029423303256178894, + "grad_norm": 294.9424133300781, + "learning_rate": 1.5e-06, + "loss": 13.3844, + "step": 15 + }, + { + "epoch": 0.003923107100823852, + "grad_norm": 314.61883544921875, + "learning_rate": 2.0000000000000003e-06, + "loss": 12.6558, + "step": 20 + }, + { + "epoch": 0.004903883876029816, + "grad_norm": 301.4609375, + "learning_rate": 2.5e-06, + "loss": 13.2002, + "step": 25 + }, + { + "epoch": 0.005884660651235779, + "grad_norm": 265.4776306152344, + "learning_rate": 3e-06, + "loss": 13.0372, + "step": 30 + }, + { + "epoch": 0.006865437426441742, + "grad_norm": 231.08770751953125, + "learning_rate": 3.5e-06, + "loss": 13.0588, + "step": 35 + }, + { + "epoch": 0.007846214201647704, + "grad_norm": 243.46893310546875, + "learning_rate": 4.000000000000001e-06, + "loss": 12.8002, + "step": 40 + }, + { + "epoch": 0.008826990976853669, + "grad_norm": 272.3201599121094, + "learning_rate": 4.5e-06, + "loss": 12.2782, + "step": 45 + }, + { + "epoch": 0.009807767752059632, + "grad_norm": 224.776611328125, + "learning_rate": 5e-06, + "loss": 11.1628, + "step": 50 + }, + { + "epoch": 0.010788544527265595, + "grad_norm": 220.5045928955078, + "learning_rate": 5.500000000000001e-06, + "loss": 10.4563, + "step": 55 + }, + { + "epoch": 0.011769321302471557, + "grad_norm": 219.2426300048828, + "learning_rate": 6e-06, + "loss": 9.908, + "step": 60 + }, + { + "epoch": 0.01275009807767752, + "grad_norm": 119.27024841308594, + "learning_rate": 6.5000000000000004e-06, + "loss": 10.026, + "step": 65 + }, + { + "epoch": 0.013730874852883483, + "grad_norm": 91.62039947509766, + "learning_rate": 7e-06, + "loss": 9.1984, + "step": 70 + }, + { + "epoch": 0.014711651628089448, + "grad_norm": 83.55156707763672, + "learning_rate": 7.500000000000001e-06, + "loss": 8.457, + "step": 75 + }, + { + "epoch": 0.01569242840329541, + "grad_norm": 67.35411071777344, + "learning_rate": 8.000000000000001e-06, + "loss": 8.3446, + "step": 80 + }, + { + "epoch": 0.016673205178501373, + "grad_norm": 50.44355010986328, + "learning_rate": 8.5e-06, + "loss": 8.1199, + "step": 85 + }, + { + "epoch": 0.017653981953707338, + "grad_norm": 45.920475006103516, + "learning_rate": 9e-06, + "loss": 7.9275, + "step": 90 + }, + { + "epoch": 0.0186347587289133, + "grad_norm": 51.882083892822266, + "learning_rate": 9.5e-06, + "loss": 7.7225, + "step": 95 + }, + { + "epoch": 0.019615535504119264, + "grad_norm": 36.69080352783203, + "learning_rate": 1e-05, + "loss": 7.2467, + "step": 100 + }, + { + "epoch": 0.020596312279325225, + "grad_norm": 55.873043060302734, + "learning_rate": 9.999998501937153e-06, + "loss": 7.2832, + "step": 105 + }, + { + "epoch": 0.02157708905453119, + "grad_norm": 28.33873176574707, + "learning_rate": 9.999994007749506e-06, + "loss": 7.3216, + "step": 110 + }, + { + "epoch": 0.02255786582973715, + "grad_norm": 33.89677047729492, + "learning_rate": 9.99998651743975e-06, + "loss": 7.2139, + "step": 115 + }, + { + "epoch": 0.023538642604943115, + "grad_norm": 33.673927307128906, + "learning_rate": 9.99997603101238e-06, + "loss": 6.7812, + "step": 120 + }, + { + "epoch": 0.02451941938014908, + "grad_norm": 35.81504821777344, + "learning_rate": 9.999962548473674e-06, + "loss": 7.1054, + "step": 125 + }, + { + "epoch": 0.02550019615535504, + "grad_norm": 36.798221588134766, + "learning_rate": 9.999946069831713e-06, + "loss": 7.4411, + "step": 130 + }, + { + "epoch": 0.026480972930561005, + "grad_norm": 35.6644172668457, + "learning_rate": 9.999926595096373e-06, + "loss": 7.3205, + "step": 135 + }, + { + "epoch": 0.027461749705766966, + "grad_norm": 20.2699031829834, + "learning_rate": 9.999904124279322e-06, + "loss": 6.5323, + "step": 140 + }, + { + "epoch": 0.02844252648097293, + "grad_norm": 34.727783203125, + "learning_rate": 9.999878657394024e-06, + "loss": 6.7351, + "step": 145 + }, + { + "epoch": 0.029423303256178895, + "grad_norm": 21.760740280151367, + "learning_rate": 9.999850194455741e-06, + "loss": 6.4626, + "step": 150 + }, + { + "epoch": 0.030404080031384857, + "grad_norm": 25.09556770324707, + "learning_rate": 9.99981873548153e-06, + "loss": 6.5722, + "step": 155 + }, + { + "epoch": 0.03138485680659082, + "grad_norm": 19.870086669921875, + "learning_rate": 9.999784280490239e-06, + "loss": 6.2656, + "step": 160 + }, + { + "epoch": 0.03236563358179678, + "grad_norm": 28.291837692260742, + "learning_rate": 9.999746829502516e-06, + "loss": 6.4729, + "step": 165 + }, + { + "epoch": 0.03334641035700275, + "grad_norm": 27.245208740234375, + "learning_rate": 9.9997063825408e-06, + "loss": 6.3256, + "step": 170 + }, + { + "epoch": 0.03432718713220871, + "grad_norm": 25.775121688842773, + "learning_rate": 9.999662939629335e-06, + "loss": 6.6193, + "step": 175 + }, + { + "epoch": 0.035307963907414676, + "grad_norm": 24.102081298828125, + "learning_rate": 9.999616500794144e-06, + "loss": 6.2076, + "step": 180 + }, + { + "epoch": 0.036288740682620634, + "grad_norm": 34.60786437988281, + "learning_rate": 9.99956706606306e-06, + "loss": 7.2322, + "step": 185 + }, + { + "epoch": 0.0372695174578266, + "grad_norm": 21.22093391418457, + "learning_rate": 9.999514635465706e-06, + "loss": 5.9367, + "step": 190 + }, + { + "epoch": 0.03825029423303256, + "grad_norm": 19.872236251831055, + "learning_rate": 9.999459209033495e-06, + "loss": 6.1443, + "step": 195 + }, + { + "epoch": 0.03923107100823853, + "grad_norm": 29.4232234954834, + "learning_rate": 9.999400786799644e-06, + "loss": 6.2759, + "step": 200 + }, + { + "epoch": 0.040211847783444485, + "grad_norm": 30.81177520751953, + "learning_rate": 9.99933936879916e-06, + "loss": 6.5985, + "step": 205 + }, + { + "epoch": 0.04119262455865045, + "grad_norm": 24.815486907958984, + "learning_rate": 9.999274955068845e-06, + "loss": 5.9052, + "step": 210 + }, + { + "epoch": 0.042173401333856414, + "grad_norm": 52.55622482299805, + "learning_rate": 9.9992075456473e-06, + "loss": 6.3684, + "step": 215 + }, + { + "epoch": 0.04315417810906238, + "grad_norm": 24.462528228759766, + "learning_rate": 9.999137140574914e-06, + "loss": 6.2602, + "step": 220 + }, + { + "epoch": 0.04413495488426834, + "grad_norm": 17.4281005859375, + "learning_rate": 9.99906373989388e-06, + "loss": 6.1871, + "step": 225 + }, + { + "epoch": 0.0451157316594743, + "grad_norm": 28.467485427856445, + "learning_rate": 9.998987343648182e-06, + "loss": 6.0398, + "step": 230 + }, + { + "epoch": 0.046096508434680265, + "grad_norm": 28.41315269470215, + "learning_rate": 9.998907951883592e-06, + "loss": 6.3472, + "step": 235 + }, + { + "epoch": 0.04707728520988623, + "grad_norm": 20.66746711730957, + "learning_rate": 9.998825564647689e-06, + "loss": 6.3148, + "step": 240 + }, + { + "epoch": 0.048058061985092194, + "grad_norm": 19.29747200012207, + "learning_rate": 9.998740181989842e-06, + "loss": 6.0387, + "step": 245 + }, + { + "epoch": 0.04903883876029816, + "grad_norm": 29.47091293334961, + "learning_rate": 9.998651803961212e-06, + "loss": 6.3439, + "step": 250 + }, + { + "epoch": 0.05001961553550412, + "grad_norm": 47.9112548828125, + "learning_rate": 9.998560430614759e-06, + "loss": 6.3999, + "step": 255 + }, + { + "epoch": 0.05100039231071008, + "grad_norm": 26.031343460083008, + "learning_rate": 9.998466062005234e-06, + "loss": 6.4955, + "step": 260 + }, + { + "epoch": 0.051981169085916046, + "grad_norm": 28.60124969482422, + "learning_rate": 9.998368698189187e-06, + "loss": 6.1393, + "step": 265 + }, + { + "epoch": 0.05296194586112201, + "grad_norm": 22.979835510253906, + "learning_rate": 9.998268339224958e-06, + "loss": 6.1923, + "step": 270 + }, + { + "epoch": 0.053942722636327975, + "grad_norm": 26.163986206054688, + "learning_rate": 9.99816498517269e-06, + "loss": 6.4099, + "step": 275 + }, + { + "epoch": 0.05492349941153393, + "grad_norm": 30.09125518798828, + "learning_rate": 9.998058636094312e-06, + "loss": 6.1997, + "step": 280 + }, + { + "epoch": 0.0559042761867399, + "grad_norm": 33.007118225097656, + "learning_rate": 9.99794929205355e-06, + "loss": 6.3884, + "step": 285 + }, + { + "epoch": 0.05688505296194586, + "grad_norm": 16.16930389404297, + "learning_rate": 9.997836953115927e-06, + "loss": 5.9929, + "step": 290 + }, + { + "epoch": 0.057865829737151826, + "grad_norm": 15.623757362365723, + "learning_rate": 9.99772161934876e-06, + "loss": 6.0234, + "step": 295 + }, + { + "epoch": 0.05884660651235779, + "grad_norm": 21.748035430908203, + "learning_rate": 9.997603290821158e-06, + "loss": 5.7948, + "step": 300 + }, + { + "epoch": 0.05982738328756375, + "grad_norm": 15.957292556762695, + "learning_rate": 9.997481967604028e-06, + "loss": 5.8803, + "step": 305 + }, + { + "epoch": 0.06080816006276971, + "grad_norm": 16.29456329345703, + "learning_rate": 9.99735764977007e-06, + "loss": 6.095, + "step": 310 + }, + { + "epoch": 0.06178893683797568, + "grad_norm": 13.741049766540527, + "learning_rate": 9.997230337393777e-06, + "loss": 6.4022, + "step": 315 + }, + { + "epoch": 0.06276971361318164, + "grad_norm": 26.837913513183594, + "learning_rate": 9.99710003055144e-06, + "loss": 6.4859, + "step": 320 + }, + { + "epoch": 0.0637504903883876, + "grad_norm": 14.79397964477539, + "learning_rate": 9.99696672932114e-06, + "loss": 5.7493, + "step": 325 + }, + { + "epoch": 0.06473126716359356, + "grad_norm": 14.920068740844727, + "learning_rate": 9.996830433782754e-06, + "loss": 5.8187, + "step": 330 + }, + { + "epoch": 0.06571204393879954, + "grad_norm": 18.98463249206543, + "learning_rate": 9.996691144017957e-06, + "loss": 5.9097, + "step": 335 + }, + { + "epoch": 0.0666928207140055, + "grad_norm": 27.435945510864258, + "learning_rate": 9.99654886011021e-06, + "loss": 6.0536, + "step": 340 + }, + { + "epoch": 0.06767359748921145, + "grad_norm": 13.197905540466309, + "learning_rate": 9.99640358214478e-06, + "loss": 5.9436, + "step": 345 + }, + { + "epoch": 0.06865437426441742, + "grad_norm": 23.347511291503906, + "learning_rate": 9.996255310208715e-06, + "loss": 6.0234, + "step": 350 + }, + { + "epoch": 0.06963515103962338, + "grad_norm": 40.36224365234375, + "learning_rate": 9.996104044390866e-06, + "loss": 5.834, + "step": 355 + }, + { + "epoch": 0.07061592781482935, + "grad_norm": 18.484323501586914, + "learning_rate": 9.995949784781873e-06, + "loss": 6.143, + "step": 360 + }, + { + "epoch": 0.07159670459003531, + "grad_norm": 27.709238052368164, + "learning_rate": 9.995792531474175e-06, + "loss": 6.0427, + "step": 365 + }, + { + "epoch": 0.07257748136524127, + "grad_norm": 12.617931365966797, + "learning_rate": 9.995632284562002e-06, + "loss": 5.8578, + "step": 370 + }, + { + "epoch": 0.07355825814044724, + "grad_norm": 19.61956214904785, + "learning_rate": 9.995469044141377e-06, + "loss": 5.8423, + "step": 375 + }, + { + "epoch": 0.0745390349156532, + "grad_norm": 12.127665519714355, + "learning_rate": 9.995302810310116e-06, + "loss": 6.1397, + "step": 380 + }, + { + "epoch": 0.07551981169085915, + "grad_norm": 23.63418197631836, + "learning_rate": 9.995133583167833e-06, + "loss": 5.9291, + "step": 385 + }, + { + "epoch": 0.07650058846606513, + "grad_norm": 19.848413467407227, + "learning_rate": 9.994961362815934e-06, + "loss": 5.8147, + "step": 390 + }, + { + "epoch": 0.07748136524127108, + "grad_norm": 24.281089782714844, + "learning_rate": 9.994786149357614e-06, + "loss": 6.1719, + "step": 395 + }, + { + "epoch": 0.07846214201647705, + "grad_norm": 26.30230712890625, + "learning_rate": 9.99460794289787e-06, + "loss": 6.0022, + "step": 400 + }, + { + "epoch": 0.07944291879168301, + "grad_norm": 20.717947006225586, + "learning_rate": 9.994426743543483e-06, + "loss": 6.1346, + "step": 405 + }, + { + "epoch": 0.08042369556688897, + "grad_norm": 13.985010147094727, + "learning_rate": 9.994242551403036e-06, + "loss": 5.7892, + "step": 410 + }, + { + "epoch": 0.08140447234209494, + "grad_norm": 26.171998977661133, + "learning_rate": 9.9940553665869e-06, + "loss": 6.1193, + "step": 415 + }, + { + "epoch": 0.0823852491173009, + "grad_norm": 19.03148078918457, + "learning_rate": 9.993865189207242e-06, + "loss": 5.8301, + "step": 420 + }, + { + "epoch": 0.08336602589250687, + "grad_norm": 18.318777084350586, + "learning_rate": 9.993672019378017e-06, + "loss": 5.7323, + "step": 425 + }, + { + "epoch": 0.08434680266771283, + "grad_norm": 15.280516624450684, + "learning_rate": 9.993475857214983e-06, + "loss": 6.4028, + "step": 430 + }, + { + "epoch": 0.08532757944291879, + "grad_norm": 13.854470252990723, + "learning_rate": 9.993276702835682e-06, + "loss": 5.9028, + "step": 435 + }, + { + "epoch": 0.08630835621812476, + "grad_norm": 19.096981048583984, + "learning_rate": 9.99307455635945e-06, + "loss": 6.3108, + "step": 440 + }, + { + "epoch": 0.08728913299333071, + "grad_norm": 20.702112197875977, + "learning_rate": 9.992869417907426e-06, + "loss": 6.047, + "step": 445 + }, + { + "epoch": 0.08826990976853669, + "grad_norm": 14.787785530090332, + "learning_rate": 9.992661287602526e-06, + "loss": 5.783, + "step": 450 + }, + { + "epoch": 0.08925068654374264, + "grad_norm": 26.985492706298828, + "learning_rate": 9.99245016556947e-06, + "loss": 6.0479, + "step": 455 + }, + { + "epoch": 0.0902314633189486, + "grad_norm": 20.202232360839844, + "learning_rate": 9.992236051934769e-06, + "loss": 5.8905, + "step": 460 + }, + { + "epoch": 0.09121224009415457, + "grad_norm": 28.611963272094727, + "learning_rate": 9.992018946826723e-06, + "loss": 5.669, + "step": 465 + }, + { + "epoch": 0.09219301686936053, + "grad_norm": 15.48359203338623, + "learning_rate": 9.99179885037543e-06, + "loss": 5.8497, + "step": 470 + }, + { + "epoch": 0.0931737936445665, + "grad_norm": 30.19912338256836, + "learning_rate": 9.991575762712773e-06, + "loss": 5.8428, + "step": 475 + }, + { + "epoch": 0.09415457041977246, + "grad_norm": 28.23841094970703, + "learning_rate": 9.991349683972435e-06, + "loss": 5.8517, + "step": 480 + }, + { + "epoch": 0.09513534719497842, + "grad_norm": 36.505924224853516, + "learning_rate": 9.991120614289887e-06, + "loss": 5.5872, + "step": 485 + }, + { + "epoch": 0.09611612397018439, + "grad_norm": 27.388525009155273, + "learning_rate": 9.99088855380239e-06, + "loss": 5.6681, + "step": 490 + }, + { + "epoch": 0.09709690074539035, + "grad_norm": 12.868749618530273, + "learning_rate": 9.990653502649008e-06, + "loss": 5.5927, + "step": 495 + }, + { + "epoch": 0.09807767752059632, + "grad_norm": 21.467880249023438, + "learning_rate": 9.990415460970584e-06, + "loss": 5.6988, + "step": 500 + }, + { + "epoch": 0.09905845429580228, + "grad_norm": 27.02879524230957, + "learning_rate": 9.99017442890976e-06, + "loss": 5.9116, + "step": 505 + }, + { + "epoch": 0.10003923107100823, + "grad_norm": 20.046236038208008, + "learning_rate": 9.98993040661097e-06, + "loss": 5.8835, + "step": 510 + }, + { + "epoch": 0.1010200078462142, + "grad_norm": 18.563119888305664, + "learning_rate": 9.989683394220435e-06, + "loss": 5.9447, + "step": 515 + }, + { + "epoch": 0.10200078462142016, + "grad_norm": 19.473636627197266, + "learning_rate": 9.989433391886171e-06, + "loss": 5.7367, + "step": 520 + }, + { + "epoch": 0.10298156139662613, + "grad_norm": 24.5170955657959, + "learning_rate": 9.98918039975799e-06, + "loss": 5.5517, + "step": 525 + }, + { + "epoch": 0.10396233817183209, + "grad_norm": 23.462825775146484, + "learning_rate": 9.988924417987489e-06, + "loss": 6.7528, + "step": 530 + }, + { + "epoch": 0.10494311494703805, + "grad_norm": 30.959993362426758, + "learning_rate": 9.988665446728057e-06, + "loss": 5.4933, + "step": 535 + }, + { + "epoch": 0.10592389172224402, + "grad_norm": 24.289413452148438, + "learning_rate": 9.988403486134877e-06, + "loss": 6.1583, + "step": 540 + }, + { + "epoch": 0.10690466849744998, + "grad_norm": 14.085652351379395, + "learning_rate": 9.988138536364922e-06, + "loss": 5.9304, + "step": 545 + }, + { + "epoch": 0.10788544527265595, + "grad_norm": 21.895252227783203, + "learning_rate": 9.987870597576961e-06, + "loss": 5.9257, + "step": 550 + }, + { + "epoch": 0.10886622204786191, + "grad_norm": 13.12173080444336, + "learning_rate": 9.987599669931543e-06, + "loss": 5.5943, + "step": 555 + }, + { + "epoch": 0.10984699882306787, + "grad_norm": 21.497610092163086, + "learning_rate": 9.987325753591019e-06, + "loss": 5.4399, + "step": 560 + }, + { + "epoch": 0.11082777559827384, + "grad_norm": 19.55420684814453, + "learning_rate": 9.987048848719524e-06, + "loss": 5.7695, + "step": 565 + }, + { + "epoch": 0.1118085523734798, + "grad_norm": 20.084917068481445, + "learning_rate": 9.986768955482988e-06, + "loss": 5.6785, + "step": 570 + }, + { + "epoch": 0.11278932914868577, + "grad_norm": 22.55635643005371, + "learning_rate": 9.986486074049131e-06, + "loss": 5.9588, + "step": 575 + }, + { + "epoch": 0.11377010592389172, + "grad_norm": 19.915632247924805, + "learning_rate": 9.98620020458746e-06, + "loss": 5.7189, + "step": 580 + }, + { + "epoch": 0.11475088269909768, + "grad_norm": 28.99961280822754, + "learning_rate": 9.985911347269277e-06, + "loss": 5.7044, + "step": 585 + }, + { + "epoch": 0.11573165947430365, + "grad_norm": 15.701618194580078, + "learning_rate": 9.985619502267671e-06, + "loss": 5.4876, + "step": 590 + }, + { + "epoch": 0.11671243624950961, + "grad_norm": 11.117427825927734, + "learning_rate": 9.985324669757526e-06, + "loss": 5.7046, + "step": 595 + }, + { + "epoch": 0.11769321302471558, + "grad_norm": 22.775066375732422, + "learning_rate": 9.985026849915508e-06, + "loss": 6.3677, + "step": 600 + }, + { + "epoch": 0.11867398979992154, + "grad_norm": 20.601396560668945, + "learning_rate": 9.984726042920085e-06, + "loss": 5.8385, + "step": 605 + }, + { + "epoch": 0.1196547665751275, + "grad_norm": 11.978520393371582, + "learning_rate": 9.984422248951502e-06, + "loss": 5.6919, + "step": 610 + }, + { + "epoch": 0.12063554335033347, + "grad_norm": 16.846281051635742, + "learning_rate": 9.984115468191803e-06, + "loss": 5.3983, + "step": 615 + }, + { + "epoch": 0.12161632012553943, + "grad_norm": 15.13311767578125, + "learning_rate": 9.983805700824816e-06, + "loss": 5.3126, + "step": 620 + }, + { + "epoch": 0.12259709690074538, + "grad_norm": 18.353530883789062, + "learning_rate": 9.983492947036164e-06, + "loss": 5.775, + "step": 625 + }, + { + "epoch": 0.12357787367595136, + "grad_norm": 28.933935165405273, + "learning_rate": 9.983177207013256e-06, + "loss": 5.9725, + "step": 630 + }, + { + "epoch": 0.12455865045115731, + "grad_norm": 12.9036226272583, + "learning_rate": 9.982858480945295e-06, + "loss": 5.6471, + "step": 635 + }, + { + "epoch": 0.12553942722636327, + "grad_norm": 20.80467414855957, + "learning_rate": 9.982536769023262e-06, + "loss": 5.5484, + "step": 640 + }, + { + "epoch": 0.12652020400156924, + "grad_norm": 16.309709548950195, + "learning_rate": 9.982212071439943e-06, + "loss": 5.9737, + "step": 645 + }, + { + "epoch": 0.1275009807767752, + "grad_norm": 16.632150650024414, + "learning_rate": 9.9818843883899e-06, + "loss": 5.4386, + "step": 650 + }, + { + "epoch": 0.12848175755198116, + "grad_norm": 13.748077392578125, + "learning_rate": 9.981553720069487e-06, + "loss": 5.5232, + "step": 655 + }, + { + "epoch": 0.12946253432718713, + "grad_norm": 11.935937881469727, + "learning_rate": 9.981220066676855e-06, + "loss": 5.3825, + "step": 660 + }, + { + "epoch": 0.1304433111023931, + "grad_norm": 21.656436920166016, + "learning_rate": 9.980883428411934e-06, + "loss": 5.8827, + "step": 665 + }, + { + "epoch": 0.13142408787759907, + "grad_norm": 14.381396293640137, + "learning_rate": 9.980543805476447e-06, + "loss": 6.2153, + "step": 670 + }, + { + "epoch": 0.13240486465280502, + "grad_norm": 18.785852432250977, + "learning_rate": 9.980201198073902e-06, + "loss": 5.6124, + "step": 675 + }, + { + "epoch": 0.133385641428011, + "grad_norm": 15.40206527709961, + "learning_rate": 9.9798556064096e-06, + "loss": 5.2739, + "step": 680 + }, + { + "epoch": 0.13436641820321696, + "grad_norm": 20.313282012939453, + "learning_rate": 9.97950703069063e-06, + "loss": 5.6705, + "step": 685 + }, + { + "epoch": 0.1353471949784229, + "grad_norm": 13.421894073486328, + "learning_rate": 9.979155471125866e-06, + "loss": 5.9609, + "step": 690 + }, + { + "epoch": 0.13632797175362887, + "grad_norm": 19.68906021118164, + "learning_rate": 9.97880092792597e-06, + "loss": 5.3471, + "step": 695 + }, + { + "epoch": 0.13730874852883485, + "grad_norm": 17.42752456665039, + "learning_rate": 9.978443401303392e-06, + "loss": 5.6414, + "step": 700 + }, + { + "epoch": 0.1382895253040408, + "grad_norm": 19.268634796142578, + "learning_rate": 9.978082891472376e-06, + "loss": 5.6018, + "step": 705 + }, + { + "epoch": 0.13927030207924676, + "grad_norm": 14.104660987854004, + "learning_rate": 9.977719398648945e-06, + "loss": 5.4409, + "step": 710 + }, + { + "epoch": 0.14025107885445273, + "grad_norm": 14.374234199523926, + "learning_rate": 9.977352923050913e-06, + "loss": 5.8209, + "step": 715 + }, + { + "epoch": 0.1412318556296587, + "grad_norm": 23.02276611328125, + "learning_rate": 9.976983464897882e-06, + "loss": 5.7185, + "step": 720 + }, + { + "epoch": 0.14221263240486465, + "grad_norm": 16.408750534057617, + "learning_rate": 9.976611024411241e-06, + "loss": 5.7395, + "step": 725 + }, + { + "epoch": 0.14319340918007062, + "grad_norm": 15.385457038879395, + "learning_rate": 9.976235601814163e-06, + "loss": 5.8248, + "step": 730 + }, + { + "epoch": 0.1441741859552766, + "grad_norm": 13.1869478225708, + "learning_rate": 9.975857197331617e-06, + "loss": 5.7622, + "step": 735 + }, + { + "epoch": 0.14515496273048253, + "grad_norm": 18.29127311706543, + "learning_rate": 9.975475811190346e-06, + "loss": 5.6788, + "step": 740 + }, + { + "epoch": 0.1461357395056885, + "grad_norm": 18.088855743408203, + "learning_rate": 9.975091443618889e-06, + "loss": 5.4656, + "step": 745 + }, + { + "epoch": 0.14711651628089448, + "grad_norm": 30.90943145751953, + "learning_rate": 9.974704094847568e-06, + "loss": 5.525, + "step": 750 + }, + { + "epoch": 0.14809729305610042, + "grad_norm": 16.267723083496094, + "learning_rate": 9.974313765108492e-06, + "loss": 5.3356, + "step": 755 + }, + { + "epoch": 0.1490780698313064, + "grad_norm": 15.598844528198242, + "learning_rate": 9.973920454635559e-06, + "loss": 5.5963, + "step": 760 + }, + { + "epoch": 0.15005884660651236, + "grad_norm": 27.659122467041016, + "learning_rate": 9.973524163664447e-06, + "loss": 5.4735, + "step": 765 + }, + { + "epoch": 0.1510396233817183, + "grad_norm": 30.47996711730957, + "learning_rate": 9.973124892432626e-06, + "loss": 5.6989, + "step": 770 + }, + { + "epoch": 0.15202040015692428, + "grad_norm": 18.81101417541504, + "learning_rate": 9.972722641179347e-06, + "loss": 5.5134, + "step": 775 + }, + { + "epoch": 0.15300117693213025, + "grad_norm": 30.00667381286621, + "learning_rate": 9.972317410145651e-06, + "loss": 5.5737, + "step": 780 + }, + { + "epoch": 0.15398195370733622, + "grad_norm": 16.57672119140625, + "learning_rate": 9.97190919957436e-06, + "loss": 5.369, + "step": 785 + }, + { + "epoch": 0.15496273048254217, + "grad_norm": 25.58556365966797, + "learning_rate": 9.971498009710088e-06, + "loss": 5.7396, + "step": 790 + }, + { + "epoch": 0.15594350725774814, + "grad_norm": 30.630939483642578, + "learning_rate": 9.971083840799229e-06, + "loss": 5.415, + "step": 795 + }, + { + "epoch": 0.1569242840329541, + "grad_norm": 12.743330001831055, + "learning_rate": 9.97066669308996e-06, + "loss": 5.5816, + "step": 800 + }, + { + "epoch": 0.15790506080816005, + "grad_norm": 15.443756103515625, + "learning_rate": 9.970246566832252e-06, + "loss": 5.4506, + "step": 805 + }, + { + "epoch": 0.15888583758336602, + "grad_norm": 22.54521942138672, + "learning_rate": 9.96982346227785e-06, + "loss": 5.6406, + "step": 810 + }, + { + "epoch": 0.159866614358572, + "grad_norm": 16.574411392211914, + "learning_rate": 9.969397379680293e-06, + "loss": 5.4527, + "step": 815 + }, + { + "epoch": 0.16084739113377794, + "grad_norm": 24.11166000366211, + "learning_rate": 9.968968319294897e-06, + "loss": 5.4452, + "step": 820 + }, + { + "epoch": 0.1618281679089839, + "grad_norm": 19.735063552856445, + "learning_rate": 9.96853628137877e-06, + "loss": 5.3691, + "step": 825 + }, + { + "epoch": 0.16280894468418988, + "grad_norm": 17.768238067626953, + "learning_rate": 9.968101266190795e-06, + "loss": 5.3472, + "step": 830 + }, + { + "epoch": 0.16378972145939585, + "grad_norm": 23.892215728759766, + "learning_rate": 9.967663273991646e-06, + "loss": 5.707, + "step": 835 + }, + { + "epoch": 0.1647704982346018, + "grad_norm": 31.293302536010742, + "learning_rate": 9.96722230504378e-06, + "loss": 5.8675, + "step": 840 + }, + { + "epoch": 0.16575127500980777, + "grad_norm": 35.70466995239258, + "learning_rate": 9.966778359611435e-06, + "loss": 6.0157, + "step": 845 + }, + { + "epoch": 0.16673205178501374, + "grad_norm": 20.035058975219727, + "learning_rate": 9.966331437960636e-06, + "loss": 5.6011, + "step": 850 + }, + { + "epoch": 0.16771282856021968, + "grad_norm": 18.689212799072266, + "learning_rate": 9.96588154035919e-06, + "loss": 5.2369, + "step": 855 + }, + { + "epoch": 0.16869360533542566, + "grad_norm": 71.4951171875, + "learning_rate": 9.965428667076687e-06, + "loss": 6.5524, + "step": 860 + }, + { + "epoch": 0.16967438211063163, + "grad_norm": 19.083498001098633, + "learning_rate": 9.964972818384496e-06, + "loss": 5.6899, + "step": 865 + }, + { + "epoch": 0.17065515888583757, + "grad_norm": 26.55590057373047, + "learning_rate": 9.964513994555778e-06, + "loss": 5.4484, + "step": 870 + }, + { + "epoch": 0.17163593566104354, + "grad_norm": 18.4299259185791, + "learning_rate": 9.964052195865468e-06, + "loss": 5.6257, + "step": 875 + }, + { + "epoch": 0.17261671243624951, + "grad_norm": 16.13317108154297, + "learning_rate": 9.96358742259029e-06, + "loss": 5.5983, + "step": 880 + }, + { + "epoch": 0.17359748921145549, + "grad_norm": 16.984779357910156, + "learning_rate": 9.963119675008748e-06, + "loss": 5.48, + "step": 885 + }, + { + "epoch": 0.17457826598666143, + "grad_norm": 22.902658462524414, + "learning_rate": 9.962648953401125e-06, + "loss": 5.9483, + "step": 890 + }, + { + "epoch": 0.1755590427618674, + "grad_norm": 24.295028686523438, + "learning_rate": 9.962175258049493e-06, + "loss": 5.7974, + "step": 895 + }, + { + "epoch": 0.17653981953707337, + "grad_norm": 20.873647689819336, + "learning_rate": 9.9616985892377e-06, + "loss": 5.5721, + "step": 900 + }, + { + "epoch": 0.17752059631227932, + "grad_norm": 15.696707725524902, + "learning_rate": 9.961218947251378e-06, + "loss": 5.4619, + "step": 905 + }, + { + "epoch": 0.1785013730874853, + "grad_norm": 20.687946319580078, + "learning_rate": 9.96073633237794e-06, + "loss": 5.8472, + "step": 910 + }, + { + "epoch": 0.17948214986269126, + "grad_norm": 14.133719444274902, + "learning_rate": 9.960250744906583e-06, + "loss": 5.417, + "step": 915 + }, + { + "epoch": 0.1804629266378972, + "grad_norm": 15.220897674560547, + "learning_rate": 9.959762185128283e-06, + "loss": 5.3647, + "step": 920 + }, + { + "epoch": 0.18144370341310317, + "grad_norm": 30.21053695678711, + "learning_rate": 9.959270653335795e-06, + "loss": 5.7774, + "step": 925 + }, + { + "epoch": 0.18242448018830915, + "grad_norm": 21.00077247619629, + "learning_rate": 9.958776149823658e-06, + "loss": 5.4467, + "step": 930 + }, + { + "epoch": 0.18340525696351512, + "grad_norm": 25.028913497924805, + "learning_rate": 9.958278674888194e-06, + "loss": 5.5723, + "step": 935 + }, + { + "epoch": 0.18438603373872106, + "grad_norm": 12.45209789276123, + "learning_rate": 9.957778228827499e-06, + "loss": 5.5205, + "step": 940 + }, + { + "epoch": 0.18536681051392703, + "grad_norm": 12.58310604095459, + "learning_rate": 9.957274811941452e-06, + "loss": 5.285, + "step": 945 + }, + { + "epoch": 0.186347587289133, + "grad_norm": 15.49960708618164, + "learning_rate": 9.956768424531717e-06, + "loss": 5.5491, + "step": 950 + }, + { + "epoch": 0.18732836406433895, + "grad_norm": 14.371564865112305, + "learning_rate": 9.956259066901733e-06, + "loss": 5.6843, + "step": 955 + }, + { + "epoch": 0.18830914083954492, + "grad_norm": 20.336835861206055, + "learning_rate": 9.955746739356716e-06, + "loss": 5.7197, + "step": 960 + }, + { + "epoch": 0.1892899176147509, + "grad_norm": 14.215054512023926, + "learning_rate": 9.95523144220367e-06, + "loss": 5.7196, + "step": 965 + }, + { + "epoch": 0.19027069438995683, + "grad_norm": 20.546175003051758, + "learning_rate": 9.954713175751373e-06, + "loss": 5.407, + "step": 970 + }, + { + "epoch": 0.1912514711651628, + "grad_norm": 14.670819282531738, + "learning_rate": 9.954191940310381e-06, + "loss": 5.6484, + "step": 975 + }, + { + "epoch": 0.19223224794036878, + "grad_norm": 46.397247314453125, + "learning_rate": 9.953667736193034e-06, + "loss": 5.6878, + "step": 980 + }, + { + "epoch": 0.19321302471557472, + "grad_norm": 10.847311973571777, + "learning_rate": 9.953140563713448e-06, + "loss": 5.4969, + "step": 985 + }, + { + "epoch": 0.1941938014907807, + "grad_norm": 17.326196670532227, + "learning_rate": 9.952610423187516e-06, + "loss": 5.6137, + "step": 990 + }, + { + "epoch": 0.19517457826598666, + "grad_norm": 18.91810417175293, + "learning_rate": 9.952077314932916e-06, + "loss": 5.3132, + "step": 995 + }, + { + "epoch": 0.19615535504119264, + "grad_norm": 15.190394401550293, + "learning_rate": 9.951541239269093e-06, + "loss": 5.4908, + "step": 1000 + }, + { + "epoch": 0.19713613181639858, + "grad_norm": 16.841787338256836, + "learning_rate": 9.951002196517284e-06, + "loss": 5.5744, + "step": 1005 + }, + { + "epoch": 0.19811690859160455, + "grad_norm": 17.683765411376953, + "learning_rate": 9.950460187000492e-06, + "loss": 5.2091, + "step": 1010 + }, + { + "epoch": 0.19909768536681052, + "grad_norm": 17.12097930908203, + "learning_rate": 9.949915211043504e-06, + "loss": 5.4289, + "step": 1015 + }, + { + "epoch": 0.20007846214201647, + "grad_norm": 22.892080307006836, + "learning_rate": 9.949367268972885e-06, + "loss": 5.2564, + "step": 1020 + }, + { + "epoch": 0.20105923891722244, + "grad_norm": 15.766369819641113, + "learning_rate": 9.948816361116973e-06, + "loss": 5.3534, + "step": 1025 + }, + { + "epoch": 0.2020400156924284, + "grad_norm": 24.48428726196289, + "learning_rate": 9.94826248780589e-06, + "loss": 6.1259, + "step": 1030 + }, + { + "epoch": 0.20302079246763435, + "grad_norm": 17.7995548248291, + "learning_rate": 9.947705649371526e-06, + "loss": 5.585, + "step": 1035 + }, + { + "epoch": 0.20400156924284032, + "grad_norm": 24.083513259887695, + "learning_rate": 9.947145846147555e-06, + "loss": 5.7126, + "step": 1040 + }, + { + "epoch": 0.2049823460180463, + "grad_norm": 12.388947486877441, + "learning_rate": 9.946583078469426e-06, + "loss": 5.5251, + "step": 1045 + }, + { + "epoch": 0.20596312279325227, + "grad_norm": 12.7921724319458, + "learning_rate": 9.946017346674362e-06, + "loss": 5.4943, + "step": 1050 + }, + { + "epoch": 0.2069438995684582, + "grad_norm": 28.395301818847656, + "learning_rate": 9.945448651101365e-06, + "loss": 5.7806, + "step": 1055 + }, + { + "epoch": 0.20792467634366418, + "grad_norm": 35.039634704589844, + "learning_rate": 9.944876992091208e-06, + "loss": 5.3988, + "step": 1060 + }, + { + "epoch": 0.20890545311887015, + "grad_norm": 12.610820770263672, + "learning_rate": 9.944302369986447e-06, + "loss": 5.5946, + "step": 1065 + }, + { + "epoch": 0.2098862298940761, + "grad_norm": 13.666259765625, + "learning_rate": 9.943724785131412e-06, + "loss": 5.5101, + "step": 1070 + }, + { + "epoch": 0.21086700666928207, + "grad_norm": 19.81101417541504, + "learning_rate": 9.943144237872202e-06, + "loss": 5.4725, + "step": 1075 + }, + { + "epoch": 0.21184778344448804, + "grad_norm": 17.405027389526367, + "learning_rate": 9.942560728556696e-06, + "loss": 5.4333, + "step": 1080 + }, + { + "epoch": 0.21282856021969399, + "grad_norm": 18.48592758178711, + "learning_rate": 9.94197425753455e-06, + "loss": 5.4893, + "step": 1085 + }, + { + "epoch": 0.21380933699489996, + "grad_norm": 26.090227127075195, + "learning_rate": 9.94138482515719e-06, + "loss": 6.0036, + "step": 1090 + }, + { + "epoch": 0.21479011377010593, + "grad_norm": 18.069581985473633, + "learning_rate": 9.94079243177782e-06, + "loss": 5.3082, + "step": 1095 + }, + { + "epoch": 0.2157708905453119, + "grad_norm": 10.112467765808105, + "learning_rate": 9.940197077751416e-06, + "loss": 5.5434, + "step": 1100 + }, + { + "epoch": 0.21675166732051784, + "grad_norm": 20.426862716674805, + "learning_rate": 9.93959876343473e-06, + "loss": 5.5726, + "step": 1105 + }, + { + "epoch": 0.21773244409572381, + "grad_norm": 14.866218566894531, + "learning_rate": 9.938997489186287e-06, + "loss": 5.7631, + "step": 1110 + }, + { + "epoch": 0.2187132208709298, + "grad_norm": 13.593292236328125, + "learning_rate": 9.938393255366383e-06, + "loss": 5.1905, + "step": 1115 + }, + { + "epoch": 0.21969399764613573, + "grad_norm": 15.151540756225586, + "learning_rate": 9.937786062337095e-06, + "loss": 5.6169, + "step": 1120 + }, + { + "epoch": 0.2206747744213417, + "grad_norm": 16.6584529876709, + "learning_rate": 9.937175910462264e-06, + "loss": 5.1824, + "step": 1125 + }, + { + "epoch": 0.22165555119654767, + "grad_norm": 36.43128967285156, + "learning_rate": 9.936562800107512e-06, + "loss": 5.8551, + "step": 1130 + }, + { + "epoch": 0.22263632797175362, + "grad_norm": 20.047487258911133, + "learning_rate": 9.935946731640226e-06, + "loss": 5.3579, + "step": 1135 + }, + { + "epoch": 0.2236171047469596, + "grad_norm": 9.661016464233398, + "learning_rate": 9.935327705429572e-06, + "loss": 5.4897, + "step": 1140 + }, + { + "epoch": 0.22459788152216556, + "grad_norm": 17.872982025146484, + "learning_rate": 9.934705721846487e-06, + "loss": 5.4356, + "step": 1145 + }, + { + "epoch": 0.22557865829737153, + "grad_norm": 14.063901901245117, + "learning_rate": 9.934080781263677e-06, + "loss": 4.9382, + "step": 1150 + }, + { + "epoch": 0.22655943507257748, + "grad_norm": 19.134902954101562, + "learning_rate": 9.933452884055625e-06, + "loss": 5.4342, + "step": 1155 + }, + { + "epoch": 0.22754021184778345, + "grad_norm": 9.560270309448242, + "learning_rate": 9.932822030598578e-06, + "loss": 5.466, + "step": 1160 + }, + { + "epoch": 0.22852098862298942, + "grad_norm": 21.985918045043945, + "learning_rate": 9.932188221270564e-06, + "loss": 5.2657, + "step": 1165 + }, + { + "epoch": 0.22950176539819536, + "grad_norm": 15.274785995483398, + "learning_rate": 9.931551456451377e-06, + "loss": 5.4104, + "step": 1170 + }, + { + "epoch": 0.23048254217340133, + "grad_norm": 28.460533142089844, + "learning_rate": 9.93091173652258e-06, + "loss": 5.2061, + "step": 1175 + }, + { + "epoch": 0.2314633189486073, + "grad_norm": 8.903681755065918, + "learning_rate": 9.93026906186751e-06, + "loss": 5.3667, + "step": 1180 + }, + { + "epoch": 0.23244409572381325, + "grad_norm": 31.3291015625, + "learning_rate": 9.929623432871277e-06, + "loss": 5.7421, + "step": 1185 + }, + { + "epoch": 0.23342487249901922, + "grad_norm": 12.238224983215332, + "learning_rate": 9.928974849920752e-06, + "loss": 5.2949, + "step": 1190 + }, + { + "epoch": 0.2344056492742252, + "grad_norm": 16.755048751831055, + "learning_rate": 9.928323313404587e-06, + "loss": 5.24, + "step": 1195 + }, + { + "epoch": 0.23538642604943116, + "grad_norm": 17.94761848449707, + "learning_rate": 9.927668823713197e-06, + "loss": 5.5436, + "step": 1200 + }, + { + "epoch": 0.2363672028246371, + "grad_norm": 14.653851509094238, + "learning_rate": 9.927011381238769e-06, + "loss": 5.1157, + "step": 1205 + }, + { + "epoch": 0.23734797959984308, + "grad_norm": 23.329797744750977, + "learning_rate": 9.926350986375261e-06, + "loss": 5.6824, + "step": 1210 + }, + { + "epoch": 0.23832875637504905, + "grad_norm": 12.664395332336426, + "learning_rate": 9.925687639518395e-06, + "loss": 5.4402, + "step": 1215 + }, + { + "epoch": 0.239309533150255, + "grad_norm": 23.01675796508789, + "learning_rate": 9.925021341065668e-06, + "loss": 5.5041, + "step": 1220 + }, + { + "epoch": 0.24029030992546097, + "grad_norm": 23.604764938354492, + "learning_rate": 9.924352091416342e-06, + "loss": 5.3516, + "step": 1225 + }, + { + "epoch": 0.24127108670066694, + "grad_norm": 10.142374992370605, + "learning_rate": 9.923679890971447e-06, + "loss": 5.0815, + "step": 1230 + }, + { + "epoch": 0.24225186347587288, + "grad_norm": 46.93704605102539, + "learning_rate": 9.923004740133783e-06, + "loss": 5.5097, + "step": 1235 + }, + { + "epoch": 0.24323264025107885, + "grad_norm": 18.356081008911133, + "learning_rate": 9.922326639307918e-06, + "loss": 5.367, + "step": 1240 + }, + { + "epoch": 0.24421341702628482, + "grad_norm": 17.588830947875977, + "learning_rate": 9.921645588900187e-06, + "loss": 5.6602, + "step": 1245 + }, + { + "epoch": 0.24519419380149077, + "grad_norm": 17.02576446533203, + "learning_rate": 9.92096158931869e-06, + "loss": 5.2146, + "step": 1250 + }, + { + "epoch": 0.24617497057669674, + "grad_norm": 20.34739875793457, + "learning_rate": 9.9202746409733e-06, + "loss": 5.3317, + "step": 1255 + }, + { + "epoch": 0.2471557473519027, + "grad_norm": 11.448872566223145, + "learning_rate": 9.919584744275652e-06, + "loss": 5.5831, + "step": 1260 + }, + { + "epoch": 0.24813652412710868, + "grad_norm": 14.755648612976074, + "learning_rate": 9.918891899639151e-06, + "loss": 5.3672, + "step": 1265 + }, + { + "epoch": 0.24911730090231463, + "grad_norm": 10.774361610412598, + "learning_rate": 9.918196107478966e-06, + "loss": 4.9776, + "step": 1270 + }, + { + "epoch": 0.25009807767752057, + "grad_norm": 13.798211097717285, + "learning_rate": 9.917497368212032e-06, + "loss": 5.3684, + "step": 1275 + }, + { + "epoch": 0.25009807767752057, + "eval_loss": 5.386422157287598, + "eval_runtime": 8.0843, + "eval_samples_per_second": 25.852, + "eval_steps_per_second": 12.988, + "step": 1275 + }, + { + "epoch": 0.25107885445272654, + "grad_norm": 14.56896686553955, + "learning_rate": 9.916795682257052e-06, + "loss": 5.5152, + "step": 1280 + }, + { + "epoch": 0.2520596312279325, + "grad_norm": 20.55042839050293, + "learning_rate": 9.916091050034496e-06, + "loss": 4.9962, + "step": 1285 + }, + { + "epoch": 0.2530404080031385, + "grad_norm": 16.28716278076172, + "learning_rate": 9.915383471966594e-06, + "loss": 5.566, + "step": 1290 + }, + { + "epoch": 0.25402118477834446, + "grad_norm": 44.03998565673828, + "learning_rate": 9.914672948477347e-06, + "loss": 5.226, + "step": 1295 + }, + { + "epoch": 0.2550019615535504, + "grad_norm": 10.094902992248535, + "learning_rate": 9.913959479992517e-06, + "loss": 5.1731, + "step": 1300 + }, + { + "epoch": 0.2559827383287564, + "grad_norm": 12.320008277893066, + "learning_rate": 9.913243066939631e-06, + "loss": 5.8408, + "step": 1305 + }, + { + "epoch": 0.2569635151039623, + "grad_norm": 12.670159339904785, + "learning_rate": 9.912523709747985e-06, + "loss": 5.0899, + "step": 1310 + }, + { + "epoch": 0.2579442918791683, + "grad_norm": 10.27662181854248, + "learning_rate": 9.911801408848634e-06, + "loss": 5.1799, + "step": 1315 + }, + { + "epoch": 0.25892506865437426, + "grad_norm": 17.295764923095703, + "learning_rate": 9.911076164674401e-06, + "loss": 5.104, + "step": 1320 + }, + { + "epoch": 0.25990584542958023, + "grad_norm": 21.576318740844727, + "learning_rate": 9.910347977659867e-06, + "loss": 5.6279, + "step": 1325 + }, + { + "epoch": 0.2608866222047862, + "grad_norm": 16.189517974853516, + "learning_rate": 9.909616848241382e-06, + "loss": 5.2356, + "step": 1330 + }, + { + "epoch": 0.26186739897999217, + "grad_norm": 15.81222152709961, + "learning_rate": 9.908882776857057e-06, + "loss": 5.5731, + "step": 1335 + }, + { + "epoch": 0.26284817575519814, + "grad_norm": 31.023778915405273, + "learning_rate": 9.908145763946766e-06, + "loss": 5.2728, + "step": 1340 + }, + { + "epoch": 0.26382895253040406, + "grad_norm": 16.400127410888672, + "learning_rate": 9.907405809952147e-06, + "loss": 5.775, + "step": 1345 + }, + { + "epoch": 0.26480972930561003, + "grad_norm": 30.60183334350586, + "learning_rate": 9.906662915316595e-06, + "loss": 5.4081, + "step": 1350 + }, + { + "epoch": 0.265790506080816, + "grad_norm": 12.121785163879395, + "learning_rate": 9.905917080485275e-06, + "loss": 5.498, + "step": 1355 + }, + { + "epoch": 0.266771282856022, + "grad_norm": 14.963290214538574, + "learning_rate": 9.905168305905109e-06, + "loss": 5.0885, + "step": 1360 + }, + { + "epoch": 0.26775205963122795, + "grad_norm": 19.471620559692383, + "learning_rate": 9.90441659202478e-06, + "loss": 5.2138, + "step": 1365 + }, + { + "epoch": 0.2687328364064339, + "grad_norm": 15.10096549987793, + "learning_rate": 9.903661939294737e-06, + "loss": 5.4012, + "step": 1370 + }, + { + "epoch": 0.26971361318163983, + "grad_norm": 14.973530769348145, + "learning_rate": 9.902904348167185e-06, + "loss": 5.329, + "step": 1375 + }, + { + "epoch": 0.2706943899568458, + "grad_norm": 12.646509170532227, + "learning_rate": 9.90214381909609e-06, + "loss": 5.2064, + "step": 1380 + }, + { + "epoch": 0.2716751667320518, + "grad_norm": 10.052018165588379, + "learning_rate": 9.901380352537183e-06, + "loss": 5.2056, + "step": 1385 + }, + { + "epoch": 0.27265594350725775, + "grad_norm": 16.167695999145508, + "learning_rate": 9.90061394894795e-06, + "loss": 5.4832, + "step": 1390 + }, + { + "epoch": 0.2736367202824637, + "grad_norm": 19.09334945678711, + "learning_rate": 9.899844608787641e-06, + "loss": 5.2711, + "step": 1395 + }, + { + "epoch": 0.2746174970576697, + "grad_norm": 17.699079513549805, + "learning_rate": 9.899072332517263e-06, + "loss": 5.3082, + "step": 1400 + }, + { + "epoch": 0.27559827383287566, + "grad_norm": 11.575671195983887, + "learning_rate": 9.898297120599585e-06, + "loss": 5.1952, + "step": 1405 + }, + { + "epoch": 0.2765790506080816, + "grad_norm": 14.866594314575195, + "learning_rate": 9.897518973499131e-06, + "loss": 5.3446, + "step": 1410 + }, + { + "epoch": 0.27755982738328755, + "grad_norm": 26.142955780029297, + "learning_rate": 9.89673789168219e-06, + "loss": 5.2227, + "step": 1415 + }, + { + "epoch": 0.2785406041584935, + "grad_norm": 34.1876220703125, + "learning_rate": 9.8959538756168e-06, + "loss": 5.5915, + "step": 1420 + }, + { + "epoch": 0.2795213809336995, + "grad_norm": 17.60607147216797, + "learning_rate": 9.89516692577277e-06, + "loss": 5.6551, + "step": 1425 + }, + { + "epoch": 0.28050215770890546, + "grad_norm": 15.993223190307617, + "learning_rate": 9.894377042621654e-06, + "loss": 5.2441, + "step": 1430 + }, + { + "epoch": 0.28148293448411144, + "grad_norm": 16.62636375427246, + "learning_rate": 9.893584226636773e-06, + "loss": 5.2074, + "step": 1435 + }, + { + "epoch": 0.2824637112593174, + "grad_norm": 23.400930404663086, + "learning_rate": 9.892788478293203e-06, + "loss": 5.3432, + "step": 1440 + }, + { + "epoch": 0.2834444880345233, + "grad_norm": 11.958523750305176, + "learning_rate": 9.891989798067774e-06, + "loss": 5.1582, + "step": 1445 + }, + { + "epoch": 0.2844252648097293, + "grad_norm": 13.033811569213867, + "learning_rate": 9.891188186439077e-06, + "loss": 5.2325, + "step": 1450 + }, + { + "epoch": 0.28540604158493527, + "grad_norm": 13.869037628173828, + "learning_rate": 9.890383643887458e-06, + "loss": 5.15, + "step": 1455 + }, + { + "epoch": 0.28638681836014124, + "grad_norm": 16.992834091186523, + "learning_rate": 9.889576170895016e-06, + "loss": 5.3062, + "step": 1460 + }, + { + "epoch": 0.2873675951353472, + "grad_norm": 15.610152244567871, + "learning_rate": 9.888765767945613e-06, + "loss": 5.4888, + "step": 1465 + }, + { + "epoch": 0.2883483719105532, + "grad_norm": 24.985816955566406, + "learning_rate": 9.887952435524863e-06, + "loss": 5.5596, + "step": 1470 + }, + { + "epoch": 0.2893291486857591, + "grad_norm": 17.40192222595215, + "learning_rate": 9.887136174120132e-06, + "loss": 5.1121, + "step": 1475 + }, + { + "epoch": 0.29030992546096507, + "grad_norm": 15.199414253234863, + "learning_rate": 9.886316984220546e-06, + "loss": 4.9102, + "step": 1480 + }, + { + "epoch": 0.29129070223617104, + "grad_norm": 17.887176513671875, + "learning_rate": 9.885494866316985e-06, + "loss": 5.2968, + "step": 1485 + }, + { + "epoch": 0.292271479011377, + "grad_norm": 9.527190208435059, + "learning_rate": 9.884669820902081e-06, + "loss": 5.0361, + "step": 1490 + }, + { + "epoch": 0.293252255786583, + "grad_norm": 11.876401901245117, + "learning_rate": 9.883841848470222e-06, + "loss": 5.2537, + "step": 1495 + }, + { + "epoch": 0.29423303256178895, + "grad_norm": 16.37000274658203, + "learning_rate": 9.883010949517553e-06, + "loss": 5.2722, + "step": 1500 + }, + { + "epoch": 0.2952138093369949, + "grad_norm": 15.113862991333008, + "learning_rate": 9.882177124541965e-06, + "loss": 5.3566, + "step": 1505 + }, + { + "epoch": 0.29619458611220084, + "grad_norm": 14.421209335327148, + "learning_rate": 9.881340374043111e-06, + "loss": 5.1384, + "step": 1510 + }, + { + "epoch": 0.2971753628874068, + "grad_norm": 24.657230377197266, + "learning_rate": 9.880500698522391e-06, + "loss": 5.6017, + "step": 1515 + }, + { + "epoch": 0.2981561396626128, + "grad_norm": 12.178624153137207, + "learning_rate": 9.879658098482959e-06, + "loss": 5.016, + "step": 1520 + }, + { + "epoch": 0.29913691643781876, + "grad_norm": 23.32276153564453, + "learning_rate": 9.878812574429722e-06, + "loss": 5.704, + "step": 1525 + }, + { + "epoch": 0.3001176932130247, + "grad_norm": 21.334623336791992, + "learning_rate": 9.877964126869341e-06, + "loss": 4.914, + "step": 1530 + }, + { + "epoch": 0.3010984699882307, + "grad_norm": 16.11152458190918, + "learning_rate": 9.877112756310225e-06, + "loss": 5.5664, + "step": 1535 + }, + { + "epoch": 0.3020792467634366, + "grad_norm": 22.593204498291016, + "learning_rate": 9.87625846326254e-06, + "loss": 5.2208, + "step": 1540 + }, + { + "epoch": 0.3030600235386426, + "grad_norm": 23.615087509155273, + "learning_rate": 9.875401248238197e-06, + "loss": 5.3042, + "step": 1545 + }, + { + "epoch": 0.30404080031384856, + "grad_norm": 14.477107048034668, + "learning_rate": 9.874541111750861e-06, + "loss": 5.3559, + "step": 1550 + }, + { + "epoch": 0.30502157708905453, + "grad_norm": 14.735102653503418, + "learning_rate": 9.873678054315949e-06, + "loss": 5.4065, + "step": 1555 + }, + { + "epoch": 0.3060023538642605, + "grad_norm": 11.800651550292969, + "learning_rate": 9.872812076450625e-06, + "loss": 5.4283, + "step": 1560 + }, + { + "epoch": 0.3069831306394665, + "grad_norm": 24.840158462524414, + "learning_rate": 9.871943178673806e-06, + "loss": 5.298, + "step": 1565 + }, + { + "epoch": 0.30796390741467244, + "grad_norm": 12.651166915893555, + "learning_rate": 9.871071361506156e-06, + "loss": 5.1233, + "step": 1570 + }, + { + "epoch": 0.30894468418987836, + "grad_norm": 30.376554489135742, + "learning_rate": 9.87019662547009e-06, + "loss": 5.5695, + "step": 1575 + }, + { + "epoch": 0.30992546096508433, + "grad_norm": 15.623506546020508, + "learning_rate": 9.869318971089774e-06, + "loss": 5.3929, + "step": 1580 + }, + { + "epoch": 0.3109062377402903, + "grad_norm": 31.46923828125, + "learning_rate": 9.868438398891118e-06, + "loss": 5.3255, + "step": 1585 + }, + { + "epoch": 0.3118870145154963, + "grad_norm": 15.704526901245117, + "learning_rate": 9.867554909401785e-06, + "loss": 5.1772, + "step": 1590 + }, + { + "epoch": 0.31286779129070225, + "grad_norm": 18.602909088134766, + "learning_rate": 9.866668503151182e-06, + "loss": 4.9734, + "step": 1595 + }, + { + "epoch": 0.3138485680659082, + "grad_norm": 20.587440490722656, + "learning_rate": 9.865779180670468e-06, + "loss": 5.0147, + "step": 1600 + }, + { + "epoch": 0.3148293448411142, + "grad_norm": 24.15857696533203, + "learning_rate": 9.864886942492543e-06, + "loss": 5.2671, + "step": 1605 + }, + { + "epoch": 0.3158101216163201, + "grad_norm": 10.229106903076172, + "learning_rate": 9.863991789152065e-06, + "loss": 5.3938, + "step": 1610 + }, + { + "epoch": 0.3167908983915261, + "grad_norm": 19.83039665222168, + "learning_rate": 9.86309372118543e-06, + "loss": 4.9836, + "step": 1615 + }, + { + "epoch": 0.31777167516673205, + "grad_norm": 24.620084762573242, + "learning_rate": 9.86219273913078e-06, + "loss": 5.1878, + "step": 1620 + }, + { + "epoch": 0.318752451941938, + "grad_norm": 10.462682723999023, + "learning_rate": 9.86128884352801e-06, + "loss": 5.8849, + "step": 1625 + }, + { + "epoch": 0.319733228717144, + "grad_norm": 16.412769317626953, + "learning_rate": 9.860382034918754e-06, + "loss": 5.3453, + "step": 1630 + }, + { + "epoch": 0.32071400549234996, + "grad_norm": 27.44397735595703, + "learning_rate": 9.859472313846396e-06, + "loss": 5.226, + "step": 1635 + }, + { + "epoch": 0.3216947822675559, + "grad_norm": 22.453519821166992, + "learning_rate": 9.858559680856064e-06, + "loss": 5.4033, + "step": 1640 + }, + { + "epoch": 0.32267555904276185, + "grad_norm": 15.075695991516113, + "learning_rate": 9.857644136494629e-06, + "loss": 5.2115, + "step": 1645 + }, + { + "epoch": 0.3236563358179678, + "grad_norm": 12.842907905578613, + "learning_rate": 9.85672568131071e-06, + "loss": 5.0739, + "step": 1650 + }, + { + "epoch": 0.3246371125931738, + "grad_norm": 15.345038414001465, + "learning_rate": 9.855804315854667e-06, + "loss": 5.1001, + "step": 1655 + }, + { + "epoch": 0.32561788936837976, + "grad_norm": 12.392874717712402, + "learning_rate": 9.854880040678608e-06, + "loss": 4.9565, + "step": 1660 + }, + { + "epoch": 0.32659866614358574, + "grad_norm": 14.970841407775879, + "learning_rate": 9.853952856336377e-06, + "loss": 5.2334, + "step": 1665 + }, + { + "epoch": 0.3275794429187917, + "grad_norm": 23.218185424804688, + "learning_rate": 9.853022763383572e-06, + "loss": 5.2103, + "step": 1670 + }, + { + "epoch": 0.3285602196939976, + "grad_norm": 16.600784301757812, + "learning_rate": 9.852089762377525e-06, + "loss": 5.359, + "step": 1675 + }, + { + "epoch": 0.3295409964692036, + "grad_norm": 31.103673934936523, + "learning_rate": 9.851153853877314e-06, + "loss": 5.3389, + "step": 1680 + }, + { + "epoch": 0.33052177324440957, + "grad_norm": 19.520828247070312, + "learning_rate": 9.850215038443756e-06, + "loss": 5.2418, + "step": 1685 + }, + { + "epoch": 0.33150255001961554, + "grad_norm": 31.281417846679688, + "learning_rate": 9.849273316639418e-06, + "loss": 5.5463, + "step": 1690 + }, + { + "epoch": 0.3324833267948215, + "grad_norm": 14.541434288024902, + "learning_rate": 9.8483286890286e-06, + "loss": 5.1112, + "step": 1695 + }, + { + "epoch": 0.3334641035700275, + "grad_norm": 20.683828353881836, + "learning_rate": 9.847381156177349e-06, + "loss": 5.7917, + "step": 1700 + }, + { + "epoch": 0.3344448803452334, + "grad_norm": 11.94363784790039, + "learning_rate": 9.846430718653449e-06, + "loss": 5.3054, + "step": 1705 + }, + { + "epoch": 0.33542565712043937, + "grad_norm": 14.712422370910645, + "learning_rate": 9.845477377026426e-06, + "loss": 5.3564, + "step": 1710 + }, + { + "epoch": 0.33640643389564534, + "grad_norm": 20.38669204711914, + "learning_rate": 9.844521131867546e-06, + "loss": 5.4337, + "step": 1715 + }, + { + "epoch": 0.3373872106708513, + "grad_norm": 16.519319534301758, + "learning_rate": 9.843561983749816e-06, + "loss": 5.4502, + "step": 1720 + }, + { + "epoch": 0.3383679874460573, + "grad_norm": 12.27083683013916, + "learning_rate": 9.84259993324798e-06, + "loss": 5.3244, + "step": 1725 + }, + { + "epoch": 0.33934876422126325, + "grad_norm": 15.599812507629395, + "learning_rate": 9.841634980938526e-06, + "loss": 5.4292, + "step": 1730 + }, + { + "epoch": 0.3403295409964692, + "grad_norm": 11.654434204101562, + "learning_rate": 9.840667127399675e-06, + "loss": 5.1737, + "step": 1735 + }, + { + "epoch": 0.34131031777167514, + "grad_norm": 22.475311279296875, + "learning_rate": 9.83969637321139e-06, + "loss": 5.4079, + "step": 1740 + }, + { + "epoch": 0.3422910945468811, + "grad_norm": 10.999152183532715, + "learning_rate": 9.838722718955372e-06, + "loss": 5.4605, + "step": 1745 + }, + { + "epoch": 0.3432718713220871, + "grad_norm": 19.069194793701172, + "learning_rate": 9.837746165215057e-06, + "loss": 5.6759, + "step": 1750 + }, + { + "epoch": 0.34425264809729306, + "grad_norm": 15.041163444519043, + "learning_rate": 9.836766712575622e-06, + "loss": 5.8595, + "step": 1755 + }, + { + "epoch": 0.34523342487249903, + "grad_norm": 12.235005378723145, + "learning_rate": 9.83578436162398e-06, + "loss": 5.4856, + "step": 1760 + }, + { + "epoch": 0.346214201647705, + "grad_norm": 17.395387649536133, + "learning_rate": 9.83479911294878e-06, + "loss": 5.2461, + "step": 1765 + }, + { + "epoch": 0.34719497842291097, + "grad_norm": 11.41357135772705, + "learning_rate": 9.833810967140408e-06, + "loss": 5.3194, + "step": 1770 + }, + { + "epoch": 0.3481757551981169, + "grad_norm": 11.756146430969238, + "learning_rate": 9.832819924790986e-06, + "loss": 5.3896, + "step": 1775 + }, + { + "epoch": 0.34915653197332286, + "grad_norm": 19.872554779052734, + "learning_rate": 9.83182598649437e-06, + "loss": 5.355, + "step": 1780 + }, + { + "epoch": 0.35013730874852883, + "grad_norm": 24.415315628051758, + "learning_rate": 9.830829152846154e-06, + "loss": 5.2137, + "step": 1785 + }, + { + "epoch": 0.3511180855237348, + "grad_norm": 19.011884689331055, + "learning_rate": 9.829829424443666e-06, + "loss": 5.478, + "step": 1790 + }, + { + "epoch": 0.3520988622989408, + "grad_norm": 12.593403816223145, + "learning_rate": 9.828826801885967e-06, + "loss": 5.7249, + "step": 1795 + }, + { + "epoch": 0.35307963907414674, + "grad_norm": 14.620553016662598, + "learning_rate": 9.827821285773855e-06, + "loss": 5.3739, + "step": 1800 + }, + { + "epoch": 0.35406041584935266, + "grad_norm": 26.63345718383789, + "learning_rate": 9.826812876709861e-06, + "loss": 5.2293, + "step": 1805 + }, + { + "epoch": 0.35504119262455863, + "grad_norm": 19.860340118408203, + "learning_rate": 9.825801575298248e-06, + "loss": 5.567, + "step": 1810 + }, + { + "epoch": 0.3560219693997646, + "grad_norm": 18.279808044433594, + "learning_rate": 9.824787382145013e-06, + "loss": 5.248, + "step": 1815 + }, + { + "epoch": 0.3570027461749706, + "grad_norm": 18.21912384033203, + "learning_rate": 9.82377029785789e-06, + "loss": 5.4724, + "step": 1820 + }, + { + "epoch": 0.35798352295017655, + "grad_norm": 18.464963912963867, + "learning_rate": 9.822750323046333e-06, + "loss": 5.1548, + "step": 1825 + }, + { + "epoch": 0.3589642997253825, + "grad_norm": 16.67640495300293, + "learning_rate": 9.821727458321544e-06, + "loss": 5.2362, + "step": 1830 + }, + { + "epoch": 0.3599450765005885, + "grad_norm": 23.646631240844727, + "learning_rate": 9.820701704296447e-06, + "loss": 5.2346, + "step": 1835 + }, + { + "epoch": 0.3609258532757944, + "grad_norm": 24.845317840576172, + "learning_rate": 9.819673061585698e-06, + "loss": 5.4409, + "step": 1840 + }, + { + "epoch": 0.3619066300510004, + "grad_norm": 28.531173706054688, + "learning_rate": 9.818641530805688e-06, + "loss": 5.3217, + "step": 1845 + }, + { + "epoch": 0.36288740682620635, + "grad_norm": 22.022838592529297, + "learning_rate": 9.817607112574534e-06, + "loss": 5.3782, + "step": 1850 + }, + { + "epoch": 0.3638681836014123, + "grad_norm": 17.82651138305664, + "learning_rate": 9.816569807512088e-06, + "loss": 5.1621, + "step": 1855 + }, + { + "epoch": 0.3648489603766183, + "grad_norm": 12.07430362701416, + "learning_rate": 9.815529616239927e-06, + "loss": 5.5542, + "step": 1860 + }, + { + "epoch": 0.36582973715182426, + "grad_norm": 12.271188735961914, + "learning_rate": 9.81448653938136e-06, + "loss": 5.1587, + "step": 1865 + }, + { + "epoch": 0.36681051392703024, + "grad_norm": 13.226649284362793, + "learning_rate": 9.813440577561429e-06, + "loss": 5.3729, + "step": 1870 + }, + { + "epoch": 0.36779129070223615, + "grad_norm": 14.851487159729004, + "learning_rate": 9.812391731406893e-06, + "loss": 5.1667, + "step": 1875 + }, + { + "epoch": 0.3687720674774421, + "grad_norm": 23.42156410217285, + "learning_rate": 9.811340001546252e-06, + "loss": 5.4899, + "step": 1880 + }, + { + "epoch": 0.3697528442526481, + "grad_norm": 11.76750659942627, + "learning_rate": 9.81028538860973e-06, + "loss": 5.0813, + "step": 1885 + }, + { + "epoch": 0.37073362102785407, + "grad_norm": 34.78882598876953, + "learning_rate": 9.809227893229273e-06, + "loss": 5.2967, + "step": 1890 + }, + { + "epoch": 0.37171439780306004, + "grad_norm": 19.89539909362793, + "learning_rate": 9.808167516038562e-06, + "loss": 5.2345, + "step": 1895 + }, + { + "epoch": 0.372695174578266, + "grad_norm": 16.6094913482666, + "learning_rate": 9.807104257673003e-06, + "loss": 4.9801, + "step": 1900 + }, + { + "epoch": 0.3736759513534719, + "grad_norm": 22.73749351501465, + "learning_rate": 9.806038118769724e-06, + "loss": 5.5207, + "step": 1905 + }, + { + "epoch": 0.3746567281286779, + "grad_norm": 10.95235538482666, + "learning_rate": 9.804969099967583e-06, + "loss": 5.0407, + "step": 1910 + }, + { + "epoch": 0.37563750490388387, + "grad_norm": 22.019926071166992, + "learning_rate": 9.803897201907164e-06, + "loss": 5.0178, + "step": 1915 + }, + { + "epoch": 0.37661828167908984, + "grad_norm": 29.805604934692383, + "learning_rate": 9.802822425230776e-06, + "loss": 5.3743, + "step": 1920 + }, + { + "epoch": 0.3775990584542958, + "grad_norm": 16.524503707885742, + "learning_rate": 9.801744770582449e-06, + "loss": 4.6636, + "step": 1925 + }, + { + "epoch": 0.3785798352295018, + "grad_norm": 21.630876541137695, + "learning_rate": 9.800664238607942e-06, + "loss": 5.465, + "step": 1930 + }, + { + "epoch": 0.37956061200470775, + "grad_norm": 22.99994468688965, + "learning_rate": 9.799580829954739e-06, + "loss": 4.9949, + "step": 1935 + }, + { + "epoch": 0.38054138877991367, + "grad_norm": 21.744600296020508, + "learning_rate": 9.798494545272044e-06, + "loss": 5.2195, + "step": 1940 + }, + { + "epoch": 0.38152216555511964, + "grad_norm": 19.225811004638672, + "learning_rate": 9.797405385210787e-06, + "loss": 5.3427, + "step": 1945 + }, + { + "epoch": 0.3825029423303256, + "grad_norm": 16.982257843017578, + "learning_rate": 9.796313350423619e-06, + "loss": 5.5768, + "step": 1950 + }, + { + "epoch": 0.3834837191055316, + "grad_norm": 13.56407642364502, + "learning_rate": 9.795218441564914e-06, + "loss": 5.4152, + "step": 1955 + }, + { + "epoch": 0.38446449588073756, + "grad_norm": 20.912071228027344, + "learning_rate": 9.79412065929077e-06, + "loss": 5.3516, + "step": 1960 + }, + { + "epoch": 0.3854452726559435, + "grad_norm": 18.138721466064453, + "learning_rate": 9.793020004259008e-06, + "loss": 5.4028, + "step": 1965 + }, + { + "epoch": 0.38642604943114944, + "grad_norm": 18.006498336791992, + "learning_rate": 9.791916477129165e-06, + "loss": 5.0267, + "step": 1970 + }, + { + "epoch": 0.3874068262063554, + "grad_norm": 12.000133514404297, + "learning_rate": 9.790810078562503e-06, + "loss": 5.2237, + "step": 1975 + }, + { + "epoch": 0.3883876029815614, + "grad_norm": 21.019866943359375, + "learning_rate": 9.789700809222005e-06, + "loss": 5.5981, + "step": 1980 + }, + { + "epoch": 0.38936837975676736, + "grad_norm": 12.761972427368164, + "learning_rate": 9.78858866977237e-06, + "loss": 5.0066, + "step": 1985 + }, + { + "epoch": 0.39034915653197333, + "grad_norm": 14.214351654052734, + "learning_rate": 9.787473660880022e-06, + "loss": 5.9138, + "step": 1990 + }, + { + "epoch": 0.3913299333071793, + "grad_norm": 15.80001163482666, + "learning_rate": 9.786355783213104e-06, + "loss": 5.6329, + "step": 1995 + }, + { + "epoch": 0.39231071008238527, + "grad_norm": 34.411319732666016, + "learning_rate": 9.785235037441473e-06, + "loss": 5.4683, + "step": 2000 + }, + { + "epoch": 0.3932914868575912, + "grad_norm": 21.931644439697266, + "learning_rate": 9.784111424236713e-06, + "loss": 5.0713, + "step": 2005 + }, + { + "epoch": 0.39427226363279716, + "grad_norm": 15.656501770019531, + "learning_rate": 9.782984944272115e-06, + "loss": 5.2415, + "step": 2010 + }, + { + "epoch": 0.39525304040800313, + "grad_norm": 18.855010986328125, + "learning_rate": 9.781855598222698e-06, + "loss": 5.1184, + "step": 2015 + }, + { + "epoch": 0.3962338171832091, + "grad_norm": 21.040325164794922, + "learning_rate": 9.780723386765194e-06, + "loss": 5.2587, + "step": 2020 + }, + { + "epoch": 0.3972145939584151, + "grad_norm": 21.39097023010254, + "learning_rate": 9.779588310578051e-06, + "loss": 5.2524, + "step": 2025 + }, + { + "epoch": 0.39819537073362105, + "grad_norm": 9.114205360412598, + "learning_rate": 9.778450370341439e-06, + "loss": 5.4023, + "step": 2030 + }, + { + "epoch": 0.399176147508827, + "grad_norm": 19.155128479003906, + "learning_rate": 9.777309566737236e-06, + "loss": 5.11, + "step": 2035 + }, + { + "epoch": 0.40015692428403293, + "grad_norm": 19.29751205444336, + "learning_rate": 9.776165900449044e-06, + "loss": 4.9788, + "step": 2040 + }, + { + "epoch": 0.4011377010592389, + "grad_norm": 25.808748245239258, + "learning_rate": 9.775019372162173e-06, + "loss": 5.4822, + "step": 2045 + }, + { + "epoch": 0.4021184778344449, + "grad_norm": 18.989933013916016, + "learning_rate": 9.773869982563653e-06, + "loss": 5.6281, + "step": 2050 + }, + { + "epoch": 0.40309925460965085, + "grad_norm": 26.764808654785156, + "learning_rate": 9.77271773234223e-06, + "loss": 5.2508, + "step": 2055 + }, + { + "epoch": 0.4040800313848568, + "grad_norm": 24.62432289123535, + "learning_rate": 9.771562622188355e-06, + "loss": 5.4812, + "step": 2060 + }, + { + "epoch": 0.4050608081600628, + "grad_norm": 13.985261917114258, + "learning_rate": 9.770404652794206e-06, + "loss": 5.2385, + "step": 2065 + }, + { + "epoch": 0.4060415849352687, + "grad_norm": 31.23125457763672, + "learning_rate": 9.769243824853661e-06, + "loss": 5.1801, + "step": 2070 + }, + { + "epoch": 0.4070223617104747, + "grad_norm": 25.266263961791992, + "learning_rate": 9.768080139062321e-06, + "loss": 5.2785, + "step": 2075 + }, + { + "epoch": 0.40800313848568065, + "grad_norm": 23.64206886291504, + "learning_rate": 9.766913596117497e-06, + "loss": 5.5125, + "step": 2080 + }, + { + "epoch": 0.4089839152608866, + "grad_norm": 20.972145080566406, + "learning_rate": 9.765744196718207e-06, + "loss": 5.0472, + "step": 2085 + }, + { + "epoch": 0.4099646920360926, + "grad_norm": 22.69337272644043, + "learning_rate": 9.764571941565189e-06, + "loss": 4.8892, + "step": 2090 + }, + { + "epoch": 0.41094546881129856, + "grad_norm": 20.279399871826172, + "learning_rate": 9.763396831360884e-06, + "loss": 5.4165, + "step": 2095 + }, + { + "epoch": 0.41192624558650454, + "grad_norm": 22.26048469543457, + "learning_rate": 9.76221886680945e-06, + "loss": 5.0247, + "step": 2100 + }, + { + "epoch": 0.41290702236171045, + "grad_norm": 23.47907066345215, + "learning_rate": 9.76103804861675e-06, + "loss": 5.4522, + "step": 2105 + }, + { + "epoch": 0.4138877991369164, + "grad_norm": 27.87772560119629, + "learning_rate": 9.75985437749036e-06, + "loss": 5.1454, + "step": 2110 + }, + { + "epoch": 0.4148685759121224, + "grad_norm": 18.217052459716797, + "learning_rate": 9.758667854139572e-06, + "loss": 5.4936, + "step": 2115 + }, + { + "epoch": 0.41584935268732837, + "grad_norm": 18.460439682006836, + "learning_rate": 9.757478479275373e-06, + "loss": 5.0919, + "step": 2120 + }, + { + "epoch": 0.41683012946253434, + "grad_norm": 17.97971534729004, + "learning_rate": 9.75628625361047e-06, + "loss": 5.1305, + "step": 2125 + }, + { + "epoch": 0.4178109062377403, + "grad_norm": 35.104087829589844, + "learning_rate": 9.755091177859273e-06, + "loss": 5.4199, + "step": 2130 + }, + { + "epoch": 0.4187916830129463, + "grad_norm": 13.479544639587402, + "learning_rate": 9.753893252737903e-06, + "loss": 5.1701, + "step": 2135 + }, + { + "epoch": 0.4197724597881522, + "grad_norm": 17.23796844482422, + "learning_rate": 9.752692478964186e-06, + "loss": 5.186, + "step": 2140 + }, + { + "epoch": 0.42075323656335817, + "grad_norm": 68.8131332397461, + "learning_rate": 9.751488857257657e-06, + "loss": 5.2711, + "step": 2145 + }, + { + "epoch": 0.42173401333856414, + "grad_norm": 16.84771156311035, + "learning_rate": 9.750282388339554e-06, + "loss": 5.0436, + "step": 2150 + }, + { + "epoch": 0.4227147901137701, + "grad_norm": 16.67029571533203, + "learning_rate": 9.749073072932824e-06, + "loss": 5.4642, + "step": 2155 + }, + { + "epoch": 0.4236955668889761, + "grad_norm": 13.9814453125, + "learning_rate": 9.747860911762122e-06, + "loss": 5.7409, + "step": 2160 + }, + { + "epoch": 0.42467634366418205, + "grad_norm": 28.299707412719727, + "learning_rate": 9.746645905553802e-06, + "loss": 5.4507, + "step": 2165 + }, + { + "epoch": 0.42565712043938797, + "grad_norm": 27.976619720458984, + "learning_rate": 9.745428055035928e-06, + "loss": 5.0855, + "step": 2170 + }, + { + "epoch": 0.42663789721459394, + "grad_norm": 22.406450271606445, + "learning_rate": 9.744207360938267e-06, + "loss": 5.2556, + "step": 2175 + }, + { + "epoch": 0.4276186739897999, + "grad_norm": 19.76823616027832, + "learning_rate": 9.742983823992289e-06, + "loss": 4.8967, + "step": 2180 + }, + { + "epoch": 0.4285994507650059, + "grad_norm": 19.014881134033203, + "learning_rate": 9.741757444931169e-06, + "loss": 4.972, + "step": 2185 + }, + { + "epoch": 0.42958022754021186, + "grad_norm": 23.06075668334961, + "learning_rate": 9.74052822448978e-06, + "loss": 5.1589, + "step": 2190 + }, + { + "epoch": 0.43056100431541783, + "grad_norm": 14.573113441467285, + "learning_rate": 9.739296163404708e-06, + "loss": 5.2036, + "step": 2195 + }, + { + "epoch": 0.4315417810906238, + "grad_norm": 17.803037643432617, + "learning_rate": 9.738061262414232e-06, + "loss": 5.0363, + "step": 2200 + }, + { + "epoch": 0.4325225578658297, + "grad_norm": 36.00239944458008, + "learning_rate": 9.736823522258334e-06, + "loss": 5.421, + "step": 2205 + }, + { + "epoch": 0.4335033346410357, + "grad_norm": 13.375822067260742, + "learning_rate": 9.735582943678701e-06, + "loss": 5.034, + "step": 2210 + }, + { + "epoch": 0.43448411141624166, + "grad_norm": 13.493207931518555, + "learning_rate": 9.73433952741872e-06, + "loss": 5.356, + "step": 2215 + }, + { + "epoch": 0.43546488819144763, + "grad_norm": 26.28885269165039, + "learning_rate": 9.733093274223474e-06, + "loss": 5.2359, + "step": 2220 + }, + { + "epoch": 0.4364456649666536, + "grad_norm": 25.277320861816406, + "learning_rate": 9.731844184839751e-06, + "loss": 5.4302, + "step": 2225 + }, + { + "epoch": 0.4374264417418596, + "grad_norm": 26.082422256469727, + "learning_rate": 9.73059226001604e-06, + "loss": 5.0238, + "step": 2230 + }, + { + "epoch": 0.4384072185170655, + "grad_norm": 64.33253479003906, + "learning_rate": 9.729337500502519e-06, + "loss": 5.3079, + "step": 2235 + }, + { + "epoch": 0.43938799529227146, + "grad_norm": 28.68381118774414, + "learning_rate": 9.728079907051076e-06, + "loss": 5.1955, + "step": 2240 + }, + { + "epoch": 0.44036877206747743, + "grad_norm": 12.982084274291992, + "learning_rate": 9.726819480415292e-06, + "loss": 4.99, + "step": 2245 + }, + { + "epoch": 0.4413495488426834, + "grad_norm": 27.528892517089844, + "learning_rate": 9.725556221350448e-06, + "loss": 5.2779, + "step": 2250 + }, + { + "epoch": 0.4423303256178894, + "grad_norm": 14.044219970703125, + "learning_rate": 9.724290130613518e-06, + "loss": 4.8775, + "step": 2255 + }, + { + "epoch": 0.44331110239309535, + "grad_norm": 17.11577796936035, + "learning_rate": 9.723021208963174e-06, + "loss": 5.2466, + "step": 2260 + }, + { + "epoch": 0.4442918791683013, + "grad_norm": 19.760128021240234, + "learning_rate": 9.72174945715979e-06, + "loss": 5.31, + "step": 2265 + }, + { + "epoch": 0.44527265594350723, + "grad_norm": 12.064308166503906, + "learning_rate": 9.72047487596543e-06, + "loss": 5.0058, + "step": 2270 + }, + { + "epoch": 0.4462534327187132, + "grad_norm": 21.768482208251953, + "learning_rate": 9.71919746614385e-06, + "loss": 5.2065, + "step": 2275 + }, + { + "epoch": 0.4472342094939192, + "grad_norm": 25.759071350097656, + "learning_rate": 9.717917228460516e-06, + "loss": 5.234, + "step": 2280 + }, + { + "epoch": 0.44821498626912515, + "grad_norm": 12.627178192138672, + "learning_rate": 9.71663416368257e-06, + "loss": 5.1581, + "step": 2285 + }, + { + "epoch": 0.4491957630443311, + "grad_norm": 15.976608276367188, + "learning_rate": 9.715348272578861e-06, + "loss": 5.026, + "step": 2290 + }, + { + "epoch": 0.4501765398195371, + "grad_norm": 16.61644172668457, + "learning_rate": 9.714059555919928e-06, + "loss": 4.9655, + "step": 2295 + }, + { + "epoch": 0.45115731659474306, + "grad_norm": 13.492209434509277, + "learning_rate": 9.712768014477997e-06, + "loss": 5.3452, + "step": 2300 + }, + { + "epoch": 0.452138093369949, + "grad_norm": 18.46678924560547, + "learning_rate": 9.711473649027e-06, + "loss": 4.9953, + "step": 2305 + }, + { + "epoch": 0.45311887014515495, + "grad_norm": 23.91395378112793, + "learning_rate": 9.710176460342546e-06, + "loss": 5.1415, + "step": 2310 + }, + { + "epoch": 0.4540996469203609, + "grad_norm": 40.245357513427734, + "learning_rate": 9.708876449201945e-06, + "loss": 4.8526, + "step": 2315 + }, + { + "epoch": 0.4550804236955669, + "grad_norm": 19.441259384155273, + "learning_rate": 9.7075736163842e-06, + "loss": 4.808, + "step": 2320 + }, + { + "epoch": 0.45606120047077287, + "grad_norm": 19.449247360229492, + "learning_rate": 9.706267962669999e-06, + "loss": 5.0985, + "step": 2325 + }, + { + "epoch": 0.45704197724597884, + "grad_norm": 19.058900833129883, + "learning_rate": 9.70495948884172e-06, + "loss": 4.9696, + "step": 2330 + }, + { + "epoch": 0.45802275402118475, + "grad_norm": 12.461446762084961, + "learning_rate": 9.703648195683438e-06, + "loss": 5.4223, + "step": 2335 + }, + { + "epoch": 0.4590035307963907, + "grad_norm": 19.470163345336914, + "learning_rate": 9.70233408398091e-06, + "loss": 5.3928, + "step": 2340 + }, + { + "epoch": 0.4599843075715967, + "grad_norm": 36.08694076538086, + "learning_rate": 9.701017154521584e-06, + "loss": 5.2037, + "step": 2345 + }, + { + "epoch": 0.46096508434680267, + "grad_norm": 23.371965408325195, + "learning_rate": 9.699697408094597e-06, + "loss": 5.1928, + "step": 2350 + }, + { + "epoch": 0.46194586112200864, + "grad_norm": 24.838356018066406, + "learning_rate": 9.698374845490779e-06, + "loss": 5.0539, + "step": 2355 + }, + { + "epoch": 0.4629266378972146, + "grad_norm": 11.548550605773926, + "learning_rate": 9.697049467502637e-06, + "loss": 5.2077, + "step": 2360 + }, + { + "epoch": 0.4639074146724206, + "grad_norm": 14.973907470703125, + "learning_rate": 9.695721274924374e-06, + "loss": 5.4514, + "step": 2365 + }, + { + "epoch": 0.4648881914476265, + "grad_norm": 16.513147354125977, + "learning_rate": 9.694390268551875e-06, + "loss": 5.3534, + "step": 2370 + }, + { + "epoch": 0.46586896822283247, + "grad_norm": 12.842937469482422, + "learning_rate": 9.693056449182714e-06, + "loss": 5.1984, + "step": 2375 + }, + { + "epoch": 0.46684974499803844, + "grad_norm": 21.4840030670166, + "learning_rate": 9.691719817616148e-06, + "loss": 5.0132, + "step": 2380 + }, + { + "epoch": 0.4678305217732444, + "grad_norm": 35.71097946166992, + "learning_rate": 9.690380374653121e-06, + "loss": 5.0982, + "step": 2385 + }, + { + "epoch": 0.4688112985484504, + "grad_norm": 32.27992630004883, + "learning_rate": 9.689038121096259e-06, + "loss": 5.1919, + "step": 2390 + }, + { + "epoch": 0.46979207532365636, + "grad_norm": 11.394453048706055, + "learning_rate": 9.687693057749876e-06, + "loss": 4.8216, + "step": 2395 + }, + { + "epoch": 0.4707728520988623, + "grad_norm": 15.756589889526367, + "learning_rate": 9.686345185419968e-06, + "loss": 5.0804, + "step": 2400 + }, + { + "epoch": 0.47175362887406824, + "grad_norm": 11.9701509475708, + "learning_rate": 9.684994504914212e-06, + "loss": 4.9623, + "step": 2405 + }, + { + "epoch": 0.4727344056492742, + "grad_norm": 30.16839599609375, + "learning_rate": 9.683641017041971e-06, + "loss": 5.1943, + "step": 2410 + }, + { + "epoch": 0.4737151824244802, + "grad_norm": 27.328493118286133, + "learning_rate": 9.68228472261429e-06, + "loss": 5.3544, + "step": 2415 + }, + { + "epoch": 0.47469595919968616, + "grad_norm": 12.850371360778809, + "learning_rate": 9.680925622443893e-06, + "loss": 5.3483, + "step": 2420 + }, + { + "epoch": 0.47567673597489213, + "grad_norm": 13.248522758483887, + "learning_rate": 9.679563717345186e-06, + "loss": 5.5595, + "step": 2425 + }, + { + "epoch": 0.4766575127500981, + "grad_norm": 16.820730209350586, + "learning_rate": 9.67819900813426e-06, + "loss": 5.2644, + "step": 2430 + }, + { + "epoch": 0.477638289525304, + "grad_norm": 19.460813522338867, + "learning_rate": 9.676831495628881e-06, + "loss": 5.2494, + "step": 2435 + }, + { + "epoch": 0.47861906630051, + "grad_norm": 19.54216766357422, + "learning_rate": 9.675461180648498e-06, + "loss": 5.3748, + "step": 2440 + }, + { + "epoch": 0.47959984307571596, + "grad_norm": 23.862497329711914, + "learning_rate": 9.674088064014235e-06, + "loss": 5.0946, + "step": 2445 + }, + { + "epoch": 0.48058061985092193, + "grad_norm": 10.924250602722168, + "learning_rate": 9.672712146548903e-06, + "loss": 5.156, + "step": 2450 + }, + { + "epoch": 0.4815613966261279, + "grad_norm": 24.398984909057617, + "learning_rate": 9.671333429076983e-06, + "loss": 5.2259, + "step": 2455 + }, + { + "epoch": 0.4825421734013339, + "grad_norm": 12.020353317260742, + "learning_rate": 9.669951912424638e-06, + "loss": 5.2717, + "step": 2460 + }, + { + "epoch": 0.48352295017653985, + "grad_norm": 22.294231414794922, + "learning_rate": 9.668567597419708e-06, + "loss": 5.2593, + "step": 2465 + }, + { + "epoch": 0.48450372695174576, + "grad_norm": 21.997915267944336, + "learning_rate": 9.667180484891707e-06, + "loss": 5.154, + "step": 2470 + }, + { + "epoch": 0.48548450372695173, + "grad_norm": 18.885684967041016, + "learning_rate": 9.66579057567183e-06, + "loss": 5.0931, + "step": 2475 + }, + { + "epoch": 0.4864652805021577, + "grad_norm": 25.34767723083496, + "learning_rate": 9.664397870592945e-06, + "loss": 4.9524, + "step": 2480 + }, + { + "epoch": 0.4874460572773637, + "grad_norm": 15.552264213562012, + "learning_rate": 9.663002370489596e-06, + "loss": 4.9879, + "step": 2485 + }, + { + "epoch": 0.48842683405256965, + "grad_norm": 20.804332733154297, + "learning_rate": 9.661604076198003e-06, + "loss": 5.2101, + "step": 2490 + }, + { + "epoch": 0.4894076108277756, + "grad_norm": 20.767370223999023, + "learning_rate": 9.660202988556057e-06, + "loss": 4.8171, + "step": 2495 + }, + { + "epoch": 0.49038838760298153, + "grad_norm": 25.34084701538086, + "learning_rate": 9.658799108403324e-06, + "loss": 5.2654, + "step": 2500 + }, + { + "epoch": 0.4913691643781875, + "grad_norm": 13.980265617370605, + "learning_rate": 9.657392436581049e-06, + "loss": 4.9889, + "step": 2505 + }, + { + "epoch": 0.4923499411533935, + "grad_norm": 19.053634643554688, + "learning_rate": 9.655982973932141e-06, + "loss": 5.6044, + "step": 2510 + }, + { + "epoch": 0.49333071792859945, + "grad_norm": 26.305130004882812, + "learning_rate": 9.654570721301186e-06, + "loss": 5.1973, + "step": 2515 + }, + { + "epoch": 0.4943114947038054, + "grad_norm": 17.097795486450195, + "learning_rate": 9.653155679534441e-06, + "loss": 4.9489, + "step": 2520 + }, + { + "epoch": 0.4952922714790114, + "grad_norm": 18.81172752380371, + "learning_rate": 9.651737849479838e-06, + "loss": 5.3182, + "step": 2525 + }, + { + "epoch": 0.49627304825421736, + "grad_norm": 20.398967742919922, + "learning_rate": 9.650317231986972e-06, + "loss": 5.15, + "step": 2530 + }, + { + "epoch": 0.4972538250294233, + "grad_norm": 14.472307205200195, + "learning_rate": 9.648893827907115e-06, + "loss": 5.1667, + "step": 2535 + }, + { + "epoch": 0.49823460180462925, + "grad_norm": 18.856861114501953, + "learning_rate": 9.647467638093206e-06, + "loss": 4.9925, + "step": 2540 + }, + { + "epoch": 0.4992153785798352, + "grad_norm": 22.74046516418457, + "learning_rate": 9.646038663399854e-06, + "loss": 5.1208, + "step": 2545 + }, + { + "epoch": 0.5001961553550411, + "grad_norm": 15.445943832397461, + "learning_rate": 9.644606904683335e-06, + "loss": 5.0133, + "step": 2550 + }, + { + "epoch": 0.5001961553550411, + "eval_loss": 5.15736198425293, + "eval_runtime": 7.6721, + "eval_samples_per_second": 27.242, + "eval_steps_per_second": 13.686, + "step": 2550 + }, + { + "epoch": 0.5011769321302472, + "grad_norm": 18.68064308166504, + "learning_rate": 9.643172362801599e-06, + "loss": 4.8855, + "step": 2555 + }, + { + "epoch": 0.5021577089054531, + "grad_norm": 18.33717155456543, + "learning_rate": 9.641735038614255e-06, + "loss": 5.4683, + "step": 2560 + }, + { + "epoch": 0.5031384856806591, + "grad_norm": 19.931522369384766, + "learning_rate": 9.640294932982585e-06, + "loss": 5.3343, + "step": 2565 + }, + { + "epoch": 0.504119262455865, + "grad_norm": 23.878559112548828, + "learning_rate": 9.63885204676954e-06, + "loss": 4.7968, + "step": 2570 + }, + { + "epoch": 0.505100039231071, + "grad_norm": 19.06217384338379, + "learning_rate": 9.637406380839728e-06, + "loss": 5.2138, + "step": 2575 + }, + { + "epoch": 0.506080816006277, + "grad_norm": 14.29570484161377, + "learning_rate": 9.635957936059432e-06, + "loss": 5.4729, + "step": 2580 + }, + { + "epoch": 0.5070615927814829, + "grad_norm": 19.553762435913086, + "learning_rate": 9.634506713296596e-06, + "loss": 5.4094, + "step": 2585 + }, + { + "epoch": 0.5080423695566889, + "grad_norm": 15.330031394958496, + "learning_rate": 9.633052713420828e-06, + "loss": 5.1174, + "step": 2590 + }, + { + "epoch": 0.5090231463318948, + "grad_norm": 21.965177536010742, + "learning_rate": 9.631595937303402e-06, + "loss": 5.2644, + "step": 2595 + }, + { + "epoch": 0.5100039231071009, + "grad_norm": 19.100711822509766, + "learning_rate": 9.630136385817258e-06, + "loss": 5.1775, + "step": 2600 + }, + { + "epoch": 0.5109846998823068, + "grad_norm": 14.69621467590332, + "learning_rate": 9.62867405983699e-06, + "loss": 5.0009, + "step": 2605 + }, + { + "epoch": 0.5119654766575128, + "grad_norm": 19.2616024017334, + "learning_rate": 9.627208960238864e-06, + "loss": 4.8041, + "step": 2610 + }, + { + "epoch": 0.5129462534327187, + "grad_norm": 13.647692680358887, + "learning_rate": 9.625741087900802e-06, + "loss": 4.7495, + "step": 2615 + }, + { + "epoch": 0.5139270302079246, + "grad_norm": 14.416105270385742, + "learning_rate": 9.624270443702395e-06, + "loss": 5.3507, + "step": 2620 + }, + { + "epoch": 0.5149078069831307, + "grad_norm": 12.422703742980957, + "learning_rate": 9.622797028524885e-06, + "loss": 4.8836, + "step": 2625 + }, + { + "epoch": 0.5158885837583366, + "grad_norm": 15.866639137268066, + "learning_rate": 9.621320843251183e-06, + "loss": 5.352, + "step": 2630 + }, + { + "epoch": 0.5168693605335426, + "grad_norm": 53.905094146728516, + "learning_rate": 9.619841888765853e-06, + "loss": 5.4811, + "step": 2635 + }, + { + "epoch": 0.5178501373087485, + "grad_norm": 18.568187713623047, + "learning_rate": 9.618360165955125e-06, + "loss": 5.2916, + "step": 2640 + }, + { + "epoch": 0.5188309140839545, + "grad_norm": 31.13910484313965, + "learning_rate": 9.61687567570688e-06, + "loss": 5.3021, + "step": 2645 + }, + { + "epoch": 0.5198116908591605, + "grad_norm": 21.233203887939453, + "learning_rate": 9.615388418910668e-06, + "loss": 5.2845, + "step": 2650 + }, + { + "epoch": 0.5207924676343664, + "grad_norm": 30.170978546142578, + "learning_rate": 9.613898396457687e-06, + "loss": 5.1301, + "step": 2655 + }, + { + "epoch": 0.5217732444095724, + "grad_norm": 23.659400939941406, + "learning_rate": 9.612405609240795e-06, + "loss": 5.217, + "step": 2660 + }, + { + "epoch": 0.5227540211847783, + "grad_norm": 12.635769844055176, + "learning_rate": 9.61091005815451e-06, + "loss": 5.0976, + "step": 2665 + }, + { + "epoch": 0.5237347979599843, + "grad_norm": 23.049379348754883, + "learning_rate": 9.609411744095002e-06, + "loss": 5.3677, + "step": 2670 + }, + { + "epoch": 0.5247155747351903, + "grad_norm": 16.504423141479492, + "learning_rate": 9.607910667960098e-06, + "loss": 5.0437, + "step": 2675 + }, + { + "epoch": 0.5256963515103963, + "grad_norm": 12.01205825805664, + "learning_rate": 9.606406830649283e-06, + "loss": 4.9374, + "step": 2680 + }, + { + "epoch": 0.5266771282856022, + "grad_norm": 16.772647857666016, + "learning_rate": 9.604900233063696e-06, + "loss": 5.0273, + "step": 2685 + }, + { + "epoch": 0.5276579050608081, + "grad_norm": 28.808847427368164, + "learning_rate": 9.603390876106123e-06, + "loss": 5.305, + "step": 2690 + }, + { + "epoch": 0.5286386818360141, + "grad_norm": 13.548622131347656, + "learning_rate": 9.60187876068101e-06, + "loss": 4.9093, + "step": 2695 + }, + { + "epoch": 0.5296194586112201, + "grad_norm": 20.574769973754883, + "learning_rate": 9.600363887694455e-06, + "loss": 5.088, + "step": 2700 + }, + { + "epoch": 0.5306002353864261, + "grad_norm": 16.430707931518555, + "learning_rate": 9.598846258054208e-06, + "loss": 5.3671, + "step": 2705 + }, + { + "epoch": 0.531581012161632, + "grad_norm": 24.375839233398438, + "learning_rate": 9.597325872669672e-06, + "loss": 5.5251, + "step": 2710 + }, + { + "epoch": 0.5325617889368379, + "grad_norm": 19.051464080810547, + "learning_rate": 9.5958027324519e-06, + "loss": 4.948, + "step": 2715 + }, + { + "epoch": 0.533542565712044, + "grad_norm": 24.722259521484375, + "learning_rate": 9.594276838313593e-06, + "loss": 4.8675, + "step": 2720 + }, + { + "epoch": 0.5345233424872499, + "grad_norm": 13.493644714355469, + "learning_rate": 9.592748191169107e-06, + "loss": 5.3292, + "step": 2725 + }, + { + "epoch": 0.5355041192624559, + "grad_norm": 12.233308792114258, + "learning_rate": 9.59121679193445e-06, + "loss": 5.1377, + "step": 2730 + }, + { + "epoch": 0.5364848960376618, + "grad_norm": 15.433117866516113, + "learning_rate": 9.589682641527269e-06, + "loss": 5.1438, + "step": 2735 + }, + { + "epoch": 0.5374656728128678, + "grad_norm": 30.711441040039062, + "learning_rate": 9.588145740866866e-06, + "loss": 5.4655, + "step": 2740 + }, + { + "epoch": 0.5384464495880738, + "grad_norm": 21.23065948486328, + "learning_rate": 9.586606090874193e-06, + "loss": 5.5321, + "step": 2745 + }, + { + "epoch": 0.5394272263632797, + "grad_norm": 14.286324501037598, + "learning_rate": 9.585063692471845e-06, + "loss": 5.2226, + "step": 2750 + }, + { + "epoch": 0.5404080031384857, + "grad_norm": 21.153053283691406, + "learning_rate": 9.583518546584069e-06, + "loss": 5.2436, + "step": 2755 + }, + { + "epoch": 0.5413887799136916, + "grad_norm": 16.570341110229492, + "learning_rate": 9.581970654136752e-06, + "loss": 5.1266, + "step": 2760 + }, + { + "epoch": 0.5423695566888976, + "grad_norm": 17.8549747467041, + "learning_rate": 9.580420016057431e-06, + "loss": 5.1313, + "step": 2765 + }, + { + "epoch": 0.5433503334641036, + "grad_norm": 37.07738494873047, + "learning_rate": 9.578866633275289e-06, + "loss": 5.1527, + "step": 2770 + }, + { + "epoch": 0.5443311102393096, + "grad_norm": 19.371015548706055, + "learning_rate": 9.577310506721148e-06, + "loss": 4.9898, + "step": 2775 + }, + { + "epoch": 0.5453118870145155, + "grad_norm": 17.40994644165039, + "learning_rate": 9.575751637327481e-06, + "loss": 5.1577, + "step": 2780 + }, + { + "epoch": 0.5462926637897214, + "grad_norm": 13.387382507324219, + "learning_rate": 9.574190026028404e-06, + "loss": 5.1941, + "step": 2785 + }, + { + "epoch": 0.5472734405649274, + "grad_norm": 21.211164474487305, + "learning_rate": 9.57262567375967e-06, + "loss": 4.9418, + "step": 2790 + }, + { + "epoch": 0.5482542173401334, + "grad_norm": 14.441864967346191, + "learning_rate": 9.57105858145868e-06, + "loss": 5.2723, + "step": 2795 + }, + { + "epoch": 0.5492349941153394, + "grad_norm": 23.936847686767578, + "learning_rate": 9.569488750064472e-06, + "loss": 5.5534, + "step": 2800 + }, + { + "epoch": 0.5502157708905453, + "grad_norm": 14.22829818725586, + "learning_rate": 9.567916180517733e-06, + "loss": 5.104, + "step": 2805 + }, + { + "epoch": 0.5511965476657513, + "grad_norm": 19.610498428344727, + "learning_rate": 9.566340873760784e-06, + "loss": 5.8822, + "step": 2810 + }, + { + "epoch": 0.5521773244409572, + "grad_norm": 32.81593322753906, + "learning_rate": 9.564762830737586e-06, + "loss": 5.0032, + "step": 2815 + }, + { + "epoch": 0.5531581012161632, + "grad_norm": 15.706247329711914, + "learning_rate": 9.563182052393747e-06, + "loss": 5.1115, + "step": 2820 + }, + { + "epoch": 0.5541388779913692, + "grad_norm": 24.40583610534668, + "learning_rate": 9.561598539676507e-06, + "loss": 5.0677, + "step": 2825 + }, + { + "epoch": 0.5551196547665751, + "grad_norm": 13.26285171508789, + "learning_rate": 9.560012293534746e-06, + "loss": 5.0625, + "step": 2830 + }, + { + "epoch": 0.5561004315417811, + "grad_norm": 12.539552688598633, + "learning_rate": 9.558423314918982e-06, + "loss": 5.0136, + "step": 2835 + }, + { + "epoch": 0.557081208316987, + "grad_norm": 22.337364196777344, + "learning_rate": 9.556831604781373e-06, + "loss": 4.7906, + "step": 2840 + }, + { + "epoch": 0.5580619850921931, + "grad_norm": 14.324231147766113, + "learning_rate": 9.55523716407571e-06, + "loss": 4.7484, + "step": 2845 + }, + { + "epoch": 0.559042761867399, + "grad_norm": 16.521678924560547, + "learning_rate": 9.553639993757422e-06, + "loss": 4.8595, + "step": 2850 + }, + { + "epoch": 0.5600235386426049, + "grad_norm": 17.187822341918945, + "learning_rate": 9.552040094783575e-06, + "loss": 4.8251, + "step": 2855 + }, + { + "epoch": 0.5610043154178109, + "grad_norm": 18.47371482849121, + "learning_rate": 9.550437468112867e-06, + "loss": 5.4023, + "step": 2860 + }, + { + "epoch": 0.5619850921930168, + "grad_norm": 22.58086585998535, + "learning_rate": 9.548832114705634e-06, + "loss": 5.22, + "step": 2865 + }, + { + "epoch": 0.5629658689682229, + "grad_norm": 12.957576751708984, + "learning_rate": 9.547224035523841e-06, + "loss": 5.3866, + "step": 2870 + }, + { + "epoch": 0.5639466457434288, + "grad_norm": 23.065975189208984, + "learning_rate": 9.545613231531094e-06, + "loss": 5.0757, + "step": 2875 + }, + { + "epoch": 0.5649274225186348, + "grad_norm": 11.80753231048584, + "learning_rate": 9.543999703692624e-06, + "loss": 5.2471, + "step": 2880 + }, + { + "epoch": 0.5659081992938407, + "grad_norm": 14.523046493530273, + "learning_rate": 9.5423834529753e-06, + "loss": 5.001, + "step": 2885 + }, + { + "epoch": 0.5668889760690466, + "grad_norm": 28.944412231445312, + "learning_rate": 9.540764480347616e-06, + "loss": 4.9322, + "step": 2890 + }, + { + "epoch": 0.5678697528442527, + "grad_norm": 18.60620880126953, + "learning_rate": 9.539142786779702e-06, + "loss": 5.0668, + "step": 2895 + }, + { + "epoch": 0.5688505296194586, + "grad_norm": 16.527719497680664, + "learning_rate": 9.537518373243322e-06, + "loss": 5.2394, + "step": 2900 + }, + { + "epoch": 0.5698313063946646, + "grad_norm": 16.115703582763672, + "learning_rate": 9.535891240711861e-06, + "loss": 5.0097, + "step": 2905 + }, + { + "epoch": 0.5708120831698705, + "grad_norm": 11.602025032043457, + "learning_rate": 9.53426139016034e-06, + "loss": 5.4735, + "step": 2910 + }, + { + "epoch": 0.5717928599450764, + "grad_norm": 14.616000175476074, + "learning_rate": 9.532628822565405e-06, + "loss": 5.0512, + "step": 2915 + }, + { + "epoch": 0.5727736367202825, + "grad_norm": 27.371612548828125, + "learning_rate": 9.530993538905332e-06, + "loss": 4.9507, + "step": 2920 + }, + { + "epoch": 0.5737544134954884, + "grad_norm": 16.049571990966797, + "learning_rate": 9.529355540160025e-06, + "loss": 5.9872, + "step": 2925 + }, + { + "epoch": 0.5747351902706944, + "grad_norm": 43.3659782409668, + "learning_rate": 9.527714827311012e-06, + "loss": 5.375, + "step": 2930 + }, + { + "epoch": 0.5757159670459003, + "grad_norm": 42.861656188964844, + "learning_rate": 9.526071401341452e-06, + "loss": 4.8584, + "step": 2935 + }, + { + "epoch": 0.5766967438211064, + "grad_norm": 36.75173568725586, + "learning_rate": 9.524425263236124e-06, + "loss": 5.3912, + "step": 2940 + }, + { + "epoch": 0.5776775205963123, + "grad_norm": 18.924116134643555, + "learning_rate": 9.522776413981438e-06, + "loss": 4.8026, + "step": 2945 + }, + { + "epoch": 0.5786582973715182, + "grad_norm": 9.715642929077148, + "learning_rate": 9.521124854565425e-06, + "loss": 5.0732, + "step": 2950 + }, + { + "epoch": 0.5796390741467242, + "grad_norm": 10.902597427368164, + "learning_rate": 9.51947058597774e-06, + "loss": 5.165, + "step": 2955 + }, + { + "epoch": 0.5806198509219301, + "grad_norm": 15.857320785522461, + "learning_rate": 9.517813609209665e-06, + "loss": 5.3345, + "step": 2960 + }, + { + "epoch": 0.5816006276971362, + "grad_norm": 18.43710708618164, + "learning_rate": 9.5161539252541e-06, + "loss": 5.3621, + "step": 2965 + }, + { + "epoch": 0.5825814044723421, + "grad_norm": 27.262819290161133, + "learning_rate": 9.51449153510557e-06, + "loss": 5.2153, + "step": 2970 + }, + { + "epoch": 0.5835621812475481, + "grad_norm": 19.492929458618164, + "learning_rate": 9.51282643976022e-06, + "loss": 5.345, + "step": 2975 + }, + { + "epoch": 0.584542958022754, + "grad_norm": 22.923349380493164, + "learning_rate": 9.511158640215818e-06, + "loss": 4.9704, + "step": 2980 + }, + { + "epoch": 0.5855237347979599, + "grad_norm": 12.464946746826172, + "learning_rate": 9.509488137471751e-06, + "loss": 4.9893, + "step": 2985 + }, + { + "epoch": 0.586504511573166, + "grad_norm": 24.526554107666016, + "learning_rate": 9.507814932529027e-06, + "loss": 4.9967, + "step": 2990 + }, + { + "epoch": 0.5874852883483719, + "grad_norm": 24.627796173095703, + "learning_rate": 9.50613902639027e-06, + "loss": 5.2092, + "step": 2995 + }, + { + "epoch": 0.5884660651235779, + "grad_norm": 27.909130096435547, + "learning_rate": 9.50446042005973e-06, + "loss": 5.2925, + "step": 3000 + }, + { + "epoch": 0.5894468418987838, + "grad_norm": 35.04037857055664, + "learning_rate": 9.502779114543263e-06, + "loss": 5.05, + "step": 3005 + }, + { + "epoch": 0.5904276186739899, + "grad_norm": 10.635119438171387, + "learning_rate": 9.501095110848357e-06, + "loss": 4.9822, + "step": 3010 + }, + { + "epoch": 0.5914083954491958, + "grad_norm": 11.250473976135254, + "learning_rate": 9.499408409984104e-06, + "loss": 4.9801, + "step": 3015 + }, + { + "epoch": 0.5923891722244017, + "grad_norm": 19.92534828186035, + "learning_rate": 9.49771901296122e-06, + "loss": 5.2654, + "step": 3020 + }, + { + "epoch": 0.5933699489996077, + "grad_norm": 9.814534187316895, + "learning_rate": 9.496026920792034e-06, + "loss": 5.1213, + "step": 3025 + }, + { + "epoch": 0.5943507257748136, + "grad_norm": 25.265356063842773, + "learning_rate": 9.49433213449049e-06, + "loss": 5.3733, + "step": 3030 + }, + { + "epoch": 0.5953315025500197, + "grad_norm": 13.742667198181152, + "learning_rate": 9.492634655072143e-06, + "loss": 5.1799, + "step": 3035 + }, + { + "epoch": 0.5963122793252256, + "grad_norm": 42.507598876953125, + "learning_rate": 9.490934483554173e-06, + "loss": 5.215, + "step": 3040 + }, + { + "epoch": 0.5972930561004316, + "grad_norm": 32.569759368896484, + "learning_rate": 9.48923162095536e-06, + "loss": 5.2368, + "step": 3045 + }, + { + "epoch": 0.5982738328756375, + "grad_norm": 33.264808654785156, + "learning_rate": 9.487526068296102e-06, + "loss": 4.9331, + "step": 3050 + }, + { + "epoch": 0.5992546096508434, + "grad_norm": 27.735538482666016, + "learning_rate": 9.485817826598411e-06, + "loss": 5.4973, + "step": 3055 + }, + { + "epoch": 0.6002353864260495, + "grad_norm": 26.621362686157227, + "learning_rate": 9.48410689688591e-06, + "loss": 5.3181, + "step": 3060 + }, + { + "epoch": 0.6012161632012554, + "grad_norm": 20.41889762878418, + "learning_rate": 9.482393280183827e-06, + "loss": 5.168, + "step": 3065 + }, + { + "epoch": 0.6021969399764614, + "grad_norm": 12.211244583129883, + "learning_rate": 9.480676977519005e-06, + "loss": 4.9126, + "step": 3070 + }, + { + "epoch": 0.6031777167516673, + "grad_norm": 26.63025665283203, + "learning_rate": 9.478957989919897e-06, + "loss": 5.3281, + "step": 3075 + }, + { + "epoch": 0.6041584935268732, + "grad_norm": 40.21369171142578, + "learning_rate": 9.477236318416564e-06, + "loss": 5.6442, + "step": 3080 + }, + { + "epoch": 0.6051392703020793, + "grad_norm": 18.292333602905273, + "learning_rate": 9.475511964040674e-06, + "loss": 4.8534, + "step": 3085 + }, + { + "epoch": 0.6061200470772852, + "grad_norm": 13.079044342041016, + "learning_rate": 9.473784927825503e-06, + "loss": 5.0455, + "step": 3090 + }, + { + "epoch": 0.6071008238524912, + "grad_norm": 16.62065315246582, + "learning_rate": 9.472055210805935e-06, + "loss": 4.9956, + "step": 3095 + }, + { + "epoch": 0.6080816006276971, + "grad_norm": 23.266950607299805, + "learning_rate": 9.47032281401846e-06, + "loss": 4.7663, + "step": 3100 + }, + { + "epoch": 0.6090623774029031, + "grad_norm": 22.578330993652344, + "learning_rate": 9.468587738501176e-06, + "loss": 5.2476, + "step": 3105 + }, + { + "epoch": 0.6100431541781091, + "grad_norm": 22.685009002685547, + "learning_rate": 9.46684998529378e-06, + "loss": 5.2655, + "step": 3110 + }, + { + "epoch": 0.611023930953315, + "grad_norm": 19.493541717529297, + "learning_rate": 9.46510955543758e-06, + "loss": 5.34, + "step": 3115 + }, + { + "epoch": 0.612004707728521, + "grad_norm": 13.723419189453125, + "learning_rate": 9.463366449975483e-06, + "loss": 5.2569, + "step": 3120 + }, + { + "epoch": 0.6129854845037269, + "grad_norm": 16.924633026123047, + "learning_rate": 9.461620669952003e-06, + "loss": 5.1248, + "step": 3125 + }, + { + "epoch": 0.613966261278933, + "grad_norm": 28.247379302978516, + "learning_rate": 9.459872216413255e-06, + "loss": 5.4098, + "step": 3130 + }, + { + "epoch": 0.6149470380541389, + "grad_norm": 9.836380958557129, + "learning_rate": 9.458121090406958e-06, + "loss": 5.165, + "step": 3135 + }, + { + "epoch": 0.6159278148293449, + "grad_norm": 15.825077056884766, + "learning_rate": 9.45636729298243e-06, + "loss": 5.1338, + "step": 3140 + }, + { + "epoch": 0.6169085916045508, + "grad_norm": 23.045560836791992, + "learning_rate": 9.454610825190586e-06, + "loss": 5.0819, + "step": 3145 + }, + { + "epoch": 0.6178893683797567, + "grad_norm": 16.358854293823242, + "learning_rate": 9.452851688083953e-06, + "loss": 4.7103, + "step": 3150 + }, + { + "epoch": 0.6188701451549627, + "grad_norm": 25.5876522064209, + "learning_rate": 9.451089882716644e-06, + "loss": 5.0649, + "step": 3155 + }, + { + "epoch": 0.6198509219301687, + "grad_norm": 14.046899795532227, + "learning_rate": 9.449325410144383e-06, + "loss": 4.7795, + "step": 3160 + }, + { + "epoch": 0.6208316987053747, + "grad_norm": 16.689111709594727, + "learning_rate": 9.44755827142448e-06, + "loss": 4.8892, + "step": 3165 + }, + { + "epoch": 0.6218124754805806, + "grad_norm": 14.133878707885742, + "learning_rate": 9.445788467615852e-06, + "loss": 5.3107, + "step": 3170 + }, + { + "epoch": 0.6227932522557866, + "grad_norm": 20.716943740844727, + "learning_rate": 9.444015999779013e-06, + "loss": 5.5117, + "step": 3175 + }, + { + "epoch": 0.6237740290309925, + "grad_norm": 14.682984352111816, + "learning_rate": 9.442240868976064e-06, + "loss": 4.8668, + "step": 3180 + }, + { + "epoch": 0.6247548058061985, + "grad_norm": 28.73581886291504, + "learning_rate": 9.440463076270713e-06, + "loss": 4.9583, + "step": 3185 + }, + { + "epoch": 0.6257355825814045, + "grad_norm": 11.658886909484863, + "learning_rate": 9.438682622728256e-06, + "loss": 5.074, + "step": 3190 + }, + { + "epoch": 0.6267163593566104, + "grad_norm": 14.399426460266113, + "learning_rate": 9.436899509415586e-06, + "loss": 5.2788, + "step": 3195 + }, + { + "epoch": 0.6276971361318164, + "grad_norm": 12.162981033325195, + "learning_rate": 9.435113737401188e-06, + "loss": 5.0695, + "step": 3200 + }, + { + "epoch": 0.6286779129070224, + "grad_norm": 28.62458038330078, + "learning_rate": 9.433325307755144e-06, + "loss": 5.0648, + "step": 3205 + }, + { + "epoch": 0.6296586896822284, + "grad_norm": 45.26708984375, + "learning_rate": 9.431534221549124e-06, + "loss": 5.003, + "step": 3210 + }, + { + "epoch": 0.6306394664574343, + "grad_norm": 13.835540771484375, + "learning_rate": 9.42974047985639e-06, + "loss": 5.1126, + "step": 3215 + }, + { + "epoch": 0.6316202432326402, + "grad_norm": 22.305397033691406, + "learning_rate": 9.427944083751803e-06, + "loss": 5.1913, + "step": 3220 + }, + { + "epoch": 0.6326010200078462, + "grad_norm": 26.718841552734375, + "learning_rate": 9.426145034311805e-06, + "loss": 5.2575, + "step": 3225 + }, + { + "epoch": 0.6335817967830522, + "grad_norm": 15.588373184204102, + "learning_rate": 9.424343332614432e-06, + "loss": 4.7884, + "step": 3230 + }, + { + "epoch": 0.6345625735582582, + "grad_norm": 23.680042266845703, + "learning_rate": 9.422538979739307e-06, + "loss": 5.1369, + "step": 3235 + }, + { + "epoch": 0.6355433503334641, + "grad_norm": 21.163063049316406, + "learning_rate": 9.420731976767647e-06, + "loss": 4.9218, + "step": 3240 + }, + { + "epoch": 0.63652412710867, + "grad_norm": 24.679636001586914, + "learning_rate": 9.418922324782252e-06, + "loss": 5.0978, + "step": 3245 + }, + { + "epoch": 0.637504903883876, + "grad_norm": 22.380672454833984, + "learning_rate": 9.41711002486751e-06, + "loss": 4.9464, + "step": 3250 + }, + { + "epoch": 0.638485680659082, + "grad_norm": 37.539794921875, + "learning_rate": 9.415295078109398e-06, + "loss": 4.9001, + "step": 3255 + }, + { + "epoch": 0.639466457434288, + "grad_norm": 21.865497589111328, + "learning_rate": 9.413477485595479e-06, + "loss": 4.9388, + "step": 3260 + }, + { + "epoch": 0.6404472342094939, + "grad_norm": 19.220638275146484, + "learning_rate": 9.411657248414898e-06, + "loss": 5.1998, + "step": 3265 + }, + { + "epoch": 0.6414280109846999, + "grad_norm": 27.21924591064453, + "learning_rate": 9.409834367658387e-06, + "loss": 5.1988, + "step": 3270 + }, + { + "epoch": 0.6424087877599058, + "grad_norm": 15.7559175491333, + "learning_rate": 9.408008844418262e-06, + "loss": 4.9148, + "step": 3275 + }, + { + "epoch": 0.6433895645351118, + "grad_norm": 13.318585395812988, + "learning_rate": 9.406180679788423e-06, + "loss": 5.2602, + "step": 3280 + }, + { + "epoch": 0.6443703413103178, + "grad_norm": 12.137907028198242, + "learning_rate": 9.404349874864354e-06, + "loss": 4.8963, + "step": 3285 + }, + { + "epoch": 0.6453511180855237, + "grad_norm": 16.94658660888672, + "learning_rate": 9.402516430743115e-06, + "loss": 5.0545, + "step": 3290 + }, + { + "epoch": 0.6463318948607297, + "grad_norm": 21.77410316467285, + "learning_rate": 9.400680348523356e-06, + "loss": 5.4027, + "step": 3295 + }, + { + "epoch": 0.6473126716359356, + "grad_norm": 26.694643020629883, + "learning_rate": 9.398841629305303e-06, + "loss": 4.9592, + "step": 3300 + }, + { + "epoch": 0.6482934484111417, + "grad_norm": 17.593299865722656, + "learning_rate": 9.397000274190759e-06, + "loss": 4.8191, + "step": 3305 + }, + { + "epoch": 0.6492742251863476, + "grad_norm": 12.933694839477539, + "learning_rate": 9.395156284283113e-06, + "loss": 5.2141, + "step": 3310 + }, + { + "epoch": 0.6502550019615535, + "grad_norm": 11.42352294921875, + "learning_rate": 9.39330966068733e-06, + "loss": 5.0004, + "step": 3315 + }, + { + "epoch": 0.6512357787367595, + "grad_norm": 17.624004364013672, + "learning_rate": 9.391460404509954e-06, + "loss": 5.2441, + "step": 3320 + }, + { + "epoch": 0.6522165555119654, + "grad_norm": 12.520964622497559, + "learning_rate": 9.389608516859106e-06, + "loss": 4.8506, + "step": 3325 + }, + { + "epoch": 0.6531973322871715, + "grad_norm": 29.8953800201416, + "learning_rate": 9.387753998844482e-06, + "loss": 5.4652, + "step": 3330 + }, + { + "epoch": 0.6541781090623774, + "grad_norm": 24.0643367767334, + "learning_rate": 9.385896851577357e-06, + "loss": 4.9041, + "step": 3335 + }, + { + "epoch": 0.6551588858375834, + "grad_norm": 18.141315460205078, + "learning_rate": 9.384037076170578e-06, + "loss": 5.2624, + "step": 3340 + }, + { + "epoch": 0.6561396626127893, + "grad_norm": 15.43309211730957, + "learning_rate": 9.382174673738573e-06, + "loss": 5.0105, + "step": 3345 + }, + { + "epoch": 0.6571204393879952, + "grad_norm": 15.214509010314941, + "learning_rate": 9.380309645397337e-06, + "loss": 5.0126, + "step": 3350 + }, + { + "epoch": 0.6581012161632013, + "grad_norm": 10.24284553527832, + "learning_rate": 9.378441992264444e-06, + "loss": 5.022, + "step": 3355 + }, + { + "epoch": 0.6590819929384072, + "grad_norm": 20.641220092773438, + "learning_rate": 9.376571715459037e-06, + "loss": 5.0157, + "step": 3360 + }, + { + "epoch": 0.6600627697136132, + "grad_norm": 16.595033645629883, + "learning_rate": 9.374698816101836e-06, + "loss": 5.3137, + "step": 3365 + }, + { + "epoch": 0.6610435464888191, + "grad_norm": 15.82652759552002, + "learning_rate": 9.372823295315126e-06, + "loss": 5.2748, + "step": 3370 + }, + { + "epoch": 0.6620243232640252, + "grad_norm": 18.090869903564453, + "learning_rate": 9.370945154222767e-06, + "loss": 4.8269, + "step": 3375 + }, + { + "epoch": 0.6630051000392311, + "grad_norm": 15.272143363952637, + "learning_rate": 9.369064393950189e-06, + "loss": 5.1632, + "step": 3380 + }, + { + "epoch": 0.663985876814437, + "grad_norm": 12.312703132629395, + "learning_rate": 9.367181015624392e-06, + "loss": 5.3285, + "step": 3385 + }, + { + "epoch": 0.664966653589643, + "grad_norm": 16.462718963623047, + "learning_rate": 9.36529502037394e-06, + "loss": 5.1033, + "step": 3390 + }, + { + "epoch": 0.6659474303648489, + "grad_norm": 12.415363311767578, + "learning_rate": 9.363406409328972e-06, + "loss": 4.752, + "step": 3395 + }, + { + "epoch": 0.666928207140055, + "grad_norm": 16.04773712158203, + "learning_rate": 9.361515183621191e-06, + "loss": 5.1875, + "step": 3400 + }, + { + "epoch": 0.6679089839152609, + "grad_norm": 13.996196746826172, + "learning_rate": 9.359621344383867e-06, + "loss": 5.1647, + "step": 3405 + }, + { + "epoch": 0.6688897606904668, + "grad_norm": 13.507939338684082, + "learning_rate": 9.357724892751834e-06, + "loss": 4.6931, + "step": 3410 + }, + { + "epoch": 0.6698705374656728, + "grad_norm": 27.69146728515625, + "learning_rate": 9.355825829861495e-06, + "loss": 5.2703, + "step": 3415 + }, + { + "epoch": 0.6708513142408787, + "grad_norm": 25.83177375793457, + "learning_rate": 9.353924156850816e-06, + "loss": 5.6674, + "step": 3420 + }, + { + "epoch": 0.6718320910160848, + "grad_norm": 25.738636016845703, + "learning_rate": 9.352019874859326e-06, + "loss": 4.8759, + "step": 3425 + }, + { + "epoch": 0.6728128677912907, + "grad_norm": 8.705573081970215, + "learning_rate": 9.350112985028121e-06, + "loss": 4.8531, + "step": 3430 + }, + { + "epoch": 0.6737936445664967, + "grad_norm": 15.935606956481934, + "learning_rate": 9.348203488499858e-06, + "loss": 5.1539, + "step": 3435 + }, + { + "epoch": 0.6747744213417026, + "grad_norm": 27.665693283081055, + "learning_rate": 9.34629138641875e-06, + "loss": 5.118, + "step": 3440 + }, + { + "epoch": 0.6757551981169085, + "grad_norm": 19.099084854125977, + "learning_rate": 9.344376679930585e-06, + "loss": 5.0635, + "step": 3445 + }, + { + "epoch": 0.6767359748921146, + "grad_norm": 15.758007049560547, + "learning_rate": 9.342459370182695e-06, + "loss": 4.9812, + "step": 3450 + }, + { + "epoch": 0.6777167516673205, + "grad_norm": 13.712491035461426, + "learning_rate": 9.340539458323985e-06, + "loss": 5.1033, + "step": 3455 + }, + { + "epoch": 0.6786975284425265, + "grad_norm": 21.73082733154297, + "learning_rate": 9.338616945504913e-06, + "loss": 4.9175, + "step": 3460 + }, + { + "epoch": 0.6796783052177324, + "grad_norm": 12.152356147766113, + "learning_rate": 9.336691832877496e-06, + "loss": 4.9905, + "step": 3465 + }, + { + "epoch": 0.6806590819929385, + "grad_norm": 27.250158309936523, + "learning_rate": 9.334764121595312e-06, + "loss": 4.7406, + "step": 3470 + }, + { + "epoch": 0.6816398587681444, + "grad_norm": 19.077829360961914, + "learning_rate": 9.332833812813494e-06, + "loss": 4.907, + "step": 3475 + }, + { + "epoch": 0.6826206355433503, + "grad_norm": 34.85506057739258, + "learning_rate": 9.330900907688728e-06, + "loss": 5.0646, + "step": 3480 + }, + { + "epoch": 0.6836014123185563, + "grad_norm": 19.06825828552246, + "learning_rate": 9.328965407379265e-06, + "loss": 5.1541, + "step": 3485 + }, + { + "epoch": 0.6845821890937622, + "grad_norm": 14.873254776000977, + "learning_rate": 9.327027313044901e-06, + "loss": 4.9209, + "step": 3490 + }, + { + "epoch": 0.6855629658689683, + "grad_norm": 20.76241683959961, + "learning_rate": 9.325086625846993e-06, + "loss": 5.1064, + "step": 3495 + }, + { + "epoch": 0.6865437426441742, + "grad_norm": 18.24216651916504, + "learning_rate": 9.323143346948449e-06, + "loss": 4.974, + "step": 3500 + }, + { + "epoch": 0.6875245194193802, + "grad_norm": 17.618261337280273, + "learning_rate": 9.32119747751373e-06, + "loss": 4.8254, + "step": 3505 + }, + { + "epoch": 0.6885052961945861, + "grad_norm": 31.671674728393555, + "learning_rate": 9.31924901870885e-06, + "loss": 4.8518, + "step": 3510 + }, + { + "epoch": 0.689486072969792, + "grad_norm": 20.532033920288086, + "learning_rate": 9.317297971701376e-06, + "loss": 5.1034, + "step": 3515 + }, + { + "epoch": 0.6904668497449981, + "grad_norm": 14.455893516540527, + "learning_rate": 9.315344337660422e-06, + "loss": 5.1695, + "step": 3520 + }, + { + "epoch": 0.691447626520204, + "grad_norm": 15.23082160949707, + "learning_rate": 9.313388117756655e-06, + "loss": 5.1999, + "step": 3525 + }, + { + "epoch": 0.69242840329541, + "grad_norm": 17.498613357543945, + "learning_rate": 9.311429313162293e-06, + "loss": 5.4637, + "step": 3530 + }, + { + "epoch": 0.6934091800706159, + "grad_norm": 14.049081802368164, + "learning_rate": 9.309467925051101e-06, + "loss": 5.0001, + "step": 3535 + }, + { + "epoch": 0.6943899568458219, + "grad_norm": 24.643108367919922, + "learning_rate": 9.30750395459839e-06, + "loss": 5.2113, + "step": 3540 + }, + { + "epoch": 0.6953707336210279, + "grad_norm": 16.181472778320312, + "learning_rate": 9.305537402981023e-06, + "loss": 4.8661, + "step": 3545 + }, + { + "epoch": 0.6963515103962338, + "grad_norm": 32.990962982177734, + "learning_rate": 9.303568271377404e-06, + "loss": 5.2794, + "step": 3550 + }, + { + "epoch": 0.6973322871714398, + "grad_norm": 19.17635726928711, + "learning_rate": 9.301596560967488e-06, + "loss": 4.8633, + "step": 3555 + }, + { + "epoch": 0.6983130639466457, + "grad_norm": 16.803030014038086, + "learning_rate": 9.299622272932772e-06, + "loss": 4.8278, + "step": 3560 + }, + { + "epoch": 0.6992938407218517, + "grad_norm": 11.135583877563477, + "learning_rate": 9.297645408456301e-06, + "loss": 4.9934, + "step": 3565 + }, + { + "epoch": 0.7002746174970577, + "grad_norm": 21.217369079589844, + "learning_rate": 9.295665968722663e-06, + "loss": 5.3993, + "step": 3570 + }, + { + "epoch": 0.7012553942722637, + "grad_norm": 22.149660110473633, + "learning_rate": 9.293683954917984e-06, + "loss": 5.4629, + "step": 3575 + }, + { + "epoch": 0.7022361710474696, + "grad_norm": 30.002939224243164, + "learning_rate": 9.29169936822994e-06, + "loss": 5.0342, + "step": 3580 + }, + { + "epoch": 0.7032169478226755, + "grad_norm": 15.087618827819824, + "learning_rate": 9.289712209847745e-06, + "loss": 4.9437, + "step": 3585 + }, + { + "epoch": 0.7041977245978815, + "grad_norm": 19.03937530517578, + "learning_rate": 9.287722480962151e-06, + "loss": 4.9014, + "step": 3590 + }, + { + "epoch": 0.7051785013730875, + "grad_norm": 16.533588409423828, + "learning_rate": 9.285730182765456e-06, + "loss": 5.0342, + "step": 3595 + }, + { + "epoch": 0.7061592781482935, + "grad_norm": 34.22774887084961, + "learning_rate": 9.283735316451497e-06, + "loss": 4.8334, + "step": 3600 + }, + { + "epoch": 0.7071400549234994, + "grad_norm": 20.17010498046875, + "learning_rate": 9.281737883215644e-06, + "loss": 5.2429, + "step": 3605 + }, + { + "epoch": 0.7081208316987053, + "grad_norm": 18.994447708129883, + "learning_rate": 9.279737884254812e-06, + "loss": 5.218, + "step": 3610 + }, + { + "epoch": 0.7091016084739113, + "grad_norm": 17.285263061523438, + "learning_rate": 9.277735320767449e-06, + "loss": 5.0856, + "step": 3615 + }, + { + "epoch": 0.7100823852491173, + "grad_norm": 61.87449645996094, + "learning_rate": 9.275730193953542e-06, + "loss": 5.0641, + "step": 3620 + }, + { + "epoch": 0.7110631620243233, + "grad_norm": 16.89430046081543, + "learning_rate": 9.273722505014615e-06, + "loss": 5.146, + "step": 3625 + }, + { + "epoch": 0.7120439387995292, + "grad_norm": 18.025007247924805, + "learning_rate": 9.271712255153724e-06, + "loss": 5.1371, + "step": 3630 + }, + { + "epoch": 0.7130247155747352, + "grad_norm": 21.702707290649414, + "learning_rate": 9.269699445575462e-06, + "loss": 4.9414, + "step": 3635 + }, + { + "epoch": 0.7140054923499412, + "grad_norm": 16.395092010498047, + "learning_rate": 9.267684077485955e-06, + "loss": 5.0574, + "step": 3640 + }, + { + "epoch": 0.7149862691251471, + "grad_norm": 26.50773048400879, + "learning_rate": 9.26566615209286e-06, + "loss": 5.2367, + "step": 3645 + }, + { + "epoch": 0.7159670459003531, + "grad_norm": 14.42000675201416, + "learning_rate": 9.263645670605373e-06, + "loss": 5.1632, + "step": 3650 + }, + { + "epoch": 0.716947822675559, + "grad_norm": 17.85245704650879, + "learning_rate": 9.261622634234213e-06, + "loss": 4.6273, + "step": 3655 + }, + { + "epoch": 0.717928599450765, + "grad_norm": 37.26712417602539, + "learning_rate": 9.259597044191635e-06, + "loss": 4.8634, + "step": 3660 + }, + { + "epoch": 0.718909376225971, + "grad_norm": 20.40871238708496, + "learning_rate": 9.257568901691428e-06, + "loss": 4.8579, + "step": 3665 + }, + { + "epoch": 0.719890153001177, + "grad_norm": 13.428717613220215, + "learning_rate": 9.2555382079489e-06, + "loss": 4.8662, + "step": 3670 + }, + { + "epoch": 0.7208709297763829, + "grad_norm": 14.121960639953613, + "learning_rate": 9.253504964180897e-06, + "loss": 4.957, + "step": 3675 + }, + { + "epoch": 0.7218517065515888, + "grad_norm": 23.1538143157959, + "learning_rate": 9.25146917160579e-06, + "loss": 4.8803, + "step": 3680 + }, + { + "epoch": 0.7228324833267948, + "grad_norm": 14.52030086517334, + "learning_rate": 9.249430831443474e-06, + "loss": 4.8426, + "step": 3685 + }, + { + "epoch": 0.7238132601020008, + "grad_norm": 18.916912078857422, + "learning_rate": 9.247389944915377e-06, + "loss": 4.7263, + "step": 3690 + }, + { + "epoch": 0.7247940368772068, + "grad_norm": 22.758358001708984, + "learning_rate": 9.245346513244448e-06, + "loss": 5.196, + "step": 3695 + }, + { + "epoch": 0.7257748136524127, + "grad_norm": 15.67486572265625, + "learning_rate": 9.243300537655163e-06, + "loss": 5.1512, + "step": 3700 + }, + { + "epoch": 0.7267555904276187, + "grad_norm": 13.838863372802734, + "learning_rate": 9.241252019373522e-06, + "loss": 5.2147, + "step": 3705 + }, + { + "epoch": 0.7277363672028246, + "grad_norm": 26.13442611694336, + "learning_rate": 9.239200959627048e-06, + "loss": 5.0676, + "step": 3710 + }, + { + "epoch": 0.7287171439780306, + "grad_norm": 17.461381912231445, + "learning_rate": 9.237147359644789e-06, + "loss": 5.6134, + "step": 3715 + }, + { + "epoch": 0.7296979207532366, + "grad_norm": 19.081663131713867, + "learning_rate": 9.235091220657313e-06, + "loss": 5.2973, + "step": 3720 + }, + { + "epoch": 0.7306786975284425, + "grad_norm": 11.549004554748535, + "learning_rate": 9.23303254389671e-06, + "loss": 4.9309, + "step": 3725 + }, + { + "epoch": 0.7316594743036485, + "grad_norm": 16.581857681274414, + "learning_rate": 9.230971330596591e-06, + "loss": 4.7896, + "step": 3730 + }, + { + "epoch": 0.7326402510788544, + "grad_norm": 23.697467803955078, + "learning_rate": 9.228907581992086e-06, + "loss": 5.0056, + "step": 3735 + }, + { + "epoch": 0.7336210278540605, + "grad_norm": 14.382548332214355, + "learning_rate": 9.226841299319846e-06, + "loss": 5.1846, + "step": 3740 + }, + { + "epoch": 0.7346018046292664, + "grad_norm": 16.23896598815918, + "learning_rate": 9.22477248381804e-06, + "loss": 5.2869, + "step": 3745 + }, + { + "epoch": 0.7355825814044723, + "grad_norm": 16.80748748779297, + "learning_rate": 9.222701136726352e-06, + "loss": 4.9048, + "step": 3750 + }, + { + "epoch": 0.7365633581796783, + "grad_norm": 19.622159957885742, + "learning_rate": 9.22062725928599e-06, + "loss": 5.2714, + "step": 3755 + }, + { + "epoch": 0.7375441349548842, + "grad_norm": 15.167123794555664, + "learning_rate": 9.218550852739669e-06, + "loss": 5.358, + "step": 3760 + }, + { + "epoch": 0.7385249117300903, + "grad_norm": 17.994897842407227, + "learning_rate": 9.216471918331625e-06, + "loss": 4.9097, + "step": 3765 + }, + { + "epoch": 0.7395056885052962, + "grad_norm": 15.874114036560059, + "learning_rate": 9.214390457307607e-06, + "loss": 4.8598, + "step": 3770 + }, + { + "epoch": 0.7404864652805021, + "grad_norm": 18.701860427856445, + "learning_rate": 9.212306470914882e-06, + "loss": 5.1611, + "step": 3775 + }, + { + "epoch": 0.7414672420557081, + "grad_norm": 23.78780746459961, + "learning_rate": 9.210219960402223e-06, + "loss": 4.8143, + "step": 3780 + }, + { + "epoch": 0.742448018830914, + "grad_norm": 27.825786590576172, + "learning_rate": 9.208130927019923e-06, + "loss": 4.8035, + "step": 3785 + }, + { + "epoch": 0.7434287956061201, + "grad_norm": 29.78475570678711, + "learning_rate": 9.206039372019779e-06, + "loss": 5.0749, + "step": 3790 + }, + { + "epoch": 0.744409572381326, + "grad_norm": 38.66997146606445, + "learning_rate": 9.203945296655109e-06, + "loss": 5.2162, + "step": 3795 + }, + { + "epoch": 0.745390349156532, + "grad_norm": 14.391605377197266, + "learning_rate": 9.201848702180732e-06, + "loss": 5.4503, + "step": 3800 + }, + { + "epoch": 0.7463711259317379, + "grad_norm": 25.25947380065918, + "learning_rate": 9.19974958985298e-06, + "loss": 5.092, + "step": 3805 + }, + { + "epoch": 0.7473519027069438, + "grad_norm": 23.04009437561035, + "learning_rate": 9.197647960929697e-06, + "loss": 5.103, + "step": 3810 + }, + { + "epoch": 0.7483326794821499, + "grad_norm": 17.638652801513672, + "learning_rate": 9.195543816670228e-06, + "loss": 5.0807, + "step": 3815 + }, + { + "epoch": 0.7493134562573558, + "grad_norm": 25.059757232666016, + "learning_rate": 9.19343715833543e-06, + "loss": 4.9474, + "step": 3820 + }, + { + "epoch": 0.7502942330325618, + "grad_norm": 21.401994705200195, + "learning_rate": 9.191327987187667e-06, + "loss": 4.66, + "step": 3825 + }, + { + "epoch": 0.7502942330325618, + "eval_loss": 5.044307231903076, + "eval_runtime": 7.6237, + "eval_samples_per_second": 27.414, + "eval_steps_per_second": 13.773, + "step": 3825 + }, + { + "epoch": 0.7512750098077677, + "grad_norm": 16.83993911743164, + "learning_rate": 9.189216304490806e-06, + "loss": 5.4801, + "step": 3830 + }, + { + "epoch": 0.7522557865829738, + "grad_norm": 28.338539123535156, + "learning_rate": 9.187102111510223e-06, + "loss": 5.0957, + "step": 3835 + }, + { + "epoch": 0.7532365633581797, + "grad_norm": 16.17990493774414, + "learning_rate": 9.184985409512793e-06, + "loss": 5.1511, + "step": 3840 + }, + { + "epoch": 0.7542173401333856, + "grad_norm": 24.36326026916504, + "learning_rate": 9.182866199766898e-06, + "loss": 5.1696, + "step": 3845 + }, + { + "epoch": 0.7551981169085916, + "grad_norm": 20.284696578979492, + "learning_rate": 9.180744483542421e-06, + "loss": 5.2461, + "step": 3850 + }, + { + "epoch": 0.7561788936837975, + "grad_norm": 14.811017990112305, + "learning_rate": 9.178620262110748e-06, + "loss": 4.868, + "step": 3855 + }, + { + "epoch": 0.7571596704590036, + "grad_norm": 26.946754455566406, + "learning_rate": 9.176493536744767e-06, + "loss": 5.244, + "step": 3860 + }, + { + "epoch": 0.7581404472342095, + "grad_norm": 26.35390281677246, + "learning_rate": 9.174364308718862e-06, + "loss": 4.9377, + "step": 3865 + }, + { + "epoch": 0.7591212240094155, + "grad_norm": 42.68120574951172, + "learning_rate": 9.172232579308924e-06, + "loss": 5.5446, + "step": 3870 + }, + { + "epoch": 0.7601020007846214, + "grad_norm": 16.318851470947266, + "learning_rate": 9.170098349792339e-06, + "loss": 5.2302, + "step": 3875 + }, + { + "epoch": 0.7610827775598273, + "grad_norm": 19.897960662841797, + "learning_rate": 9.167961621447984e-06, + "loss": 4.8648, + "step": 3880 + }, + { + "epoch": 0.7620635543350334, + "grad_norm": 32.09339904785156, + "learning_rate": 9.16582239555625e-06, + "loss": 4.791, + "step": 3885 + }, + { + "epoch": 0.7630443311102393, + "grad_norm": 25.286733627319336, + "learning_rate": 9.16368067339901e-06, + "loss": 5.1848, + "step": 3890 + }, + { + "epoch": 0.7640251078854453, + "grad_norm": 30.160734176635742, + "learning_rate": 9.161536456259637e-06, + "loss": 5.0082, + "step": 3895 + }, + { + "epoch": 0.7650058846606512, + "grad_norm": 27.863746643066406, + "learning_rate": 9.159389745423003e-06, + "loss": 5.1531, + "step": 3900 + }, + { + "epoch": 0.7659866614358573, + "grad_norm": 20.288660049438477, + "learning_rate": 9.157240542175468e-06, + "loss": 4.9016, + "step": 3905 + }, + { + "epoch": 0.7669674382110632, + "grad_norm": 13.439239501953125, + "learning_rate": 9.155088847804888e-06, + "loss": 5.1312, + "step": 3910 + }, + { + "epoch": 0.7679482149862691, + "grad_norm": 31.238624572753906, + "learning_rate": 9.152934663600615e-06, + "loss": 5.3148, + "step": 3915 + }, + { + "epoch": 0.7689289917614751, + "grad_norm": 20.205110549926758, + "learning_rate": 9.15077799085349e-06, + "loss": 4.9455, + "step": 3920 + }, + { + "epoch": 0.769909768536681, + "grad_norm": 29.910932540893555, + "learning_rate": 9.148618830855846e-06, + "loss": 5.4219, + "step": 3925 + }, + { + "epoch": 0.770890545311887, + "grad_norm": 17.717374801635742, + "learning_rate": 9.146457184901502e-06, + "loss": 5.3079, + "step": 3930 + }, + { + "epoch": 0.771871322087093, + "grad_norm": 14.105619430541992, + "learning_rate": 9.144293054285776e-06, + "loss": 5.173, + "step": 3935 + }, + { + "epoch": 0.7728520988622989, + "grad_norm": 18.493331909179688, + "learning_rate": 9.142126440305466e-06, + "loss": 4.9215, + "step": 3940 + }, + { + "epoch": 0.7738328756375049, + "grad_norm": 17.42450523376465, + "learning_rate": 9.139957344258863e-06, + "loss": 5.0038, + "step": 3945 + }, + { + "epoch": 0.7748136524127108, + "grad_norm": 37.434608459472656, + "learning_rate": 9.137785767445743e-06, + "loss": 4.7231, + "step": 3950 + }, + { + "epoch": 0.7757944291879169, + "grad_norm": 18.50735092163086, + "learning_rate": 9.135611711167371e-06, + "loss": 4.792, + "step": 3955 + }, + { + "epoch": 0.7767752059631228, + "grad_norm": 18.376445770263672, + "learning_rate": 9.133435176726494e-06, + "loss": 4.7746, + "step": 3960 + }, + { + "epoch": 0.7777559827383288, + "grad_norm": 13.254705429077148, + "learning_rate": 9.131256165427347e-06, + "loss": 5.318, + "step": 3965 + }, + { + "epoch": 0.7787367595135347, + "grad_norm": 14.707402229309082, + "learning_rate": 9.129074678575649e-06, + "loss": 5.0746, + "step": 3970 + }, + { + "epoch": 0.7797175362887406, + "grad_norm": 17.6321964263916, + "learning_rate": 9.1268907174786e-06, + "loss": 5.022, + "step": 3975 + }, + { + "epoch": 0.7806983130639467, + "grad_norm": 19.831993103027344, + "learning_rate": 9.124704283444887e-06, + "loss": 4.9905, + "step": 3980 + }, + { + "epoch": 0.7816790898391526, + "grad_norm": 19.51742935180664, + "learning_rate": 9.122515377784676e-06, + "loss": 5.1026, + "step": 3985 + }, + { + "epoch": 0.7826598666143586, + "grad_norm": 13.889021873474121, + "learning_rate": 9.12032400180961e-06, + "loss": 4.7532, + "step": 3990 + }, + { + "epoch": 0.7836406433895645, + "grad_norm": 20.075048446655273, + "learning_rate": 9.118130156832823e-06, + "loss": 4.9112, + "step": 3995 + }, + { + "epoch": 0.7846214201647705, + "grad_norm": 43.44004821777344, + "learning_rate": 9.115933844168918e-06, + "loss": 5.2201, + "step": 4000 + }, + { + "epoch": 0.7856021969399765, + "grad_norm": 37.11042404174805, + "learning_rate": 9.11373506513398e-06, + "loss": 5.1144, + "step": 4005 + }, + { + "epoch": 0.7865829737151824, + "grad_norm": 23.289901733398438, + "learning_rate": 9.111533821045576e-06, + "loss": 4.9628, + "step": 4010 + }, + { + "epoch": 0.7875637504903884, + "grad_norm": 12.136765480041504, + "learning_rate": 9.109330113222745e-06, + "loss": 4.9547, + "step": 4015 + }, + { + "epoch": 0.7885445272655943, + "grad_norm": 21.204288482666016, + "learning_rate": 9.107123942986003e-06, + "loss": 4.8932, + "step": 4020 + }, + { + "epoch": 0.7895253040408003, + "grad_norm": 19.411935806274414, + "learning_rate": 9.104915311657346e-06, + "loss": 4.7352, + "step": 4025 + }, + { + "epoch": 0.7905060808160063, + "grad_norm": 19.139673233032227, + "learning_rate": 9.102704220560237e-06, + "loss": 5.011, + "step": 4030 + }, + { + "epoch": 0.7914868575912123, + "grad_norm": 22.491958618164062, + "learning_rate": 9.10049067101962e-06, + "loss": 5.1561, + "step": 4035 + }, + { + "epoch": 0.7924676343664182, + "grad_norm": 23.593151092529297, + "learning_rate": 9.09827466436191e-06, + "loss": 5.3284, + "step": 4040 + }, + { + "epoch": 0.7934484111416241, + "grad_norm": 25.913562774658203, + "learning_rate": 9.096056201914993e-06, + "loss": 4.8373, + "step": 4045 + }, + { + "epoch": 0.7944291879168301, + "grad_norm": 19.09825897216797, + "learning_rate": 9.093835285008228e-06, + "loss": 5.422, + "step": 4050 + }, + { + "epoch": 0.7954099646920361, + "grad_norm": 16.97250747680664, + "learning_rate": 9.091611914972443e-06, + "loss": 5.1956, + "step": 4055 + }, + { + "epoch": 0.7963907414672421, + "grad_norm": 14.669245719909668, + "learning_rate": 9.089386093139937e-06, + "loss": 4.7867, + "step": 4060 + }, + { + "epoch": 0.797371518242448, + "grad_norm": 38.453433990478516, + "learning_rate": 9.087157820844482e-06, + "loss": 5.1385, + "step": 4065 + }, + { + "epoch": 0.798352295017654, + "grad_norm": 13.812004089355469, + "learning_rate": 9.08492709942131e-06, + "loss": 5.0448, + "step": 4070 + }, + { + "epoch": 0.79933307179286, + "grad_norm": 15.264069557189941, + "learning_rate": 9.082693930207128e-06, + "loss": 5.0391, + "step": 4075 + }, + { + "epoch": 0.8003138485680659, + "grad_norm": 28.84646224975586, + "learning_rate": 9.080458314540107e-06, + "loss": 4.9142, + "step": 4080 + }, + { + "epoch": 0.8012946253432719, + "grad_norm": 53.37618637084961, + "learning_rate": 9.078220253759884e-06, + "loss": 4.6441, + "step": 4085 + }, + { + "epoch": 0.8022754021184778, + "grad_norm": 14.650290489196777, + "learning_rate": 9.07597974920756e-06, + "loss": 4.9577, + "step": 4090 + }, + { + "epoch": 0.8032561788936838, + "grad_norm": 20.597881317138672, + "learning_rate": 9.073736802225705e-06, + "loss": 5.203, + "step": 4095 + }, + { + "epoch": 0.8042369556688898, + "grad_norm": 10.912410736083984, + "learning_rate": 9.071491414158345e-06, + "loss": 5.1236, + "step": 4100 + }, + { + "epoch": 0.8052177324440957, + "grad_norm": 19.348920822143555, + "learning_rate": 9.069243586350976e-06, + "loss": 4.9828, + "step": 4105 + }, + { + "epoch": 0.8061985092193017, + "grad_norm": 12.223658561706543, + "learning_rate": 9.066993320150552e-06, + "loss": 5.5052, + "step": 4110 + }, + { + "epoch": 0.8071792859945076, + "grad_norm": 19.24483299255371, + "learning_rate": 9.064740616905487e-06, + "loss": 5.2803, + "step": 4115 + }, + { + "epoch": 0.8081600627697136, + "grad_norm": 18.650951385498047, + "learning_rate": 9.062485477965661e-06, + "loss": 5.1652, + "step": 4120 + }, + { + "epoch": 0.8091408395449196, + "grad_norm": 25.61029052734375, + "learning_rate": 9.060227904682408e-06, + "loss": 4.7679, + "step": 4125 + }, + { + "epoch": 0.8101216163201256, + "grad_norm": 16.603193283081055, + "learning_rate": 9.057967898408523e-06, + "loss": 4.8387, + "step": 4130 + }, + { + "epoch": 0.8111023930953315, + "grad_norm": 14.41788101196289, + "learning_rate": 9.055705460498258e-06, + "loss": 5.0304, + "step": 4135 + }, + { + "epoch": 0.8120831698705374, + "grad_norm": 17.828062057495117, + "learning_rate": 9.053440592307322e-06, + "loss": 5.3152, + "step": 4140 + }, + { + "epoch": 0.8130639466457434, + "grad_norm": 15.405653953552246, + "learning_rate": 9.051173295192885e-06, + "loss": 4.9053, + "step": 4145 + }, + { + "epoch": 0.8140447234209494, + "grad_norm": 18.29840087890625, + "learning_rate": 9.048903570513565e-06, + "loss": 4.9139, + "step": 4150 + }, + { + "epoch": 0.8150255001961554, + "grad_norm": 10.129656791687012, + "learning_rate": 9.046631419629438e-06, + "loss": 4.8187, + "step": 4155 + }, + { + "epoch": 0.8160062769713613, + "grad_norm": 21.516311645507812, + "learning_rate": 9.044356843902036e-06, + "loss": 4.8428, + "step": 4160 + }, + { + "epoch": 0.8169870537465673, + "grad_norm": 15.47047233581543, + "learning_rate": 9.042079844694339e-06, + "loss": 4.984, + "step": 4165 + }, + { + "epoch": 0.8179678305217732, + "grad_norm": 21.687891006469727, + "learning_rate": 9.039800423370783e-06, + "loss": 5.1401, + "step": 4170 + }, + { + "epoch": 0.8189486072969792, + "grad_norm": 28.038644790649414, + "learning_rate": 9.037518581297257e-06, + "loss": 5.1431, + "step": 4175 + }, + { + "epoch": 0.8199293840721852, + "grad_norm": 30.172744750976562, + "learning_rate": 9.035234319841095e-06, + "loss": 4.8415, + "step": 4180 + }, + { + "epoch": 0.8209101608473911, + "grad_norm": 16.931224822998047, + "learning_rate": 9.032947640371086e-06, + "loss": 4.5958, + "step": 4185 + }, + { + "epoch": 0.8218909376225971, + "grad_norm": 23.898576736450195, + "learning_rate": 9.030658544257466e-06, + "loss": 4.7125, + "step": 4190 + }, + { + "epoch": 0.822871714397803, + "grad_norm": 12.608359336853027, + "learning_rate": 9.028367032871917e-06, + "loss": 5.1428, + "step": 4195 + }, + { + "epoch": 0.8238524911730091, + "grad_norm": 12.2660493850708, + "learning_rate": 9.026073107587571e-06, + "loss": 5.0363, + "step": 4200 + }, + { + "epoch": 0.824833267948215, + "grad_norm": 10.48144245147705, + "learning_rate": 9.023776769779007e-06, + "loss": 5.0101, + "step": 4205 + }, + { + "epoch": 0.8258140447234209, + "grad_norm": 26.566226959228516, + "learning_rate": 9.021478020822248e-06, + "loss": 5.0142, + "step": 4210 + }, + { + "epoch": 0.8267948214986269, + "grad_norm": 11.696605682373047, + "learning_rate": 9.01917686209476e-06, + "loss": 5.2014, + "step": 4215 + }, + { + "epoch": 0.8277755982738328, + "grad_norm": 28.729278564453125, + "learning_rate": 9.016873294975457e-06, + "loss": 5.2874, + "step": 4220 + }, + { + "epoch": 0.8287563750490389, + "grad_norm": 21.497310638427734, + "learning_rate": 9.014567320844694e-06, + "loss": 5.1681, + "step": 4225 + }, + { + "epoch": 0.8297371518242448, + "grad_norm": 13.806232452392578, + "learning_rate": 9.012258941084269e-06, + "loss": 4.664, + "step": 4230 + }, + { + "epoch": 0.8307179285994508, + "grad_norm": 15.800619125366211, + "learning_rate": 9.009948157077421e-06, + "loss": 5.1314, + "step": 4235 + }, + { + "epoch": 0.8316987053746567, + "grad_norm": 12.598173141479492, + "learning_rate": 9.007634970208829e-06, + "loss": 5.0503, + "step": 4240 + }, + { + "epoch": 0.8326794821498626, + "grad_norm": 19.550113677978516, + "learning_rate": 9.005319381864615e-06, + "loss": 5.059, + "step": 4245 + }, + { + "epoch": 0.8336602589250687, + "grad_norm": 11.087757110595703, + "learning_rate": 9.003001393432334e-06, + "loss": 4.9414, + "step": 4250 + }, + { + "epoch": 0.8346410357002746, + "grad_norm": 11.731034278869629, + "learning_rate": 9.000681006300986e-06, + "loss": 4.8664, + "step": 4255 + }, + { + "epoch": 0.8356218124754806, + "grad_norm": 21.4744873046875, + "learning_rate": 8.998358221861006e-06, + "loss": 5.2014, + "step": 4260 + }, + { + "epoch": 0.8366025892506865, + "grad_norm": 14.924543380737305, + "learning_rate": 8.996033041504262e-06, + "loss": 4.8598, + "step": 4265 + }, + { + "epoch": 0.8375833660258926, + "grad_norm": 21.695417404174805, + "learning_rate": 8.993705466624061e-06, + "loss": 4.8761, + "step": 4270 + }, + { + "epoch": 0.8385641428010985, + "grad_norm": 22.94925880432129, + "learning_rate": 8.991375498615147e-06, + "loss": 5.0729, + "step": 4275 + }, + { + "epoch": 0.8395449195763044, + "grad_norm": 24.85720443725586, + "learning_rate": 8.98904313887369e-06, + "loss": 4.8534, + "step": 4280 + }, + { + "epoch": 0.8405256963515104, + "grad_norm": 15.123157501220703, + "learning_rate": 8.986708388797306e-06, + "loss": 4.7483, + "step": 4285 + }, + { + "epoch": 0.8415064731267163, + "grad_norm": 17.581621170043945, + "learning_rate": 8.984371249785031e-06, + "loss": 4.9156, + "step": 4290 + }, + { + "epoch": 0.8424872499019224, + "grad_norm": 14.177846908569336, + "learning_rate": 8.982031723237338e-06, + "loss": 5.0995, + "step": 4295 + }, + { + "epoch": 0.8434680266771283, + "grad_norm": 12.062125205993652, + "learning_rate": 8.979689810556132e-06, + "loss": 4.9483, + "step": 4300 + }, + { + "epoch": 0.8444488034523342, + "grad_norm": 21.800048828125, + "learning_rate": 8.977345513144743e-06, + "loss": 5.163, + "step": 4305 + }, + { + "epoch": 0.8454295802275402, + "grad_norm": 13.504755020141602, + "learning_rate": 8.974998832407935e-06, + "loss": 5.4037, + "step": 4310 + }, + { + "epoch": 0.8464103570027461, + "grad_norm": 31.20657730102539, + "learning_rate": 8.972649769751897e-06, + "loss": 5.3662, + "step": 4315 + }, + { + "epoch": 0.8473911337779522, + "grad_norm": 17.510353088378906, + "learning_rate": 8.97029832658425e-06, + "loss": 5.0793, + "step": 4320 + }, + { + "epoch": 0.8483719105531581, + "grad_norm": 35.864559173583984, + "learning_rate": 8.967944504314033e-06, + "loss": 5.2564, + "step": 4325 + }, + { + "epoch": 0.8493526873283641, + "grad_norm": 26.920175552368164, + "learning_rate": 8.965588304351716e-06, + "loss": 4.9264, + "step": 4330 + }, + { + "epoch": 0.85033346410357, + "grad_norm": 12.373115539550781, + "learning_rate": 8.963229728109196e-06, + "loss": 4.8945, + "step": 4335 + }, + { + "epoch": 0.8513142408787759, + "grad_norm": 19.75405502319336, + "learning_rate": 8.96086877699979e-06, + "loss": 5.0325, + "step": 4340 + }, + { + "epoch": 0.852295017653982, + "grad_norm": 25.117504119873047, + "learning_rate": 8.95850545243824e-06, + "loss": 4.9604, + "step": 4345 + }, + { + "epoch": 0.8532757944291879, + "grad_norm": 14.683432579040527, + "learning_rate": 8.956139755840706e-06, + "loss": 4.8917, + "step": 4350 + }, + { + "epoch": 0.8542565712043939, + "grad_norm": 10.729886054992676, + "learning_rate": 8.953771688624777e-06, + "loss": 4.9119, + "step": 4355 + }, + { + "epoch": 0.8552373479795998, + "grad_norm": 18.196348190307617, + "learning_rate": 8.951401252209457e-06, + "loss": 4.8839, + "step": 4360 + }, + { + "epoch": 0.8562181247548059, + "grad_norm": 13.992422103881836, + "learning_rate": 8.94902844801517e-06, + "loss": 4.9166, + "step": 4365 + }, + { + "epoch": 0.8571989015300118, + "grad_norm": 17.236942291259766, + "learning_rate": 8.946653277463763e-06, + "loss": 4.702, + "step": 4370 + }, + { + "epoch": 0.8581796783052177, + "grad_norm": 16.63797950744629, + "learning_rate": 8.944275741978495e-06, + "loss": 4.9513, + "step": 4375 + }, + { + "epoch": 0.8591604550804237, + "grad_norm": 11.08305549621582, + "learning_rate": 8.941895842984045e-06, + "loss": 5.0286, + "step": 4380 + }, + { + "epoch": 0.8601412318556296, + "grad_norm": 20.1209774017334, + "learning_rate": 8.939513581906509e-06, + "loss": 4.9298, + "step": 4385 + }, + { + "epoch": 0.8611220086308357, + "grad_norm": 25.10521125793457, + "learning_rate": 8.937128960173399e-06, + "loss": 5.3603, + "step": 4390 + }, + { + "epoch": 0.8621027854060416, + "grad_norm": 24.15997314453125, + "learning_rate": 8.934741979213638e-06, + "loss": 5.0146, + "step": 4395 + }, + { + "epoch": 0.8630835621812476, + "grad_norm": 21.98234748840332, + "learning_rate": 8.932352640457566e-06, + "loss": 5.3546, + "step": 4400 + }, + { + "epoch": 0.8640643389564535, + "grad_norm": 34.10669708251953, + "learning_rate": 8.929960945336936e-06, + "loss": 5.2731, + "step": 4405 + }, + { + "epoch": 0.8650451157316594, + "grad_norm": 13.953348159790039, + "learning_rate": 8.927566895284912e-06, + "loss": 5.4325, + "step": 4410 + }, + { + "epoch": 0.8660258925068655, + "grad_norm": 32.35000228881836, + "learning_rate": 8.925170491736065e-06, + "loss": 4.8432, + "step": 4415 + }, + { + "epoch": 0.8670066692820714, + "grad_norm": 11.26505184173584, + "learning_rate": 8.922771736126384e-06, + "loss": 5.1536, + "step": 4420 + }, + { + "epoch": 0.8679874460572774, + "grad_norm": 18.455360412597656, + "learning_rate": 8.920370629893263e-06, + "loss": 5.0085, + "step": 4425 + }, + { + "epoch": 0.8689682228324833, + "grad_norm": 15.50165843963623, + "learning_rate": 8.917967174475505e-06, + "loss": 4.9807, + "step": 4430 + }, + { + "epoch": 0.8699489996076893, + "grad_norm": 12.871978759765625, + "learning_rate": 8.915561371313321e-06, + "loss": 5.2293, + "step": 4435 + }, + { + "epoch": 0.8709297763828953, + "grad_norm": 23.6169490814209, + "learning_rate": 8.913153221848328e-06, + "loss": 5.1125, + "step": 4440 + }, + { + "epoch": 0.8719105531581012, + "grad_norm": 32.19282913208008, + "learning_rate": 8.91074272752355e-06, + "loss": 4.9331, + "step": 4445 + }, + { + "epoch": 0.8728913299333072, + "grad_norm": 21.433151245117188, + "learning_rate": 8.908329889783418e-06, + "loss": 4.9661, + "step": 4450 + }, + { + "epoch": 0.8738721067085131, + "grad_norm": 24.601730346679688, + "learning_rate": 8.905914710073761e-06, + "loss": 4.9997, + "step": 4455 + }, + { + "epoch": 0.8748528834837191, + "grad_norm": 27.876657485961914, + "learning_rate": 8.903497189841819e-06, + "loss": 5.1279, + "step": 4460 + }, + { + "epoch": 0.8758336602589251, + "grad_norm": 20.576623916625977, + "learning_rate": 8.901077330536228e-06, + "loss": 5.1912, + "step": 4465 + }, + { + "epoch": 0.876814437034131, + "grad_norm": 20.63385009765625, + "learning_rate": 8.89865513360703e-06, + "loss": 4.8031, + "step": 4470 + }, + { + "epoch": 0.877795213809337, + "grad_norm": 35.024566650390625, + "learning_rate": 8.896230600505668e-06, + "loss": 5.1266, + "step": 4475 + }, + { + "epoch": 0.8787759905845429, + "grad_norm": 21.97711181640625, + "learning_rate": 8.893803732684981e-06, + "loss": 4.8322, + "step": 4480 + }, + { + "epoch": 0.879756767359749, + "grad_norm": 50.84933853149414, + "learning_rate": 8.891374531599209e-06, + "loss": 5.0662, + "step": 4485 + }, + { + "epoch": 0.8807375441349549, + "grad_norm": 17.233617782592773, + "learning_rate": 8.88894299870399e-06, + "loss": 4.7075, + "step": 4490 + }, + { + "epoch": 0.8817183209101609, + "grad_norm": 22.550418853759766, + "learning_rate": 8.886509135456362e-06, + "loss": 5.4782, + "step": 4495 + }, + { + "epoch": 0.8826990976853668, + "grad_norm": 11.513266563415527, + "learning_rate": 8.884072943314754e-06, + "loss": 5.2502, + "step": 4500 + }, + { + "epoch": 0.8836798744605727, + "grad_norm": 40.44765090942383, + "learning_rate": 8.881634423738995e-06, + "loss": 4.8474, + "step": 4505 + }, + { + "epoch": 0.8846606512357787, + "grad_norm": 20.928241729736328, + "learning_rate": 8.879193578190311e-06, + "loss": 5.2969, + "step": 4510 + }, + { + "epoch": 0.8856414280109847, + "grad_norm": 28.95954132080078, + "learning_rate": 8.876750408131312e-06, + "loss": 4.937, + "step": 4515 + }, + { + "epoch": 0.8866222047861907, + "grad_norm": 16.0018253326416, + "learning_rate": 8.874304915026012e-06, + "loss": 5.0333, + "step": 4520 + }, + { + "epoch": 0.8876029815613966, + "grad_norm": 29.66618537902832, + "learning_rate": 8.871857100339805e-06, + "loss": 5.0874, + "step": 4525 + }, + { + "epoch": 0.8885837583366026, + "grad_norm": 21.657180786132812, + "learning_rate": 8.869406965539489e-06, + "loss": 4.7309, + "step": 4530 + }, + { + "epoch": 0.8895645351118086, + "grad_norm": 15.75297737121582, + "learning_rate": 8.866954512093246e-06, + "loss": 5.0516, + "step": 4535 + }, + { + "epoch": 0.8905453118870145, + "grad_norm": 11.395059585571289, + "learning_rate": 8.864499741470646e-06, + "loss": 4.8039, + "step": 4540 + }, + { + "epoch": 0.8915260886622205, + "grad_norm": 20.670623779296875, + "learning_rate": 8.86204265514265e-06, + "loss": 5.0751, + "step": 4545 + }, + { + "epoch": 0.8925068654374264, + "grad_norm": 19.001562118530273, + "learning_rate": 8.859583254581604e-06, + "loss": 4.948, + "step": 4550 + }, + { + "epoch": 0.8934876422126324, + "grad_norm": 17.733688354492188, + "learning_rate": 8.857121541261247e-06, + "loss": 4.8752, + "step": 4555 + }, + { + "epoch": 0.8944684189878384, + "grad_norm": 28.683687210083008, + "learning_rate": 8.854657516656697e-06, + "loss": 5.201, + "step": 4560 + }, + { + "epoch": 0.8954491957630444, + "grad_norm": 18.13066291809082, + "learning_rate": 8.852191182244456e-06, + "loss": 5.0124, + "step": 4565 + }, + { + "epoch": 0.8964299725382503, + "grad_norm": 27.07872200012207, + "learning_rate": 8.849722539502419e-06, + "loss": 5.3658, + "step": 4570 + }, + { + "epoch": 0.8974107493134562, + "grad_norm": 21.85537338256836, + "learning_rate": 8.847251589909857e-06, + "loss": 5.3605, + "step": 4575 + }, + { + "epoch": 0.8983915260886622, + "grad_norm": 21.174556732177734, + "learning_rate": 8.844778334947426e-06, + "loss": 4.5005, + "step": 4580 + }, + { + "epoch": 0.8993723028638682, + "grad_norm": 19.17597007751465, + "learning_rate": 8.84230277609716e-06, + "loss": 5.2061, + "step": 4585 + }, + { + "epoch": 0.9003530796390742, + "grad_norm": 20.14710235595703, + "learning_rate": 8.839824914842477e-06, + "loss": 5.0189, + "step": 4590 + }, + { + "epoch": 0.9013338564142801, + "grad_norm": 13.179183006286621, + "learning_rate": 8.837344752668176e-06, + "loss": 5.2221, + "step": 4595 + }, + { + "epoch": 0.9023146331894861, + "grad_norm": 22.216659545898438, + "learning_rate": 8.83486229106043e-06, + "loss": 4.9365, + "step": 4600 + }, + { + "epoch": 0.903295409964692, + "grad_norm": 26.944229125976562, + "learning_rate": 8.832377531506794e-06, + "loss": 5.4394, + "step": 4605 + }, + { + "epoch": 0.904276186739898, + "grad_norm": 21.65399742126465, + "learning_rate": 8.829890475496195e-06, + "loss": 5.0546, + "step": 4610 + }, + { + "epoch": 0.905256963515104, + "grad_norm": 19.045804977416992, + "learning_rate": 8.827401124518945e-06, + "loss": 5.0907, + "step": 4615 + }, + { + "epoch": 0.9062377402903099, + "grad_norm": 11.208507537841797, + "learning_rate": 8.82490948006672e-06, + "loss": 4.7015, + "step": 4620 + }, + { + "epoch": 0.9072185170655159, + "grad_norm": 26.1632022857666, + "learning_rate": 8.82241554363258e-06, + "loss": 5.1097, + "step": 4625 + }, + { + "epoch": 0.9081992938407218, + "grad_norm": 34.274539947509766, + "learning_rate": 8.819919316710954e-06, + "loss": 5.1715, + "step": 4630 + }, + { + "epoch": 0.9091800706159278, + "grad_norm": 14.437838554382324, + "learning_rate": 8.817420800797641e-06, + "loss": 4.8171, + "step": 4635 + }, + { + "epoch": 0.9101608473911338, + "grad_norm": 19.037561416625977, + "learning_rate": 8.814919997389818e-06, + "loss": 4.8992, + "step": 4640 + }, + { + "epoch": 0.9111416241663397, + "grad_norm": 18.625173568725586, + "learning_rate": 8.812416907986027e-06, + "loss": 5.0305, + "step": 4645 + }, + { + "epoch": 0.9121224009415457, + "grad_norm": 17.157176971435547, + "learning_rate": 8.809911534086185e-06, + "loss": 4.582, + "step": 4650 + }, + { + "epoch": 0.9131031777167516, + "grad_norm": 26.29932975769043, + "learning_rate": 8.807403877191572e-06, + "loss": 5.138, + "step": 4655 + }, + { + "epoch": 0.9140839544919577, + "grad_norm": 20.871774673461914, + "learning_rate": 8.804893938804839e-06, + "loss": 4.9286, + "step": 4660 + }, + { + "epoch": 0.9150647312671636, + "grad_norm": 17.02748680114746, + "learning_rate": 8.802381720430006e-06, + "loss": 4.9197, + "step": 4665 + }, + { + "epoch": 0.9160455080423695, + "grad_norm": 13.891420364379883, + "learning_rate": 8.799867223572457e-06, + "loss": 5.003, + "step": 4670 + }, + { + "epoch": 0.9170262848175755, + "grad_norm": 20.51250457763672, + "learning_rate": 8.797350449738941e-06, + "loss": 4.9468, + "step": 4675 + }, + { + "epoch": 0.9180070615927814, + "grad_norm": 16.219396591186523, + "learning_rate": 8.794831400437573e-06, + "loss": 4.8844, + "step": 4680 + }, + { + "epoch": 0.9189878383679875, + "grad_norm": 23.17514419555664, + "learning_rate": 8.79231007717783e-06, + "loss": 4.654, + "step": 4685 + }, + { + "epoch": 0.9199686151431934, + "grad_norm": 19.793642044067383, + "learning_rate": 8.789786481470553e-06, + "loss": 4.9788, + "step": 4690 + }, + { + "epoch": 0.9209493919183994, + "grad_norm": 17.20256233215332, + "learning_rate": 8.787260614827942e-06, + "loss": 5.4916, + "step": 4695 + }, + { + "epoch": 0.9219301686936053, + "grad_norm": 21.612836837768555, + "learning_rate": 8.784732478763562e-06, + "loss": 5.2886, + "step": 4700 + }, + { + "epoch": 0.9229109454688113, + "grad_norm": 23.609474182128906, + "learning_rate": 8.782202074792336e-06, + "loss": 4.7733, + "step": 4705 + }, + { + "epoch": 0.9238917222440173, + "grad_norm": 25.944740295410156, + "learning_rate": 8.779669404430545e-06, + "loss": 5.068, + "step": 4710 + }, + { + "epoch": 0.9248724990192232, + "grad_norm": 13.784878730773926, + "learning_rate": 8.777134469195826e-06, + "loss": 4.8825, + "step": 4715 + }, + { + "epoch": 0.9258532757944292, + "grad_norm": 21.367332458496094, + "learning_rate": 8.77459727060718e-06, + "loss": 4.8307, + "step": 4720 + }, + { + "epoch": 0.9268340525696351, + "grad_norm": 21.81898307800293, + "learning_rate": 8.772057810184957e-06, + "loss": 5.2136, + "step": 4725 + }, + { + "epoch": 0.9278148293448412, + "grad_norm": 13.044177055358887, + "learning_rate": 8.769516089450869e-06, + "loss": 4.8561, + "step": 4730 + }, + { + "epoch": 0.9287956061200471, + "grad_norm": 21.14825439453125, + "learning_rate": 8.766972109927976e-06, + "loss": 4.9493, + "step": 4735 + }, + { + "epoch": 0.929776382895253, + "grad_norm": 21.38848304748535, + "learning_rate": 8.764425873140693e-06, + "loss": 5.1738, + "step": 4740 + }, + { + "epoch": 0.930757159670459, + "grad_norm": 18.144960403442383, + "learning_rate": 8.761877380614796e-06, + "loss": 4.8048, + "step": 4745 + }, + { + "epoch": 0.9317379364456649, + "grad_norm": 23.306209564208984, + "learning_rate": 8.759326633877398e-06, + "loss": 4.4888, + "step": 4750 + }, + { + "epoch": 0.932718713220871, + "grad_norm": 19.98696517944336, + "learning_rate": 8.756773634456975e-06, + "loss": 4.7224, + "step": 4755 + }, + { + "epoch": 0.9336994899960769, + "grad_norm": 12.257347106933594, + "learning_rate": 8.754218383883349e-06, + "loss": 4.9588, + "step": 4760 + }, + { + "epoch": 0.9346802667712829, + "grad_norm": 12.258370399475098, + "learning_rate": 8.751660883687685e-06, + "loss": 5.0761, + "step": 4765 + }, + { + "epoch": 0.9356610435464888, + "grad_norm": 16.816762924194336, + "learning_rate": 8.749101135402508e-06, + "loss": 5.1723, + "step": 4770 + }, + { + "epoch": 0.9366418203216947, + "grad_norm": 24.14014434814453, + "learning_rate": 8.74653914056168e-06, + "loss": 4.8346, + "step": 4775 + }, + { + "epoch": 0.9376225970969008, + "grad_norm": 18.49698829650879, + "learning_rate": 8.743974900700415e-06, + "loss": 4.8741, + "step": 4780 + }, + { + "epoch": 0.9386033738721067, + "grad_norm": 17.863910675048828, + "learning_rate": 8.741408417355264e-06, + "loss": 5.0614, + "step": 4785 + }, + { + "epoch": 0.9395841506473127, + "grad_norm": 20.26540184020996, + "learning_rate": 8.738839692064136e-06, + "loss": 4.7714, + "step": 4790 + }, + { + "epoch": 0.9405649274225186, + "grad_norm": 19.46352195739746, + "learning_rate": 8.736268726366272e-06, + "loss": 5.0038, + "step": 4795 + }, + { + "epoch": 0.9415457041977247, + "grad_norm": 17.201187133789062, + "learning_rate": 8.733695521802259e-06, + "loss": 5.0616, + "step": 4800 + }, + { + "epoch": 0.9425264809729306, + "grad_norm": 17.212223052978516, + "learning_rate": 8.731120079914026e-06, + "loss": 4.7626, + "step": 4805 + }, + { + "epoch": 0.9435072577481365, + "grad_norm": 27.876567840576172, + "learning_rate": 8.728542402244847e-06, + "loss": 5.2046, + "step": 4810 + }, + { + "epoch": 0.9444880345233425, + "grad_norm": 16.261171340942383, + "learning_rate": 8.725962490339323e-06, + "loss": 4.6926, + "step": 4815 + }, + { + "epoch": 0.9454688112985484, + "grad_norm": 15.20531177520752, + "learning_rate": 8.723380345743408e-06, + "loss": 4.8294, + "step": 4820 + }, + { + "epoch": 0.9464495880737545, + "grad_norm": 23.71307945251465, + "learning_rate": 8.720795970004385e-06, + "loss": 4.9227, + "step": 4825 + }, + { + "epoch": 0.9474303648489604, + "grad_norm": 23.945533752441406, + "learning_rate": 8.718209364670881e-06, + "loss": 5.343, + "step": 4830 + }, + { + "epoch": 0.9484111416241663, + "grad_norm": 32.02574920654297, + "learning_rate": 8.71562053129285e-06, + "loss": 5.208, + "step": 4835 + }, + { + "epoch": 0.9493919183993723, + "grad_norm": 29.752044677734375, + "learning_rate": 8.71302947142159e-06, + "loss": 4.6208, + "step": 4840 + }, + { + "epoch": 0.9503726951745782, + "grad_norm": 10.127486228942871, + "learning_rate": 8.710436186609728e-06, + "loss": 4.8635, + "step": 4845 + }, + { + "epoch": 0.9513534719497843, + "grad_norm": 14.631573677062988, + "learning_rate": 8.707840678411223e-06, + "loss": 4.8818, + "step": 4850 + }, + { + "epoch": 0.9523342487249902, + "grad_norm": 42.126060485839844, + "learning_rate": 8.705242948381372e-06, + "loss": 5.2032, + "step": 4855 + }, + { + "epoch": 0.9533150255001962, + "grad_norm": 21.709590911865234, + "learning_rate": 8.702642998076798e-06, + "loss": 4.9076, + "step": 4860 + }, + { + "epoch": 0.9542958022754021, + "grad_norm": 19.50485610961914, + "learning_rate": 8.700040829055458e-06, + "loss": 4.8002, + "step": 4865 + }, + { + "epoch": 0.955276579050608, + "grad_norm": 17.49419593811035, + "learning_rate": 8.697436442876637e-06, + "loss": 4.8608, + "step": 4870 + }, + { + "epoch": 0.9562573558258141, + "grad_norm": 20.620445251464844, + "learning_rate": 8.694829841100946e-06, + "loss": 5.0148, + "step": 4875 + }, + { + "epoch": 0.95723813260102, + "grad_norm": 12.148300170898438, + "learning_rate": 8.69222102529033e-06, + "loss": 4.8127, + "step": 4880 + }, + { + "epoch": 0.958218909376226, + "grad_norm": 21.182106018066406, + "learning_rate": 8.689609997008057e-06, + "loss": 5.0137, + "step": 4885 + }, + { + "epoch": 0.9591996861514319, + "grad_norm": 11.576159477233887, + "learning_rate": 8.686996757818718e-06, + "loss": 5.0396, + "step": 4890 + }, + { + "epoch": 0.960180462926638, + "grad_norm": 11.674856185913086, + "learning_rate": 8.684381309288232e-06, + "loss": 4.742, + "step": 4895 + }, + { + "epoch": 0.9611612397018439, + "grad_norm": 26.99156951904297, + "learning_rate": 8.681763652983846e-06, + "loss": 5.0905, + "step": 4900 + }, + { + "epoch": 0.9621420164770498, + "grad_norm": 24.627355575561523, + "learning_rate": 8.679143790474119e-06, + "loss": 4.873, + "step": 4905 + }, + { + "epoch": 0.9631227932522558, + "grad_norm": 21.86775016784668, + "learning_rate": 8.676521723328942e-06, + "loss": 5.1994, + "step": 4910 + }, + { + "epoch": 0.9641035700274617, + "grad_norm": 14.598928451538086, + "learning_rate": 8.673897453119521e-06, + "loss": 4.8057, + "step": 4915 + }, + { + "epoch": 0.9650843468026677, + "grad_norm": 27.489707946777344, + "learning_rate": 8.67127098141839e-06, + "loss": 5.2588, + "step": 4920 + }, + { + "epoch": 0.9660651235778737, + "grad_norm": 10.293922424316406, + "learning_rate": 8.66864230979939e-06, + "loss": 5.0615, + "step": 4925 + }, + { + "epoch": 0.9670459003530797, + "grad_norm": 18.869178771972656, + "learning_rate": 8.666011439837694e-06, + "loss": 5.2834, + "step": 4930 + }, + { + "epoch": 0.9680266771282856, + "grad_norm": 40.34228515625, + "learning_rate": 8.66337837310978e-06, + "loss": 4.9996, + "step": 4935 + }, + { + "epoch": 0.9690074539034915, + "grad_norm": 23.41005516052246, + "learning_rate": 8.66074311119345e-06, + "loss": 5.3371, + "step": 4940 + }, + { + "epoch": 0.9699882306786975, + "grad_norm": 33.91444396972656, + "learning_rate": 8.65810565566782e-06, + "loss": 5.2898, + "step": 4945 + }, + { + "epoch": 0.9709690074539035, + "grad_norm": 26.008817672729492, + "learning_rate": 8.655466008113318e-06, + "loss": 5.0883, + "step": 4950 + }, + { + "epoch": 0.9719497842291095, + "grad_norm": 17.434890747070312, + "learning_rate": 8.652824170111689e-06, + "loss": 4.8602, + "step": 4955 + }, + { + "epoch": 0.9729305610043154, + "grad_norm": 13.335190773010254, + "learning_rate": 8.650180143245985e-06, + "loss": 4.5868, + "step": 4960 + }, + { + "epoch": 0.9739113377795214, + "grad_norm": 21.08846664428711, + "learning_rate": 8.647533929100577e-06, + "loss": 4.6415, + "step": 4965 + }, + { + "epoch": 0.9748921145547274, + "grad_norm": 11.147554397583008, + "learning_rate": 8.644885529261144e-06, + "loss": 5.104, + "step": 4970 + }, + { + "epoch": 0.9758728913299333, + "grad_norm": 18.926776885986328, + "learning_rate": 8.642234945314671e-06, + "loss": 4.9096, + "step": 4975 + }, + { + "epoch": 0.9768536681051393, + "grad_norm": 18.32732582092285, + "learning_rate": 8.639582178849454e-06, + "loss": 4.8699, + "step": 4980 + }, + { + "epoch": 0.9778344448803452, + "grad_norm": 16.150941848754883, + "learning_rate": 8.6369272314551e-06, + "loss": 4.6607, + "step": 4985 + }, + { + "epoch": 0.9788152216555512, + "grad_norm": 24.335636138916016, + "learning_rate": 8.634270104722518e-06, + "loss": 4.7315, + "step": 4990 + }, + { + "epoch": 0.9797959984307572, + "grad_norm": 13.760662078857422, + "learning_rate": 8.631610800243926e-06, + "loss": 4.9446, + "step": 4995 + }, + { + "epoch": 0.9807767752059631, + "grad_norm": 27.17913818359375, + "learning_rate": 8.628949319612845e-06, + "loss": 5.0926, + "step": 5000 + }, + { + "epoch": 0.9817575519811691, + "grad_norm": 30.56397247314453, + "learning_rate": 8.626285664424104e-06, + "loss": 4.9878, + "step": 5005 + }, + { + "epoch": 0.982738328756375, + "grad_norm": 22.218719482421875, + "learning_rate": 8.62361983627383e-06, + "loss": 4.8556, + "step": 5010 + }, + { + "epoch": 0.983719105531581, + "grad_norm": 14.08055305480957, + "learning_rate": 8.620951836759454e-06, + "loss": 5.0962, + "step": 5015 + }, + { + "epoch": 0.984699882306787, + "grad_norm": 35.20744705200195, + "learning_rate": 8.61828166747971e-06, + "loss": 4.5052, + "step": 5020 + }, + { + "epoch": 0.985680659081993, + "grad_norm": 52.820247650146484, + "learning_rate": 8.615609330034628e-06, + "loss": 5.0339, + "step": 5025 + }, + { + "epoch": 0.9866614358571989, + "grad_norm": 22.243247985839844, + "learning_rate": 8.612934826025542e-06, + "loss": 5.1378, + "step": 5030 + }, + { + "epoch": 0.9876422126324048, + "grad_norm": 9.840145111083984, + "learning_rate": 8.610258157055082e-06, + "loss": 4.9607, + "step": 5035 + }, + { + "epoch": 0.9886229894076108, + "grad_norm": 15.050167083740234, + "learning_rate": 8.607579324727175e-06, + "loss": 5.3641, + "step": 5040 + }, + { + "epoch": 0.9896037661828168, + "grad_norm": 18.553813934326172, + "learning_rate": 8.604898330647043e-06, + "loss": 5.3992, + "step": 5045 + }, + { + "epoch": 0.9905845429580228, + "grad_norm": 25.090362548828125, + "learning_rate": 8.602215176421206e-06, + "loss": 4.8542, + "step": 5050 + }, + { + "epoch": 0.9915653197332287, + "grad_norm": 24.53265380859375, + "learning_rate": 8.59952986365748e-06, + "loss": 4.7045, + "step": 5055 + }, + { + "epoch": 0.9925460965084347, + "grad_norm": 13.577052116394043, + "learning_rate": 8.59684239396497e-06, + "loss": 4.8215, + "step": 5060 + }, + { + "epoch": 0.9935268732836406, + "grad_norm": 26.031982421875, + "learning_rate": 8.594152768954072e-06, + "loss": 5.115, + "step": 5065 + }, + { + "epoch": 0.9945076500588466, + "grad_norm": 17.36546516418457, + "learning_rate": 8.591460990236482e-06, + "loss": 4.911, + "step": 5070 + }, + { + "epoch": 0.9954884268340526, + "grad_norm": 15.414812088012695, + "learning_rate": 8.58876705942518e-06, + "loss": 4.9138, + "step": 5075 + }, + { + "epoch": 0.9964692036092585, + "grad_norm": 21.61842155456543, + "learning_rate": 8.586070978134437e-06, + "loss": 4.7276, + "step": 5080 + }, + { + "epoch": 0.9974499803844645, + "grad_norm": 20.003103256225586, + "learning_rate": 8.583372747979813e-06, + "loss": 4.9446, + "step": 5085 + }, + { + "epoch": 0.9984307571596704, + "grad_norm": 18.7646541595459, + "learning_rate": 8.580672370578152e-06, + "loss": 4.8425, + "step": 5090 + }, + { + "epoch": 0.9994115339348765, + "grad_norm": 14.617816925048828, + "learning_rate": 8.577969847547591e-06, + "loss": 4.7787, + "step": 5095 + }, + { + "epoch": 1.0003923107100823, + "grad_norm": 25.93317222595215, + "learning_rate": 8.575265180507553e-06, + "loss": 5.0419, + "step": 5100 + }, + { + "epoch": 1.0003923107100823, + "eval_loss": 4.966672420501709, + "eval_runtime": 7.6131, + "eval_samples_per_second": 27.453, + "eval_steps_per_second": 13.792, + "step": 5100 + }, + { + "epoch": 1.0013730874852884, + "grad_norm": 19.052959442138672, + "learning_rate": 8.572558371078736e-06, + "loss": 4.6995, + "step": 5105 + }, + { + "epoch": 1.0023538642604943, + "grad_norm": 24.985578536987305, + "learning_rate": 8.56984942088313e-06, + "loss": 5.0097, + "step": 5110 + }, + { + "epoch": 1.0033346410357002, + "grad_norm": 13.549357414245605, + "learning_rate": 8.567138331544009e-06, + "loss": 4.813, + "step": 5115 + }, + { + "epoch": 1.0043154178109062, + "grad_norm": 24.388721466064453, + "learning_rate": 8.564425104685926e-06, + "loss": 4.9489, + "step": 5120 + }, + { + "epoch": 1.0052961945861123, + "grad_norm": 11.15587043762207, + "learning_rate": 8.56170974193471e-06, + "loss": 4.6149, + "step": 5125 + }, + { + "epoch": 1.0062769713613182, + "grad_norm": 15.220309257507324, + "learning_rate": 8.55899224491748e-06, + "loss": 4.8196, + "step": 5130 + }, + { + "epoch": 1.0072577481365241, + "grad_norm": 15.071609497070312, + "learning_rate": 8.556272615262623e-06, + "loss": 5.3157, + "step": 5135 + }, + { + "epoch": 1.00823852491173, + "grad_norm": 16.934743881225586, + "learning_rate": 8.553550854599815e-06, + "loss": 4.7028, + "step": 5140 + }, + { + "epoch": 1.009219301686936, + "grad_norm": 13.945096015930176, + "learning_rate": 8.55082696456e-06, + "loss": 4.3538, + "step": 5145 + }, + { + "epoch": 1.010200078462142, + "grad_norm": 21.077960968017578, + "learning_rate": 8.548100946775402e-06, + "loss": 4.5671, + "step": 5150 + }, + { + "epoch": 1.011180855237348, + "grad_norm": 23.00332260131836, + "learning_rate": 8.54537280287952e-06, + "loss": 4.8257, + "step": 5155 + }, + { + "epoch": 1.012161632012554, + "grad_norm": 12.397933959960938, + "learning_rate": 8.542642534507126e-06, + "loss": 4.3358, + "step": 5160 + }, + { + "epoch": 1.0131424087877599, + "grad_norm": 12.854864120483398, + "learning_rate": 8.539910143294265e-06, + "loss": 4.7781, + "step": 5165 + }, + { + "epoch": 1.0141231855629658, + "grad_norm": 12.894524574279785, + "learning_rate": 8.537175630878256e-06, + "loss": 4.6318, + "step": 5170 + }, + { + "epoch": 1.015103962338172, + "grad_norm": 11.00051498413086, + "learning_rate": 8.534438998897686e-06, + "loss": 4.6955, + "step": 5175 + }, + { + "epoch": 1.0160847391133778, + "grad_norm": 12.633216857910156, + "learning_rate": 8.531700248992414e-06, + "loss": 4.6592, + "step": 5180 + }, + { + "epoch": 1.0170655158885837, + "grad_norm": 17.211999893188477, + "learning_rate": 8.52895938280357e-06, + "loss": 4.8464, + "step": 5185 + }, + { + "epoch": 1.0180462926637897, + "grad_norm": 19.490310668945312, + "learning_rate": 8.526216401973546e-06, + "loss": 4.5088, + "step": 5190 + }, + { + "epoch": 1.0190270694389958, + "grad_norm": 26.104711532592773, + "learning_rate": 8.523471308146007e-06, + "loss": 4.7509, + "step": 5195 + }, + { + "epoch": 1.0200078462142017, + "grad_norm": 19.656057357788086, + "learning_rate": 8.520724102965883e-06, + "loss": 5.0214, + "step": 5200 + }, + { + "epoch": 1.0209886229894076, + "grad_norm": 13.035916328430176, + "learning_rate": 8.517974788079369e-06, + "loss": 4.5281, + "step": 5205 + }, + { + "epoch": 1.0219693997646135, + "grad_norm": 18.815303802490234, + "learning_rate": 8.51522336513392e-06, + "loss": 4.6767, + "step": 5210 + }, + { + "epoch": 1.0229501765398195, + "grad_norm": 16.87318229675293, + "learning_rate": 8.512469835778262e-06, + "loss": 4.8249, + "step": 5215 + }, + { + "epoch": 1.0239309533150256, + "grad_norm": 26.774776458740234, + "learning_rate": 8.509714201662377e-06, + "loss": 4.8232, + "step": 5220 + }, + { + "epoch": 1.0249117300902315, + "grad_norm": 17.386625289916992, + "learning_rate": 8.506956464437509e-06, + "loss": 5.3507, + "step": 5225 + }, + { + "epoch": 1.0258925068654374, + "grad_norm": 19.190387725830078, + "learning_rate": 8.504196625756166e-06, + "loss": 4.5408, + "step": 5230 + }, + { + "epoch": 1.0268732836406433, + "grad_norm": 22.238534927368164, + "learning_rate": 8.50143468727211e-06, + "loss": 4.6218, + "step": 5235 + }, + { + "epoch": 1.0278540604158493, + "grad_norm": 19.188385009765625, + "learning_rate": 8.498670650640368e-06, + "loss": 4.8509, + "step": 5240 + }, + { + "epoch": 1.0288348371910554, + "grad_norm": 17.50986099243164, + "learning_rate": 8.495904517517217e-06, + "loss": 4.919, + "step": 5245 + }, + { + "epoch": 1.0298156139662613, + "grad_norm": 24.868322372436523, + "learning_rate": 8.493136289560194e-06, + "loss": 4.8401, + "step": 5250 + }, + { + "epoch": 1.0307963907414672, + "grad_norm": 19.065523147583008, + "learning_rate": 8.49036596842809e-06, + "loss": 4.729, + "step": 5255 + }, + { + "epoch": 1.0317771675166731, + "grad_norm": 17.544790267944336, + "learning_rate": 8.487593555780954e-06, + "loss": 4.4417, + "step": 5260 + }, + { + "epoch": 1.032757944291879, + "grad_norm": 13.028230667114258, + "learning_rate": 8.484819053280082e-06, + "loss": 4.7517, + "step": 5265 + }, + { + "epoch": 1.0337387210670852, + "grad_norm": 12.053363800048828, + "learning_rate": 8.482042462588028e-06, + "loss": 4.6513, + "step": 5270 + }, + { + "epoch": 1.0347194978422911, + "grad_norm": 16.837976455688477, + "learning_rate": 8.479263785368594e-06, + "loss": 4.9133, + "step": 5275 + }, + { + "epoch": 1.035700274617497, + "grad_norm": 23.320768356323242, + "learning_rate": 8.476483023286832e-06, + "loss": 4.7035, + "step": 5280 + }, + { + "epoch": 1.036681051392703, + "grad_norm": 14.706212043762207, + "learning_rate": 8.473700178009047e-06, + "loss": 4.7084, + "step": 5285 + }, + { + "epoch": 1.037661828167909, + "grad_norm": 16.379053115844727, + "learning_rate": 8.470915251202789e-06, + "loss": 4.8806, + "step": 5290 + }, + { + "epoch": 1.038642604943115, + "grad_norm": 16.73505210876465, + "learning_rate": 8.468128244536854e-06, + "loss": 4.5875, + "step": 5295 + }, + { + "epoch": 1.039623381718321, + "grad_norm": 13.927240371704102, + "learning_rate": 8.465339159681291e-06, + "loss": 4.5403, + "step": 5300 + }, + { + "epoch": 1.0406041584935268, + "grad_norm": 18.347909927368164, + "learning_rate": 8.462547998307386e-06, + "loss": 4.9035, + "step": 5305 + }, + { + "epoch": 1.0415849352687327, + "grad_norm": 35.21794891357422, + "learning_rate": 8.459754762087675e-06, + "loss": 5.0395, + "step": 5310 + }, + { + "epoch": 1.0425657120439389, + "grad_norm": 9.092461585998535, + "learning_rate": 8.456959452695934e-06, + "loss": 4.8834, + "step": 5315 + }, + { + "epoch": 1.0435464888191448, + "grad_norm": 17.34891700744629, + "learning_rate": 8.454162071807181e-06, + "loss": 4.6001, + "step": 5320 + }, + { + "epoch": 1.0445272655943507, + "grad_norm": 19.068683624267578, + "learning_rate": 8.45136262109768e-06, + "loss": 5.0256, + "step": 5325 + }, + { + "epoch": 1.0455080423695566, + "grad_norm": 19.529052734375, + "learning_rate": 8.448561102244934e-06, + "loss": 4.6542, + "step": 5330 + }, + { + "epoch": 1.0464888191447625, + "grad_norm": 28.510236740112305, + "learning_rate": 8.445757516927679e-06, + "loss": 4.6586, + "step": 5335 + }, + { + "epoch": 1.0474695959199687, + "grad_norm": 10.34117603302002, + "learning_rate": 8.442951866825898e-06, + "loss": 4.8925, + "step": 5340 + }, + { + "epoch": 1.0484503726951746, + "grad_norm": 20.325603485107422, + "learning_rate": 8.4401441536208e-06, + "loss": 4.927, + "step": 5345 + }, + { + "epoch": 1.0494311494703805, + "grad_norm": 15.056465148925781, + "learning_rate": 8.437334378994846e-06, + "loss": 4.807, + "step": 5350 + }, + { + "epoch": 1.0504119262455864, + "grad_norm": 23.482568740844727, + "learning_rate": 8.434522544631718e-06, + "loss": 4.4933, + "step": 5355 + }, + { + "epoch": 1.0513927030207926, + "grad_norm": 29.62999725341797, + "learning_rate": 8.43170865221634e-06, + "loss": 5.0732, + "step": 5360 + }, + { + "epoch": 1.0523734797959985, + "grad_norm": 14.750212669372559, + "learning_rate": 8.428892703434867e-06, + "loss": 4.5473, + "step": 5365 + }, + { + "epoch": 1.0533542565712044, + "grad_norm": 16.44927978515625, + "learning_rate": 8.426074699974686e-06, + "loss": 4.8326, + "step": 5370 + }, + { + "epoch": 1.0543350333464103, + "grad_norm": 33.85694122314453, + "learning_rate": 8.423254643524415e-06, + "loss": 4.9815, + "step": 5375 + }, + { + "epoch": 1.0553158101216162, + "grad_norm": 18.947261810302734, + "learning_rate": 8.420432535773902e-06, + "loss": 4.6267, + "step": 5380 + }, + { + "epoch": 1.0562965868968224, + "grad_norm": 21.307687759399414, + "learning_rate": 8.417608378414228e-06, + "loss": 4.6284, + "step": 5385 + }, + { + "epoch": 1.0572773636720283, + "grad_norm": 14.791841506958008, + "learning_rate": 8.414782173137697e-06, + "loss": 4.7716, + "step": 5390 + }, + { + "epoch": 1.0582581404472342, + "grad_norm": 10.958831787109375, + "learning_rate": 8.41195392163784e-06, + "loss": 4.7179, + "step": 5395 + }, + { + "epoch": 1.0592389172224401, + "grad_norm": 17.133495330810547, + "learning_rate": 8.409123625609421e-06, + "loss": 4.7059, + "step": 5400 + }, + { + "epoch": 1.060219693997646, + "grad_norm": 18.041934967041016, + "learning_rate": 8.406291286748423e-06, + "loss": 4.7454, + "step": 5405 + }, + { + "epoch": 1.0612004707728522, + "grad_norm": 20.192089080810547, + "learning_rate": 8.403456906752053e-06, + "loss": 4.7348, + "step": 5410 + }, + { + "epoch": 1.062181247548058, + "grad_norm": 13.601256370544434, + "learning_rate": 8.400620487318743e-06, + "loss": 4.763, + "step": 5415 + }, + { + "epoch": 1.063162024323264, + "grad_norm": 23.853984832763672, + "learning_rate": 8.397782030148147e-06, + "loss": 4.5687, + "step": 5420 + }, + { + "epoch": 1.06414280109847, + "grad_norm": 14.830785751342773, + "learning_rate": 8.394941536941141e-06, + "loss": 4.9468, + "step": 5425 + }, + { + "epoch": 1.0651235778736758, + "grad_norm": 22.439790725708008, + "learning_rate": 8.39209900939982e-06, + "loss": 4.716, + "step": 5430 + }, + { + "epoch": 1.066104354648882, + "grad_norm": 31.342418670654297, + "learning_rate": 8.389254449227498e-06, + "loss": 4.9564, + "step": 5435 + }, + { + "epoch": 1.067085131424088, + "grad_norm": 15.614798545837402, + "learning_rate": 8.386407858128707e-06, + "loss": 4.9951, + "step": 5440 + }, + { + "epoch": 1.0680659081992938, + "grad_norm": 23.963844299316406, + "learning_rate": 8.383559237809194e-06, + "loss": 4.5133, + "step": 5445 + }, + { + "epoch": 1.0690466849744997, + "grad_norm": 26.141693115234375, + "learning_rate": 8.380708589975923e-06, + "loss": 4.7709, + "step": 5450 + }, + { + "epoch": 1.0700274617497059, + "grad_norm": 16.767637252807617, + "learning_rate": 8.377855916337078e-06, + "loss": 4.636, + "step": 5455 + }, + { + "epoch": 1.0710082385249118, + "grad_norm": 19.09316635131836, + "learning_rate": 8.375001218602053e-06, + "loss": 4.671, + "step": 5460 + }, + { + "epoch": 1.0719890153001177, + "grad_norm": 14.91485595703125, + "learning_rate": 8.372144498481449e-06, + "loss": 4.9976, + "step": 5465 + }, + { + "epoch": 1.0729697920753236, + "grad_norm": 26.471715927124023, + "learning_rate": 8.36928575768709e-06, + "loss": 4.8231, + "step": 5470 + }, + { + "epoch": 1.0739505688505295, + "grad_norm": 15.676648139953613, + "learning_rate": 8.366424997932003e-06, + "loss": 4.6148, + "step": 5475 + }, + { + "epoch": 1.0749313456257357, + "grad_norm": 20.513164520263672, + "learning_rate": 8.363562220930426e-06, + "loss": 5.0241, + "step": 5480 + }, + { + "epoch": 1.0759121224009416, + "grad_norm": 11.651981353759766, + "learning_rate": 8.36069742839781e-06, + "loss": 4.6736, + "step": 5485 + }, + { + "epoch": 1.0768928991761475, + "grad_norm": 20.40721321105957, + "learning_rate": 8.357830622050809e-06, + "loss": 4.7698, + "step": 5490 + }, + { + "epoch": 1.0778736759513534, + "grad_norm": 17.44342613220215, + "learning_rate": 8.354961803607285e-06, + "loss": 4.7641, + "step": 5495 + }, + { + "epoch": 1.0788544527265593, + "grad_norm": 18.868745803833008, + "learning_rate": 8.352090974786305e-06, + "loss": 4.8438, + "step": 5500 + }, + { + "epoch": 1.0798352295017655, + "grad_norm": 11.686888694763184, + "learning_rate": 8.349218137308146e-06, + "loss": 4.4356, + "step": 5505 + }, + { + "epoch": 1.0808160062769714, + "grad_norm": 13.204122543334961, + "learning_rate": 8.34634329289428e-06, + "loss": 4.652, + "step": 5510 + }, + { + "epoch": 1.0817967830521773, + "grad_norm": 27.09731674194336, + "learning_rate": 8.34346644326739e-06, + "loss": 4.6513, + "step": 5515 + }, + { + "epoch": 1.0827775598273832, + "grad_norm": 16.423181533813477, + "learning_rate": 8.340587590151355e-06, + "loss": 4.6354, + "step": 5520 + }, + { + "epoch": 1.0837583366025894, + "grad_norm": 16.599773406982422, + "learning_rate": 8.337706735271252e-06, + "loss": 5.013, + "step": 5525 + }, + { + "epoch": 1.0847391133777953, + "grad_norm": 18.727693557739258, + "learning_rate": 8.334823880353368e-06, + "loss": 4.6582, + "step": 5530 + }, + { + "epoch": 1.0857198901530012, + "grad_norm": 22.449386596679688, + "learning_rate": 8.33193902712518e-06, + "loss": 4.8486, + "step": 5535 + }, + { + "epoch": 1.086700666928207, + "grad_norm": 10.492510795593262, + "learning_rate": 8.329052177315365e-06, + "loss": 4.6019, + "step": 5540 + }, + { + "epoch": 1.087681443703413, + "grad_norm": 23.221635818481445, + "learning_rate": 8.326163332653791e-06, + "loss": 4.8868, + "step": 5545 + }, + { + "epoch": 1.0886622204786192, + "grad_norm": 15.17660140991211, + "learning_rate": 8.323272494871534e-06, + "loss": 4.8388, + "step": 5550 + }, + { + "epoch": 1.089642997253825, + "grad_norm": 14.142809867858887, + "learning_rate": 8.320379665700852e-06, + "loss": 4.7848, + "step": 5555 + }, + { + "epoch": 1.090623774029031, + "grad_norm": 21.153562545776367, + "learning_rate": 8.317484846875202e-06, + "loss": 4.7567, + "step": 5560 + }, + { + "epoch": 1.091604550804237, + "grad_norm": 20.27756118774414, + "learning_rate": 8.314588040129232e-06, + "loss": 4.6365, + "step": 5565 + }, + { + "epoch": 1.0925853275794428, + "grad_norm": 17.335216522216797, + "learning_rate": 8.311689247198783e-06, + "loss": 4.8954, + "step": 5570 + }, + { + "epoch": 1.093566104354649, + "grad_norm": 12.399637222290039, + "learning_rate": 8.308788469820881e-06, + "loss": 4.8607, + "step": 5575 + }, + { + "epoch": 1.0945468811298549, + "grad_norm": 20.954959869384766, + "learning_rate": 8.30588570973375e-06, + "loss": 4.5106, + "step": 5580 + }, + { + "epoch": 1.0955276579050608, + "grad_norm": 32.353904724121094, + "learning_rate": 8.302980968676792e-06, + "loss": 4.7893, + "step": 5585 + }, + { + "epoch": 1.0965084346802667, + "grad_norm": 16.396020889282227, + "learning_rate": 8.300074248390603e-06, + "loss": 4.4562, + "step": 5590 + }, + { + "epoch": 1.0974892114554726, + "grad_norm": 20.024126052856445, + "learning_rate": 8.297165550616964e-06, + "loss": 4.486, + "step": 5595 + }, + { + "epoch": 1.0984699882306788, + "grad_norm": 26.761669158935547, + "learning_rate": 8.294254877098834e-06, + "loss": 4.5415, + "step": 5600 + }, + { + "epoch": 1.0994507650058847, + "grad_norm": 14.93077564239502, + "learning_rate": 8.29134222958037e-06, + "loss": 4.7062, + "step": 5605 + }, + { + "epoch": 1.1004315417810906, + "grad_norm": 25.9860782623291, + "learning_rate": 8.288427609806899e-06, + "loss": 4.9213, + "step": 5610 + }, + { + "epoch": 1.1014123185562965, + "grad_norm": 15.712201118469238, + "learning_rate": 8.285511019524937e-06, + "loss": 4.5184, + "step": 5615 + }, + { + "epoch": 1.1023930953315026, + "grad_norm": 29.031225204467773, + "learning_rate": 8.282592460482175e-06, + "loss": 4.6388, + "step": 5620 + }, + { + "epoch": 1.1033738721067086, + "grad_norm": 13.145207405090332, + "learning_rate": 8.279671934427486e-06, + "loss": 4.7952, + "step": 5625 + }, + { + "epoch": 1.1043546488819145, + "grad_norm": 15.207327842712402, + "learning_rate": 8.276749443110928e-06, + "loss": 4.9209, + "step": 5630 + }, + { + "epoch": 1.1053354256571204, + "grad_norm": 14.627603530883789, + "learning_rate": 8.273824988283727e-06, + "loss": 4.5341, + "step": 5635 + }, + { + "epoch": 1.1063162024323263, + "grad_norm": 15.305438995361328, + "learning_rate": 8.270898571698291e-06, + "loss": 4.454, + "step": 5640 + }, + { + "epoch": 1.1072969792075325, + "grad_norm": 30.631746292114258, + "learning_rate": 8.267970195108204e-06, + "loss": 4.8473, + "step": 5645 + }, + { + "epoch": 1.1082777559827384, + "grad_norm": 11.449334144592285, + "learning_rate": 8.26503986026822e-06, + "loss": 4.5177, + "step": 5650 + }, + { + "epoch": 1.1092585327579443, + "grad_norm": 22.79661750793457, + "learning_rate": 8.262107568934271e-06, + "loss": 4.788, + "step": 5655 + }, + { + "epoch": 1.1102393095331502, + "grad_norm": 24.50604820251465, + "learning_rate": 8.25917332286346e-06, + "loss": 4.8102, + "step": 5660 + }, + { + "epoch": 1.1112200863083561, + "grad_norm": 22.736764907836914, + "learning_rate": 8.256237123814059e-06, + "loss": 4.8124, + "step": 5665 + }, + { + "epoch": 1.1122008630835623, + "grad_norm": 10.96435260772705, + "learning_rate": 8.253298973545516e-06, + "loss": 4.3873, + "step": 5670 + }, + { + "epoch": 1.1131816398587682, + "grad_norm": 26.919565200805664, + "learning_rate": 8.25035887381844e-06, + "loss": 4.8894, + "step": 5675 + }, + { + "epoch": 1.114162416633974, + "grad_norm": 19.470022201538086, + "learning_rate": 8.247416826394616e-06, + "loss": 4.7621, + "step": 5680 + }, + { + "epoch": 1.11514319340918, + "grad_norm": 29.276655197143555, + "learning_rate": 8.24447283303699e-06, + "loss": 4.505, + "step": 5685 + }, + { + "epoch": 1.1161239701843861, + "grad_norm": 20.404621124267578, + "learning_rate": 8.241526895509681e-06, + "loss": 4.8768, + "step": 5690 + }, + { + "epoch": 1.117104746959592, + "grad_norm": 21.071603775024414, + "learning_rate": 8.238579015577966e-06, + "loss": 4.6903, + "step": 5695 + }, + { + "epoch": 1.118085523734798, + "grad_norm": 24.47282600402832, + "learning_rate": 8.235629195008286e-06, + "loss": 4.7896, + "step": 5700 + }, + { + "epoch": 1.1190663005100039, + "grad_norm": 15.374588966369629, + "learning_rate": 8.232677435568252e-06, + "loss": 4.9747, + "step": 5705 + }, + { + "epoch": 1.1200470772852098, + "grad_norm": 23.620338439941406, + "learning_rate": 8.229723739026634e-06, + "loss": 4.8028, + "step": 5710 + }, + { + "epoch": 1.121027854060416, + "grad_norm": 13.628473281860352, + "learning_rate": 8.226768107153356e-06, + "loss": 4.6099, + "step": 5715 + }, + { + "epoch": 1.1220086308356219, + "grad_norm": 30.455013275146484, + "learning_rate": 8.22381054171951e-06, + "loss": 4.7731, + "step": 5720 + }, + { + "epoch": 1.1229894076108278, + "grad_norm": 16.254222869873047, + "learning_rate": 8.220851044497342e-06, + "loss": 4.8022, + "step": 5725 + }, + { + "epoch": 1.1239701843860337, + "grad_norm": 15.20760440826416, + "learning_rate": 8.217889617260257e-06, + "loss": 4.5828, + "step": 5730 + }, + { + "epoch": 1.1249509611612396, + "grad_norm": 45.388980865478516, + "learning_rate": 8.214926261782818e-06, + "loss": 4.8385, + "step": 5735 + }, + { + "epoch": 1.1259317379364457, + "grad_norm": 29.33622932434082, + "learning_rate": 8.211960979840743e-06, + "loss": 4.2081, + "step": 5740 + }, + { + "epoch": 1.1269125147116517, + "grad_norm": 25.06155014038086, + "learning_rate": 8.208993773210903e-06, + "loss": 4.7609, + "step": 5745 + }, + { + "epoch": 1.1278932914868576, + "grad_norm": 29.778722763061523, + "learning_rate": 8.20602464367132e-06, + "loss": 4.4514, + "step": 5750 + }, + { + "epoch": 1.1288740682620635, + "grad_norm": 21.625354766845703, + "learning_rate": 8.203053593001174e-06, + "loss": 4.6612, + "step": 5755 + }, + { + "epoch": 1.1298548450372694, + "grad_norm": 12.36301040649414, + "learning_rate": 8.200080622980793e-06, + "loss": 4.8831, + "step": 5760 + }, + { + "epoch": 1.1308356218124755, + "grad_norm": 13.567903518676758, + "learning_rate": 8.197105735391655e-06, + "loss": 4.7941, + "step": 5765 + }, + { + "epoch": 1.1318163985876815, + "grad_norm": 33.889305114746094, + "learning_rate": 8.194128932016385e-06, + "loss": 4.9926, + "step": 5770 + }, + { + "epoch": 1.1327971753628874, + "grad_norm": 17.675067901611328, + "learning_rate": 8.19115021463876e-06, + "loss": 4.6703, + "step": 5775 + }, + { + "epoch": 1.1337779521380933, + "grad_norm": 42.091495513916016, + "learning_rate": 8.188169585043706e-06, + "loss": 4.5128, + "step": 5780 + }, + { + "epoch": 1.1347587289132992, + "grad_norm": 37.27928161621094, + "learning_rate": 8.185187045017289e-06, + "loss": 4.835, + "step": 5785 + }, + { + "epoch": 1.1357395056885053, + "grad_norm": 19.49931526184082, + "learning_rate": 8.182202596346718e-06, + "loss": 4.6441, + "step": 5790 + }, + { + "epoch": 1.1367202824637113, + "grad_norm": 21.166391372680664, + "learning_rate": 8.179216240820354e-06, + "loss": 4.9681, + "step": 5795 + }, + { + "epoch": 1.1377010592389172, + "grad_norm": 15.765752792358398, + "learning_rate": 8.176227980227693e-06, + "loss": 4.809, + "step": 5800 + }, + { + "epoch": 1.138681836014123, + "grad_norm": 19.515087127685547, + "learning_rate": 8.17323781635938e-06, + "loss": 4.8872, + "step": 5805 + }, + { + "epoch": 1.1396626127893292, + "grad_norm": 28.068483352661133, + "learning_rate": 8.170245751007194e-06, + "loss": 4.4617, + "step": 5810 + }, + { + "epoch": 1.1406433895645351, + "grad_norm": 13.95158863067627, + "learning_rate": 8.167251785964055e-06, + "loss": 4.5233, + "step": 5815 + }, + { + "epoch": 1.141624166339741, + "grad_norm": 13.904541015625, + "learning_rate": 8.164255923024025e-06, + "loss": 4.6834, + "step": 5820 + }, + { + "epoch": 1.142604943114947, + "grad_norm": 25.7268123626709, + "learning_rate": 8.161258163982298e-06, + "loss": 4.7352, + "step": 5825 + }, + { + "epoch": 1.1435857198901531, + "grad_norm": 21.95464515686035, + "learning_rate": 8.158258510635206e-06, + "loss": 4.6658, + "step": 5830 + }, + { + "epoch": 1.144566496665359, + "grad_norm": 11.288395881652832, + "learning_rate": 8.155256964780218e-06, + "loss": 4.7413, + "step": 5835 + }, + { + "epoch": 1.145547273440565, + "grad_norm": 24.911361694335938, + "learning_rate": 8.152253528215937e-06, + "loss": 4.67, + "step": 5840 + }, + { + "epoch": 1.1465280502157709, + "grad_norm": 10.945342063903809, + "learning_rate": 8.149248202742096e-06, + "loss": 4.8911, + "step": 5845 + }, + { + "epoch": 1.1475088269909768, + "grad_norm": 18.10870361328125, + "learning_rate": 8.146240990159558e-06, + "loss": 4.9292, + "step": 5850 + }, + { + "epoch": 1.148489603766183, + "grad_norm": 23.122905731201172, + "learning_rate": 8.143231892270327e-06, + "loss": 5.0509, + "step": 5855 + }, + { + "epoch": 1.1494703805413888, + "grad_norm": 14.539555549621582, + "learning_rate": 8.140220910877529e-06, + "loss": 4.8416, + "step": 5860 + }, + { + "epoch": 1.1504511573165948, + "grad_norm": 18.56782341003418, + "learning_rate": 8.137208047785417e-06, + "loss": 4.8764, + "step": 5865 + }, + { + "epoch": 1.1514319340918007, + "grad_norm": 22.753496170043945, + "learning_rate": 8.134193304799373e-06, + "loss": 4.921, + "step": 5870 + }, + { + "epoch": 1.1524127108670066, + "grad_norm": 15.556824684143066, + "learning_rate": 8.131176683725912e-06, + "loss": 5.0195, + "step": 5875 + }, + { + "epoch": 1.1533934876422127, + "grad_norm": 16.221290588378906, + "learning_rate": 8.128158186372666e-06, + "loss": 4.6949, + "step": 5880 + }, + { + "epoch": 1.1543742644174186, + "grad_norm": 24.236032485961914, + "learning_rate": 8.125137814548394e-06, + "loss": 5.0952, + "step": 5885 + }, + { + "epoch": 1.1553550411926246, + "grad_norm": 21.219877243041992, + "learning_rate": 8.122115570062978e-06, + "loss": 4.8158, + "step": 5890 + }, + { + "epoch": 1.1563358179678305, + "grad_norm": 23.525745391845703, + "learning_rate": 8.119091454727427e-06, + "loss": 4.5122, + "step": 5895 + }, + { + "epoch": 1.1573165947430364, + "grad_norm": 17.191791534423828, + "learning_rate": 8.116065470353863e-06, + "loss": 4.5277, + "step": 5900 + }, + { + "epoch": 1.1582973715182425, + "grad_norm": 14.015380859375, + "learning_rate": 8.113037618755533e-06, + "loss": 4.5689, + "step": 5905 + }, + { + "epoch": 1.1592781482934484, + "grad_norm": 24.197856903076172, + "learning_rate": 8.110007901746804e-06, + "loss": 4.7434, + "step": 5910 + }, + { + "epoch": 1.1602589250686544, + "grad_norm": 11.606034278869629, + "learning_rate": 8.106976321143155e-06, + "loss": 4.9903, + "step": 5915 + }, + { + "epoch": 1.1612397018438603, + "grad_norm": 10.518669128417969, + "learning_rate": 8.103942878761189e-06, + "loss": 4.8785, + "step": 5920 + }, + { + "epoch": 1.1622204786190662, + "grad_norm": 27.280576705932617, + "learning_rate": 8.100907576418616e-06, + "loss": 4.7905, + "step": 5925 + }, + { + "epoch": 1.1632012553942723, + "grad_norm": 111.20673370361328, + "learning_rate": 8.097870415934269e-06, + "loss": 4.5365, + "step": 5930 + }, + { + "epoch": 1.1641820321694782, + "grad_norm": 17.625286102294922, + "learning_rate": 8.094831399128092e-06, + "loss": 4.7943, + "step": 5935 + }, + { + "epoch": 1.1651628089446842, + "grad_norm": 11.342884063720703, + "learning_rate": 8.091790527821138e-06, + "loss": 4.6058, + "step": 5940 + }, + { + "epoch": 1.16614358571989, + "grad_norm": 20.569257736206055, + "learning_rate": 8.088747803835573e-06, + "loss": 4.6913, + "step": 5945 + }, + { + "epoch": 1.1671243624950962, + "grad_norm": 13.408282279968262, + "learning_rate": 8.085703228994674e-06, + "loss": 4.9914, + "step": 5950 + }, + { + "epoch": 1.1681051392703021, + "grad_norm": 28.350143432617188, + "learning_rate": 8.082656805122829e-06, + "loss": 4.5938, + "step": 5955 + }, + { + "epoch": 1.169085916045508, + "grad_norm": 25.818490982055664, + "learning_rate": 8.07960853404553e-06, + "loss": 4.4677, + "step": 5960 + }, + { + "epoch": 1.170066692820714, + "grad_norm": 16.237733840942383, + "learning_rate": 8.07655841758938e-06, + "loss": 4.5051, + "step": 5965 + }, + { + "epoch": 1.1710474695959199, + "grad_norm": 24.858333587646484, + "learning_rate": 8.073506457582082e-06, + "loss": 4.574, + "step": 5970 + }, + { + "epoch": 1.172028246371126, + "grad_norm": 10.83703327178955, + "learning_rate": 8.070452655852445e-06, + "loss": 5.1138, + "step": 5975 + }, + { + "epoch": 1.173009023146332, + "grad_norm": 22.579744338989258, + "learning_rate": 8.067397014230391e-06, + "loss": 4.759, + "step": 5980 + }, + { + "epoch": 1.1739897999215378, + "grad_norm": 17.268518447875977, + "learning_rate": 8.064339534546935e-06, + "loss": 4.7689, + "step": 5985 + }, + { + "epoch": 1.1749705766967438, + "grad_norm": 34.30562973022461, + "learning_rate": 8.061280218634192e-06, + "loss": 4.7157, + "step": 5990 + }, + { + "epoch": 1.17595135347195, + "grad_norm": 16.77152442932129, + "learning_rate": 8.058219068325383e-06, + "loss": 4.5394, + "step": 5995 + }, + { + "epoch": 1.1769321302471558, + "grad_norm": 10.069709777832031, + "learning_rate": 8.055156085454828e-06, + "loss": 4.8192, + "step": 6000 + }, + { + "epoch": 1.1779129070223617, + "grad_norm": 36.60639190673828, + "learning_rate": 8.052091271857942e-06, + "loss": 4.5211, + "step": 6005 + }, + { + "epoch": 1.1788936837975676, + "grad_norm": 28.5606689453125, + "learning_rate": 8.049024629371237e-06, + "loss": 4.4561, + "step": 6010 + }, + { + "epoch": 1.1798744605727736, + "grad_norm": 18.83449935913086, + "learning_rate": 8.045956159832324e-06, + "loss": 4.89, + "step": 6015 + }, + { + "epoch": 1.1808552373479797, + "grad_norm": 14.824164390563965, + "learning_rate": 8.042885865079909e-06, + "loss": 4.8188, + "step": 6020 + }, + { + "epoch": 1.1818360141231856, + "grad_norm": 17.74277114868164, + "learning_rate": 8.039813746953785e-06, + "loss": 5.0979, + "step": 6025 + }, + { + "epoch": 1.1828167908983915, + "grad_norm": 19.057353973388672, + "learning_rate": 8.036739807294844e-06, + "loss": 4.915, + "step": 6030 + }, + { + "epoch": 1.1837975676735975, + "grad_norm": 24.1026554107666, + "learning_rate": 8.03366404794507e-06, + "loss": 4.9094, + "step": 6035 + }, + { + "epoch": 1.1847783444488034, + "grad_norm": 17.447006225585938, + "learning_rate": 8.030586470747535e-06, + "loss": 4.7573, + "step": 6040 + }, + { + "epoch": 1.1857591212240095, + "grad_norm": 34.93657302856445, + "learning_rate": 8.027507077546398e-06, + "loss": 4.5797, + "step": 6045 + }, + { + "epoch": 1.1867398979992154, + "grad_norm": 19.555604934692383, + "learning_rate": 8.024425870186912e-06, + "loss": 4.5836, + "step": 6050 + }, + { + "epoch": 1.1877206747744213, + "grad_norm": 26.290122985839844, + "learning_rate": 8.02134285051541e-06, + "loss": 4.7241, + "step": 6055 + }, + { + "epoch": 1.1887014515496273, + "grad_norm": 39.10771942138672, + "learning_rate": 8.018258020379319e-06, + "loss": 4.8729, + "step": 6060 + }, + { + "epoch": 1.1896822283248332, + "grad_norm": 12.844032287597656, + "learning_rate": 8.015171381627145e-06, + "loss": 4.5366, + "step": 6065 + }, + { + "epoch": 1.1906630051000393, + "grad_norm": 17.043214797973633, + "learning_rate": 8.01208293610848e-06, + "loss": 4.7391, + "step": 6070 + }, + { + "epoch": 1.1916437818752452, + "grad_norm": 14.163534164428711, + "learning_rate": 8.008992685673998e-06, + "loss": 5.0134, + "step": 6075 + }, + { + "epoch": 1.1926245586504511, + "grad_norm": 18.311864852905273, + "learning_rate": 8.005900632175453e-06, + "loss": 4.5244, + "step": 6080 + }, + { + "epoch": 1.193605335425657, + "grad_norm": 15.40824031829834, + "learning_rate": 8.002806777465685e-06, + "loss": 5.0018, + "step": 6085 + }, + { + "epoch": 1.194586112200863, + "grad_norm": 16.03959846496582, + "learning_rate": 7.999711123398607e-06, + "loss": 4.8075, + "step": 6090 + }, + { + "epoch": 1.195566888976069, + "grad_norm": 12.000391960144043, + "learning_rate": 7.996613671829211e-06, + "loss": 4.6103, + "step": 6095 + }, + { + "epoch": 1.196547665751275, + "grad_norm": 17.460683822631836, + "learning_rate": 7.993514424613572e-06, + "loss": 4.8899, + "step": 6100 + }, + { + "epoch": 1.197528442526481, + "grad_norm": 31.026866912841797, + "learning_rate": 7.990413383608833e-06, + "loss": 4.7524, + "step": 6105 + }, + { + "epoch": 1.1985092193016869, + "grad_norm": 18.090789794921875, + "learning_rate": 7.98731055067322e-06, + "loss": 4.6904, + "step": 6110 + }, + { + "epoch": 1.199489996076893, + "grad_norm": 14.723376274108887, + "learning_rate": 7.984205927666023e-06, + "loss": 4.5939, + "step": 6115 + }, + { + "epoch": 1.200470772852099, + "grad_norm": 25.426658630371094, + "learning_rate": 7.981099516447614e-06, + "loss": 4.6615, + "step": 6120 + }, + { + "epoch": 1.2014515496273048, + "grad_norm": 13.51884651184082, + "learning_rate": 7.977991318879432e-06, + "loss": 4.9369, + "step": 6125 + }, + { + "epoch": 1.2024323264025107, + "grad_norm": 28.351192474365234, + "learning_rate": 7.974881336823988e-06, + "loss": 4.7937, + "step": 6130 + }, + { + "epoch": 1.2034131031777167, + "grad_norm": 39.130531311035156, + "learning_rate": 7.971769572144858e-06, + "loss": 4.8798, + "step": 6135 + }, + { + "epoch": 1.2043938799529228, + "grad_norm": 16.273771286010742, + "learning_rate": 7.968656026706693e-06, + "loss": 5.1519, + "step": 6140 + }, + { + "epoch": 1.2053746567281287, + "grad_norm": 20.515405654907227, + "learning_rate": 7.965540702375207e-06, + "loss": 4.9034, + "step": 6145 + }, + { + "epoch": 1.2063554335033346, + "grad_norm": 21.847187042236328, + "learning_rate": 7.962423601017183e-06, + "loss": 4.5778, + "step": 6150 + }, + { + "epoch": 1.2073362102785405, + "grad_norm": 24.054405212402344, + "learning_rate": 7.95930472450046e-06, + "loss": 4.8069, + "step": 6155 + }, + { + "epoch": 1.2083169870537467, + "grad_norm": 16.962587356567383, + "learning_rate": 7.956184074693952e-06, + "loss": 4.6251, + "step": 6160 + }, + { + "epoch": 1.2092977638289526, + "grad_norm": 37.481475830078125, + "learning_rate": 7.953061653467631e-06, + "loss": 4.7728, + "step": 6165 + }, + { + "epoch": 1.2102785406041585, + "grad_norm": 24.814760208129883, + "learning_rate": 7.949937462692528e-06, + "loss": 4.7485, + "step": 6170 + }, + { + "epoch": 1.2112593173793644, + "grad_norm": 15.355949401855469, + "learning_rate": 7.946811504240736e-06, + "loss": 4.6448, + "step": 6175 + }, + { + "epoch": 1.2122400941545703, + "grad_norm": 30.65787696838379, + "learning_rate": 7.943683779985412e-06, + "loss": 4.6957, + "step": 6180 + }, + { + "epoch": 1.2132208709297765, + "grad_norm": 13.861238479614258, + "learning_rate": 7.940554291800766e-06, + "loss": 4.4497, + "step": 6185 + }, + { + "epoch": 1.2142016477049824, + "grad_norm": 52.959903717041016, + "learning_rate": 7.937423041562063e-06, + "loss": 4.8312, + "step": 6190 + }, + { + "epoch": 1.2151824244801883, + "grad_norm": 20.761072158813477, + "learning_rate": 7.934290031145629e-06, + "loss": 4.6906, + "step": 6195 + }, + { + "epoch": 1.2161632012553942, + "grad_norm": 32.41350173950195, + "learning_rate": 7.93115526242884e-06, + "loss": 4.9518, + "step": 6200 + }, + { + "epoch": 1.2171439780306001, + "grad_norm": 16.019487380981445, + "learning_rate": 7.928018737290132e-06, + "loss": 4.7748, + "step": 6205 + }, + { + "epoch": 1.2181247548058063, + "grad_norm": 12.364213943481445, + "learning_rate": 7.924880457608987e-06, + "loss": 4.5267, + "step": 6210 + }, + { + "epoch": 1.2191055315810122, + "grad_norm": 27.74942970275879, + "learning_rate": 7.921740425265944e-06, + "loss": 4.6387, + "step": 6215 + }, + { + "epoch": 1.2200863083562181, + "grad_norm": 14.577978134155273, + "learning_rate": 7.918598642142588e-06, + "loss": 4.7566, + "step": 6220 + }, + { + "epoch": 1.221067085131424, + "grad_norm": 18.41602325439453, + "learning_rate": 7.915455110121553e-06, + "loss": 4.613, + "step": 6225 + }, + { + "epoch": 1.22204786190663, + "grad_norm": 13.009770393371582, + "learning_rate": 7.912309831086522e-06, + "loss": 4.8313, + "step": 6230 + }, + { + "epoch": 1.223028638681836, + "grad_norm": 16.142860412597656, + "learning_rate": 7.909162806922229e-06, + "loss": 4.4975, + "step": 6235 + }, + { + "epoch": 1.224009415457042, + "grad_norm": 19.88920021057129, + "learning_rate": 7.906014039514446e-06, + "loss": 4.7498, + "step": 6240 + }, + { + "epoch": 1.224990192232248, + "grad_norm": 15.997335433959961, + "learning_rate": 7.902863530749995e-06, + "loss": 4.8573, + "step": 6245 + }, + { + "epoch": 1.2259709690074538, + "grad_norm": 25.197553634643555, + "learning_rate": 7.89971128251674e-06, + "loss": 4.8058, + "step": 6250 + }, + { + "epoch": 1.2269517457826598, + "grad_norm": 25.884624481201172, + "learning_rate": 7.896557296703589e-06, + "loss": 4.7384, + "step": 6255 + }, + { + "epoch": 1.227932522557866, + "grad_norm": 17.546234130859375, + "learning_rate": 7.893401575200488e-06, + "loss": 4.9061, + "step": 6260 + }, + { + "epoch": 1.2289132993330718, + "grad_norm": 15.71548843383789, + "learning_rate": 7.890244119898423e-06, + "loss": 4.5454, + "step": 6265 + }, + { + "epoch": 1.2298940761082777, + "grad_norm": 15.9916353225708, + "learning_rate": 7.887084932689424e-06, + "loss": 5.0506, + "step": 6270 + }, + { + "epoch": 1.2308748528834836, + "grad_norm": 34.314453125, + "learning_rate": 7.883924015466554e-06, + "loss": 4.604, + "step": 6275 + }, + { + "epoch": 1.2318556296586898, + "grad_norm": 15.04297924041748, + "learning_rate": 7.880761370123914e-06, + "loss": 4.7883, + "step": 6280 + }, + { + "epoch": 1.2328364064338957, + "grad_norm": 26.045204162597656, + "learning_rate": 7.87759699855664e-06, + "loss": 4.6663, + "step": 6285 + }, + { + "epoch": 1.2338171832091016, + "grad_norm": 19.145492553710938, + "learning_rate": 7.874430902660903e-06, + "loss": 4.6448, + "step": 6290 + }, + { + "epoch": 1.2347979599843075, + "grad_norm": 12.347979545593262, + "learning_rate": 7.87126308433391e-06, + "loss": 4.9435, + "step": 6295 + }, + { + "epoch": 1.2357787367595134, + "grad_norm": 16.063499450683594, + "learning_rate": 7.868093545473891e-06, + "loss": 4.6589, + "step": 6300 + }, + { + "epoch": 1.2367595135347196, + "grad_norm": 12.78153133392334, + "learning_rate": 7.864922287980121e-06, + "loss": 4.6432, + "step": 6305 + }, + { + "epoch": 1.2377402903099255, + "grad_norm": 15.366764068603516, + "learning_rate": 7.861749313752893e-06, + "loss": 4.5588, + "step": 6310 + }, + { + "epoch": 1.2387210670851314, + "grad_norm": 13.756194114685059, + "learning_rate": 7.858574624693533e-06, + "loss": 4.5369, + "step": 6315 + }, + { + "epoch": 1.2397018438603373, + "grad_norm": 25.25469207763672, + "learning_rate": 7.855398222704395e-06, + "loss": 5.0234, + "step": 6320 + }, + { + "epoch": 1.2406826206355435, + "grad_norm": 20.50889015197754, + "learning_rate": 7.85222010968886e-06, + "loss": 4.6325, + "step": 6325 + }, + { + "epoch": 1.2416633974107494, + "grad_norm": 24.124738693237305, + "learning_rate": 7.849040287551331e-06, + "loss": 4.8982, + "step": 6330 + }, + { + "epoch": 1.2426441741859553, + "grad_norm": 27.7025146484375, + "learning_rate": 7.84585875819724e-06, + "loss": 5.0759, + "step": 6335 + }, + { + "epoch": 1.2436249509611612, + "grad_norm": 11.230903625488281, + "learning_rate": 7.842675523533038e-06, + "loss": 4.9903, + "step": 6340 + }, + { + "epoch": 1.2446057277363671, + "grad_norm": 11.444461822509766, + "learning_rate": 7.839490585466198e-06, + "loss": 4.6177, + "step": 6345 + }, + { + "epoch": 1.2455865045115733, + "grad_norm": 19.044597625732422, + "learning_rate": 7.836303945905217e-06, + "loss": 4.7733, + "step": 6350 + }, + { + "epoch": 1.2465672812867792, + "grad_norm": 15.269647598266602, + "learning_rate": 7.833115606759608e-06, + "loss": 4.7614, + "step": 6355 + }, + { + "epoch": 1.247548058061985, + "grad_norm": 16.577899932861328, + "learning_rate": 7.829925569939908e-06, + "loss": 4.601, + "step": 6360 + }, + { + "epoch": 1.248528834837191, + "grad_norm": 19.186344146728516, + "learning_rate": 7.82673383735766e-06, + "loss": 4.5914, + "step": 6365 + }, + { + "epoch": 1.249509611612397, + "grad_norm": 18.438297271728516, + "learning_rate": 7.823540410925434e-06, + "loss": 4.7085, + "step": 6370 + }, + { + "epoch": 1.250490388387603, + "grad_norm": 15.657546043395996, + "learning_rate": 7.820345292556815e-06, + "loss": 4.605, + "step": 6375 + }, + { + "epoch": 1.250490388387603, + "eval_loss": 4.937655448913574, + "eval_runtime": 7.7325, + "eval_samples_per_second": 27.029, + "eval_steps_per_second": 13.579, + "step": 6375 + }, + { + "epoch": 1.251471165162809, + "grad_norm": 24.359493255615234, + "learning_rate": 7.817148484166392e-06, + "loss": 4.7048, + "step": 6380 + }, + { + "epoch": 1.252451941938015, + "grad_norm": 16.691558837890625, + "learning_rate": 7.813949987669777e-06, + "loss": 4.7663, + "step": 6385 + }, + { + "epoch": 1.2534327187132208, + "grad_norm": 26.948749542236328, + "learning_rate": 7.81074980498359e-06, + "loss": 4.7941, + "step": 6390 + }, + { + "epoch": 1.2544134954884267, + "grad_norm": 13.582273483276367, + "learning_rate": 7.807547938025458e-06, + "loss": 4.6875, + "step": 6395 + }, + { + "epoch": 1.2553942722636329, + "grad_norm": 14.236882209777832, + "learning_rate": 7.804344388714022e-06, + "loss": 4.5558, + "step": 6400 + }, + { + "epoch": 1.2563750490388388, + "grad_norm": 30.24952507019043, + "learning_rate": 7.801139158968928e-06, + "loss": 4.6246, + "step": 6405 + }, + { + "epoch": 1.2573558258140447, + "grad_norm": 24.6456298828125, + "learning_rate": 7.797932250710832e-06, + "loss": 4.6808, + "step": 6410 + }, + { + "epoch": 1.2583366025892506, + "grad_norm": 14.85630989074707, + "learning_rate": 7.794723665861392e-06, + "loss": 4.9929, + "step": 6415 + }, + { + "epoch": 1.2593173793644565, + "grad_norm": 20.140844345092773, + "learning_rate": 7.791513406343276e-06, + "loss": 4.516, + "step": 6420 + }, + { + "epoch": 1.2602981561396627, + "grad_norm": 16.46764373779297, + "learning_rate": 7.788301474080148e-06, + "loss": 4.7002, + "step": 6425 + }, + { + "epoch": 1.2612789329148686, + "grad_norm": 16.4792537689209, + "learning_rate": 7.785087870996682e-06, + "loss": 4.7079, + "step": 6430 + }, + { + "epoch": 1.2622597096900745, + "grad_norm": 14.717877388000488, + "learning_rate": 7.781872599018547e-06, + "loss": 4.5069, + "step": 6435 + }, + { + "epoch": 1.2632404864652804, + "grad_norm": 31.211990356445312, + "learning_rate": 7.778655660072417e-06, + "loss": 4.915, + "step": 6440 + }, + { + "epoch": 1.2642212632404863, + "grad_norm": 19.69355010986328, + "learning_rate": 7.775437056085961e-06, + "loss": 4.7949, + "step": 6445 + }, + { + "epoch": 1.2652020400156925, + "grad_norm": 22.162994384765625, + "learning_rate": 7.77221678898785e-06, + "loss": 4.7987, + "step": 6450 + }, + { + "epoch": 1.2661828167908984, + "grad_norm": 57.02979278564453, + "learning_rate": 7.768994860707745e-06, + "loss": 4.7767, + "step": 6455 + }, + { + "epoch": 1.2671635935661043, + "grad_norm": 20.068218231201172, + "learning_rate": 7.76577127317631e-06, + "loss": 4.4555, + "step": 6460 + }, + { + "epoch": 1.2681443703413104, + "grad_norm": 29.125776290893555, + "learning_rate": 7.7625460283252e-06, + "loss": 4.7459, + "step": 6465 + }, + { + "epoch": 1.2691251471165164, + "grad_norm": 13.710939407348633, + "learning_rate": 7.759319128087058e-06, + "loss": 4.6614, + "step": 6470 + }, + { + "epoch": 1.2701059238917223, + "grad_norm": 12.004446029663086, + "learning_rate": 7.756090574395528e-06, + "loss": 4.6544, + "step": 6475 + }, + { + "epoch": 1.2710867006669282, + "grad_norm": 13.436174392700195, + "learning_rate": 7.75286036918524e-06, + "loss": 4.6053, + "step": 6480 + }, + { + "epoch": 1.272067477442134, + "grad_norm": 12.79939079284668, + "learning_rate": 7.749628514391814e-06, + "loss": 4.9016, + "step": 6485 + }, + { + "epoch": 1.2730482542173402, + "grad_norm": 12.1824312210083, + "learning_rate": 7.746395011951857e-06, + "loss": 4.7288, + "step": 6490 + }, + { + "epoch": 1.2740290309925462, + "grad_norm": 17.693960189819336, + "learning_rate": 7.743159863802967e-06, + "loss": 4.6414, + "step": 6495 + }, + { + "epoch": 1.275009807767752, + "grad_norm": 22.020206451416016, + "learning_rate": 7.739923071883725e-06, + "loss": 4.8802, + "step": 6500 + }, + { + "epoch": 1.275990584542958, + "grad_norm": 14.62353801727295, + "learning_rate": 7.736684638133699e-06, + "loss": 4.7882, + "step": 6505 + }, + { + "epoch": 1.276971361318164, + "grad_norm": 22.087949752807617, + "learning_rate": 7.73344456449344e-06, + "loss": 4.8307, + "step": 6510 + }, + { + "epoch": 1.27795213809337, + "grad_norm": 23.798274993896484, + "learning_rate": 7.73020285290448e-06, + "loss": 4.7251, + "step": 6515 + }, + { + "epoch": 1.278932914868576, + "grad_norm": 15.14513874053955, + "learning_rate": 7.726959505309335e-06, + "loss": 4.7146, + "step": 6520 + }, + { + "epoch": 1.2799136916437819, + "grad_norm": 12.246772766113281, + "learning_rate": 7.7237145236515e-06, + "loss": 4.8786, + "step": 6525 + }, + { + "epoch": 1.2808944684189878, + "grad_norm": 25.760866165161133, + "learning_rate": 7.720467909875448e-06, + "loss": 4.3852, + "step": 6530 + }, + { + "epoch": 1.2818752451941937, + "grad_norm": 23.278759002685547, + "learning_rate": 7.717219665926635e-06, + "loss": 4.7413, + "step": 6535 + }, + { + "epoch": 1.2828560219693999, + "grad_norm": 10.611922264099121, + "learning_rate": 7.713969793751493e-06, + "loss": 4.4106, + "step": 6540 + }, + { + "epoch": 1.2838367987446058, + "grad_norm": 21.27843475341797, + "learning_rate": 7.710718295297418e-06, + "loss": 5.0443, + "step": 6545 + }, + { + "epoch": 1.2848175755198117, + "grad_norm": 17.157188415527344, + "learning_rate": 7.707465172512798e-06, + "loss": 4.9554, + "step": 6550 + }, + { + "epoch": 1.2857983522950176, + "grad_norm": 13.657434463500977, + "learning_rate": 7.704210427346979e-06, + "loss": 4.6168, + "step": 6555 + }, + { + "epoch": 1.2867791290702235, + "grad_norm": 20.332624435424805, + "learning_rate": 7.700954061750295e-06, + "loss": 4.8259, + "step": 6560 + }, + { + "epoch": 1.2877599058454297, + "grad_norm": 11.391372680664062, + "learning_rate": 7.697696077674032e-06, + "loss": 4.7383, + "step": 6565 + }, + { + "epoch": 1.2887406826206356, + "grad_norm": 21.024250030517578, + "learning_rate": 7.694436477070464e-06, + "loss": 4.6618, + "step": 6570 + }, + { + "epoch": 1.2897214593958415, + "grad_norm": 17.038434982299805, + "learning_rate": 7.691175261892821e-06, + "loss": 5.1161, + "step": 6575 + }, + { + "epoch": 1.2907022361710474, + "grad_norm": 15.151735305786133, + "learning_rate": 7.687912434095306e-06, + "loss": 4.7389, + "step": 6580 + }, + { + "epoch": 1.2916830129462533, + "grad_norm": 33.51579284667969, + "learning_rate": 7.68464799563309e-06, + "loss": 4.5458, + "step": 6585 + }, + { + "epoch": 1.2926637897214595, + "grad_norm": 22.85167121887207, + "learning_rate": 7.681381948462304e-06, + "loss": 4.7308, + "step": 6590 + }, + { + "epoch": 1.2936445664966654, + "grad_norm": 23.826412200927734, + "learning_rate": 7.678114294540046e-06, + "loss": 5.0318, + "step": 6595 + }, + { + "epoch": 1.2946253432718713, + "grad_norm": 19.270708084106445, + "learning_rate": 7.674845035824377e-06, + "loss": 4.6337, + "step": 6600 + }, + { + "epoch": 1.2956061200470772, + "grad_norm": 22.91469955444336, + "learning_rate": 7.671574174274317e-06, + "loss": 4.8004, + "step": 6605 + }, + { + "epoch": 1.2965868968222831, + "grad_norm": 19.99880027770996, + "learning_rate": 7.668301711849852e-06, + "loss": 4.7976, + "step": 6610 + }, + { + "epoch": 1.2975676735974893, + "grad_norm": 18.21483039855957, + "learning_rate": 7.665027650511921e-06, + "loss": 4.826, + "step": 6615 + }, + { + "epoch": 1.2985484503726952, + "grad_norm": 23.880598068237305, + "learning_rate": 7.661751992222425e-06, + "loss": 4.335, + "step": 6620 + }, + { + "epoch": 1.299529227147901, + "grad_norm": 34.46923065185547, + "learning_rate": 7.65847473894422e-06, + "loss": 4.6269, + "step": 6625 + }, + { + "epoch": 1.3005100039231072, + "grad_norm": 10.862066268920898, + "learning_rate": 7.65519589264112e-06, + "loss": 4.7542, + "step": 6630 + }, + { + "epoch": 1.3014907806983131, + "grad_norm": 18.505020141601562, + "learning_rate": 7.65191545527789e-06, + "loss": 4.6034, + "step": 6635 + }, + { + "epoch": 1.302471557473519, + "grad_norm": 14.459244728088379, + "learning_rate": 7.648633428820254e-06, + "loss": 4.6442, + "step": 6640 + }, + { + "epoch": 1.303452334248725, + "grad_norm": 18.99560546875, + "learning_rate": 7.64534981523488e-06, + "loss": 4.5086, + "step": 6645 + }, + { + "epoch": 1.304433111023931, + "grad_norm": 15.123648643493652, + "learning_rate": 7.642064616489394e-06, + "loss": 4.9788, + "step": 6650 + }, + { + "epoch": 1.305413887799137, + "grad_norm": 30.390419006347656, + "learning_rate": 7.638777834552372e-06, + "loss": 4.9361, + "step": 6655 + }, + { + "epoch": 1.306394664574343, + "grad_norm": 16.628713607788086, + "learning_rate": 7.635489471393334e-06, + "loss": 4.4967, + "step": 6660 + }, + { + "epoch": 1.3073754413495489, + "grad_norm": 23.70064353942871, + "learning_rate": 7.632199528982748e-06, + "loss": 4.7082, + "step": 6665 + }, + { + "epoch": 1.3083562181247548, + "grad_norm": 12.797712326049805, + "learning_rate": 7.6289080092920354e-06, + "loss": 4.9439, + "step": 6670 + }, + { + "epoch": 1.3093369948999607, + "grad_norm": 23.16967010498047, + "learning_rate": 7.625614914293553e-06, + "loss": 4.6688, + "step": 6675 + }, + { + "epoch": 1.3103177716751668, + "grad_norm": 27.266939163208008, + "learning_rate": 7.622320245960607e-06, + "loss": 4.427, + "step": 6680 + }, + { + "epoch": 1.3112985484503727, + "grad_norm": 24.94828224182129, + "learning_rate": 7.619024006267448e-06, + "loss": 4.6799, + "step": 6685 + }, + { + "epoch": 1.3122793252255787, + "grad_norm": 13.128344535827637, + "learning_rate": 7.6157261971892626e-06, + "loss": 4.4892, + "step": 6690 + }, + { + "epoch": 1.3132601020007846, + "grad_norm": 22.916282653808594, + "learning_rate": 7.612426820702182e-06, + "loss": 4.781, + "step": 6695 + }, + { + "epoch": 1.3142408787759905, + "grad_norm": 18.023021697998047, + "learning_rate": 7.6091258787832765e-06, + "loss": 4.7486, + "step": 6700 + }, + { + "epoch": 1.3152216555511966, + "grad_norm": 20.854341506958008, + "learning_rate": 7.605823373410553e-06, + "loss": 4.8066, + "step": 6705 + }, + { + "epoch": 1.3162024323264025, + "grad_norm": 25.060224533081055, + "learning_rate": 7.602519306562954e-06, + "loss": 4.7733, + "step": 6710 + }, + { + "epoch": 1.3171832091016085, + "grad_norm": 34.962406158447266, + "learning_rate": 7.599213680220362e-06, + "loss": 4.7923, + "step": 6715 + }, + { + "epoch": 1.3181639858768144, + "grad_norm": 22.008214950561523, + "learning_rate": 7.59590649636359e-06, + "loss": 5.0123, + "step": 6720 + }, + { + "epoch": 1.3191447626520203, + "grad_norm": 16.601545333862305, + "learning_rate": 7.592597756974385e-06, + "loss": 4.9437, + "step": 6725 + }, + { + "epoch": 1.3201255394272264, + "grad_norm": 24.702821731567383, + "learning_rate": 7.589287464035429e-06, + "loss": 4.8218, + "step": 6730 + }, + { + "epoch": 1.3211063162024324, + "grad_norm": 15.623889923095703, + "learning_rate": 7.58597561953033e-06, + "loss": 4.7291, + "step": 6735 + }, + { + "epoch": 1.3220870929776383, + "grad_norm": 16.421722412109375, + "learning_rate": 7.582662225443631e-06, + "loss": 4.87, + "step": 6740 + }, + { + "epoch": 1.3230678697528442, + "grad_norm": 23.022462844848633, + "learning_rate": 7.579347283760801e-06, + "loss": 4.9356, + "step": 6745 + }, + { + "epoch": 1.32404864652805, + "grad_norm": 26.935955047607422, + "learning_rate": 7.576030796468233e-06, + "loss": 4.5722, + "step": 6750 + }, + { + "epoch": 1.3250294233032562, + "grad_norm": 16.350265502929688, + "learning_rate": 7.572712765553254e-06, + "loss": 4.7951, + "step": 6755 + }, + { + "epoch": 1.3260102000784622, + "grad_norm": 10.10744571685791, + "learning_rate": 7.569393193004109e-06, + "loss": 4.9494, + "step": 6760 + }, + { + "epoch": 1.326990976853668, + "grad_norm": 20.70772933959961, + "learning_rate": 7.56607208080997e-06, + "loss": 4.4884, + "step": 6765 + }, + { + "epoch": 1.327971753628874, + "grad_norm": 18.814462661743164, + "learning_rate": 7.562749430960931e-06, + "loss": 4.7232, + "step": 6770 + }, + { + "epoch": 1.32895253040408, + "grad_norm": 17.414348602294922, + "learning_rate": 7.559425245448006e-06, + "loss": 4.7775, + "step": 6775 + }, + { + "epoch": 1.329933307179286, + "grad_norm": 36.159873962402344, + "learning_rate": 7.556099526263132e-06, + "loss": 4.5531, + "step": 6780 + }, + { + "epoch": 1.330914083954492, + "grad_norm": 22.765953063964844, + "learning_rate": 7.552772275399163e-06, + "loss": 4.6333, + "step": 6785 + }, + { + "epoch": 1.3318948607296979, + "grad_norm": 17.589569091796875, + "learning_rate": 7.549443494849872e-06, + "loss": 4.7986, + "step": 6790 + }, + { + "epoch": 1.332875637504904, + "grad_norm": 19.393115997314453, + "learning_rate": 7.5461131866099465e-06, + "loss": 4.722, + "step": 6795 + }, + { + "epoch": 1.33385641428011, + "grad_norm": 35.45538330078125, + "learning_rate": 7.542781352674994e-06, + "loss": 4.5966, + "step": 6800 + }, + { + "epoch": 1.3348371910553158, + "grad_norm": 11.774484634399414, + "learning_rate": 7.539447995041529e-06, + "loss": 4.6637, + "step": 6805 + }, + { + "epoch": 1.3358179678305218, + "grad_norm": 26.95338249206543, + "learning_rate": 7.536113115706987e-06, + "loss": 4.8572, + "step": 6810 + }, + { + "epoch": 1.3367987446057277, + "grad_norm": 30.311328887939453, + "learning_rate": 7.532776716669708e-06, + "loss": 4.8531, + "step": 6815 + }, + { + "epoch": 1.3377795213809338, + "grad_norm": 24.06073570251465, + "learning_rate": 7.52943879992895e-06, + "loss": 4.5024, + "step": 6820 + }, + { + "epoch": 1.3387602981561397, + "grad_norm": 37.04559326171875, + "learning_rate": 7.526099367484871e-06, + "loss": 4.9231, + "step": 6825 + }, + { + "epoch": 1.3397410749313456, + "grad_norm": 35.2464485168457, + "learning_rate": 7.52275842133855e-06, + "loss": 5.2958, + "step": 6830 + }, + { + "epoch": 1.3407218517065516, + "grad_norm": 19.340726852416992, + "learning_rate": 7.519415963491961e-06, + "loss": 5.0359, + "step": 6835 + }, + { + "epoch": 1.3417026284817575, + "grad_norm": 29.035587310791016, + "learning_rate": 7.516071995947991e-06, + "loss": 5.0164, + "step": 6840 + }, + { + "epoch": 1.3426834052569636, + "grad_norm": 11.302748680114746, + "learning_rate": 7.512726520710429e-06, + "loss": 4.809, + "step": 6845 + }, + { + "epoch": 1.3436641820321695, + "grad_norm": 49.59445571899414, + "learning_rate": 7.5093795397839655e-06, + "loss": 4.7413, + "step": 6850 + }, + { + "epoch": 1.3446449588073754, + "grad_norm": 24.61562728881836, + "learning_rate": 7.5060310551741986e-06, + "loss": 4.6153, + "step": 6855 + }, + { + "epoch": 1.3456257355825814, + "grad_norm": 20.13814353942871, + "learning_rate": 7.5026810688876225e-06, + "loss": 5.0, + "step": 6860 + }, + { + "epoch": 1.3466065123577873, + "grad_norm": 49.62564468383789, + "learning_rate": 7.499329582931636e-06, + "loss": 5.0259, + "step": 6865 + }, + { + "epoch": 1.3475872891329934, + "grad_norm": 18.966625213623047, + "learning_rate": 7.495976599314531e-06, + "loss": 4.8656, + "step": 6870 + }, + { + "epoch": 1.3485680659081993, + "grad_norm": 28.413414001464844, + "learning_rate": 7.4926221200455e-06, + "loss": 4.6754, + "step": 6875 + }, + { + "epoch": 1.3495488426834052, + "grad_norm": 13.881267547607422, + "learning_rate": 7.489266147134631e-06, + "loss": 4.3717, + "step": 6880 + }, + { + "epoch": 1.3505296194586112, + "grad_norm": 29.266584396362305, + "learning_rate": 7.485908682592909e-06, + "loss": 4.6337, + "step": 6885 + }, + { + "epoch": 1.351510396233817, + "grad_norm": 24.69556999206543, + "learning_rate": 7.482549728432211e-06, + "loss": 4.604, + "step": 6890 + }, + { + "epoch": 1.3524911730090232, + "grad_norm": 34.28022003173828, + "learning_rate": 7.479189286665305e-06, + "loss": 4.3239, + "step": 6895 + }, + { + "epoch": 1.3534719497842291, + "grad_norm": 26.678653717041016, + "learning_rate": 7.475827359305853e-06, + "loss": 4.7429, + "step": 6900 + }, + { + "epoch": 1.354452726559435, + "grad_norm": 21.746240615844727, + "learning_rate": 7.472463948368407e-06, + "loss": 4.7656, + "step": 6905 + }, + { + "epoch": 1.355433503334641, + "grad_norm": 23.173107147216797, + "learning_rate": 7.469099055868406e-06, + "loss": 4.6698, + "step": 6910 + }, + { + "epoch": 1.3564142801098469, + "grad_norm": 38.12723159790039, + "learning_rate": 7.465732683822182e-06, + "loss": 4.7617, + "step": 6915 + }, + { + "epoch": 1.357395056885053, + "grad_norm": 17.623767852783203, + "learning_rate": 7.462364834246945e-06, + "loss": 4.6744, + "step": 6920 + }, + { + "epoch": 1.358375833660259, + "grad_norm": 31.644580841064453, + "learning_rate": 7.4589955091607954e-06, + "loss": 5.1335, + "step": 6925 + }, + { + "epoch": 1.3593566104354649, + "grad_norm": 23.06364631652832, + "learning_rate": 7.455624710582721e-06, + "loss": 4.7955, + "step": 6930 + }, + { + "epoch": 1.360337387210671, + "grad_norm": 12.311569213867188, + "learning_rate": 7.452252440532587e-06, + "loss": 4.7353, + "step": 6935 + }, + { + "epoch": 1.3613181639858767, + "grad_norm": 17.3242244720459, + "learning_rate": 7.4488787010311425e-06, + "loss": 4.639, + "step": 6940 + }, + { + "epoch": 1.3622989407610828, + "grad_norm": 25.3474063873291, + "learning_rate": 7.445503494100017e-06, + "loss": 4.9055, + "step": 6945 + }, + { + "epoch": 1.3632797175362887, + "grad_norm": 21.65390396118164, + "learning_rate": 7.442126821761719e-06, + "loss": 4.7359, + "step": 6950 + }, + { + "epoch": 1.3642604943114947, + "grad_norm": 28.077287673950195, + "learning_rate": 7.438748686039637e-06, + "loss": 4.9454, + "step": 6955 + }, + { + "epoch": 1.3652412710867008, + "grad_norm": 19.87175941467285, + "learning_rate": 7.435369088958033e-06, + "loss": 4.5149, + "step": 6960 + }, + { + "epoch": 1.3662220478619067, + "grad_norm": 23.754520416259766, + "learning_rate": 7.431988032542048e-06, + "loss": 4.5241, + "step": 6965 + }, + { + "epoch": 1.3672028246371126, + "grad_norm": 14.080316543579102, + "learning_rate": 7.4286055188176945e-06, + "loss": 4.7205, + "step": 6970 + }, + { + "epoch": 1.3681836014123185, + "grad_norm": 20.002689361572266, + "learning_rate": 7.42522154981186e-06, + "loss": 4.9367, + "step": 6975 + }, + { + "epoch": 1.3691643781875245, + "grad_norm": 18.009675979614258, + "learning_rate": 7.4218361275523046e-06, + "loss": 5.1465, + "step": 6980 + }, + { + "epoch": 1.3701451549627306, + "grad_norm": 18.44239044189453, + "learning_rate": 7.418449254067659e-06, + "loss": 4.6064, + "step": 6985 + }, + { + "epoch": 1.3711259317379365, + "grad_norm": 23.188587188720703, + "learning_rate": 7.415060931387422e-06, + "loss": 4.4947, + "step": 6990 + }, + { + "epoch": 1.3721067085131424, + "grad_norm": 23.407365798950195, + "learning_rate": 7.411671161541961e-06, + "loss": 4.5955, + "step": 6995 + }, + { + "epoch": 1.3730874852883483, + "grad_norm": 17.974319458007812, + "learning_rate": 7.408279946562512e-06, + "loss": 4.4836, + "step": 7000 + }, + { + "epoch": 1.3740682620635543, + "grad_norm": 17.078502655029297, + "learning_rate": 7.404887288481177e-06, + "loss": 4.8717, + "step": 7005 + }, + { + "epoch": 1.3750490388387604, + "grad_norm": 20.545015335083008, + "learning_rate": 7.401493189330921e-06, + "loss": 4.7096, + "step": 7010 + }, + { + "epoch": 1.3760298156139663, + "grad_norm": 22.460535049438477, + "learning_rate": 7.398097651145575e-06, + "loss": 4.8465, + "step": 7015 + }, + { + "epoch": 1.3770105923891722, + "grad_norm": 21.549362182617188, + "learning_rate": 7.3947006759598295e-06, + "loss": 4.8413, + "step": 7020 + }, + { + "epoch": 1.3779913691643781, + "grad_norm": 22.767526626586914, + "learning_rate": 7.391302265809237e-06, + "loss": 4.7785, + "step": 7025 + }, + { + "epoch": 1.378972145939584, + "grad_norm": 21.575889587402344, + "learning_rate": 7.387902422730211e-06, + "loss": 4.6862, + "step": 7030 + }, + { + "epoch": 1.3799529227147902, + "grad_norm": 11.060517311096191, + "learning_rate": 7.384501148760024e-06, + "loss": 4.8493, + "step": 7035 + }, + { + "epoch": 1.3809336994899961, + "grad_norm": 22.244422912597656, + "learning_rate": 7.381098445936803e-06, + "loss": 4.609, + "step": 7040 + }, + { + "epoch": 1.381914476265202, + "grad_norm": 19.83547019958496, + "learning_rate": 7.377694316299533e-06, + "loss": 4.8589, + "step": 7045 + }, + { + "epoch": 1.382895253040408, + "grad_norm": 26.3979434967041, + "learning_rate": 7.374288761888056e-06, + "loss": 4.6734, + "step": 7050 + }, + { + "epoch": 1.3838760298156139, + "grad_norm": 17.394556045532227, + "learning_rate": 7.370881784743065e-06, + "loss": 4.7569, + "step": 7055 + }, + { + "epoch": 1.38485680659082, + "grad_norm": 39.59242630004883, + "learning_rate": 7.367473386906106e-06, + "loss": 4.6074, + "step": 7060 + }, + { + "epoch": 1.385837583366026, + "grad_norm": 10.681520462036133, + "learning_rate": 7.364063570419576e-06, + "loss": 4.7027, + "step": 7065 + }, + { + "epoch": 1.3868183601412318, + "grad_norm": 17.754701614379883, + "learning_rate": 7.360652337326725e-06, + "loss": 4.5475, + "step": 7070 + }, + { + "epoch": 1.3877991369164377, + "grad_norm": 12.982447624206543, + "learning_rate": 7.357239689671646e-06, + "loss": 4.4779, + "step": 7075 + }, + { + "epoch": 1.3887799136916437, + "grad_norm": 22.83730125427246, + "learning_rate": 7.353825629499287e-06, + "loss": 4.5497, + "step": 7080 + }, + { + "epoch": 1.3897606904668498, + "grad_norm": 23.83551025390625, + "learning_rate": 7.350410158855437e-06, + "loss": 4.7299, + "step": 7085 + }, + { + "epoch": 1.3907414672420557, + "grad_norm": 22.732009887695312, + "learning_rate": 7.346993279786732e-06, + "loss": 4.5305, + "step": 7090 + }, + { + "epoch": 1.3917222440172616, + "grad_norm": 19.805837631225586, + "learning_rate": 7.343574994340652e-06, + "loss": 4.7342, + "step": 7095 + }, + { + "epoch": 1.3927030207924678, + "grad_norm": 30.17318344116211, + "learning_rate": 7.340155304565518e-06, + "loss": 4.6992, + "step": 7100 + }, + { + "epoch": 1.3936837975676735, + "grad_norm": 12.482978820800781, + "learning_rate": 7.336734212510497e-06, + "loss": 4.5203, + "step": 7105 + }, + { + "epoch": 1.3946645743428796, + "grad_norm": 24.85907554626465, + "learning_rate": 7.333311720225591e-06, + "loss": 4.3356, + "step": 7110 + }, + { + "epoch": 1.3956453511180855, + "grad_norm": 33.45339584350586, + "learning_rate": 7.329887829761645e-06, + "loss": 5.0046, + "step": 7115 + }, + { + "epoch": 1.3966261278932914, + "grad_norm": 11.957460403442383, + "learning_rate": 7.326462543170339e-06, + "loss": 4.6839, + "step": 7120 + }, + { + "epoch": 1.3976069046684976, + "grad_norm": 27.296606063842773, + "learning_rate": 7.323035862504191e-06, + "loss": 4.5867, + "step": 7125 + }, + { + "epoch": 1.3985876814437035, + "grad_norm": 15.323542594909668, + "learning_rate": 7.319607789816555e-06, + "loss": 4.6469, + "step": 7130 + }, + { + "epoch": 1.3995684582189094, + "grad_norm": 31.46601676940918, + "learning_rate": 7.316178327161618e-06, + "loss": 4.7618, + "step": 7135 + }, + { + "epoch": 1.4005492349941153, + "grad_norm": 16.684764862060547, + "learning_rate": 7.3127474765944004e-06, + "loss": 4.5262, + "step": 7140 + }, + { + "epoch": 1.4015300117693212, + "grad_norm": 17.97416877746582, + "learning_rate": 7.309315240170753e-06, + "loss": 4.9621, + "step": 7145 + }, + { + "epoch": 1.4025107885445274, + "grad_norm": 13.907105445861816, + "learning_rate": 7.305881619947359e-06, + "loss": 4.7714, + "step": 7150 + }, + { + "epoch": 1.4034915653197333, + "grad_norm": 26.989219665527344, + "learning_rate": 7.302446617981731e-06, + "loss": 4.7963, + "step": 7155 + }, + { + "epoch": 1.4044723420949392, + "grad_norm": 11.415125846862793, + "learning_rate": 7.2990102363322065e-06, + "loss": 4.6166, + "step": 7160 + }, + { + "epoch": 1.4054531188701451, + "grad_norm": 15.411608695983887, + "learning_rate": 7.295572477057952e-06, + "loss": 4.8559, + "step": 7165 + }, + { + "epoch": 1.406433895645351, + "grad_norm": 18.74057388305664, + "learning_rate": 7.292133342218963e-06, + "loss": 4.8423, + "step": 7170 + }, + { + "epoch": 1.4074146724205572, + "grad_norm": 15.429743766784668, + "learning_rate": 7.28869283387605e-06, + "loss": 4.6179, + "step": 7175 + }, + { + "epoch": 1.408395449195763, + "grad_norm": 20.775442123413086, + "learning_rate": 7.2852509540908546e-06, + "loss": 4.5703, + "step": 7180 + }, + { + "epoch": 1.409376225970969, + "grad_norm": 10.963190078735352, + "learning_rate": 7.281807704925839e-06, + "loss": 4.5943, + "step": 7185 + }, + { + "epoch": 1.410357002746175, + "grad_norm": 22.696077346801758, + "learning_rate": 7.278363088444283e-06, + "loss": 4.8727, + "step": 7190 + }, + { + "epoch": 1.4113377795213808, + "grad_norm": 22.353727340698242, + "learning_rate": 7.2749171067102875e-06, + "loss": 4.9767, + "step": 7195 + }, + { + "epoch": 1.412318556296587, + "grad_norm": 24.793746948242188, + "learning_rate": 7.271469761788772e-06, + "loss": 4.4291, + "step": 7200 + }, + { + "epoch": 1.413299333071793, + "grad_norm": 24.146011352539062, + "learning_rate": 7.2680210557454715e-06, + "loss": 4.646, + "step": 7205 + }, + { + "epoch": 1.4142801098469988, + "grad_norm": 20.337390899658203, + "learning_rate": 7.264570990646938e-06, + "loss": 5.1823, + "step": 7210 + }, + { + "epoch": 1.4152608866222047, + "grad_norm": 31.162933349609375, + "learning_rate": 7.261119568560537e-06, + "loss": 4.5848, + "step": 7215 + }, + { + "epoch": 1.4162416633974106, + "grad_norm": 17.75670623779297, + "learning_rate": 7.257666791554448e-06, + "loss": 4.6185, + "step": 7220 + }, + { + "epoch": 1.4172224401726168, + "grad_norm": 13.3048734664917, + "learning_rate": 7.2542126616976596e-06, + "loss": 4.7974, + "step": 7225 + }, + { + "epoch": 1.4182032169478227, + "grad_norm": 15.745701789855957, + "learning_rate": 7.2507571810599755e-06, + "loss": 4.5467, + "step": 7230 + }, + { + "epoch": 1.4191839937230286, + "grad_norm": 12.2063627243042, + "learning_rate": 7.247300351712007e-06, + "loss": 4.6024, + "step": 7235 + }, + { + "epoch": 1.4201647704982345, + "grad_norm": 36.18549346923828, + "learning_rate": 7.243842175725172e-06, + "loss": 4.9873, + "step": 7240 + }, + { + "epoch": 1.4211455472734404, + "grad_norm": 26.773792266845703, + "learning_rate": 7.240382655171696e-06, + "loss": 4.6385, + "step": 7245 + }, + { + "epoch": 1.4221263240486466, + "grad_norm": 21.363792419433594, + "learning_rate": 7.236921792124611e-06, + "loss": 4.647, + "step": 7250 + }, + { + "epoch": 1.4231071008238525, + "grad_norm": 15.612807273864746, + "learning_rate": 7.233459588657753e-06, + "loss": 4.7398, + "step": 7255 + }, + { + "epoch": 1.4240878775990584, + "grad_norm": 42.73514175415039, + "learning_rate": 7.229996046845762e-06, + "loss": 4.9747, + "step": 7260 + }, + { + "epoch": 1.4250686543742646, + "grad_norm": 26.65818977355957, + "learning_rate": 7.226531168764079e-06, + "loss": 5.0051, + "step": 7265 + }, + { + "epoch": 1.4260494311494702, + "grad_norm": 15.626964569091797, + "learning_rate": 7.223064956488946e-06, + "loss": 4.618, + "step": 7270 + }, + { + "epoch": 1.4270302079246764, + "grad_norm": 10.92302417755127, + "learning_rate": 7.219597412097405e-06, + "loss": 5.0778, + "step": 7275 + }, + { + "epoch": 1.4280109846998823, + "grad_norm": 19.099761962890625, + "learning_rate": 7.216128537667296e-06, + "loss": 4.7971, + "step": 7280 + }, + { + "epoch": 1.4289917614750882, + "grad_norm": 16.517105102539062, + "learning_rate": 7.212658335277255e-06, + "loss": 4.3451, + "step": 7285 + }, + { + "epoch": 1.4299725382502944, + "grad_norm": 30.557945251464844, + "learning_rate": 7.209186807006714e-06, + "loss": 4.6488, + "step": 7290 + }, + { + "epoch": 1.4309533150255003, + "grad_norm": 10.725804328918457, + "learning_rate": 7.205713954935901e-06, + "loss": 4.7013, + "step": 7295 + }, + { + "epoch": 1.4319340918007062, + "grad_norm": 25.416257858276367, + "learning_rate": 7.202239781145834e-06, + "loss": 4.814, + "step": 7300 + }, + { + "epoch": 1.432914868575912, + "grad_norm": 28.029569625854492, + "learning_rate": 7.19876428771833e-06, + "loss": 4.6645, + "step": 7305 + }, + { + "epoch": 1.433895645351118, + "grad_norm": 11.939929962158203, + "learning_rate": 7.195287476735989e-06, + "loss": 5.2664, + "step": 7310 + }, + { + "epoch": 1.4348764221263242, + "grad_norm": 15.845399856567383, + "learning_rate": 7.191809350282204e-06, + "loss": 4.6847, + "step": 7315 + }, + { + "epoch": 1.43585719890153, + "grad_norm": 15.885704040527344, + "learning_rate": 7.188329910441154e-06, + "loss": 4.6084, + "step": 7320 + }, + { + "epoch": 1.436837975676736, + "grad_norm": 36.25415802001953, + "learning_rate": 7.184849159297809e-06, + "loss": 5.2779, + "step": 7325 + }, + { + "epoch": 1.437818752451942, + "grad_norm": 15.539462089538574, + "learning_rate": 7.1813670989379215e-06, + "loss": 4.3759, + "step": 7330 + }, + { + "epoch": 1.4387995292271478, + "grad_norm": 16.43358612060547, + "learning_rate": 7.177883731448031e-06, + "loss": 4.4987, + "step": 7335 + }, + { + "epoch": 1.439780306002354, + "grad_norm": 14.630133628845215, + "learning_rate": 7.174399058915458e-06, + "loss": 4.4032, + "step": 7340 + }, + { + "epoch": 1.4407610827775599, + "grad_norm": 23.953678131103516, + "learning_rate": 7.170913083428306e-06, + "loss": 4.9749, + "step": 7345 + }, + { + "epoch": 1.4417418595527658, + "grad_norm": 20.43268585205078, + "learning_rate": 7.167425807075459e-06, + "loss": 4.644, + "step": 7350 + }, + { + "epoch": 1.4427226363279717, + "grad_norm": 20.701139450073242, + "learning_rate": 7.163937231946581e-06, + "loss": 5.2018, + "step": 7355 + }, + { + "epoch": 1.4437034131031776, + "grad_norm": 25.42399024963379, + "learning_rate": 7.1604473601321125e-06, + "loss": 4.5791, + "step": 7360 + }, + { + "epoch": 1.4446841898783838, + "grad_norm": 10.516730308532715, + "learning_rate": 7.156956193723275e-06, + "loss": 4.7935, + "step": 7365 + }, + { + "epoch": 1.4456649666535897, + "grad_norm": 31.056705474853516, + "learning_rate": 7.153463734812059e-06, + "loss": 4.721, + "step": 7370 + }, + { + "epoch": 1.4466457434287956, + "grad_norm": 18.38773536682129, + "learning_rate": 7.1499699854912385e-06, + "loss": 4.4783, + "step": 7375 + }, + { + "epoch": 1.4476265202040015, + "grad_norm": 21.014942169189453, + "learning_rate": 7.146474947854354e-06, + "loss": 4.7552, + "step": 7380 + }, + { + "epoch": 1.4486072969792074, + "grad_norm": 26.835750579833984, + "learning_rate": 7.1429786239957195e-06, + "loss": 4.5055, + "step": 7385 + }, + { + "epoch": 1.4495880737544136, + "grad_norm": 23.621606826782227, + "learning_rate": 7.13948101601042e-06, + "loss": 5.0276, + "step": 7390 + }, + { + "epoch": 1.4505688505296195, + "grad_norm": 22.148571014404297, + "learning_rate": 7.135982125994311e-06, + "loss": 4.6589, + "step": 7395 + }, + { + "epoch": 1.4515496273048254, + "grad_norm": 15.912561416625977, + "learning_rate": 7.132481956044013e-06, + "loss": 4.7373, + "step": 7400 + }, + { + "epoch": 1.4525304040800313, + "grad_norm": 15.783080101013184, + "learning_rate": 7.128980508256919e-06, + "loss": 4.763, + "step": 7405 + }, + { + "epoch": 1.4535111808552372, + "grad_norm": 14.95667839050293, + "learning_rate": 7.125477784731184e-06, + "loss": 4.959, + "step": 7410 + }, + { + "epoch": 1.4544919576304434, + "grad_norm": 23.89700698852539, + "learning_rate": 7.121973787565727e-06, + "loss": 4.8652, + "step": 7415 + }, + { + "epoch": 1.4554727344056493, + "grad_norm": 23.417661666870117, + "learning_rate": 7.118468518860232e-06, + "loss": 4.697, + "step": 7420 + }, + { + "epoch": 1.4564535111808552, + "grad_norm": 23.191051483154297, + "learning_rate": 7.114961980715142e-06, + "loss": 4.4113, + "step": 7425 + }, + { + "epoch": 1.4574342879560613, + "grad_norm": 12.547563552856445, + "learning_rate": 7.111454175231664e-06, + "loss": 4.427, + "step": 7430 + }, + { + "epoch": 1.458415064731267, + "grad_norm": 22.79470443725586, + "learning_rate": 7.107945104511766e-06, + "loss": 4.7032, + "step": 7435 + }, + { + "epoch": 1.4593958415064732, + "grad_norm": 21.279233932495117, + "learning_rate": 7.1044347706581664e-06, + "loss": 4.9074, + "step": 7440 + }, + { + "epoch": 1.460376618281679, + "grad_norm": 24.7849178314209, + "learning_rate": 7.10092317577435e-06, + "loss": 4.7103, + "step": 7445 + }, + { + "epoch": 1.461357395056885, + "grad_norm": 12.6660795211792, + "learning_rate": 7.09741032196455e-06, + "loss": 4.7538, + "step": 7450 + }, + { + "epoch": 1.4623381718320911, + "grad_norm": 21.938674926757812, + "learning_rate": 7.093896211333757e-06, + "loss": 5.1549, + "step": 7455 + }, + { + "epoch": 1.463318948607297, + "grad_norm": 25.91387176513672, + "learning_rate": 7.090380845987716e-06, + "loss": 4.6054, + "step": 7460 + }, + { + "epoch": 1.464299725382503, + "grad_norm": 23.84564971923828, + "learning_rate": 7.08686422803292e-06, + "loss": 4.6331, + "step": 7465 + }, + { + "epoch": 1.4652805021577089, + "grad_norm": 18.71299171447754, + "learning_rate": 7.083346359576617e-06, + "loss": 4.8511, + "step": 7470 + }, + { + "epoch": 1.4662612789329148, + "grad_norm": 13.39997673034668, + "learning_rate": 7.079827242726801e-06, + "loss": 4.7739, + "step": 7475 + }, + { + "epoch": 1.467242055708121, + "grad_norm": 21.340524673461914, + "learning_rate": 7.076306879592215e-06, + "loss": 4.7725, + "step": 7480 + }, + { + "epoch": 1.4682228324833269, + "grad_norm": 24.9217529296875, + "learning_rate": 7.072785272282351e-06, + "loss": 4.7112, + "step": 7485 + }, + { + "epoch": 1.4692036092585328, + "grad_norm": 18.006662368774414, + "learning_rate": 7.069262422907444e-06, + "loss": 4.5367, + "step": 7490 + }, + { + "epoch": 1.4701843860337387, + "grad_norm": 17.566797256469727, + "learning_rate": 7.065738333578473e-06, + "loss": 4.7325, + "step": 7495 + }, + { + "epoch": 1.4711651628089446, + "grad_norm": 16.905597686767578, + "learning_rate": 7.0622130064071584e-06, + "loss": 4.6231, + "step": 7500 + }, + { + "epoch": 1.4721459395841507, + "grad_norm": 13.76935863494873, + "learning_rate": 7.05868644350597e-06, + "loss": 4.7777, + "step": 7505 + }, + { + "epoch": 1.4731267163593567, + "grad_norm": 20.6540470123291, + "learning_rate": 7.05515864698811e-06, + "loss": 4.6186, + "step": 7510 + }, + { + "epoch": 1.4741074931345626, + "grad_norm": 19.557889938354492, + "learning_rate": 7.051629618967523e-06, + "loss": 4.957, + "step": 7515 + }, + { + "epoch": 1.4750882699097685, + "grad_norm": 35.486270904541016, + "learning_rate": 7.048099361558892e-06, + "loss": 4.5927, + "step": 7520 + }, + { + "epoch": 1.4760690466849744, + "grad_norm": 23.648691177368164, + "learning_rate": 7.044567876877636e-06, + "loss": 4.7108, + "step": 7525 + }, + { + "epoch": 1.4770498234601805, + "grad_norm": 15.685006141662598, + "learning_rate": 7.041035167039909e-06, + "loss": 4.689, + "step": 7530 + }, + { + "epoch": 1.4780306002353865, + "grad_norm": 21.108903884887695, + "learning_rate": 7.037501234162599e-06, + "loss": 4.4376, + "step": 7535 + }, + { + "epoch": 1.4790113770105924, + "grad_norm": 16.158863067626953, + "learning_rate": 7.033966080363328e-06, + "loss": 4.6436, + "step": 7540 + }, + { + "epoch": 1.4799921537857983, + "grad_norm": 18.32492446899414, + "learning_rate": 7.03042970776045e-06, + "loss": 4.5336, + "step": 7545 + }, + { + "epoch": 1.4809729305610042, + "grad_norm": 15.529178619384766, + "learning_rate": 7.026892118473045e-06, + "loss": 4.6415, + "step": 7550 + }, + { + "epoch": 1.4819537073362103, + "grad_norm": 23.759431838989258, + "learning_rate": 7.023353314620931e-06, + "loss": 4.6697, + "step": 7555 + }, + { + "epoch": 1.4829344841114163, + "grad_norm": 14.494952201843262, + "learning_rate": 7.019813298324642e-06, + "loss": 5.0709, + "step": 7560 + }, + { + "epoch": 1.4839152608866222, + "grad_norm": 23.806947708129883, + "learning_rate": 7.016272071705452e-06, + "loss": 4.5083, + "step": 7565 + }, + { + "epoch": 1.484896037661828, + "grad_norm": 17.701692581176758, + "learning_rate": 7.012729636885346e-06, + "loss": 4.7351, + "step": 7570 + }, + { + "epoch": 1.485876814437034, + "grad_norm": 13.665419578552246, + "learning_rate": 7.009185995987042e-06, + "loss": 4.5931, + "step": 7575 + }, + { + "epoch": 1.4868575912122401, + "grad_norm": 25.867773056030273, + "learning_rate": 7.0056411511339805e-06, + "loss": 5.2031, + "step": 7580 + }, + { + "epoch": 1.487838367987446, + "grad_norm": 23.662742614746094, + "learning_rate": 7.002095104450322e-06, + "loss": 4.4658, + "step": 7585 + }, + { + "epoch": 1.488819144762652, + "grad_norm": 53.169151306152344, + "learning_rate": 6.998547858060944e-06, + "loss": 5.3187, + "step": 7590 + }, + { + "epoch": 1.4897999215378581, + "grad_norm": 15.928996086120605, + "learning_rate": 6.994999414091448e-06, + "loss": 4.9528, + "step": 7595 + }, + { + "epoch": 1.4907806983130638, + "grad_norm": 28.862323760986328, + "learning_rate": 6.991449774668149e-06, + "loss": 4.8229, + "step": 7600 + }, + { + "epoch": 1.49176147508827, + "grad_norm": 23.40715789794922, + "learning_rate": 6.987898941918082e-06, + "loss": 4.4993, + "step": 7605 + }, + { + "epoch": 1.4927422518634759, + "grad_norm": 10.611961364746094, + "learning_rate": 6.984346917968994e-06, + "loss": 4.3586, + "step": 7610 + }, + { + "epoch": 1.4937230286386818, + "grad_norm": 20.525981903076172, + "learning_rate": 6.980793704949348e-06, + "loss": 4.4613, + "step": 7615 + }, + { + "epoch": 1.494703805413888, + "grad_norm": 17.283946990966797, + "learning_rate": 6.977239304988318e-06, + "loss": 4.8228, + "step": 7620 + }, + { + "epoch": 1.4956845821890938, + "grad_norm": 21.752166748046875, + "learning_rate": 6.973683720215789e-06, + "loss": 4.7708, + "step": 7625 + }, + { + "epoch": 1.4966653589642998, + "grad_norm": 20.8831787109375, + "learning_rate": 6.970126952762359e-06, + "loss": 4.7839, + "step": 7630 + }, + { + "epoch": 1.4976461357395057, + "grad_norm": 13.42829418182373, + "learning_rate": 6.966569004759331e-06, + "loss": 4.7172, + "step": 7635 + }, + { + "epoch": 1.4986269125147116, + "grad_norm": 21.750574111938477, + "learning_rate": 6.963009878338718e-06, + "loss": 4.6011, + "step": 7640 + }, + { + "epoch": 1.4996076892899177, + "grad_norm": 19.537803649902344, + "learning_rate": 6.959449575633236e-06, + "loss": 4.7368, + "step": 7645 + }, + { + "epoch": 1.5005884660651234, + "grad_norm": 21.34620475769043, + "learning_rate": 6.955888098776308e-06, + "loss": 4.6103, + "step": 7650 + }, + { + "epoch": 1.5005884660651234, + "eval_loss": 4.907505512237549, + "eval_runtime": 7.6604, + "eval_samples_per_second": 27.283, + "eval_steps_per_second": 13.707, + "step": 7650 + }, + { + "epoch": 1.5015692428403296, + "grad_norm": 13.40380573272705, + "learning_rate": 6.952325449902062e-06, + "loss": 4.6435, + "step": 7655 + }, + { + "epoch": 1.5025500196155355, + "grad_norm": 14.482090950012207, + "learning_rate": 6.948761631145327e-06, + "loss": 4.8812, + "step": 7660 + }, + { + "epoch": 1.5035307963907414, + "grad_norm": 22.7137451171875, + "learning_rate": 6.945196644641631e-06, + "loss": 4.8619, + "step": 7665 + }, + { + "epoch": 1.5045115731659475, + "grad_norm": 19.11748504638672, + "learning_rate": 6.941630492527205e-06, + "loss": 5.1203, + "step": 7670 + }, + { + "epoch": 1.5054923499411534, + "grad_norm": 24.688961029052734, + "learning_rate": 6.938063176938976e-06, + "loss": 4.7891, + "step": 7675 + }, + { + "epoch": 1.5064731267163594, + "grad_norm": 20.401952743530273, + "learning_rate": 6.934494700014572e-06, + "loss": 4.5722, + "step": 7680 + }, + { + "epoch": 1.5074539034915653, + "grad_norm": 17.21699333190918, + "learning_rate": 6.9309250638923085e-06, + "loss": 4.588, + "step": 7685 + }, + { + "epoch": 1.5084346802667712, + "grad_norm": 29.611526489257812, + "learning_rate": 6.927354270711206e-06, + "loss": 4.4101, + "step": 7690 + }, + { + "epoch": 1.5094154570419773, + "grad_norm": 19.163101196289062, + "learning_rate": 6.923782322610972e-06, + "loss": 4.6286, + "step": 7695 + }, + { + "epoch": 1.5103962338171832, + "grad_norm": 20.316064834594727, + "learning_rate": 6.920209221732007e-06, + "loss": 4.7434, + "step": 7700 + }, + { + "epoch": 1.5113770105923892, + "grad_norm": 21.396059036254883, + "learning_rate": 6.916634970215406e-06, + "loss": 4.7623, + "step": 7705 + }, + { + "epoch": 1.5123577873675953, + "grad_norm": 33.224063873291016, + "learning_rate": 6.913059570202945e-06, + "loss": 4.6921, + "step": 7710 + }, + { + "epoch": 1.513338564142801, + "grad_norm": 19.221223831176758, + "learning_rate": 6.909483023837098e-06, + "loss": 4.6417, + "step": 7715 + }, + { + "epoch": 1.5143193409180071, + "grad_norm": 14.870227813720703, + "learning_rate": 6.905905333261019e-06, + "loss": 4.8107, + "step": 7720 + }, + { + "epoch": 1.515300117693213, + "grad_norm": 14.234113693237305, + "learning_rate": 6.90232650061855e-06, + "loss": 4.6731, + "step": 7725 + }, + { + "epoch": 1.516280894468419, + "grad_norm": 12.864129066467285, + "learning_rate": 6.898746528054221e-06, + "loss": 4.7254, + "step": 7730 + }, + { + "epoch": 1.517261671243625, + "grad_norm": 32.53975296020508, + "learning_rate": 6.895165417713238e-06, + "loss": 4.6027, + "step": 7735 + }, + { + "epoch": 1.5182424480188308, + "grad_norm": 17.56818199157715, + "learning_rate": 6.891583171741494e-06, + "loss": 4.6698, + "step": 7740 + }, + { + "epoch": 1.519223224794037, + "grad_norm": 24.81783676147461, + "learning_rate": 6.88799979228556e-06, + "loss": 4.8477, + "step": 7745 + }, + { + "epoch": 1.5202040015692428, + "grad_norm": 12.959918975830078, + "learning_rate": 6.884415281492686e-06, + "loss": 4.8092, + "step": 7750 + }, + { + "epoch": 1.5211847783444488, + "grad_norm": 26.625934600830078, + "learning_rate": 6.880829641510805e-06, + "loss": 4.9008, + "step": 7755 + }, + { + "epoch": 1.522165555119655, + "grad_norm": 24.741802215576172, + "learning_rate": 6.877242874488518e-06, + "loss": 4.4703, + "step": 7760 + }, + { + "epoch": 1.5231463318948606, + "grad_norm": 21.69708824157715, + "learning_rate": 6.873654982575108e-06, + "loss": 4.6701, + "step": 7765 + }, + { + "epoch": 1.5241271086700667, + "grad_norm": 30.454282760620117, + "learning_rate": 6.8700659679205296e-06, + "loss": 4.9591, + "step": 7770 + }, + { + "epoch": 1.5251078854452726, + "grad_norm": 25.414043426513672, + "learning_rate": 6.866475832675412e-06, + "loss": 4.8959, + "step": 7775 + }, + { + "epoch": 1.5260886622204786, + "grad_norm": 27.82634925842285, + "learning_rate": 6.862884578991054e-06, + "loss": 4.9565, + "step": 7780 + }, + { + "epoch": 1.5270694389956847, + "grad_norm": 37.48154067993164, + "learning_rate": 6.859292209019424e-06, + "loss": 4.6826, + "step": 7785 + }, + { + "epoch": 1.5280502157708904, + "grad_norm": 14.848986625671387, + "learning_rate": 6.85569872491316e-06, + "loss": 4.6118, + "step": 7790 + }, + { + "epoch": 1.5290309925460965, + "grad_norm": 20.567981719970703, + "learning_rate": 6.85210412882557e-06, + "loss": 4.7033, + "step": 7795 + }, + { + "epoch": 1.5300117693213025, + "grad_norm": 37.882362365722656, + "learning_rate": 6.848508422910622e-06, + "loss": 4.9443, + "step": 7800 + }, + { + "epoch": 1.5309925460965084, + "grad_norm": 15.06411075592041, + "learning_rate": 6.8449116093229605e-06, + "loss": 4.3824, + "step": 7805 + }, + { + "epoch": 1.5319733228717145, + "grad_norm": 11.747258186340332, + "learning_rate": 6.841313690217881e-06, + "loss": 4.6974, + "step": 7810 + }, + { + "epoch": 1.5329540996469202, + "grad_norm": 13.8905611038208, + "learning_rate": 6.837714667751351e-06, + "loss": 4.7445, + "step": 7815 + }, + { + "epoch": 1.5339348764221263, + "grad_norm": 19.0726318359375, + "learning_rate": 6.834114544079993e-06, + "loss": 4.927, + "step": 7820 + }, + { + "epoch": 1.5349156531973323, + "grad_norm": 17.133312225341797, + "learning_rate": 6.830513321361089e-06, + "loss": 4.8282, + "step": 7825 + }, + { + "epoch": 1.5358964299725382, + "grad_norm": 12.729392051696777, + "learning_rate": 6.826911001752586e-06, + "loss": 4.7162, + "step": 7830 + }, + { + "epoch": 1.5368772067477443, + "grad_norm": 30.635669708251953, + "learning_rate": 6.823307587413084e-06, + "loss": 4.5225, + "step": 7835 + }, + { + "epoch": 1.5378579835229502, + "grad_norm": 17.1396427154541, + "learning_rate": 6.8197030805018385e-06, + "loss": 4.9225, + "step": 7840 + }, + { + "epoch": 1.5388387602981561, + "grad_norm": 16.432329177856445, + "learning_rate": 6.8160974831787605e-06, + "loss": 4.5991, + "step": 7845 + }, + { + "epoch": 1.539819537073362, + "grad_norm": 13.991456031799316, + "learning_rate": 6.812490797604416e-06, + "loss": 4.5396, + "step": 7850 + }, + { + "epoch": 1.540800313848568, + "grad_norm": 20.396547317504883, + "learning_rate": 6.808883025940019e-06, + "loss": 4.8735, + "step": 7855 + }, + { + "epoch": 1.541781090623774, + "grad_norm": 18.803667068481445, + "learning_rate": 6.805274170347441e-06, + "loss": 4.9716, + "step": 7860 + }, + { + "epoch": 1.54276186739898, + "grad_norm": 16.86100196838379, + "learning_rate": 6.801664232989196e-06, + "loss": 4.5278, + "step": 7865 + }, + { + "epoch": 1.543742644174186, + "grad_norm": 19.707059860229492, + "learning_rate": 6.798053216028448e-06, + "loss": 4.719, + "step": 7870 + }, + { + "epoch": 1.544723420949392, + "grad_norm": 17.481386184692383, + "learning_rate": 6.794441121629013e-06, + "loss": 4.5006, + "step": 7875 + }, + { + "epoch": 1.5457041977245978, + "grad_norm": 11.351982116699219, + "learning_rate": 6.790827951955345e-06, + "loss": 4.8233, + "step": 7880 + }, + { + "epoch": 1.546684974499804, + "grad_norm": 15.135643005371094, + "learning_rate": 6.787213709172551e-06, + "loss": 4.9262, + "step": 7885 + }, + { + "epoch": 1.5476657512750098, + "grad_norm": 11.407129287719727, + "learning_rate": 6.783598395446371e-06, + "loss": 5.4497, + "step": 7890 + }, + { + "epoch": 1.5486465280502157, + "grad_norm": 13.211956024169922, + "learning_rate": 6.779982012943195e-06, + "loss": 4.975, + "step": 7895 + }, + { + "epoch": 1.5496273048254219, + "grad_norm": 23.667285919189453, + "learning_rate": 6.776364563830047e-06, + "loss": 4.7498, + "step": 7900 + }, + { + "epoch": 1.5506080816006276, + "grad_norm": 16.621427536010742, + "learning_rate": 6.772746050274598e-06, + "loss": 4.6203, + "step": 7905 + }, + { + "epoch": 1.5515888583758337, + "grad_norm": 14.443175315856934, + "learning_rate": 6.769126474445149e-06, + "loss": 4.4905, + "step": 7910 + }, + { + "epoch": 1.5525696351510396, + "grad_norm": 20.291501998901367, + "learning_rate": 6.765505838510642e-06, + "loss": 4.7088, + "step": 7915 + }, + { + "epoch": 1.5535504119262455, + "grad_norm": 19.331323623657227, + "learning_rate": 6.761884144640652e-06, + "loss": 4.5726, + "step": 7920 + }, + { + "epoch": 1.5545311887014517, + "grad_norm": 22.91404914855957, + "learning_rate": 6.758261395005391e-06, + "loss": 4.7076, + "step": 7925 + }, + { + "epoch": 1.5555119654766574, + "grad_norm": 24.691198348999023, + "learning_rate": 6.7546375917757e-06, + "loss": 4.8906, + "step": 7930 + }, + { + "epoch": 1.5564927422518635, + "grad_norm": 28.088998794555664, + "learning_rate": 6.751012737123054e-06, + "loss": 4.6132, + "step": 7935 + }, + { + "epoch": 1.5574735190270694, + "grad_norm": 10.642810821533203, + "learning_rate": 6.747386833219556e-06, + "loss": 4.9542, + "step": 7940 + }, + { + "epoch": 1.5584542958022753, + "grad_norm": 13.011480331420898, + "learning_rate": 6.7437598822379405e-06, + "loss": 4.631, + "step": 7945 + }, + { + "epoch": 1.5594350725774815, + "grad_norm": 29.10993194580078, + "learning_rate": 6.740131886351564e-06, + "loss": 4.5673, + "step": 7950 + }, + { + "epoch": 1.5604158493526872, + "grad_norm": 17.598003387451172, + "learning_rate": 6.736502847734417e-06, + "loss": 4.6614, + "step": 7955 + }, + { + "epoch": 1.5613966261278933, + "grad_norm": 20.10211753845215, + "learning_rate": 6.732872768561111e-06, + "loss": 4.5551, + "step": 7960 + }, + { + "epoch": 1.5623774029030992, + "grad_norm": 38.78089141845703, + "learning_rate": 6.729241651006876e-06, + "loss": 4.7825, + "step": 7965 + }, + { + "epoch": 1.5633581796783051, + "grad_norm": 10.49329662322998, + "learning_rate": 6.725609497247573e-06, + "loss": 5.0862, + "step": 7970 + }, + { + "epoch": 1.5643389564535113, + "grad_norm": 26.494342803955078, + "learning_rate": 6.721976309459677e-06, + "loss": 4.8202, + "step": 7975 + }, + { + "epoch": 1.565319733228717, + "grad_norm": 24.8291015625, + "learning_rate": 6.718342089820288e-06, + "loss": 4.8502, + "step": 7980 + }, + { + "epoch": 1.5663005100039231, + "grad_norm": 22.414064407348633, + "learning_rate": 6.714706840507122e-06, + "loss": 4.8922, + "step": 7985 + }, + { + "epoch": 1.567281286779129, + "grad_norm": 14.861505508422852, + "learning_rate": 6.711070563698508e-06, + "loss": 4.8528, + "step": 7990 + }, + { + "epoch": 1.568262063554335, + "grad_norm": 18.311233520507812, + "learning_rate": 6.707433261573399e-06, + "loss": 4.4056, + "step": 7995 + }, + { + "epoch": 1.569242840329541, + "grad_norm": 22.220447540283203, + "learning_rate": 6.703794936311354e-06, + "loss": 4.7109, + "step": 8000 + }, + { + "epoch": 1.570223617104747, + "grad_norm": 12.350397109985352, + "learning_rate": 6.700155590092553e-06, + "loss": 4.8512, + "step": 8005 + }, + { + "epoch": 1.571204393879953, + "grad_norm": 15.76508903503418, + "learning_rate": 6.6965152250977805e-06, + "loss": 4.8235, + "step": 8010 + }, + { + "epoch": 1.5721851706551588, + "grad_norm": 16.39112663269043, + "learning_rate": 6.692873843508436e-06, + "loss": 4.5981, + "step": 8015 + }, + { + "epoch": 1.5731659474303648, + "grad_norm": 27.190099716186523, + "learning_rate": 6.689231447506527e-06, + "loss": 4.6361, + "step": 8020 + }, + { + "epoch": 1.574146724205571, + "grad_norm": 11.988786697387695, + "learning_rate": 6.685588039274666e-06, + "loss": 4.5448, + "step": 8025 + }, + { + "epoch": 1.5751275009807768, + "grad_norm": 15.980277061462402, + "learning_rate": 6.681943620996081e-06, + "loss": 4.8218, + "step": 8030 + }, + { + "epoch": 1.5761082777559827, + "grad_norm": 16.668498992919922, + "learning_rate": 6.678298194854594e-06, + "loss": 4.2428, + "step": 8035 + }, + { + "epoch": 1.5770890545311889, + "grad_norm": 14.539552688598633, + "learning_rate": 6.674651763034636e-06, + "loss": 4.7311, + "step": 8040 + }, + { + "epoch": 1.5780698313063946, + "grad_norm": 21.027690887451172, + "learning_rate": 6.671004327721243e-06, + "loss": 4.3581, + "step": 8045 + }, + { + "epoch": 1.5790506080816007, + "grad_norm": 14.93703556060791, + "learning_rate": 6.667355891100049e-06, + "loss": 4.5448, + "step": 8050 + }, + { + "epoch": 1.5800313848568066, + "grad_norm": 30.272512435913086, + "learning_rate": 6.663706455357288e-06, + "loss": 4.6557, + "step": 8055 + }, + { + "epoch": 1.5810121616320125, + "grad_norm": 21.9456787109375, + "learning_rate": 6.660056022679795e-06, + "loss": 4.483, + "step": 8060 + }, + { + "epoch": 1.5819929384072187, + "grad_norm": 13.090523719787598, + "learning_rate": 6.6564045952549994e-06, + "loss": 4.7171, + "step": 8065 + }, + { + "epoch": 1.5829737151824244, + "grad_norm": 16.17782974243164, + "learning_rate": 6.652752175270933e-06, + "loss": 4.4794, + "step": 8070 + }, + { + "epoch": 1.5839544919576305, + "grad_norm": 17.59096908569336, + "learning_rate": 6.649098764916211e-06, + "loss": 4.4093, + "step": 8075 + }, + { + "epoch": 1.5849352687328364, + "grad_norm": 22.827747344970703, + "learning_rate": 6.64544436638005e-06, + "loss": 4.6104, + "step": 8080 + }, + { + "epoch": 1.5859160455080423, + "grad_norm": 34.059200286865234, + "learning_rate": 6.641788981852262e-06, + "loss": 5.1327, + "step": 8085 + }, + { + "epoch": 1.5868968222832485, + "grad_norm": 18.246440887451172, + "learning_rate": 6.6381326135232415e-06, + "loss": 4.7764, + "step": 8090 + }, + { + "epoch": 1.5878775990584542, + "grad_norm": 26.03481101989746, + "learning_rate": 6.634475263583978e-06, + "loss": 4.8857, + "step": 8095 + }, + { + "epoch": 1.5888583758336603, + "grad_norm": 11.254276275634766, + "learning_rate": 6.630816934226047e-06, + "loss": 5.087, + "step": 8100 + }, + { + "epoch": 1.5898391526088662, + "grad_norm": 37.02158737182617, + "learning_rate": 6.627157627641611e-06, + "loss": 4.7483, + "step": 8105 + }, + { + "epoch": 1.5908199293840721, + "grad_norm": 11.20740795135498, + "learning_rate": 6.6234973460234184e-06, + "loss": 4.747, + "step": 8110 + }, + { + "epoch": 1.5918007061592783, + "grad_norm": 17.87250328063965, + "learning_rate": 6.619836091564803e-06, + "loss": 4.5122, + "step": 8115 + }, + { + "epoch": 1.592781482934484, + "grad_norm": 21.036434173583984, + "learning_rate": 6.61617386645968e-06, + "loss": 4.6486, + "step": 8120 + }, + { + "epoch": 1.59376225970969, + "grad_norm": 12.921380996704102, + "learning_rate": 6.612510672902545e-06, + "loss": 4.5646, + "step": 8125 + }, + { + "epoch": 1.594743036484896, + "grad_norm": 23.73027992248535, + "learning_rate": 6.608846513088478e-06, + "loss": 4.7245, + "step": 8130 + }, + { + "epoch": 1.595723813260102, + "grad_norm": 22.765920639038086, + "learning_rate": 6.6051813892131355e-06, + "loss": 4.8803, + "step": 8135 + }, + { + "epoch": 1.596704590035308, + "grad_norm": 13.69679069519043, + "learning_rate": 6.601515303472752e-06, + "loss": 4.9128, + "step": 8140 + }, + { + "epoch": 1.5976853668105138, + "grad_norm": 31.66204071044922, + "learning_rate": 6.597848258064138e-06, + "loss": 4.7675, + "step": 8145 + }, + { + "epoch": 1.59866614358572, + "grad_norm": 37.54737091064453, + "learning_rate": 6.594180255184678e-06, + "loss": 4.4534, + "step": 8150 + }, + { + "epoch": 1.5996469203609258, + "grad_norm": 13.1535005569458, + "learning_rate": 6.59051129703233e-06, + "loss": 4.8452, + "step": 8155 + }, + { + "epoch": 1.6006276971361317, + "grad_norm": 16.961715698242188, + "learning_rate": 6.5868413858056315e-06, + "loss": 4.4688, + "step": 8160 + }, + { + "epoch": 1.6016084739113379, + "grad_norm": 20.165821075439453, + "learning_rate": 6.583170523703682e-06, + "loss": 4.5493, + "step": 8165 + }, + { + "epoch": 1.6025892506865438, + "grad_norm": 16.436073303222656, + "learning_rate": 6.579498712926153e-06, + "loss": 4.526, + "step": 8170 + }, + { + "epoch": 1.6035700274617497, + "grad_norm": 22.747486114501953, + "learning_rate": 6.5758259556732896e-06, + "loss": 4.8861, + "step": 8175 + }, + { + "epoch": 1.6045508042369556, + "grad_norm": 12.746312141418457, + "learning_rate": 6.572152254145898e-06, + "loss": 4.8588, + "step": 8180 + }, + { + "epoch": 1.6055315810121615, + "grad_norm": 23.855030059814453, + "learning_rate": 6.568477610545352e-06, + "loss": 4.9615, + "step": 8185 + }, + { + "epoch": 1.6065123577873677, + "grad_norm": 17.665613174438477, + "learning_rate": 6.564802027073592e-06, + "loss": 4.7032, + "step": 8190 + }, + { + "epoch": 1.6074931345625736, + "grad_norm": 31.55120849609375, + "learning_rate": 6.561125505933119e-06, + "loss": 4.8295, + "step": 8195 + }, + { + "epoch": 1.6084739113377795, + "grad_norm": 14.968411445617676, + "learning_rate": 6.557448049326997e-06, + "loss": 4.8213, + "step": 8200 + }, + { + "epoch": 1.6094546881129856, + "grad_norm": 20.886701583862305, + "learning_rate": 6.55376965945885e-06, + "loss": 4.6678, + "step": 8205 + }, + { + "epoch": 1.6104354648881913, + "grad_norm": 14.710062026977539, + "learning_rate": 6.550090338532863e-06, + "loss": 4.7658, + "step": 8210 + }, + { + "epoch": 1.6114162416633975, + "grad_norm": 13.487741470336914, + "learning_rate": 6.546410088753777e-06, + "loss": 4.5046, + "step": 8215 + }, + { + "epoch": 1.6123970184386034, + "grad_norm": 14.448518753051758, + "learning_rate": 6.54272891232689e-06, + "loss": 5.073, + "step": 8220 + }, + { + "epoch": 1.6133777952138093, + "grad_norm": 22.348711013793945, + "learning_rate": 6.539046811458056e-06, + "loss": 4.794, + "step": 8225 + }, + { + "epoch": 1.6143585719890154, + "grad_norm": 11.976225852966309, + "learning_rate": 6.53536378835368e-06, + "loss": 4.6712, + "step": 8230 + }, + { + "epoch": 1.6153393487642211, + "grad_norm": 19.27861213684082, + "learning_rate": 6.531679845220725e-06, + "loss": 4.4471, + "step": 8235 + }, + { + "epoch": 1.6163201255394273, + "grad_norm": 20.683855056762695, + "learning_rate": 6.527994984266702e-06, + "loss": 4.8398, + "step": 8240 + }, + { + "epoch": 1.6173009023146332, + "grad_norm": 23.678176879882812, + "learning_rate": 6.524309207699671e-06, + "loss": 5.0151, + "step": 8245 + }, + { + "epoch": 1.618281679089839, + "grad_norm": 25.82317543029785, + "learning_rate": 6.5206225177282435e-06, + "loss": 4.7891, + "step": 8250 + }, + { + "epoch": 1.6192624558650452, + "grad_norm": 28.66914939880371, + "learning_rate": 6.516934916561575e-06, + "loss": 4.8626, + "step": 8255 + }, + { + "epoch": 1.620243232640251, + "grad_norm": 11.398292541503906, + "learning_rate": 6.513246406409369e-06, + "loss": 4.4237, + "step": 8260 + }, + { + "epoch": 1.621224009415457, + "grad_norm": 25.888748168945312, + "learning_rate": 6.509556989481875e-06, + "loss": 4.416, + "step": 8265 + }, + { + "epoch": 1.622204786190663, + "grad_norm": 11.734145164489746, + "learning_rate": 6.505866667989884e-06, + "loss": 4.6947, + "step": 8270 + }, + { + "epoch": 1.623185562965869, + "grad_norm": 12.392928123474121, + "learning_rate": 6.50217544414473e-06, + "loss": 4.4491, + "step": 8275 + }, + { + "epoch": 1.624166339741075, + "grad_norm": 11.715088844299316, + "learning_rate": 6.498483320158282e-06, + "loss": 4.564, + "step": 8280 + }, + { + "epoch": 1.6251471165162807, + "grad_norm": 34.25434112548828, + "learning_rate": 6.494790298242962e-06, + "loss": 4.8514, + "step": 8285 + }, + { + "epoch": 1.6261278932914869, + "grad_norm": 25.465301513671875, + "learning_rate": 6.491096380611716e-06, + "loss": 4.814, + "step": 8290 + }, + { + "epoch": 1.6271086700666928, + "grad_norm": 15.51944351196289, + "learning_rate": 6.487401569478033e-06, + "loss": 4.5486, + "step": 8295 + }, + { + "epoch": 1.6280894468418987, + "grad_norm": 26.168691635131836, + "learning_rate": 6.483705867055937e-06, + "loss": 4.9638, + "step": 8300 + }, + { + "epoch": 1.6290702236171049, + "grad_norm": 40.255760192871094, + "learning_rate": 6.480009275559985e-06, + "loss": 5.1347, + "step": 8305 + }, + { + "epoch": 1.6300510003923105, + "grad_norm": 26.99652671813965, + "learning_rate": 6.4763117972052704e-06, + "loss": 4.3371, + "step": 8310 + }, + { + "epoch": 1.6310317771675167, + "grad_norm": 31.933998107910156, + "learning_rate": 6.472613434207413e-06, + "loss": 4.4121, + "step": 8315 + }, + { + "epoch": 1.6320125539427226, + "grad_norm": 22.316579818725586, + "learning_rate": 6.4689141887825655e-06, + "loss": 4.8148, + "step": 8320 + }, + { + "epoch": 1.6329933307179285, + "grad_norm": 27.438989639282227, + "learning_rate": 6.465214063147409e-06, + "loss": 4.6428, + "step": 8325 + }, + { + "epoch": 1.6339741074931347, + "grad_norm": 15.515409469604492, + "learning_rate": 6.46151305951915e-06, + "loss": 4.9047, + "step": 8330 + }, + { + "epoch": 1.6349548842683406, + "grad_norm": 14.11785888671875, + "learning_rate": 6.457811180115525e-06, + "loss": 4.4316, + "step": 8335 + }, + { + "epoch": 1.6359356610435465, + "grad_norm": 13.665918350219727, + "learning_rate": 6.454108427154792e-06, + "loss": 4.8829, + "step": 8340 + }, + { + "epoch": 1.6369164378187524, + "grad_norm": 12.321572303771973, + "learning_rate": 6.450404802855734e-06, + "loss": 4.4735, + "step": 8345 + }, + { + "epoch": 1.6378972145939583, + "grad_norm": 14.074231147766113, + "learning_rate": 6.446700309437657e-06, + "loss": 4.614, + "step": 8350 + }, + { + "epoch": 1.6388779913691645, + "grad_norm": 23.989662170410156, + "learning_rate": 6.442994949120385e-06, + "loss": 4.4395, + "step": 8355 + }, + { + "epoch": 1.6398587681443704, + "grad_norm": 23.025890350341797, + "learning_rate": 6.439288724124262e-06, + "loss": 4.642, + "step": 8360 + }, + { + "epoch": 1.6408395449195763, + "grad_norm": 22.322298049926758, + "learning_rate": 6.435581636670154e-06, + "loss": 5.0991, + "step": 8365 + }, + { + "epoch": 1.6418203216947824, + "grad_norm": 15.972933769226074, + "learning_rate": 6.43187368897944e-06, + "loss": 4.6842, + "step": 8370 + }, + { + "epoch": 1.6428010984699881, + "grad_norm": 18.866586685180664, + "learning_rate": 6.4281648832740155e-06, + "loss": 4.7774, + "step": 8375 + }, + { + "epoch": 1.6437818752451943, + "grad_norm": 20.908037185668945, + "learning_rate": 6.424455221776286e-06, + "loss": 4.9527, + "step": 8380 + }, + { + "epoch": 1.6447626520204002, + "grad_norm": 22.348464965820312, + "learning_rate": 6.420744706709181e-06, + "loss": 4.6132, + "step": 8385 + }, + { + "epoch": 1.645743428795606, + "grad_norm": 27.352182388305664, + "learning_rate": 6.417033340296131e-06, + "loss": 4.8313, + "step": 8390 + }, + { + "epoch": 1.6467242055708122, + "grad_norm": 16.535625457763672, + "learning_rate": 6.413321124761082e-06, + "loss": 4.9389, + "step": 8395 + }, + { + "epoch": 1.647704982346018, + "grad_norm": 17.407255172729492, + "learning_rate": 6.409608062328483e-06, + "loss": 4.7184, + "step": 8400 + }, + { + "epoch": 1.648685759121224, + "grad_norm": 20.512407302856445, + "learning_rate": 6.405894155223296e-06, + "loss": 4.524, + "step": 8405 + }, + { + "epoch": 1.64966653589643, + "grad_norm": 16.161840438842773, + "learning_rate": 6.402179405670987e-06, + "loss": 4.9194, + "step": 8410 + }, + { + "epoch": 1.650647312671636, + "grad_norm": 16.505544662475586, + "learning_rate": 6.39846381589753e-06, + "loss": 4.9257, + "step": 8415 + }, + { + "epoch": 1.651628089446842, + "grad_norm": 21.150869369506836, + "learning_rate": 6.394747388129397e-06, + "loss": 5.1818, + "step": 8420 + }, + { + "epoch": 1.6526088662220477, + "grad_norm": 21.39322853088379, + "learning_rate": 6.391030124593567e-06, + "loss": 4.4558, + "step": 8425 + }, + { + "epoch": 1.6535896429972539, + "grad_norm": 22.720727920532227, + "learning_rate": 6.387312027517516e-06, + "loss": 5.0587, + "step": 8430 + }, + { + "epoch": 1.6545704197724598, + "grad_norm": 24.188188552856445, + "learning_rate": 6.383593099129223e-06, + "loss": 4.6071, + "step": 8435 + }, + { + "epoch": 1.6555511965476657, + "grad_norm": 42.62229537963867, + "learning_rate": 6.3798733416571615e-06, + "loss": 4.7521, + "step": 8440 + }, + { + "epoch": 1.6565319733228718, + "grad_norm": 18.189743041992188, + "learning_rate": 6.376152757330305e-06, + "loss": 4.9016, + "step": 8445 + }, + { + "epoch": 1.6575127500980775, + "grad_norm": 23.545324325561523, + "learning_rate": 6.37243134837812e-06, + "loss": 4.9774, + "step": 8450 + }, + { + "epoch": 1.6584935268732837, + "grad_norm": 12.068930625915527, + "learning_rate": 6.368709117030568e-06, + "loss": 4.3265, + "step": 8455 + }, + { + "epoch": 1.6594743036484896, + "grad_norm": 19.71160316467285, + "learning_rate": 6.364986065518106e-06, + "loss": 4.7148, + "step": 8460 + }, + { + "epoch": 1.6604550804236955, + "grad_norm": 12.18775463104248, + "learning_rate": 6.361262196071679e-06, + "loss": 4.5165, + "step": 8465 + }, + { + "epoch": 1.6614358571989016, + "grad_norm": 20.218761444091797, + "learning_rate": 6.357537510922723e-06, + "loss": 4.7621, + "step": 8470 + }, + { + "epoch": 1.6624166339741073, + "grad_norm": 24.970178604125977, + "learning_rate": 6.353812012303162e-06, + "loss": 4.9106, + "step": 8475 + }, + { + "epoch": 1.6633974107493135, + "grad_norm": 15.730690956115723, + "learning_rate": 6.3500857024454085e-06, + "loss": 4.616, + "step": 8480 + }, + { + "epoch": 1.6643781875245194, + "grad_norm": 17.307218551635742, + "learning_rate": 6.346358583582364e-06, + "loss": 4.4639, + "step": 8485 + }, + { + "epoch": 1.6653589642997253, + "grad_norm": 27.932443618774414, + "learning_rate": 6.342630657947409e-06, + "loss": 4.9017, + "step": 8490 + }, + { + "epoch": 1.6663397410749314, + "grad_norm": 32.948543548583984, + "learning_rate": 6.338901927774409e-06, + "loss": 4.433, + "step": 8495 + }, + { + "epoch": 1.6673205178501374, + "grad_norm": 16.31046485900879, + "learning_rate": 6.335172395297716e-06, + "loss": 4.6377, + "step": 8500 + }, + { + "epoch": 1.6683012946253433, + "grad_norm": 34.126861572265625, + "learning_rate": 6.331442062752159e-06, + "loss": 5.0827, + "step": 8505 + }, + { + "epoch": 1.6692820714005492, + "grad_norm": 25.11467170715332, + "learning_rate": 6.327710932373046e-06, + "loss": 4.8379, + "step": 8510 + }, + { + "epoch": 1.670262848175755, + "grad_norm": 23.186079025268555, + "learning_rate": 6.3239790063961635e-06, + "loss": 4.4733, + "step": 8515 + }, + { + "epoch": 1.6712436249509612, + "grad_norm": 29.186731338500977, + "learning_rate": 6.320246287057778e-06, + "loss": 4.5714, + "step": 8520 + }, + { + "epoch": 1.6722244017261672, + "grad_norm": 17.12517738342285, + "learning_rate": 6.316512776594626e-06, + "loss": 4.3804, + "step": 8525 + }, + { + "epoch": 1.673205178501373, + "grad_norm": 10.104792594909668, + "learning_rate": 6.3127784772439215e-06, + "loss": 5.0567, + "step": 8530 + }, + { + "epoch": 1.6741859552765792, + "grad_norm": 15.159939765930176, + "learning_rate": 6.309043391243351e-06, + "loss": 4.5438, + "step": 8535 + }, + { + "epoch": 1.675166732051785, + "grad_norm": 20.949722290039062, + "learning_rate": 6.305307520831075e-06, + "loss": 4.4797, + "step": 8540 + }, + { + "epoch": 1.676147508826991, + "grad_norm": 14.368339538574219, + "learning_rate": 6.3015708682457155e-06, + "loss": 4.7613, + "step": 8545 + }, + { + "epoch": 1.677128285602197, + "grad_norm": 13.72527027130127, + "learning_rate": 6.29783343572637e-06, + "loss": 4.9441, + "step": 8550 + }, + { + "epoch": 1.6781090623774029, + "grad_norm": 37.21997833251953, + "learning_rate": 6.294095225512604e-06, + "loss": 4.5065, + "step": 8555 + }, + { + "epoch": 1.679089839152609, + "grad_norm": 19.89255714416504, + "learning_rate": 6.290356239844446e-06, + "loss": 4.605, + "step": 8560 + }, + { + "epoch": 1.6800706159278147, + "grad_norm": 30.77366828918457, + "learning_rate": 6.286616480962392e-06, + "loss": 4.7847, + "step": 8565 + }, + { + "epoch": 1.6810513927030208, + "grad_norm": 15.917234420776367, + "learning_rate": 6.282875951107396e-06, + "loss": 4.408, + "step": 8570 + }, + { + "epoch": 1.6820321694782268, + "grad_norm": 18.189117431640625, + "learning_rate": 6.279134652520881e-06, + "loss": 4.7609, + "step": 8575 + }, + { + "epoch": 1.6830129462534327, + "grad_norm": 30.72614097595215, + "learning_rate": 6.275392587444724e-06, + "loss": 4.7158, + "step": 8580 + }, + { + "epoch": 1.6839937230286388, + "grad_norm": 11.812881469726562, + "learning_rate": 6.271649758121268e-06, + "loss": 4.4278, + "step": 8585 + }, + { + "epoch": 1.6849744998038445, + "grad_norm": 17.11888313293457, + "learning_rate": 6.267906166793306e-06, + "loss": 4.3961, + "step": 8590 + }, + { + "epoch": 1.6859552765790506, + "grad_norm": 22.773406982421875, + "learning_rate": 6.264161815704096e-06, + "loss": 4.9114, + "step": 8595 + }, + { + "epoch": 1.6869360533542566, + "grad_norm": 26.260541915893555, + "learning_rate": 6.260416707097345e-06, + "loss": 4.6699, + "step": 8600 + }, + { + "epoch": 1.6879168301294625, + "grad_norm": 24.89080238342285, + "learning_rate": 6.256670843217217e-06, + "loss": 4.6868, + "step": 8605 + }, + { + "epoch": 1.6888976069046686, + "grad_norm": 17.895580291748047, + "learning_rate": 6.2529242263083305e-06, + "loss": 4.6321, + "step": 8610 + }, + { + "epoch": 1.6898783836798743, + "grad_norm": 15.826708793640137, + "learning_rate": 6.249176858615746e-06, + "loss": 4.4789, + "step": 8615 + }, + { + "epoch": 1.6908591604550804, + "grad_norm": 20.98354148864746, + "learning_rate": 6.2454287423849865e-06, + "loss": 4.5294, + "step": 8620 + }, + { + "epoch": 1.6918399372302864, + "grad_norm": 12.527328491210938, + "learning_rate": 6.241679879862015e-06, + "loss": 4.5682, + "step": 8625 + }, + { + "epoch": 1.6928207140054923, + "grad_norm": 12.26150131225586, + "learning_rate": 6.237930273293244e-06, + "loss": 4.6067, + "step": 8630 + }, + { + "epoch": 1.6938014907806984, + "grad_norm": 14.805673599243164, + "learning_rate": 6.234179924925532e-06, + "loss": 4.5776, + "step": 8635 + }, + { + "epoch": 1.6947822675559043, + "grad_norm": 27.439050674438477, + "learning_rate": 6.230428837006184e-06, + "loss": 4.6483, + "step": 8640 + }, + { + "epoch": 1.6957630443311102, + "grad_norm": 17.67620277404785, + "learning_rate": 6.226677011782944e-06, + "loss": 4.6386, + "step": 8645 + }, + { + "epoch": 1.6967438211063162, + "grad_norm": 27.96957015991211, + "learning_rate": 6.222924451504001e-06, + "loss": 4.2794, + "step": 8650 + }, + { + "epoch": 1.697724597881522, + "grad_norm": 31.630573272705078, + "learning_rate": 6.219171158417981e-06, + "loss": 5.1995, + "step": 8655 + }, + { + "epoch": 1.6987053746567282, + "grad_norm": 18.49655532836914, + "learning_rate": 6.215417134773956e-06, + "loss": 4.5782, + "step": 8660 + }, + { + "epoch": 1.6996861514319341, + "grad_norm": 14.828500747680664, + "learning_rate": 6.211662382821428e-06, + "loss": 4.5066, + "step": 8665 + }, + { + "epoch": 1.70066692820714, + "grad_norm": 18.147098541259766, + "learning_rate": 6.207906904810341e-06, + "loss": 4.6078, + "step": 8670 + }, + { + "epoch": 1.701647704982346, + "grad_norm": 10.830955505371094, + "learning_rate": 6.20415070299107e-06, + "loss": 4.5747, + "step": 8675 + }, + { + "epoch": 1.7026284817575519, + "grad_norm": 17.871294021606445, + "learning_rate": 6.200393779614425e-06, + "loss": 4.7012, + "step": 8680 + }, + { + "epoch": 1.703609258532758, + "grad_norm": 16.49895477294922, + "learning_rate": 6.196636136931652e-06, + "loss": 4.5185, + "step": 8685 + }, + { + "epoch": 1.704590035307964, + "grad_norm": 15.448025703430176, + "learning_rate": 6.192877777194422e-06, + "loss": 4.8708, + "step": 8690 + }, + { + "epoch": 1.7055708120831699, + "grad_norm": 21.347553253173828, + "learning_rate": 6.18911870265484e-06, + "loss": 4.6123, + "step": 8695 + }, + { + "epoch": 1.706551588858376, + "grad_norm": 13.813310623168945, + "learning_rate": 6.185358915565438e-06, + "loss": 4.7046, + "step": 8700 + }, + { + "epoch": 1.7075323656335817, + "grad_norm": 24.650951385498047, + "learning_rate": 6.181598418179173e-06, + "loss": 5.0376, + "step": 8705 + }, + { + "epoch": 1.7085131424087878, + "grad_norm": 26.78155517578125, + "learning_rate": 6.177837212749432e-06, + "loss": 4.452, + "step": 8710 + }, + { + "epoch": 1.7094939191839937, + "grad_norm": 14.59371566772461, + "learning_rate": 6.174075301530024e-06, + "loss": 4.9905, + "step": 8715 + }, + { + "epoch": 1.7104746959591997, + "grad_norm": 27.0151424407959, + "learning_rate": 6.17031268677518e-06, + "loss": 4.612, + "step": 8720 + }, + { + "epoch": 1.7114554727344058, + "grad_norm": 27.519006729125977, + "learning_rate": 6.166549370739553e-06, + "loss": 4.7261, + "step": 8725 + }, + { + "epoch": 1.7124362495096115, + "grad_norm": 21.014606475830078, + "learning_rate": 6.162785355678215e-06, + "loss": 4.664, + "step": 8730 + }, + { + "epoch": 1.7134170262848176, + "grad_norm": 21.153648376464844, + "learning_rate": 6.1590206438466605e-06, + "loss": 4.6907, + "step": 8735 + }, + { + "epoch": 1.7143978030600235, + "grad_norm": 20.082855224609375, + "learning_rate": 6.1552552375008e-06, + "loss": 4.4854, + "step": 8740 + }, + { + "epoch": 1.7153785798352295, + "grad_norm": 26.873451232910156, + "learning_rate": 6.15148913889696e-06, + "loss": 5.1138, + "step": 8745 + }, + { + "epoch": 1.7163593566104356, + "grad_norm": 12.644761085510254, + "learning_rate": 6.147722350291878e-06, + "loss": 4.847, + "step": 8750 + }, + { + "epoch": 1.7173401333856413, + "grad_norm": 30.004619598388672, + "learning_rate": 6.143954873942712e-06, + "loss": 4.6217, + "step": 8755 + }, + { + "epoch": 1.7183209101608474, + "grad_norm": 15.3263578414917, + "learning_rate": 6.140186712107027e-06, + "loss": 4.5504, + "step": 8760 + }, + { + "epoch": 1.7193016869360533, + "grad_norm": 17.260662078857422, + "learning_rate": 6.136417867042801e-06, + "loss": 4.3313, + "step": 8765 + }, + { + "epoch": 1.7202824637112593, + "grad_norm": 22.777530670166016, + "learning_rate": 6.132648341008421e-06, + "loss": 4.7004, + "step": 8770 + }, + { + "epoch": 1.7212632404864654, + "grad_norm": 9.795730590820312, + "learning_rate": 6.128878136262678e-06, + "loss": 4.9465, + "step": 8775 + }, + { + "epoch": 1.722244017261671, + "grad_norm": 13.725994110107422, + "learning_rate": 6.1251072550647775e-06, + "loss": 4.4856, + "step": 8780 + }, + { + "epoch": 1.7232247940368772, + "grad_norm": 12.93813705444336, + "learning_rate": 6.1213356996743265e-06, + "loss": 4.4976, + "step": 8785 + }, + { + "epoch": 1.7242055708120831, + "grad_norm": 28.854127883911133, + "learning_rate": 6.117563472351334e-06, + "loss": 4.643, + "step": 8790 + }, + { + "epoch": 1.725186347587289, + "grad_norm": 17.60565757751465, + "learning_rate": 6.1137905753562155e-06, + "loss": 4.8717, + "step": 8795 + }, + { + "epoch": 1.7261671243624952, + "grad_norm": 18.80259132385254, + "learning_rate": 6.110017010949783e-06, + "loss": 4.4856, + "step": 8800 + }, + { + "epoch": 1.7271479011377011, + "grad_norm": 8.956795692443848, + "learning_rate": 6.106242781393251e-06, + "loss": 4.6121, + "step": 8805 + }, + { + "epoch": 1.728128677912907, + "grad_norm": 17.677858352661133, + "learning_rate": 6.102467888948236e-06, + "loss": 4.448, + "step": 8810 + }, + { + "epoch": 1.729109454688113, + "grad_norm": 16.494901657104492, + "learning_rate": 6.098692335876746e-06, + "loss": 4.4203, + "step": 8815 + }, + { + "epoch": 1.7300902314633189, + "grad_norm": 14.752246856689453, + "learning_rate": 6.0949161244411885e-06, + "loss": 4.6896, + "step": 8820 + }, + { + "epoch": 1.731071008238525, + "grad_norm": 10.59720516204834, + "learning_rate": 6.091139256904363e-06, + "loss": 4.4049, + "step": 8825 + }, + { + "epoch": 1.732051785013731, + "grad_norm": 22.1069393157959, + "learning_rate": 6.087361735529464e-06, + "loss": 4.7651, + "step": 8830 + }, + { + "epoch": 1.7330325617889368, + "grad_norm": 16.202037811279297, + "learning_rate": 6.083583562580078e-06, + "loss": 4.6475, + "step": 8835 + }, + { + "epoch": 1.7340133385641427, + "grad_norm": 32.283241271972656, + "learning_rate": 6.079804740320181e-06, + "loss": 4.8508, + "step": 8840 + }, + { + "epoch": 1.7349941153393487, + "grad_norm": 15.124344825744629, + "learning_rate": 6.076025271014138e-06, + "loss": 4.5117, + "step": 8845 + }, + { + "epoch": 1.7359748921145548, + "grad_norm": 22.1917724609375, + "learning_rate": 6.0722451569267015e-06, + "loss": 4.5844, + "step": 8850 + }, + { + "epoch": 1.7369556688897607, + "grad_norm": 21.06514549255371, + "learning_rate": 6.06846440032301e-06, + "loss": 4.647, + "step": 8855 + }, + { + "epoch": 1.7379364456649666, + "grad_norm": 29.728837966918945, + "learning_rate": 6.064683003468591e-06, + "loss": 4.5468, + "step": 8860 + }, + { + "epoch": 1.7389172224401728, + "grad_norm": 15.97719669342041, + "learning_rate": 6.060900968629352e-06, + "loss": 4.7311, + "step": 8865 + }, + { + "epoch": 1.7398979992153785, + "grad_norm": 15.47018814086914, + "learning_rate": 6.05711829807158e-06, + "loss": 4.6125, + "step": 8870 + }, + { + "epoch": 1.7408787759905846, + "grad_norm": 22.31346893310547, + "learning_rate": 6.05333499406195e-06, + "loss": 4.6249, + "step": 8875 + }, + { + "epoch": 1.7418595527657905, + "grad_norm": 38.01810836791992, + "learning_rate": 6.04955105886751e-06, + "loss": 5.4116, + "step": 8880 + }, + { + "epoch": 1.7428403295409964, + "grad_norm": 16.22886085510254, + "learning_rate": 6.045766494755692e-06, + "loss": 4.9962, + "step": 8885 + }, + { + "epoch": 1.7438211063162026, + "grad_norm": 25.318523406982422, + "learning_rate": 6.0419813039943e-06, + "loss": 4.7032, + "step": 8890 + }, + { + "epoch": 1.7448018830914083, + "grad_norm": 28.022945404052734, + "learning_rate": 6.038195488851515e-06, + "loss": 4.5836, + "step": 8895 + }, + { + "epoch": 1.7457826598666144, + "grad_norm": 9.770703315734863, + "learning_rate": 6.0344090515958946e-06, + "loss": 4.6167, + "step": 8900 + }, + { + "epoch": 1.7467634366418203, + "grad_norm": 19.886987686157227, + "learning_rate": 6.030621994496365e-06, + "loss": 4.3776, + "step": 8905 + }, + { + "epoch": 1.7477442134170262, + "grad_norm": 14.081059455871582, + "learning_rate": 6.026834319822228e-06, + "loss": 4.5549, + "step": 8910 + }, + { + "epoch": 1.7487249901922324, + "grad_norm": 21.361549377441406, + "learning_rate": 6.0230460298431525e-06, + "loss": 4.8304, + "step": 8915 + }, + { + "epoch": 1.749705766967438, + "grad_norm": 12.771543502807617, + "learning_rate": 6.019257126829178e-06, + "loss": 4.5428, + "step": 8920 + }, + { + "epoch": 1.7506865437426442, + "grad_norm": 36.33089065551758, + "learning_rate": 6.015467613050708e-06, + "loss": 5.1167, + "step": 8925 + }, + { + "epoch": 1.7506865437426442, + "eval_loss": 4.88515567779541, + "eval_runtime": 7.8212, + "eval_samples_per_second": 26.722, + "eval_steps_per_second": 13.425, + "step": 8925 + }, + { + "epoch": 1.7516673205178501, + "grad_norm": 15.354480743408203, + "learning_rate": 6.0116774907785154e-06, + "loss": 5.1371, + "step": 8930 + }, + { + "epoch": 1.752648097293056, + "grad_norm": 9.22092056274414, + "learning_rate": 6.00788676228374e-06, + "loss": 4.2449, + "step": 8935 + }, + { + "epoch": 1.7536288740682622, + "grad_norm": 14.879875183105469, + "learning_rate": 6.004095429837878e-06, + "loss": 4.9914, + "step": 8940 + }, + { + "epoch": 1.7546096508434679, + "grad_norm": 20.033912658691406, + "learning_rate": 6.000303495712791e-06, + "loss": 4.5591, + "step": 8945 + }, + { + "epoch": 1.755590427618674, + "grad_norm": 19.408241271972656, + "learning_rate": 5.996510962180704e-06, + "loss": 4.9889, + "step": 8950 + }, + { + "epoch": 1.75657120439388, + "grad_norm": 18.337635040283203, + "learning_rate": 5.992717831514196e-06, + "loss": 4.8683, + "step": 8955 + }, + { + "epoch": 1.7575519811690858, + "grad_norm": 10.698108673095703, + "learning_rate": 5.988924105986207e-06, + "loss": 4.7408, + "step": 8960 + }, + { + "epoch": 1.758532757944292, + "grad_norm": 20.650182723999023, + "learning_rate": 5.985129787870032e-06, + "loss": 4.748, + "step": 8965 + }, + { + "epoch": 1.759513534719498, + "grad_norm": 16.508623123168945, + "learning_rate": 5.981334879439324e-06, + "loss": 4.7143, + "step": 8970 + }, + { + "epoch": 1.7604943114947038, + "grad_norm": 38.822044372558594, + "learning_rate": 5.9775393829680865e-06, + "loss": 4.8905, + "step": 8975 + }, + { + "epoch": 1.7614750882699097, + "grad_norm": 16.134923934936523, + "learning_rate": 5.973743300730674e-06, + "loss": 4.4573, + "step": 8980 + }, + { + "epoch": 1.7624558650451156, + "grad_norm": 29.342491149902344, + "learning_rate": 5.9699466350017975e-06, + "loss": 4.5743, + "step": 8985 + }, + { + "epoch": 1.7634366418203218, + "grad_norm": 30.499624252319336, + "learning_rate": 5.9661493880565136e-06, + "loss": 4.512, + "step": 8990 + }, + { + "epoch": 1.7644174185955277, + "grad_norm": 25.990304946899414, + "learning_rate": 5.9623515621702275e-06, + "loss": 4.7905, + "step": 8995 + }, + { + "epoch": 1.7653981953707336, + "grad_norm": 13.431137084960938, + "learning_rate": 5.958553159618693e-06, + "loss": 5.4404, + "step": 9000 + }, + { + "epoch": 1.7663789721459395, + "grad_norm": 24.7120361328125, + "learning_rate": 5.954754182678008e-06, + "loss": 4.8807, + "step": 9005 + }, + { + "epoch": 1.7673597489211454, + "grad_norm": 22.7612361907959, + "learning_rate": 5.9509546336246135e-06, + "loss": 4.3851, + "step": 9010 + }, + { + "epoch": 1.7683405256963516, + "grad_norm": 20.71037483215332, + "learning_rate": 5.947154514735299e-06, + "loss": 4.7264, + "step": 9015 + }, + { + "epoch": 1.7693213024715575, + "grad_norm": 20.132646560668945, + "learning_rate": 5.943353828287185e-06, + "loss": 4.525, + "step": 9020 + }, + { + "epoch": 1.7703020792467634, + "grad_norm": 21.293230056762695, + "learning_rate": 5.939552576557743e-06, + "loss": 4.5384, + "step": 9025 + }, + { + "epoch": 1.7712828560219696, + "grad_norm": 33.4547233581543, + "learning_rate": 5.935750761824777e-06, + "loss": 4.6896, + "step": 9030 + }, + { + "epoch": 1.7722636327971752, + "grad_norm": 28.96019744873047, + "learning_rate": 5.9319483863664306e-06, + "loss": 4.4839, + "step": 9035 + }, + { + "epoch": 1.7732444095723814, + "grad_norm": 21.133338928222656, + "learning_rate": 5.928145452461183e-06, + "loss": 4.696, + "step": 9040 + }, + { + "epoch": 1.7742251863475873, + "grad_norm": 32.1081657409668, + "learning_rate": 5.9243419623878485e-06, + "loss": 4.739, + "step": 9045 + }, + { + "epoch": 1.7752059631227932, + "grad_norm": 22.64603614807129, + "learning_rate": 5.920537918425571e-06, + "loss": 4.445, + "step": 9050 + }, + { + "epoch": 1.7761867398979994, + "grad_norm": 24.658477783203125, + "learning_rate": 5.916733322853831e-06, + "loss": 4.4289, + "step": 9055 + }, + { + "epoch": 1.777167516673205, + "grad_norm": 20.534879684448242, + "learning_rate": 5.912928177952438e-06, + "loss": 4.7319, + "step": 9060 + }, + { + "epoch": 1.7781482934484112, + "grad_norm": 23.79903793334961, + "learning_rate": 5.909122486001531e-06, + "loss": 4.5808, + "step": 9065 + }, + { + "epoch": 1.779129070223617, + "grad_norm": 13.706748962402344, + "learning_rate": 5.905316249281575e-06, + "loss": 4.624, + "step": 9070 + }, + { + "epoch": 1.780109846998823, + "grad_norm": 37.71672439575195, + "learning_rate": 5.901509470073364e-06, + "loss": 4.6139, + "step": 9075 + }, + { + "epoch": 1.7810906237740292, + "grad_norm": 18.964744567871094, + "learning_rate": 5.897702150658015e-06, + "loss": 4.6728, + "step": 9080 + }, + { + "epoch": 1.7820714005492349, + "grad_norm": 11.068283081054688, + "learning_rate": 5.89389429331697e-06, + "loss": 4.5614, + "step": 9085 + }, + { + "epoch": 1.783052177324441, + "grad_norm": 14.902594566345215, + "learning_rate": 5.890085900331991e-06, + "loss": 4.6259, + "step": 9090 + }, + { + "epoch": 1.784032954099647, + "grad_norm": 26.177350997924805, + "learning_rate": 5.8862769739851655e-06, + "loss": 4.7789, + "step": 9095 + }, + { + "epoch": 1.7850137308748528, + "grad_norm": 11.954874038696289, + "learning_rate": 5.882467516558896e-06, + "loss": 4.6356, + "step": 9100 + }, + { + "epoch": 1.785994507650059, + "grad_norm": 26.418127059936523, + "learning_rate": 5.878657530335906e-06, + "loss": 4.6969, + "step": 9105 + }, + { + "epoch": 1.7869752844252647, + "grad_norm": 24.923782348632812, + "learning_rate": 5.874847017599236e-06, + "loss": 4.3997, + "step": 9110 + }, + { + "epoch": 1.7879560612004708, + "grad_norm": 24.46477699279785, + "learning_rate": 5.87103598063224e-06, + "loss": 4.5095, + "step": 9115 + }, + { + "epoch": 1.7889368379756767, + "grad_norm": 21.268714904785156, + "learning_rate": 5.867224421718587e-06, + "loss": 4.9617, + "step": 9120 + }, + { + "epoch": 1.7899176147508826, + "grad_norm": 16.486682891845703, + "learning_rate": 5.863412343142258e-06, + "loss": 4.1348, + "step": 9125 + }, + { + "epoch": 1.7908983915260888, + "grad_norm": 15.189291954040527, + "learning_rate": 5.8595997471875465e-06, + "loss": 4.442, + "step": 9130 + }, + { + "epoch": 1.7918791683012947, + "grad_norm": 12.895219802856445, + "learning_rate": 5.855786636139058e-06, + "loss": 4.9543, + "step": 9135 + }, + { + "epoch": 1.7928599450765006, + "grad_norm": 13.546518325805664, + "learning_rate": 5.8519730122817045e-06, + "loss": 4.6226, + "step": 9140 + }, + { + "epoch": 1.7938407218517065, + "grad_norm": 16.05805206298828, + "learning_rate": 5.848158877900702e-06, + "loss": 4.5933, + "step": 9145 + }, + { + "epoch": 1.7948214986269124, + "grad_norm": 19.39604377746582, + "learning_rate": 5.844344235281578e-06, + "loss": 4.5569, + "step": 9150 + }, + { + "epoch": 1.7958022754021186, + "grad_norm": 20.664596557617188, + "learning_rate": 5.840529086710163e-06, + "loss": 4.5215, + "step": 9155 + }, + { + "epoch": 1.7967830521773245, + "grad_norm": 24.75821304321289, + "learning_rate": 5.836713434472587e-06, + "loss": 4.7306, + "step": 9160 + }, + { + "epoch": 1.7977638289525304, + "grad_norm": 38.528533935546875, + "learning_rate": 5.832897280855289e-06, + "loss": 4.8929, + "step": 9165 + }, + { + "epoch": 1.7987446057277363, + "grad_norm": 17.18060874938965, + "learning_rate": 5.8290806281450004e-06, + "loss": 4.6035, + "step": 9170 + }, + { + "epoch": 1.7997253825029422, + "grad_norm": 20.632356643676758, + "learning_rate": 5.8252634786287574e-06, + "loss": 4.5474, + "step": 9175 + }, + { + "epoch": 1.8007061592781484, + "grad_norm": 21.149751663208008, + "learning_rate": 5.821445834593889e-06, + "loss": 4.7935, + "step": 9180 + }, + { + "epoch": 1.8016869360533543, + "grad_norm": 21.72342300415039, + "learning_rate": 5.817627698328029e-06, + "loss": 4.3023, + "step": 9185 + }, + { + "epoch": 1.8026677128285602, + "grad_norm": 21.94120216369629, + "learning_rate": 5.813809072119098e-06, + "loss": 4.719, + "step": 9190 + }, + { + "epoch": 1.8036484896037663, + "grad_norm": 21.152067184448242, + "learning_rate": 5.80998995825531e-06, + "loss": 5.0133, + "step": 9195 + }, + { + "epoch": 1.804629266378972, + "grad_norm": 17.194652557373047, + "learning_rate": 5.806170359025177e-06, + "loss": 4.6929, + "step": 9200 + }, + { + "epoch": 1.8056100431541782, + "grad_norm": 22.314577102661133, + "learning_rate": 5.802350276717498e-06, + "loss": 4.5256, + "step": 9205 + }, + { + "epoch": 1.806590819929384, + "grad_norm": 18.893587112426758, + "learning_rate": 5.798529713621364e-06, + "loss": 4.5493, + "step": 9210 + }, + { + "epoch": 1.80757159670459, + "grad_norm": 25.69357681274414, + "learning_rate": 5.7947086720261495e-06, + "loss": 4.5069, + "step": 9215 + }, + { + "epoch": 1.8085523734797961, + "grad_norm": 13.256308555603027, + "learning_rate": 5.790887154221521e-06, + "loss": 4.6355, + "step": 9220 + }, + { + "epoch": 1.8095331502550018, + "grad_norm": 26.34750747680664, + "learning_rate": 5.787065162497427e-06, + "loss": 4.2925, + "step": 9225 + }, + { + "epoch": 1.810513927030208, + "grad_norm": 21.691946029663086, + "learning_rate": 5.7832426991441014e-06, + "loss": 4.7569, + "step": 9230 + }, + { + "epoch": 1.8114947038054139, + "grad_norm": 22.89154052734375, + "learning_rate": 5.77941976645206e-06, + "loss": 4.5489, + "step": 9235 + }, + { + "epoch": 1.8124754805806198, + "grad_norm": 13.551591873168945, + "learning_rate": 5.775596366712101e-06, + "loss": 4.6459, + "step": 9240 + }, + { + "epoch": 1.813456257355826, + "grad_norm": 35.56877899169922, + "learning_rate": 5.771772502215301e-06, + "loss": 5.1302, + "step": 9245 + }, + { + "epoch": 1.8144370341310316, + "grad_norm": 22.027677536010742, + "learning_rate": 5.767948175253015e-06, + "loss": 4.9661, + "step": 9250 + }, + { + "epoch": 1.8154178109062378, + "grad_norm": 10.572216987609863, + "learning_rate": 5.764123388116877e-06, + "loss": 4.5735, + "step": 9255 + }, + { + "epoch": 1.8163985876814437, + "grad_norm": 17.56256866455078, + "learning_rate": 5.760298143098797e-06, + "loss": 4.725, + "step": 9260 + }, + { + "epoch": 1.8173793644566496, + "grad_norm": 36.0407829284668, + "learning_rate": 5.756472442490954e-06, + "loss": 4.8934, + "step": 9265 + }, + { + "epoch": 1.8183601412318557, + "grad_norm": 21.272165298461914, + "learning_rate": 5.752646288585808e-06, + "loss": 4.488, + "step": 9270 + }, + { + "epoch": 1.8193409180070614, + "grad_norm": 19.818387985229492, + "learning_rate": 5.748819683676083e-06, + "loss": 4.6846, + "step": 9275 + }, + { + "epoch": 1.8203216947822676, + "grad_norm": 28.447729110717773, + "learning_rate": 5.744992630054779e-06, + "loss": 4.6269, + "step": 9280 + }, + { + "epoch": 1.8213024715574735, + "grad_norm": 27.741268157958984, + "learning_rate": 5.7411651300151624e-06, + "loss": 4.8284, + "step": 9285 + }, + { + "epoch": 1.8222832483326794, + "grad_norm": 17.49547576904297, + "learning_rate": 5.737337185850769e-06, + "loss": 4.5584, + "step": 9290 + }, + { + "epoch": 1.8232640251078855, + "grad_norm": 24.814552307128906, + "learning_rate": 5.733508799855396e-06, + "loss": 4.6543, + "step": 9295 + }, + { + "epoch": 1.8242448018830915, + "grad_norm": 16.014293670654297, + "learning_rate": 5.7296799743231125e-06, + "loss": 4.4332, + "step": 9300 + }, + { + "epoch": 1.8252255786582974, + "grad_norm": 27.853015899658203, + "learning_rate": 5.725850711548242e-06, + "loss": 4.3259, + "step": 9305 + }, + { + "epoch": 1.8262063554335033, + "grad_norm": 16.20301628112793, + "learning_rate": 5.722021013825378e-06, + "loss": 4.5385, + "step": 9310 + }, + { + "epoch": 1.8271871322087092, + "grad_norm": 26.664472579956055, + "learning_rate": 5.718190883449373e-06, + "loss": 4.4814, + "step": 9315 + }, + { + "epoch": 1.8281679089839153, + "grad_norm": 39.86537170410156, + "learning_rate": 5.714360322715335e-06, + "loss": 4.6435, + "step": 9320 + }, + { + "epoch": 1.8291486857591213, + "grad_norm": 32.14000701904297, + "learning_rate": 5.710529333918633e-06, + "loss": 4.529, + "step": 9325 + }, + { + "epoch": 1.8301294625343272, + "grad_norm": 21.911773681640625, + "learning_rate": 5.706697919354892e-06, + "loss": 4.801, + "step": 9330 + }, + { + "epoch": 1.8311102393095333, + "grad_norm": 17.9941463470459, + "learning_rate": 5.702866081319992e-06, + "loss": 4.6752, + "step": 9335 + }, + { + "epoch": 1.832091016084739, + "grad_norm": 24.887266159057617, + "learning_rate": 5.699033822110066e-06, + "loss": 4.8302, + "step": 9340 + }, + { + "epoch": 1.8330717928599451, + "grad_norm": 18.100421905517578, + "learning_rate": 5.6952011440215e-06, + "loss": 4.4905, + "step": 9345 + }, + { + "epoch": 1.834052569635151, + "grad_norm": 12.726563453674316, + "learning_rate": 5.691368049350932e-06, + "loss": 5.1001, + "step": 9350 + }, + { + "epoch": 1.835033346410357, + "grad_norm": 13.885564804077148, + "learning_rate": 5.687534540395247e-06, + "loss": 4.3967, + "step": 9355 + }, + { + "epoch": 1.8360141231855631, + "grad_norm": 13.186820030212402, + "learning_rate": 5.683700619451584e-06, + "loss": 4.536, + "step": 9360 + }, + { + "epoch": 1.8369948999607688, + "grad_norm": 21.86033058166504, + "learning_rate": 5.679866288817321e-06, + "loss": 4.5876, + "step": 9365 + }, + { + "epoch": 1.837975676735975, + "grad_norm": 23.89190101623535, + "learning_rate": 5.676031550790087e-06, + "loss": 4.9731, + "step": 9370 + }, + { + "epoch": 1.8389564535111809, + "grad_norm": 23.900188446044922, + "learning_rate": 5.6721964076677515e-06, + "loss": 4.6855, + "step": 9375 + }, + { + "epoch": 1.8399372302863868, + "grad_norm": 24.231233596801758, + "learning_rate": 5.66836086174843e-06, + "loss": 4.4809, + "step": 9380 + }, + { + "epoch": 1.840918007061593, + "grad_norm": 20.207731246948242, + "learning_rate": 5.664524915330478e-06, + "loss": 4.6487, + "step": 9385 + }, + { + "epoch": 1.8418987838367986, + "grad_norm": 15.975571632385254, + "learning_rate": 5.660688570712492e-06, + "loss": 4.8748, + "step": 9390 + }, + { + "epoch": 1.8428795606120048, + "grad_norm": 29.445188522338867, + "learning_rate": 5.656851830193304e-06, + "loss": 4.9362, + "step": 9395 + }, + { + "epoch": 1.8438603373872107, + "grad_norm": 16.028915405273438, + "learning_rate": 5.653014696071987e-06, + "loss": 4.7763, + "step": 9400 + }, + { + "epoch": 1.8448411141624166, + "grad_norm": 24.72963523864746, + "learning_rate": 5.649177170647847e-06, + "loss": 4.6154, + "step": 9405 + }, + { + "epoch": 1.8458218909376227, + "grad_norm": 44.67896270751953, + "learning_rate": 5.645339256220427e-06, + "loss": 4.8692, + "step": 9410 + }, + { + "epoch": 1.8468026677128284, + "grad_norm": 31.613073348999023, + "learning_rate": 5.641500955089502e-06, + "loss": 4.7649, + "step": 9415 + }, + { + "epoch": 1.8477834444880346, + "grad_norm": 23.47369956970215, + "learning_rate": 5.6376622695550764e-06, + "loss": 4.9661, + "step": 9420 + }, + { + "epoch": 1.8487642212632405, + "grad_norm": 11.438894271850586, + "learning_rate": 5.63382320191739e-06, + "loss": 4.7752, + "step": 9425 + }, + { + "epoch": 1.8497449980384464, + "grad_norm": 17.62010383605957, + "learning_rate": 5.6299837544769046e-06, + "loss": 4.7805, + "step": 9430 + }, + { + "epoch": 1.8507257748136525, + "grad_norm": 18.699684143066406, + "learning_rate": 5.6261439295343175e-06, + "loss": 4.5472, + "step": 9435 + }, + { + "epoch": 1.8517065515888582, + "grad_norm": 13.261539459228516, + "learning_rate": 5.622303729390548e-06, + "loss": 4.8652, + "step": 9440 + }, + { + "epoch": 1.8526873283640644, + "grad_norm": 17.09125518798828, + "learning_rate": 5.61846315634674e-06, + "loss": 4.6028, + "step": 9445 + }, + { + "epoch": 1.8536681051392703, + "grad_norm": 34.29975891113281, + "learning_rate": 5.61462221270426e-06, + "loss": 4.392, + "step": 9450 + }, + { + "epoch": 1.8546488819144762, + "grad_norm": 16.109909057617188, + "learning_rate": 5.6107809007646966e-06, + "loss": 4.6932, + "step": 9455 + }, + { + "epoch": 1.8556296586896823, + "grad_norm": 18.13594627380371, + "learning_rate": 5.606939222829865e-06, + "loss": 4.7523, + "step": 9460 + }, + { + "epoch": 1.8566104354648882, + "grad_norm": 19.660789489746094, + "learning_rate": 5.603097181201793e-06, + "loss": 4.4013, + "step": 9465 + }, + { + "epoch": 1.8575912122400942, + "grad_norm": 24.52638053894043, + "learning_rate": 5.599254778182729e-06, + "loss": 5.0819, + "step": 9470 + }, + { + "epoch": 1.8585719890153, + "grad_norm": 11.734413146972656, + "learning_rate": 5.5954120160751354e-06, + "loss": 4.4883, + "step": 9475 + }, + { + "epoch": 1.859552765790506, + "grad_norm": 22.79249382019043, + "learning_rate": 5.5915688971816955e-06, + "loss": 4.6221, + "step": 9480 + }, + { + "epoch": 1.8605335425657121, + "grad_norm": 18.433395385742188, + "learning_rate": 5.587725423805299e-06, + "loss": 4.6917, + "step": 9485 + }, + { + "epoch": 1.861514319340918, + "grad_norm": 17.560726165771484, + "learning_rate": 5.583881598249054e-06, + "loss": 4.6395, + "step": 9490 + }, + { + "epoch": 1.862495096116124, + "grad_norm": 18.91305160522461, + "learning_rate": 5.5800374228162776e-06, + "loss": 4.4603, + "step": 9495 + }, + { + "epoch": 1.86347587289133, + "grad_norm": 22.717443466186523, + "learning_rate": 5.576192899810495e-06, + "loss": 4.8376, + "step": 9500 + }, + { + "epoch": 1.8644566496665358, + "grad_norm": 14.52634334564209, + "learning_rate": 5.572348031535442e-06, + "loss": 4.8505, + "step": 9505 + }, + { + "epoch": 1.865437426441742, + "grad_norm": 20.6605167388916, + "learning_rate": 5.5685028202950595e-06, + "loss": 4.8469, + "step": 9510 + }, + { + "epoch": 1.8664182032169478, + "grad_norm": 18.79912757873535, + "learning_rate": 5.5646572683934975e-06, + "loss": 4.9637, + "step": 9515 + }, + { + "epoch": 1.8673989799921538, + "grad_norm": 13.271584510803223, + "learning_rate": 5.560811378135104e-06, + "loss": 4.8746, + "step": 9520 + }, + { + "epoch": 1.86837975676736, + "grad_norm": 15.022200584411621, + "learning_rate": 5.556965151824433e-06, + "loss": 4.5181, + "step": 9525 + }, + { + "epoch": 1.8693605335425656, + "grad_norm": 20.869287490844727, + "learning_rate": 5.553118591766241e-06, + "loss": 4.4984, + "step": 9530 + }, + { + "epoch": 1.8703413103177717, + "grad_norm": 26.764667510986328, + "learning_rate": 5.549271700265485e-06, + "loss": 4.7425, + "step": 9535 + }, + { + "epoch": 1.8713220870929776, + "grad_norm": 20.59969139099121, + "learning_rate": 5.5454244796273175e-06, + "loss": 4.6921, + "step": 9540 + }, + { + "epoch": 1.8723028638681836, + "grad_norm": 19.59276008605957, + "learning_rate": 5.54157693215709e-06, + "loss": 4.8437, + "step": 9545 + }, + { + "epoch": 1.8732836406433897, + "grad_norm": 13.677157402038574, + "learning_rate": 5.5377290601603504e-06, + "loss": 4.721, + "step": 9550 + }, + { + "epoch": 1.8742644174185954, + "grad_norm": 18.6031551361084, + "learning_rate": 5.53388086594284e-06, + "loss": 4.4678, + "step": 9555 + }, + { + "epoch": 1.8752451941938015, + "grad_norm": 28.015052795410156, + "learning_rate": 5.5300323518104925e-06, + "loss": 4.8587, + "step": 9560 + }, + { + "epoch": 1.8762259709690075, + "grad_norm": 23.171489715576172, + "learning_rate": 5.526183520069436e-06, + "loss": 4.6468, + "step": 9565 + }, + { + "epoch": 1.8772067477442134, + "grad_norm": 23.11787223815918, + "learning_rate": 5.522334373025986e-06, + "loss": 4.5523, + "step": 9570 + }, + { + "epoch": 1.8781875245194195, + "grad_norm": 11.215222358703613, + "learning_rate": 5.518484912986648e-06, + "loss": 4.7702, + "step": 9575 + }, + { + "epoch": 1.8791683012946252, + "grad_norm": 22.043649673461914, + "learning_rate": 5.514635142258116e-06, + "loss": 4.426, + "step": 9580 + }, + { + "epoch": 1.8801490780698313, + "grad_norm": 15.956389427185059, + "learning_rate": 5.510785063147269e-06, + "loss": 4.7553, + "step": 9585 + }, + { + "epoch": 1.8811298548450373, + "grad_norm": 31.348739624023438, + "learning_rate": 5.506934677961172e-06, + "loss": 4.2031, + "step": 9590 + }, + { + "epoch": 1.8821106316202432, + "grad_norm": 14.20702838897705, + "learning_rate": 5.503083989007072e-06, + "loss": 4.6814, + "step": 9595 + }, + { + "epoch": 1.8830914083954493, + "grad_norm": 20.319719314575195, + "learning_rate": 5.499232998592399e-06, + "loss": 4.5755, + "step": 9600 + }, + { + "epoch": 1.884072185170655, + "grad_norm": 25.81941795349121, + "learning_rate": 5.49538170902476e-06, + "loss": 4.801, + "step": 9605 + }, + { + "epoch": 1.8850529619458611, + "grad_norm": 24.612640380859375, + "learning_rate": 5.49153012261195e-06, + "loss": 4.9738, + "step": 9610 + }, + { + "epoch": 1.886033738721067, + "grad_norm": 25.011335372924805, + "learning_rate": 5.487678241661933e-06, + "loss": 4.7212, + "step": 9615 + }, + { + "epoch": 1.887014515496273, + "grad_norm": 27.755661010742188, + "learning_rate": 5.483826068482854e-06, + "loss": 4.6, + "step": 9620 + }, + { + "epoch": 1.887995292271479, + "grad_norm": 18.196792602539062, + "learning_rate": 5.4799736053830324e-06, + "loss": 4.53, + "step": 9625 + }, + { + "epoch": 1.888976069046685, + "grad_norm": 13.700377464294434, + "learning_rate": 5.476120854670957e-06, + "loss": 4.8347, + "step": 9630 + }, + { + "epoch": 1.889956845821891, + "grad_norm": 30.03533935546875, + "learning_rate": 5.4722678186552995e-06, + "loss": 4.7082, + "step": 9635 + }, + { + "epoch": 1.8909376225970969, + "grad_norm": 16.419034957885742, + "learning_rate": 5.468414499644892e-06, + "loss": 4.3029, + "step": 9640 + }, + { + "epoch": 1.8919183993723028, + "grad_norm": 14.156940460205078, + "learning_rate": 5.4645608999487395e-06, + "loss": 4.3517, + "step": 9645 + }, + { + "epoch": 1.892899176147509, + "grad_norm": 28.83321762084961, + "learning_rate": 5.4607070218760184e-06, + "loss": 4.5463, + "step": 9650 + }, + { + "epoch": 1.8938799529227148, + "grad_norm": 45.10062026977539, + "learning_rate": 5.456852867736067e-06, + "loss": 4.8991, + "step": 9655 + }, + { + "epoch": 1.8948607296979207, + "grad_norm": 24.96133041381836, + "learning_rate": 5.452998439838392e-06, + "loss": 4.5104, + "step": 9660 + }, + { + "epoch": 1.8958415064731269, + "grad_norm": 11.74023151397705, + "learning_rate": 5.449143740492664e-06, + "loss": 4.665, + "step": 9665 + }, + { + "epoch": 1.8968222832483326, + "grad_norm": 19.273738861083984, + "learning_rate": 5.4452887720087165e-06, + "loss": 4.7088, + "step": 9670 + }, + { + "epoch": 1.8978030600235387, + "grad_norm": 15.15214729309082, + "learning_rate": 5.441433536696541e-06, + "loss": 4.8068, + "step": 9675 + }, + { + "epoch": 1.8987838367987446, + "grad_norm": 18.632030487060547, + "learning_rate": 5.437578036866293e-06, + "loss": 4.9114, + "step": 9680 + }, + { + "epoch": 1.8997646135739505, + "grad_norm": 19.880435943603516, + "learning_rate": 5.433722274828286e-06, + "loss": 4.7176, + "step": 9685 + }, + { + "epoch": 1.9007453903491567, + "grad_norm": 29.079919815063477, + "learning_rate": 5.429866252892988e-06, + "loss": 3.8863, + "step": 9690 + }, + { + "epoch": 1.9017261671243624, + "grad_norm": 20.837482452392578, + "learning_rate": 5.426009973371026e-06, + "loss": 4.7429, + "step": 9695 + }, + { + "epoch": 1.9027069438995685, + "grad_norm": 18.1405029296875, + "learning_rate": 5.422153438573176e-06, + "loss": 5.1525, + "step": 9700 + }, + { + "epoch": 1.9036877206747744, + "grad_norm": 41.7537841796875, + "learning_rate": 5.418296650810373e-06, + "loss": 4.7584, + "step": 9705 + }, + { + "epoch": 1.9046684974499803, + "grad_norm": 13.388349533081055, + "learning_rate": 5.414439612393703e-06, + "loss": 4.6856, + "step": 9710 + }, + { + "epoch": 1.9056492742251865, + "grad_norm": 15.971819877624512, + "learning_rate": 5.410582325634397e-06, + "loss": 4.8956, + "step": 9715 + }, + { + "epoch": 1.9066300510003922, + "grad_norm": 20.830162048339844, + "learning_rate": 5.40672479284384e-06, + "loss": 4.5245, + "step": 9720 + }, + { + "epoch": 1.9076108277755983, + "grad_norm": 12.127934455871582, + "learning_rate": 5.402867016333563e-06, + "loss": 4.4878, + "step": 9725 + }, + { + "epoch": 1.9085916045508042, + "grad_norm": 21.272417068481445, + "learning_rate": 5.399008998415242e-06, + "loss": 4.5526, + "step": 9730 + }, + { + "epoch": 1.9095723813260101, + "grad_norm": 14.217857360839844, + "learning_rate": 5.395150741400698e-06, + "loss": 4.5744, + "step": 9735 + }, + { + "epoch": 1.9105531581012163, + "grad_norm": 24.945274353027344, + "learning_rate": 5.3912922476018956e-06, + "loss": 4.5555, + "step": 9740 + }, + { + "epoch": 1.911533934876422, + "grad_norm": 17.92483139038086, + "learning_rate": 5.387433519330941e-06, + "loss": 5.0362, + "step": 9745 + }, + { + "epoch": 1.9125147116516281, + "grad_norm": 19.815181732177734, + "learning_rate": 5.383574558900083e-06, + "loss": 4.5256, + "step": 9750 + }, + { + "epoch": 1.913495488426834, + "grad_norm": 18.88251304626465, + "learning_rate": 5.3797153686217054e-06, + "loss": 4.6896, + "step": 9755 + }, + { + "epoch": 1.91447626520204, + "grad_norm": 28.06948471069336, + "learning_rate": 5.375855950808334e-06, + "loss": 4.5653, + "step": 9760 + }, + { + "epoch": 1.915457041977246, + "grad_norm": 19.297407150268555, + "learning_rate": 5.371996307772628e-06, + "loss": 4.6416, + "step": 9765 + }, + { + "epoch": 1.9164378187524518, + "grad_norm": 14.056635856628418, + "learning_rate": 5.368136441827383e-06, + "loss": 4.7386, + "step": 9770 + }, + { + "epoch": 1.917418595527658, + "grad_norm": 17.106592178344727, + "learning_rate": 5.364276355285527e-06, + "loss": 4.5563, + "step": 9775 + }, + { + "epoch": 1.9183993723028638, + "grad_norm": 21.838733673095703, + "learning_rate": 5.36041605046012e-06, + "loss": 4.6798, + "step": 9780 + }, + { + "epoch": 1.9193801490780698, + "grad_norm": 17.363008499145508, + "learning_rate": 5.3565555296643555e-06, + "loss": 4.868, + "step": 9785 + }, + { + "epoch": 1.920360925853276, + "grad_norm": 20.810707092285156, + "learning_rate": 5.352694795211555e-06, + "loss": 4.8239, + "step": 9790 + }, + { + "epoch": 1.9213417026284818, + "grad_norm": 29.47817039489746, + "learning_rate": 5.348833849415167e-06, + "loss": 4.8366, + "step": 9795 + }, + { + "epoch": 1.9223224794036877, + "grad_norm": 21.65797233581543, + "learning_rate": 5.344972694588766e-06, + "loss": 4.3141, + "step": 9800 + }, + { + "epoch": 1.9233032561788936, + "grad_norm": 15.772238731384277, + "learning_rate": 5.341111333046054e-06, + "loss": 4.7682, + "step": 9805 + }, + { + "epoch": 1.9242840329540996, + "grad_norm": 17.035268783569336, + "learning_rate": 5.337249767100856e-06, + "loss": 4.5995, + "step": 9810 + }, + { + "epoch": 1.9252648097293057, + "grad_norm": 20.667861938476562, + "learning_rate": 5.33338799906712e-06, + "loss": 4.7046, + "step": 9815 + }, + { + "epoch": 1.9262455865045116, + "grad_norm": 18.487794876098633, + "learning_rate": 5.329526031258914e-06, + "loss": 4.567, + "step": 9820 + }, + { + "epoch": 1.9272263632797175, + "grad_norm": 18.39491844177246, + "learning_rate": 5.325663865990425e-06, + "loss": 5.004, + "step": 9825 + }, + { + "epoch": 1.9282071400549237, + "grad_norm": 17.848005294799805, + "learning_rate": 5.321801505575959e-06, + "loss": 4.7211, + "step": 9830 + }, + { + "epoch": 1.9291879168301294, + "grad_norm": 25.833599090576172, + "learning_rate": 5.317938952329943e-06, + "loss": 4.4809, + "step": 9835 + }, + { + "epoch": 1.9301686936053355, + "grad_norm": 15.303146362304688, + "learning_rate": 5.314076208566915e-06, + "loss": 4.6026, + "step": 9840 + }, + { + "epoch": 1.9311494703805414, + "grad_norm": 16.975481033325195, + "learning_rate": 5.310213276601525e-06, + "loss": 4.9513, + "step": 9845 + }, + { + "epoch": 1.9321302471557473, + "grad_norm": 45.39344024658203, + "learning_rate": 5.306350158748544e-06, + "loss": 4.7095, + "step": 9850 + }, + { + "epoch": 1.9331110239309535, + "grad_norm": 19.3656063079834, + "learning_rate": 5.302486857322841e-06, + "loss": 4.7902, + "step": 9855 + }, + { + "epoch": 1.9340918007061592, + "grad_norm": 18.749727249145508, + "learning_rate": 5.298623374639413e-06, + "loss": 4.5287, + "step": 9860 + }, + { + "epoch": 1.9350725774813653, + "grad_norm": 26.79215431213379, + "learning_rate": 5.294759713013351e-06, + "loss": 4.5868, + "step": 9865 + }, + { + "epoch": 1.9360533542565712, + "grad_norm": 14.35335636138916, + "learning_rate": 5.290895874759859e-06, + "loss": 5.0044, + "step": 9870 + }, + { + "epoch": 1.9370341310317771, + "grad_norm": 12.449397087097168, + "learning_rate": 5.287031862194246e-06, + "loss": 4.6489, + "step": 9875 + }, + { + "epoch": 1.9380149078069833, + "grad_norm": 13.96717357635498, + "learning_rate": 5.283167677631926e-06, + "loss": 4.772, + "step": 9880 + }, + { + "epoch": 1.938995684582189, + "grad_norm": 12.712096214294434, + "learning_rate": 5.279303323388413e-06, + "loss": 4.7292, + "step": 9885 + }, + { + "epoch": 1.939976461357395, + "grad_norm": 18.295469284057617, + "learning_rate": 5.275438801779328e-06, + "loss": 4.6064, + "step": 9890 + }, + { + "epoch": 1.940957238132601, + "grad_norm": 14.68940544128418, + "learning_rate": 5.2715741151203895e-06, + "loss": 4.7464, + "step": 9895 + }, + { + "epoch": 1.941938014907807, + "grad_norm": 27.88214111328125, + "learning_rate": 5.267709265727412e-06, + "loss": 4.5663, + "step": 9900 + }, + { + "epoch": 1.942918791683013, + "grad_norm": 12.562390327453613, + "learning_rate": 5.263844255916313e-06, + "loss": 4.5185, + "step": 9905 + }, + { + "epoch": 1.9438995684582188, + "grad_norm": 17.847429275512695, + "learning_rate": 5.259979088003104e-06, + "loss": 4.3947, + "step": 9910 + }, + { + "epoch": 1.944880345233425, + "grad_norm": 14.35951042175293, + "learning_rate": 5.256113764303891e-06, + "loss": 4.7351, + "step": 9915 + }, + { + "epoch": 1.9458611220086308, + "grad_norm": 18.09912872314453, + "learning_rate": 5.252248287134869e-06, + "loss": 4.498, + "step": 9920 + }, + { + "epoch": 1.9468418987838367, + "grad_norm": 14.626702308654785, + "learning_rate": 5.248382658812334e-06, + "loss": 4.6051, + "step": 9925 + }, + { + "epoch": 1.9478226755590429, + "grad_norm": 15.402030944824219, + "learning_rate": 5.2445168816526635e-06, + "loss": 4.8815, + "step": 9930 + }, + { + "epoch": 1.9488034523342486, + "grad_norm": 30.433496475219727, + "learning_rate": 5.2406509579723315e-06, + "loss": 4.643, + "step": 9935 + }, + { + "epoch": 1.9497842291094547, + "grad_norm": 19.342151641845703, + "learning_rate": 5.236784890087897e-06, + "loss": 4.3592, + "step": 9940 + }, + { + "epoch": 1.9507650058846606, + "grad_norm": 20.78154754638672, + "learning_rate": 5.232918680316003e-06, + "loss": 4.4561, + "step": 9945 + }, + { + "epoch": 1.9517457826598665, + "grad_norm": 14.727789878845215, + "learning_rate": 5.229052330973381e-06, + "loss": 4.9764, + "step": 9950 + }, + { + "epoch": 1.9527265594350727, + "grad_norm": 26.10953712463379, + "learning_rate": 5.225185844376842e-06, + "loss": 4.7121, + "step": 9955 + }, + { + "epoch": 1.9537073362102786, + "grad_norm": 24.18365478515625, + "learning_rate": 5.221319222843285e-06, + "loss": 4.6004, + "step": 9960 + }, + { + "epoch": 1.9546881129854845, + "grad_norm": 23.998422622680664, + "learning_rate": 5.217452468689687e-06, + "loss": 4.5914, + "step": 9965 + }, + { + "epoch": 1.9556688897606904, + "grad_norm": 17.29246711730957, + "learning_rate": 5.2135855842331015e-06, + "loss": 4.7927, + "step": 9970 + }, + { + "epoch": 1.9566496665358963, + "grad_norm": 12.302995681762695, + "learning_rate": 5.2097185717906654e-06, + "loss": 4.629, + "step": 9975 + }, + { + "epoch": 1.9576304433111025, + "grad_norm": 17.5142765045166, + "learning_rate": 5.20585143367959e-06, + "loss": 4.6272, + "step": 9980 + }, + { + "epoch": 1.9586112200863084, + "grad_norm": 59.8892822265625, + "learning_rate": 5.201984172217158e-06, + "loss": 4.8155, + "step": 9985 + }, + { + "epoch": 1.9595919968615143, + "grad_norm": 21.148422241210938, + "learning_rate": 5.1981167897207345e-06, + "loss": 4.3854, + "step": 9990 + }, + { + "epoch": 1.9605727736367204, + "grad_norm": 10.437204360961914, + "learning_rate": 5.194249288507749e-06, + "loss": 4.3812, + "step": 9995 + }, + { + "epoch": 1.9615535504119261, + "grad_norm": 12.685667991638184, + "learning_rate": 5.190381670895707e-06, + "loss": 4.5556, + "step": 10000 + }, + { + "epoch": 1.9625343271871323, + "grad_norm": 120.73411560058594, + "learning_rate": 5.18651393920218e-06, + "loss": 5.078, + "step": 10005 + }, + { + "epoch": 1.9635151039623382, + "grad_norm": 16.686086654663086, + "learning_rate": 5.182646095744813e-06, + "loss": 4.8024, + "step": 10010 + }, + { + "epoch": 1.964495880737544, + "grad_norm": 19.33757209777832, + "learning_rate": 5.178778142841315e-06, + "loss": 4.6619, + "step": 10015 + }, + { + "epoch": 1.9654766575127502, + "grad_norm": 12.6121187210083, + "learning_rate": 5.174910082809459e-06, + "loss": 5.1033, + "step": 10020 + }, + { + "epoch": 1.966457434287956, + "grad_norm": 28.6193790435791, + "learning_rate": 5.171041917967083e-06, + "loss": 4.3412, + "step": 10025 + }, + { + "epoch": 1.967438211063162, + "grad_norm": 14.460850715637207, + "learning_rate": 5.16717365063209e-06, + "loss": 4.7468, + "step": 10030 + }, + { + "epoch": 1.968418987838368, + "grad_norm": 14.494165420532227, + "learning_rate": 5.163305283122443e-06, + "loss": 4.9953, + "step": 10035 + }, + { + "epoch": 1.969399764613574, + "grad_norm": 15.552408218383789, + "learning_rate": 5.159436817756166e-06, + "loss": 4.7655, + "step": 10040 + }, + { + "epoch": 1.97038054138878, + "grad_norm": 14.32104778289795, + "learning_rate": 5.155568256851339e-06, + "loss": 4.4171, + "step": 10045 + }, + { + "epoch": 1.9713613181639857, + "grad_norm": 32.562442779541016, + "learning_rate": 5.151699602726101e-06, + "loss": 4.7189, + "step": 10050 + }, + { + "epoch": 1.9723420949391919, + "grad_norm": 28.78983497619629, + "learning_rate": 5.147830857698649e-06, + "loss": 4.8201, + "step": 10055 + }, + { + "epoch": 1.9733228717143978, + "grad_norm": 39.05999755859375, + "learning_rate": 5.143962024087229e-06, + "loss": 4.7264, + "step": 10060 + }, + { + "epoch": 1.9743036484896037, + "grad_norm": 23.98911476135254, + "learning_rate": 5.140093104210147e-06, + "loss": 4.383, + "step": 10065 + }, + { + "epoch": 1.9752844252648099, + "grad_norm": 21.3807373046875, + "learning_rate": 5.136224100385754e-06, + "loss": 4.7054, + "step": 10070 + }, + { + "epoch": 1.9762652020400155, + "grad_norm": 27.155351638793945, + "learning_rate": 5.132355014932455e-06, + "loss": 4.6505, + "step": 10075 + }, + { + "epoch": 1.9772459788152217, + "grad_norm": 32.85955047607422, + "learning_rate": 5.128485850168703e-06, + "loss": 4.8904, + "step": 10080 + }, + { + "epoch": 1.9782267555904276, + "grad_norm": 16.200801849365234, + "learning_rate": 5.124616608413e-06, + "loss": 4.6475, + "step": 10085 + }, + { + "epoch": 1.9792075323656335, + "grad_norm": 20.49567222595215, + "learning_rate": 5.1207472919838945e-06, + "loss": 4.8862, + "step": 10090 + }, + { + "epoch": 1.9801883091408397, + "grad_norm": 21.595518112182617, + "learning_rate": 5.116877903199975e-06, + "loss": 4.891, + "step": 10095 + }, + { + "epoch": 1.9811690859160453, + "grad_norm": 42.03987121582031, + "learning_rate": 5.113008444379878e-06, + "loss": 4.8607, + "step": 10100 + }, + { + "epoch": 1.9821498626912515, + "grad_norm": 21.260618209838867, + "learning_rate": 5.109138917842278e-06, + "loss": 4.6659, + "step": 10105 + }, + { + "epoch": 1.9831306394664574, + "grad_norm": 23.255855560302734, + "learning_rate": 5.105269325905896e-06, + "loss": 4.5065, + "step": 10110 + }, + { + "epoch": 1.9841114162416633, + "grad_norm": 30.049131393432617, + "learning_rate": 5.101399670889489e-06, + "loss": 5.2045, + "step": 10115 + }, + { + "epoch": 1.9850921930168695, + "grad_norm": 28.775293350219727, + "learning_rate": 5.097529955111848e-06, + "loss": 4.8581, + "step": 10120 + }, + { + "epoch": 1.9860729697920754, + "grad_norm": 25.73450469970703, + "learning_rate": 5.093660180891807e-06, + "loss": 4.6503, + "step": 10125 + }, + { + "epoch": 1.9870537465672813, + "grad_norm": 12.520339965820312, + "learning_rate": 5.089790350548232e-06, + "loss": 4.793, + "step": 10130 + }, + { + "epoch": 1.9880345233424872, + "grad_norm": 10.180679321289062, + "learning_rate": 5.085920466400021e-06, + "loss": 4.7151, + "step": 10135 + }, + { + "epoch": 1.9890153001176931, + "grad_norm": 23.294992446899414, + "learning_rate": 5.082050530766105e-06, + "loss": 4.5855, + "step": 10140 + }, + { + "epoch": 1.9899960768928993, + "grad_norm": 16.03803825378418, + "learning_rate": 5.07818054596545e-06, + "loss": 4.7725, + "step": 10145 + }, + { + "epoch": 1.9909768536681052, + "grad_norm": 16.226547241210938, + "learning_rate": 5.074310514317046e-06, + "loss": 4.5754, + "step": 10150 + }, + { + "epoch": 1.991957630443311, + "grad_norm": 16.746049880981445, + "learning_rate": 5.070440438139913e-06, + "loss": 5.0154, + "step": 10155 + }, + { + "epoch": 1.9929384072185172, + "grad_norm": 27.564208984375, + "learning_rate": 5.066570319753099e-06, + "loss": 4.4396, + "step": 10160 + }, + { + "epoch": 1.993919183993723, + "grad_norm": 24.835065841674805, + "learning_rate": 5.0627001614756775e-06, + "loss": 4.9128, + "step": 10165 + }, + { + "epoch": 1.994899960768929, + "grad_norm": 15.367796897888184, + "learning_rate": 5.058829965626742e-06, + "loss": 4.5579, + "step": 10170 + }, + { + "epoch": 1.995880737544135, + "grad_norm": 15.437244415283203, + "learning_rate": 5.054959734525412e-06, + "loss": 4.6782, + "step": 10175 + }, + { + "epoch": 1.996861514319341, + "grad_norm": 13.151288032531738, + "learning_rate": 5.051089470490825e-06, + "loss": 4.5289, + "step": 10180 + }, + { + "epoch": 1.997842291094547, + "grad_norm": 19.87721824645996, + "learning_rate": 5.047219175842146e-06, + "loss": 5.0906, + "step": 10185 + }, + { + "epoch": 1.9988230678697527, + "grad_norm": 24.418581008911133, + "learning_rate": 5.043348852898549e-06, + "loss": 4.7632, + "step": 10190 + }, + { + "epoch": 1.9998038446449589, + "grad_norm": 34.285369873046875, + "learning_rate": 5.03947850397923e-06, + "loss": 4.7502, + "step": 10195 + }, + { + "epoch": 2.0007846214201646, + "grad_norm": 13.637475967407227, + "learning_rate": 5.035608131403397e-06, + "loss": 4.6337, + "step": 10200 + }, + { + "epoch": 2.0007846214201646, + "eval_loss": 4.867619514465332, + "eval_runtime": 7.7779, + "eval_samples_per_second": 26.871, + "eval_steps_per_second": 13.5, + "step": 10200 + }, + { + "epoch": 2.0017653981953707, + "grad_norm": 20.579256057739258, + "learning_rate": 5.031737737490278e-06, + "loss": 4.2294, + "step": 10205 + }, + { + "epoch": 2.002746174970577, + "grad_norm": 22.140090942382812, + "learning_rate": 5.027867324559111e-06, + "loss": 4.6012, + "step": 10210 + }, + { + "epoch": 2.0037269517457825, + "grad_norm": 31.860458374023438, + "learning_rate": 5.02399689492914e-06, + "loss": 5.1452, + "step": 10215 + }, + { + "epoch": 2.0047077285209887, + "grad_norm": 12.973932266235352, + "learning_rate": 5.020126450919626e-06, + "loss": 4.5311, + "step": 10220 + }, + { + "epoch": 2.005688505296195, + "grad_norm": 25.211833953857422, + "learning_rate": 5.016255994849837e-06, + "loss": 4.8593, + "step": 10225 + }, + { + "epoch": 2.0066692820714005, + "grad_norm": 17.093555450439453, + "learning_rate": 5.0123855290390465e-06, + "loss": 4.8051, + "step": 10230 + }, + { + "epoch": 2.0076500588466066, + "grad_norm": 33.78654861450195, + "learning_rate": 5.008515055806538e-06, + "loss": 4.4996, + "step": 10235 + }, + { + "epoch": 2.0086308356218123, + "grad_norm": 18.358837127685547, + "learning_rate": 5.004644577471592e-06, + "loss": 4.3743, + "step": 10240 + }, + { + "epoch": 2.0096116123970185, + "grad_norm": 18.64828109741211, + "learning_rate": 5.0007740963535e-06, + "loss": 4.4953, + "step": 10245 + }, + { + "epoch": 2.0105923891722246, + "grad_norm": 12.689542770385742, + "learning_rate": 4.996903614771548e-06, + "loss": 4.6645, + "step": 10250 + }, + { + "epoch": 2.0115731659474303, + "grad_norm": 35.74906921386719, + "learning_rate": 4.99303313504503e-06, + "loss": 4.5491, + "step": 10255 + }, + { + "epoch": 2.0125539427226364, + "grad_norm": 17.008872985839844, + "learning_rate": 4.9891626594932304e-06, + "loss": 4.1487, + "step": 10260 + }, + { + "epoch": 2.013534719497842, + "grad_norm": 15.55538558959961, + "learning_rate": 4.98529219043544e-06, + "loss": 4.4378, + "step": 10265 + }, + { + "epoch": 2.0145154962730483, + "grad_norm": 15.817734718322754, + "learning_rate": 4.981421730190937e-06, + "loss": 4.5915, + "step": 10270 + }, + { + "epoch": 2.0154962730482544, + "grad_norm": 13.019744873046875, + "learning_rate": 4.977551281079001e-06, + "loss": 4.4266, + "step": 10275 + }, + { + "epoch": 2.01647704982346, + "grad_norm": 18.480030059814453, + "learning_rate": 4.973680845418903e-06, + "loss": 4.3731, + "step": 10280 + }, + { + "epoch": 2.0174578265986662, + "grad_norm": 18.247615814208984, + "learning_rate": 4.9698104255299015e-06, + "loss": 4.5799, + "step": 10285 + }, + { + "epoch": 2.018438603373872, + "grad_norm": 29.975618362426758, + "learning_rate": 4.965940023731255e-06, + "loss": 4.2928, + "step": 10290 + }, + { + "epoch": 2.019419380149078, + "grad_norm": 16.613618850708008, + "learning_rate": 4.9620696423422e-06, + "loss": 4.6338, + "step": 10295 + }, + { + "epoch": 2.020400156924284, + "grad_norm": 11.347552299499512, + "learning_rate": 4.958199283681968e-06, + "loss": 4.3939, + "step": 10300 + }, + { + "epoch": 2.02138093369949, + "grad_norm": 16.170337677001953, + "learning_rate": 4.954328950069778e-06, + "loss": 4.7013, + "step": 10305 + }, + { + "epoch": 2.022361710474696, + "grad_norm": 15.764708518981934, + "learning_rate": 4.95045864382483e-06, + "loss": 4.6302, + "step": 10310 + }, + { + "epoch": 2.0233424872499017, + "grad_norm": 20.800151824951172, + "learning_rate": 4.946588367266308e-06, + "loss": 4.7581, + "step": 10315 + }, + { + "epoch": 2.024323264025108, + "grad_norm": 14.423624038696289, + "learning_rate": 4.942718122713377e-06, + "loss": 4.5362, + "step": 10320 + }, + { + "epoch": 2.025304040800314, + "grad_norm": 27.55731773376465, + "learning_rate": 4.938847912485187e-06, + "loss": 4.485, + "step": 10325 + }, + { + "epoch": 2.0262848175755197, + "grad_norm": 30.498001098632812, + "learning_rate": 4.934977738900867e-06, + "loss": 4.3044, + "step": 10330 + }, + { + "epoch": 2.027265594350726, + "grad_norm": 28.44693374633789, + "learning_rate": 4.9311076042795185e-06, + "loss": 4.729, + "step": 10335 + }, + { + "epoch": 2.0282463711259315, + "grad_norm": 16.130205154418945, + "learning_rate": 4.927237510940228e-06, + "loss": 4.6165, + "step": 10340 + }, + { + "epoch": 2.0292271479011377, + "grad_norm": 14.545047760009766, + "learning_rate": 4.9233674612020485e-06, + "loss": 4.7173, + "step": 10345 + }, + { + "epoch": 2.030207924676344, + "grad_norm": 44.33458709716797, + "learning_rate": 4.919497457384012e-06, + "loss": 4.7184, + "step": 10350 + }, + { + "epoch": 2.0311887014515495, + "grad_norm": 16.259366989135742, + "learning_rate": 4.915627501805125e-06, + "loss": 4.555, + "step": 10355 + }, + { + "epoch": 2.0321694782267556, + "grad_norm": 20.897994995117188, + "learning_rate": 4.911757596784358e-06, + "loss": 4.2933, + "step": 10360 + }, + { + "epoch": 2.0331502550019613, + "grad_norm": 22.479900360107422, + "learning_rate": 4.907887744640659e-06, + "loss": 4.6158, + "step": 10365 + }, + { + "epoch": 2.0341310317771675, + "grad_norm": 21.30280876159668, + "learning_rate": 4.9040179476929364e-06, + "loss": 4.3196, + "step": 10370 + }, + { + "epoch": 2.0351118085523736, + "grad_norm": 14.088079452514648, + "learning_rate": 4.900148208260075e-06, + "loss": 4.6136, + "step": 10375 + }, + { + "epoch": 2.0360925853275793, + "grad_norm": 21.850318908691406, + "learning_rate": 4.896278528660916e-06, + "loss": 4.6937, + "step": 10380 + }, + { + "epoch": 2.0370733621027854, + "grad_norm": 24.06600570678711, + "learning_rate": 4.892408911214271e-06, + "loss": 4.5812, + "step": 10385 + }, + { + "epoch": 2.0380541388779916, + "grad_norm": 17.342010498046875, + "learning_rate": 4.888539358238912e-06, + "loss": 4.5512, + "step": 10390 + }, + { + "epoch": 2.0390349156531973, + "grad_norm": 21.402070999145508, + "learning_rate": 4.88466987205357e-06, + "loss": 4.3862, + "step": 10395 + }, + { + "epoch": 2.0400156924284034, + "grad_norm": 30.924278259277344, + "learning_rate": 4.880800454976939e-06, + "loss": 4.3662, + "step": 10400 + }, + { + "epoch": 2.040996469203609, + "grad_norm": 20.360410690307617, + "learning_rate": 4.876931109327675e-06, + "loss": 4.5773, + "step": 10405 + }, + { + "epoch": 2.0419772459788152, + "grad_norm": 26.364816665649414, + "learning_rate": 4.873061837424382e-06, + "loss": 4.3995, + "step": 10410 + }, + { + "epoch": 2.0429580227540214, + "grad_norm": 18.199127197265625, + "learning_rate": 4.869192641585628e-06, + "loss": 4.4404, + "step": 10415 + }, + { + "epoch": 2.043938799529227, + "grad_norm": 31.47274398803711, + "learning_rate": 4.8653235241299315e-06, + "loss": 4.5386, + "step": 10420 + }, + { + "epoch": 2.044919576304433, + "grad_norm": 36.052207946777344, + "learning_rate": 4.861454487375765e-06, + "loss": 4.5961, + "step": 10425 + }, + { + "epoch": 2.045900353079639, + "grad_norm": 30.397974014282227, + "learning_rate": 4.8575855336415536e-06, + "loss": 4.6295, + "step": 10430 + }, + { + "epoch": 2.046881129854845, + "grad_norm": 13.34601879119873, + "learning_rate": 4.853716665245668e-06, + "loss": 4.5577, + "step": 10435 + }, + { + "epoch": 2.047861906630051, + "grad_norm": 32.35442352294922, + "learning_rate": 4.849847884506437e-06, + "loss": 4.6545, + "step": 10440 + }, + { + "epoch": 2.048842683405257, + "grad_norm": 29.84942054748535, + "learning_rate": 4.8459791937421255e-06, + "loss": 4.6705, + "step": 10445 + }, + { + "epoch": 2.049823460180463, + "grad_norm": 41.33201599121094, + "learning_rate": 4.842110595270955e-06, + "loss": 4.295, + "step": 10450 + }, + { + "epoch": 2.0508042369556687, + "grad_norm": 30.651723861694336, + "learning_rate": 4.838242091411085e-06, + "loss": 4.4773, + "step": 10455 + }, + { + "epoch": 2.051785013730875, + "grad_norm": 17.958133697509766, + "learning_rate": 4.83437368448062e-06, + "loss": 4.7877, + "step": 10460 + }, + { + "epoch": 2.052765790506081, + "grad_norm": 21.545969009399414, + "learning_rate": 4.8305053767976075e-06, + "loss": 4.3349, + "step": 10465 + }, + { + "epoch": 2.0537465672812867, + "grad_norm": 17.481050491333008, + "learning_rate": 4.826637170680033e-06, + "loss": 4.5657, + "step": 10470 + }, + { + "epoch": 2.054727344056493, + "grad_norm": 24.961719512939453, + "learning_rate": 4.822769068445824e-06, + "loss": 4.6152, + "step": 10475 + }, + { + "epoch": 2.0557081208316985, + "grad_norm": 21.356874465942383, + "learning_rate": 4.818901072412846e-06, + "loss": 4.5025, + "step": 10480 + }, + { + "epoch": 2.0566888976069047, + "grad_norm": 19.501522064208984, + "learning_rate": 4.8150331848988965e-06, + "loss": 4.8735, + "step": 10485 + }, + { + "epoch": 2.057669674382111, + "grad_norm": 21.428464889526367, + "learning_rate": 4.811165408221715e-06, + "loss": 4.5817, + "step": 10490 + }, + { + "epoch": 2.0586504511573165, + "grad_norm": 10.177115440368652, + "learning_rate": 4.8072977446989665e-06, + "loss": 4.3053, + "step": 10495 + }, + { + "epoch": 2.0596312279325226, + "grad_norm": 32.65115737915039, + "learning_rate": 4.803430196648252e-06, + "loss": 3.9838, + "step": 10500 + }, + { + "epoch": 2.0606120047077283, + "grad_norm": 18.025699615478516, + "learning_rate": 4.799562766387109e-06, + "loss": 4.4535, + "step": 10505 + }, + { + "epoch": 2.0615927814829345, + "grad_norm": 13.657340049743652, + "learning_rate": 4.795695456232993e-06, + "loss": 4.7212, + "step": 10510 + }, + { + "epoch": 2.0625735582581406, + "grad_norm": 16.720195770263672, + "learning_rate": 4.791828268503297e-06, + "loss": 4.3234, + "step": 10515 + }, + { + "epoch": 2.0635543350333463, + "grad_norm": 25.29947853088379, + "learning_rate": 4.7879612055153335e-06, + "loss": 4.5086, + "step": 10520 + }, + { + "epoch": 2.0645351118085524, + "grad_norm": 15.523651123046875, + "learning_rate": 4.784094269586348e-06, + "loss": 4.567, + "step": 10525 + }, + { + "epoch": 2.065515888583758, + "grad_norm": 38.62691879272461, + "learning_rate": 4.780227463033505e-06, + "loss": 4.7915, + "step": 10530 + }, + { + "epoch": 2.0664966653589643, + "grad_norm": 23.588415145874023, + "learning_rate": 4.7763607881738884e-06, + "loss": 4.5609, + "step": 10535 + }, + { + "epoch": 2.0674774421341704, + "grad_norm": 23.60785484313965, + "learning_rate": 4.772494247324512e-06, + "loss": 4.8313, + "step": 10540 + }, + { + "epoch": 2.068458218909376, + "grad_norm": 26.25421905517578, + "learning_rate": 4.7686278428023e-06, + "loss": 4.4832, + "step": 10545 + }, + { + "epoch": 2.0694389956845822, + "grad_norm": 19.635887145996094, + "learning_rate": 4.7647615769241e-06, + "loss": 4.5175, + "step": 10550 + }, + { + "epoch": 2.0704197724597884, + "grad_norm": 11.955699920654297, + "learning_rate": 4.760895452006681e-06, + "loss": 4.7148, + "step": 10555 + }, + { + "epoch": 2.071400549234994, + "grad_norm": 27.53982162475586, + "learning_rate": 4.757029470366716e-06, + "loss": 4.3946, + "step": 10560 + }, + { + "epoch": 2.0723813260102, + "grad_norm": 14.448670387268066, + "learning_rate": 4.753163634320801e-06, + "loss": 4.7461, + "step": 10565 + }, + { + "epoch": 2.073362102785406, + "grad_norm": 19.246601104736328, + "learning_rate": 4.7492979461854405e-06, + "loss": 4.349, + "step": 10570 + }, + { + "epoch": 2.074342879560612, + "grad_norm": 17.460773468017578, + "learning_rate": 4.745432408277053e-06, + "loss": 4.1874, + "step": 10575 + }, + { + "epoch": 2.075323656335818, + "grad_norm": 21.156417846679688, + "learning_rate": 4.741567022911968e-06, + "loss": 4.2897, + "step": 10580 + }, + { + "epoch": 2.076304433111024, + "grad_norm": 27.80026626586914, + "learning_rate": 4.7377017924064175e-06, + "loss": 4.3745, + "step": 10585 + }, + { + "epoch": 2.07728520988623, + "grad_norm": 12.806593894958496, + "learning_rate": 4.733836719076549e-06, + "loss": 4.4964, + "step": 10590 + }, + { + "epoch": 2.0782659866614357, + "grad_norm": 11.110158920288086, + "learning_rate": 4.729971805238407e-06, + "loss": 4.4094, + "step": 10595 + }, + { + "epoch": 2.079246763436642, + "grad_norm": 14.680946350097656, + "learning_rate": 4.72610705320795e-06, + "loss": 4.976, + "step": 10600 + }, + { + "epoch": 2.080227540211848, + "grad_norm": 28.969358444213867, + "learning_rate": 4.722242465301033e-06, + "loss": 4.5051, + "step": 10605 + }, + { + "epoch": 2.0812083169870537, + "grad_norm": 13.442532539367676, + "learning_rate": 4.718378043833411e-06, + "loss": 4.4828, + "step": 10610 + }, + { + "epoch": 2.08218909376226, + "grad_norm": 14.584362983703613, + "learning_rate": 4.714513791120746e-06, + "loss": 4.1986, + "step": 10615 + }, + { + "epoch": 2.0831698705374655, + "grad_norm": 8.882427215576172, + "learning_rate": 4.710649709478593e-06, + "loss": 4.6214, + "step": 10620 + }, + { + "epoch": 2.0841506473126716, + "grad_norm": 32.854148864746094, + "learning_rate": 4.706785801222409e-06, + "loss": 4.904, + "step": 10625 + }, + { + "epoch": 2.0851314240878778, + "grad_norm": 12.650853157043457, + "learning_rate": 4.702922068667546e-06, + "loss": 4.6661, + "step": 10630 + }, + { + "epoch": 2.0861122008630835, + "grad_norm": 24.82941436767578, + "learning_rate": 4.699058514129246e-06, + "loss": 4.5173, + "step": 10635 + }, + { + "epoch": 2.0870929776382896, + "grad_norm": 32.43788528442383, + "learning_rate": 4.695195139922652e-06, + "loss": 4.6476, + "step": 10640 + }, + { + "epoch": 2.0880737544134953, + "grad_norm": 12.398055076599121, + "learning_rate": 4.691331948362789e-06, + "loss": 4.5709, + "step": 10645 + }, + { + "epoch": 2.0890545311887014, + "grad_norm": 19.99089241027832, + "learning_rate": 4.687468941764583e-06, + "loss": 4.6279, + "step": 10650 + }, + { + "epoch": 2.0900353079639076, + "grad_norm": 19.84067726135254, + "learning_rate": 4.683606122442846e-06, + "loss": 4.7469, + "step": 10655 + }, + { + "epoch": 2.0910160847391133, + "grad_norm": 21.618118286132812, + "learning_rate": 4.679743492712273e-06, + "loss": 5.2934, + "step": 10660 + }, + { + "epoch": 2.0919968615143194, + "grad_norm": 15.103788375854492, + "learning_rate": 4.675881054887451e-06, + "loss": 4.5272, + "step": 10665 + }, + { + "epoch": 2.092977638289525, + "grad_norm": 26.1807861328125, + "learning_rate": 4.672018811282849e-06, + "loss": 4.5789, + "step": 10670 + }, + { + "epoch": 2.0939584150647312, + "grad_norm": 14.807060241699219, + "learning_rate": 4.6681567642128195e-06, + "loss": 5.14, + "step": 10675 + }, + { + "epoch": 2.0949391918399374, + "grad_norm": 22.160566329956055, + "learning_rate": 4.664294915991601e-06, + "loss": 4.3984, + "step": 10680 + }, + { + "epoch": 2.095919968615143, + "grad_norm": 24.023670196533203, + "learning_rate": 4.660433268933306e-06, + "loss": 4.5185, + "step": 10685 + }, + { + "epoch": 2.096900745390349, + "grad_norm": 13.71312427520752, + "learning_rate": 4.656571825351936e-06, + "loss": 4.5645, + "step": 10690 + }, + { + "epoch": 2.0978815221655553, + "grad_norm": 19.164081573486328, + "learning_rate": 4.6527105875613574e-06, + "loss": 4.2813, + "step": 10695 + }, + { + "epoch": 2.098862298940761, + "grad_norm": 22.777475357055664, + "learning_rate": 4.6488495578753285e-06, + "loss": 4.3725, + "step": 10700 + }, + { + "epoch": 2.099843075715967, + "grad_norm": 20.250986099243164, + "learning_rate": 4.644988738607471e-06, + "loss": 4.4706, + "step": 10705 + }, + { + "epoch": 2.100823852491173, + "grad_norm": 11.561543464660645, + "learning_rate": 4.641128132071287e-06, + "loss": 4.6326, + "step": 10710 + }, + { + "epoch": 2.101804629266379, + "grad_norm": 20.83006477355957, + "learning_rate": 4.637267740580149e-06, + "loss": 4.3374, + "step": 10715 + }, + { + "epoch": 2.102785406041585, + "grad_norm": 10.181340217590332, + "learning_rate": 4.633407566447297e-06, + "loss": 4.5099, + "step": 10720 + }, + { + "epoch": 2.103766182816791, + "grad_norm": 12.635647773742676, + "learning_rate": 4.629547611985848e-06, + "loss": 4.2601, + "step": 10725 + }, + { + "epoch": 2.104746959591997, + "grad_norm": 15.131701469421387, + "learning_rate": 4.625687879508783e-06, + "loss": 4.5051, + "step": 10730 + }, + { + "epoch": 2.1057277363672027, + "grad_norm": 16.336679458618164, + "learning_rate": 4.62182837132895e-06, + "loss": 4.3319, + "step": 10735 + }, + { + "epoch": 2.106708513142409, + "grad_norm": 32.087459564208984, + "learning_rate": 4.617969089759066e-06, + "loss": 4.6912, + "step": 10740 + }, + { + "epoch": 2.107689289917615, + "grad_norm": 13.97162914276123, + "learning_rate": 4.614110037111706e-06, + "loss": 4.6789, + "step": 10745 + }, + { + "epoch": 2.1086700666928206, + "grad_norm": 49.92332077026367, + "learning_rate": 4.6102512156993116e-06, + "loss": 4.7705, + "step": 10750 + }, + { + "epoch": 2.109650843468027, + "grad_norm": 31.643251419067383, + "learning_rate": 4.6063926278341895e-06, + "loss": 4.4707, + "step": 10755 + }, + { + "epoch": 2.1106316202432325, + "grad_norm": 17.356582641601562, + "learning_rate": 4.602534275828498e-06, + "loss": 4.3755, + "step": 10760 + }, + { + "epoch": 2.1116123970184386, + "grad_norm": 25.40827178955078, + "learning_rate": 4.598676161994262e-06, + "loss": 4.5291, + "step": 10765 + }, + { + "epoch": 2.1125931737936448, + "grad_norm": 12.720136642456055, + "learning_rate": 4.594818288643356e-06, + "loss": 4.5491, + "step": 10770 + }, + { + "epoch": 2.1135739505688504, + "grad_norm": 17.85308074951172, + "learning_rate": 4.59096065808752e-06, + "loss": 4.6167, + "step": 10775 + }, + { + "epoch": 2.1145547273440566, + "grad_norm": 38.109039306640625, + "learning_rate": 4.587103272638339e-06, + "loss": 4.8616, + "step": 10780 + }, + { + "epoch": 2.1155355041192623, + "grad_norm": 19.8563289642334, + "learning_rate": 4.583246134607258e-06, + "loss": 4.6665, + "step": 10785 + }, + { + "epoch": 2.1165162808944684, + "grad_norm": 39.26411437988281, + "learning_rate": 4.57938924630557e-06, + "loss": 4.3647, + "step": 10790 + }, + { + "epoch": 2.1174970576696746, + "grad_norm": 25.38083267211914, + "learning_rate": 4.575532610044419e-06, + "loss": 4.6439, + "step": 10795 + }, + { + "epoch": 2.1184778344448802, + "grad_norm": 18.37013816833496, + "learning_rate": 4.571676228134798e-06, + "loss": 4.7012, + "step": 10800 + }, + { + "epoch": 2.1194586112200864, + "grad_norm": 25.82213020324707, + "learning_rate": 4.567820102887552e-06, + "loss": 4.4759, + "step": 10805 + }, + { + "epoch": 2.120439387995292, + "grad_norm": 18.050235748291016, + "learning_rate": 4.563964236613362e-06, + "loss": 4.3479, + "step": 10810 + }, + { + "epoch": 2.121420164770498, + "grad_norm": 21.06696891784668, + "learning_rate": 4.560108631622765e-06, + "loss": 4.6009, + "step": 10815 + }, + { + "epoch": 2.1224009415457044, + "grad_norm": 26.383195877075195, + "learning_rate": 4.556253290226135e-06, + "loss": 4.4963, + "step": 10820 + }, + { + "epoch": 2.12338171832091, + "grad_norm": 15.887774467468262, + "learning_rate": 4.552398214733686e-06, + "loss": 4.3878, + "step": 10825 + }, + { + "epoch": 2.124362495096116, + "grad_norm": 23.23624038696289, + "learning_rate": 4.548543407455482e-06, + "loss": 4.7093, + "step": 10830 + }, + { + "epoch": 2.125343271871322, + "grad_norm": 17.61455535888672, + "learning_rate": 4.544688870701416e-06, + "loss": 4.5022, + "step": 10835 + }, + { + "epoch": 2.126324048646528, + "grad_norm": 12.441754341125488, + "learning_rate": 4.540834606781226e-06, + "loss": 4.4838, + "step": 10840 + }, + { + "epoch": 2.127304825421734, + "grad_norm": 15.740215301513672, + "learning_rate": 4.536980618004481e-06, + "loss": 4.3949, + "step": 10845 + }, + { + "epoch": 2.12828560219694, + "grad_norm": 13.362967491149902, + "learning_rate": 4.533126906680591e-06, + "loss": 4.495, + "step": 10850 + }, + { + "epoch": 2.129266378972146, + "grad_norm": 11.252303123474121, + "learning_rate": 4.529273475118797e-06, + "loss": 4.438, + "step": 10855 + }, + { + "epoch": 2.1302471557473517, + "grad_norm": 11.080646514892578, + "learning_rate": 4.525420325628167e-06, + "loss": 4.3683, + "step": 10860 + }, + { + "epoch": 2.131227932522558, + "grad_norm": 17.542789459228516, + "learning_rate": 4.521567460517612e-06, + "loss": 4.7441, + "step": 10865 + }, + { + "epoch": 2.132208709297764, + "grad_norm": 14.918002128601074, + "learning_rate": 4.517714882095859e-06, + "loss": 4.8096, + "step": 10870 + }, + { + "epoch": 2.1331894860729697, + "grad_norm": 16.332870483398438, + "learning_rate": 4.5138625926714734e-06, + "loss": 4.3374, + "step": 10875 + }, + { + "epoch": 2.134170262848176, + "grad_norm": 14.32162857055664, + "learning_rate": 4.510010594552846e-06, + "loss": 4.4616, + "step": 10880 + }, + { + "epoch": 2.1351510396233815, + "grad_norm": 26.749788284301758, + "learning_rate": 4.506158890048187e-06, + "loss": 4.7608, + "step": 10885 + }, + { + "epoch": 2.1361318163985876, + "grad_norm": 16.43140983581543, + "learning_rate": 4.502307481465536e-06, + "loss": 4.4194, + "step": 10890 + }, + { + "epoch": 2.1371125931737938, + "grad_norm": 36.98377990722656, + "learning_rate": 4.498456371112753e-06, + "loss": 4.265, + "step": 10895 + }, + { + "epoch": 2.1380933699489995, + "grad_norm": 12.881791114807129, + "learning_rate": 4.494605561297521e-06, + "loss": 4.7524, + "step": 10900 + }, + { + "epoch": 2.1390741467242056, + "grad_norm": 12.371302604675293, + "learning_rate": 4.4907550543273436e-06, + "loss": 4.4738, + "step": 10905 + }, + { + "epoch": 2.1400549234994117, + "grad_norm": 17.81466293334961, + "learning_rate": 4.486904852509537e-06, + "loss": 4.5088, + "step": 10910 + }, + { + "epoch": 2.1410357002746174, + "grad_norm": 14.954310417175293, + "learning_rate": 4.483054958151244e-06, + "loss": 4.4856, + "step": 10915 + }, + { + "epoch": 2.1420164770498236, + "grad_norm": 25.58282470703125, + "learning_rate": 4.479205373559415e-06, + "loss": 4.5459, + "step": 10920 + }, + { + "epoch": 2.1429972538250293, + "grad_norm": 13.035386085510254, + "learning_rate": 4.475356101040818e-06, + "loss": 4.2163, + "step": 10925 + }, + { + "epoch": 2.1439780306002354, + "grad_norm": 21.911907196044922, + "learning_rate": 4.471507142902036e-06, + "loss": 4.2241, + "step": 10930 + }, + { + "epoch": 2.1449588073754415, + "grad_norm": 13.07209300994873, + "learning_rate": 4.467658501449458e-06, + "loss": 4.6244, + "step": 10935 + }, + { + "epoch": 2.1459395841506472, + "grad_norm": 20.160175323486328, + "learning_rate": 4.463810178989291e-06, + "loss": 4.3972, + "step": 10940 + }, + { + "epoch": 2.1469203609258534, + "grad_norm": 15.461299896240234, + "learning_rate": 4.459962177827543e-06, + "loss": 4.3091, + "step": 10945 + }, + { + "epoch": 2.147901137701059, + "grad_norm": 28.049442291259766, + "learning_rate": 4.4561145002700325e-06, + "loss": 4.393, + "step": 10950 + }, + { + "epoch": 2.148881914476265, + "grad_norm": 21.617782592773438, + "learning_rate": 4.452267148622389e-06, + "loss": 4.5262, + "step": 10955 + }, + { + "epoch": 2.1498626912514713, + "grad_norm": 16.35369873046875, + "learning_rate": 4.448420125190039e-06, + "loss": 4.462, + "step": 10960 + }, + { + "epoch": 2.150843468026677, + "grad_norm": 19.262086868286133, + "learning_rate": 4.444573432278217e-06, + "loss": 4.565, + "step": 10965 + }, + { + "epoch": 2.151824244801883, + "grad_norm": 30.79473876953125, + "learning_rate": 4.440727072191956e-06, + "loss": 4.5552, + "step": 10970 + }, + { + "epoch": 2.152805021577089, + "grad_norm": 19.929492950439453, + "learning_rate": 4.436881047236092e-06, + "loss": 4.6865, + "step": 10975 + }, + { + "epoch": 2.153785798352295, + "grad_norm": 22.288394927978516, + "learning_rate": 4.433035359715264e-06, + "loss": 4.4575, + "step": 10980 + }, + { + "epoch": 2.154766575127501, + "grad_norm": 20.23619842529297, + "learning_rate": 4.429190011933899e-06, + "loss": 4.4639, + "step": 10985 + }, + { + "epoch": 2.155747351902707, + "grad_norm": 29.274555206298828, + "learning_rate": 4.425345006196231e-06, + "loss": 4.3485, + "step": 10990 + }, + { + "epoch": 2.156728128677913, + "grad_norm": 17.464130401611328, + "learning_rate": 4.421500344806281e-06, + "loss": 4.5714, + "step": 10995 + }, + { + "epoch": 2.1577089054531187, + "grad_norm": 33.32584762573242, + "learning_rate": 4.417656030067866e-06, + "loss": 4.645, + "step": 11000 + }, + { + "epoch": 2.158689682228325, + "grad_norm": 23.476160049438477, + "learning_rate": 4.4138120642846e-06, + "loss": 4.5932, + "step": 11005 + }, + { + "epoch": 2.159670459003531, + "grad_norm": 30.347623825073242, + "learning_rate": 4.409968449759879e-06, + "loss": 4.3533, + "step": 11010 + }, + { + "epoch": 2.1606512357787366, + "grad_norm": 24.87689781188965, + "learning_rate": 4.406125188796898e-06, + "loss": 4.6235, + "step": 11015 + }, + { + "epoch": 2.1616320125539428, + "grad_norm": 15.419154167175293, + "learning_rate": 4.4022822836986315e-06, + "loss": 4.4848, + "step": 11020 + }, + { + "epoch": 2.162612789329149, + "grad_norm": 21.806236267089844, + "learning_rate": 4.3984397367678475e-06, + "loss": 4.7521, + "step": 11025 + }, + { + "epoch": 2.1635935661043546, + "grad_norm": 29.77591896057129, + "learning_rate": 4.394597550307097e-06, + "loss": 4.8401, + "step": 11030 + }, + { + "epoch": 2.1645743428795607, + "grad_norm": 13.51814079284668, + "learning_rate": 4.390755726618714e-06, + "loss": 4.5055, + "step": 11035 + }, + { + "epoch": 2.1655551196547664, + "grad_norm": 13.01634407043457, + "learning_rate": 4.386914268004815e-06, + "loss": 4.69, + "step": 11040 + }, + { + "epoch": 2.1665358964299726, + "grad_norm": 16.430099487304688, + "learning_rate": 4.383073176767299e-06, + "loss": 4.8869, + "step": 11045 + }, + { + "epoch": 2.1675166732051787, + "grad_norm": 15.964555740356445, + "learning_rate": 4.379232455207843e-06, + "loss": 4.2059, + "step": 11050 + }, + { + "epoch": 2.1684974499803844, + "grad_norm": 12.990334510803223, + "learning_rate": 4.375392105627909e-06, + "loss": 4.5416, + "step": 11055 + }, + { + "epoch": 2.1694782267555905, + "grad_norm": 28.908823013305664, + "learning_rate": 4.371552130328725e-06, + "loss": 4.3389, + "step": 11060 + }, + { + "epoch": 2.1704590035307962, + "grad_norm": 34.378807067871094, + "learning_rate": 4.367712531611305e-06, + "loss": 4.5571, + "step": 11065 + }, + { + "epoch": 2.1714397803060024, + "grad_norm": 18.134485244750977, + "learning_rate": 4.3638733117764295e-06, + "loss": 4.2235, + "step": 11070 + }, + { + "epoch": 2.1724205570812085, + "grad_norm": 25.642366409301758, + "learning_rate": 4.360034473124658e-06, + "loss": 4.6163, + "step": 11075 + }, + { + "epoch": 2.173401333856414, + "grad_norm": 10.750842094421387, + "learning_rate": 4.35619601795632e-06, + "loss": 4.6841, + "step": 11080 + }, + { + "epoch": 2.1743821106316203, + "grad_norm": 12.613737106323242, + "learning_rate": 4.3523579485715105e-06, + "loss": 4.5547, + "step": 11085 + }, + { + "epoch": 2.175362887406826, + "grad_norm": 14.869281768798828, + "learning_rate": 4.348520267270102e-06, + "loss": 4.6588, + "step": 11090 + }, + { + "epoch": 2.176343664182032, + "grad_norm": 21.500011444091797, + "learning_rate": 4.344682976351725e-06, + "loss": 4.3966, + "step": 11095 + }, + { + "epoch": 2.1773244409572383, + "grad_norm": 13.638593673706055, + "learning_rate": 4.340846078115784e-06, + "loss": 4.429, + "step": 11100 + }, + { + "epoch": 2.178305217732444, + "grad_norm": 22.606096267700195, + "learning_rate": 4.337009574861443e-06, + "loss": 5.035, + "step": 11105 + }, + { + "epoch": 2.17928599450765, + "grad_norm": 13.18099594116211, + "learning_rate": 4.333173468887632e-06, + "loss": 4.4518, + "step": 11110 + }, + { + "epoch": 2.180266771282856, + "grad_norm": 10.771079063415527, + "learning_rate": 4.329337762493044e-06, + "loss": 4.8277, + "step": 11115 + }, + { + "epoch": 2.181247548058062, + "grad_norm": 15.482091903686523, + "learning_rate": 4.325502457976126e-06, + "loss": 4.3848, + "step": 11120 + }, + { + "epoch": 2.182228324833268, + "grad_norm": 17.949617385864258, + "learning_rate": 4.321667557635092e-06, + "loss": 4.3615, + "step": 11125 + }, + { + "epoch": 2.183209101608474, + "grad_norm": 18.554176330566406, + "learning_rate": 4.317833063767912e-06, + "loss": 4.4504, + "step": 11130 + }, + { + "epoch": 2.18418987838368, + "grad_norm": 30.101398468017578, + "learning_rate": 4.313998978672308e-06, + "loss": 4.6963, + "step": 11135 + }, + { + "epoch": 2.1851706551588856, + "grad_norm": 13.602922439575195, + "learning_rate": 4.310165304645763e-06, + "loss": 4.5375, + "step": 11140 + }, + { + "epoch": 2.186151431934092, + "grad_norm": 15.757498741149902, + "learning_rate": 4.3063320439855085e-06, + "loss": 4.5059, + "step": 11145 + }, + { + "epoch": 2.187132208709298, + "grad_norm": 23.122838973999023, + "learning_rate": 4.302499198988531e-06, + "loss": 4.4306, + "step": 11150 + }, + { + "epoch": 2.1881129854845036, + "grad_norm": 23.106101989746094, + "learning_rate": 4.29866677195157e-06, + "loss": 4.1968, + "step": 11155 + }, + { + "epoch": 2.1890937622597098, + "grad_norm": 17.85923194885254, + "learning_rate": 4.294834765171108e-06, + "loss": 4.6488, + "step": 11160 + }, + { + "epoch": 2.1900745390349154, + "grad_norm": 8.963134765625, + "learning_rate": 4.291003180943385e-06, + "loss": 4.3469, + "step": 11165 + }, + { + "epoch": 2.1910553158101216, + "grad_norm": 28.65680503845215, + "learning_rate": 4.287172021564377e-06, + "loss": 4.349, + "step": 11170 + }, + { + "epoch": 2.1920360925853277, + "grad_norm": 23.335914611816406, + "learning_rate": 4.283341289329815e-06, + "loss": 4.6713, + "step": 11175 + }, + { + "epoch": 2.1930168693605334, + "grad_norm": 10.928681373596191, + "learning_rate": 4.279510986535169e-06, + "loss": 4.388, + "step": 11180 + }, + { + "epoch": 2.1939976461357396, + "grad_norm": 15.44399642944336, + "learning_rate": 4.275681115475651e-06, + "loss": 4.4258, + "step": 11185 + }, + { + "epoch": 2.1949784229109452, + "grad_norm": 32.2411003112793, + "learning_rate": 4.27185167844622e-06, + "loss": 4.3384, + "step": 11190 + }, + { + "epoch": 2.1959591996861514, + "grad_norm": 18.149885177612305, + "learning_rate": 4.268022677741566e-06, + "loss": 4.7455, + "step": 11195 + }, + { + "epoch": 2.1969399764613575, + "grad_norm": 18.782310485839844, + "learning_rate": 4.264194115656124e-06, + "loss": 4.6965, + "step": 11200 + }, + { + "epoch": 2.197920753236563, + "grad_norm": 11.856428146362305, + "learning_rate": 4.260365994484069e-06, + "loss": 4.4527, + "step": 11205 + }, + { + "epoch": 2.1989015300117694, + "grad_norm": 21.500896453857422, + "learning_rate": 4.256538316519303e-06, + "loss": 4.3852, + "step": 11210 + }, + { + "epoch": 2.199882306786975, + "grad_norm": 16.60177993774414, + "learning_rate": 4.252711084055468e-06, + "loss": 4.3806, + "step": 11215 + }, + { + "epoch": 2.200863083562181, + "grad_norm": 18.734601974487305, + "learning_rate": 4.248884299385937e-06, + "loss": 4.4284, + "step": 11220 + }, + { + "epoch": 2.2018438603373873, + "grad_norm": 21.753625869750977, + "learning_rate": 4.245057964803815e-06, + "loss": 4.5484, + "step": 11225 + }, + { + "epoch": 2.202824637112593, + "grad_norm": 13.715068817138672, + "learning_rate": 4.2412320826019425e-06, + "loss": 4.5648, + "step": 11230 + }, + { + "epoch": 2.203805413887799, + "grad_norm": 28.02129554748535, + "learning_rate": 4.237406655072879e-06, + "loss": 4.5578, + "step": 11235 + }, + { + "epoch": 2.2047861906630053, + "grad_norm": 19.558780670166016, + "learning_rate": 4.23358168450892e-06, + "loss": 4.7332, + "step": 11240 + }, + { + "epoch": 2.205766967438211, + "grad_norm": 27.703575134277344, + "learning_rate": 4.229757173202082e-06, + "loss": 4.7346, + "step": 11245 + }, + { + "epoch": 2.206747744213417, + "grad_norm": 20.822980880737305, + "learning_rate": 4.225933123444108e-06, + "loss": 4.5904, + "step": 11250 + }, + { + "epoch": 2.207728520988623, + "grad_norm": 23.84830665588379, + "learning_rate": 4.22210953752647e-06, + "loss": 4.6531, + "step": 11255 + }, + { + "epoch": 2.208709297763829, + "grad_norm": 10.470589637756348, + "learning_rate": 4.218286417740348e-06, + "loss": 4.2309, + "step": 11260 + }, + { + "epoch": 2.209690074539035, + "grad_norm": 22.220144271850586, + "learning_rate": 4.21446376637666e-06, + "loss": 4.3394, + "step": 11265 + }, + { + "epoch": 2.210670851314241, + "grad_norm": 22.664947509765625, + "learning_rate": 4.210641585726029e-06, + "loss": 4.6662, + "step": 11270 + }, + { + "epoch": 2.211651628089447, + "grad_norm": 18.833683013916016, + "learning_rate": 4.206819878078803e-06, + "loss": 4.368, + "step": 11275 + }, + { + "epoch": 2.2126324048646526, + "grad_norm": 15.988152503967285, + "learning_rate": 4.2029986457250495e-06, + "loss": 4.3532, + "step": 11280 + }, + { + "epoch": 2.2136131816398588, + "grad_norm": 37.39259719848633, + "learning_rate": 4.199177890954541e-06, + "loss": 4.5568, + "step": 11285 + }, + { + "epoch": 2.214593958415065, + "grad_norm": 10.75390625, + "learning_rate": 4.195357616056774e-06, + "loss": 4.4022, + "step": 11290 + }, + { + "epoch": 2.2155747351902706, + "grad_norm": 25.854948043823242, + "learning_rate": 4.19153782332095e-06, + "loss": 4.4454, + "step": 11295 + }, + { + "epoch": 2.2165555119654767, + "grad_norm": 16.350391387939453, + "learning_rate": 4.187718515035986e-06, + "loss": 4.3026, + "step": 11300 + }, + { + "epoch": 2.2175362887406824, + "grad_norm": 11.197614669799805, + "learning_rate": 4.18389969349051e-06, + "loss": 4.378, + "step": 11305 + }, + { + "epoch": 2.2185170655158886, + "grad_norm": 21.271650314331055, + "learning_rate": 4.180081360972852e-06, + "loss": 4.5606, + "step": 11310 + }, + { + "epoch": 2.2194978422910947, + "grad_norm": 31.11151123046875, + "learning_rate": 4.176263519771058e-06, + "loss": 4.7756, + "step": 11315 + }, + { + "epoch": 2.2204786190663004, + "grad_norm": 18.244281768798828, + "learning_rate": 4.172446172172868e-06, + "loss": 4.5988, + "step": 11320 + }, + { + "epoch": 2.2214593958415065, + "grad_norm": 17.656965255737305, + "learning_rate": 4.168629320465737e-06, + "loss": 4.2405, + "step": 11325 + }, + { + "epoch": 2.2224401726167122, + "grad_norm": 17.68485450744629, + "learning_rate": 4.164812966936818e-06, + "loss": 4.6696, + "step": 11330 + }, + { + "epoch": 2.2234209493919184, + "grad_norm": 19.626468658447266, + "learning_rate": 4.160997113872964e-06, + "loss": 4.6334, + "step": 11335 + }, + { + "epoch": 2.2244017261671245, + "grad_norm": 12.249372482299805, + "learning_rate": 4.157181763560732e-06, + "loss": 4.4522, + "step": 11340 + }, + { + "epoch": 2.22538250294233, + "grad_norm": 18.31877326965332, + "learning_rate": 4.153366918286374e-06, + "loss": 4.6883, + "step": 11345 + }, + { + "epoch": 2.2263632797175363, + "grad_norm": 16.244726181030273, + "learning_rate": 4.149552580335843e-06, + "loss": 4.6329, + "step": 11350 + }, + { + "epoch": 2.2273440564927425, + "grad_norm": 38.69582748413086, + "learning_rate": 4.1457387519947864e-06, + "loss": 4.6677, + "step": 11355 + }, + { + "epoch": 2.228324833267948, + "grad_norm": 16.428791046142578, + "learning_rate": 4.141925435548545e-06, + "loss": 4.4718, + "step": 11360 + }, + { + "epoch": 2.2293056100431543, + "grad_norm": 30.045249938964844, + "learning_rate": 4.138112633282154e-06, + "loss": 4.5068, + "step": 11365 + }, + { + "epoch": 2.23028638681836, + "grad_norm": 21.004417419433594, + "learning_rate": 4.13430034748034e-06, + "loss": 4.5831, + "step": 11370 + }, + { + "epoch": 2.231267163593566, + "grad_norm": 18.86147689819336, + "learning_rate": 4.13048858042752e-06, + "loss": 4.4865, + "step": 11375 + }, + { + "epoch": 2.2322479403687723, + "grad_norm": 26.14810562133789, + "learning_rate": 4.126677334407804e-06, + "loss": 4.5101, + "step": 11380 + }, + { + "epoch": 2.233228717143978, + "grad_norm": 23.22881507873535, + "learning_rate": 4.122866611704981e-06, + "loss": 4.6181, + "step": 11385 + }, + { + "epoch": 2.234209493919184, + "grad_norm": 21.121158599853516, + "learning_rate": 4.119056414602538e-06, + "loss": 4.397, + "step": 11390 + }, + { + "epoch": 2.23519027069439, + "grad_norm": 22.4326229095459, + "learning_rate": 4.115246745383636e-06, + "loss": 4.5258, + "step": 11395 + }, + { + "epoch": 2.236171047469596, + "grad_norm": 11.575772285461426, + "learning_rate": 4.111437606331126e-06, + "loss": 4.5109, + "step": 11400 + }, + { + "epoch": 2.237151824244802, + "grad_norm": 34.85311508178711, + "learning_rate": 4.107628999727542e-06, + "loss": 4.6383, + "step": 11405 + }, + { + "epoch": 2.2381326010200078, + "grad_norm": 37.00575256347656, + "learning_rate": 4.103820927855092e-06, + "loss": 4.5175, + "step": 11410 + }, + { + "epoch": 2.239113377795214, + "grad_norm": 15.301085472106934, + "learning_rate": 4.1000133929956745e-06, + "loss": 4.5078, + "step": 11415 + }, + { + "epoch": 2.2400941545704196, + "grad_norm": 19.144397735595703, + "learning_rate": 4.096206397430855e-06, + "loss": 4.3975, + "step": 11420 + }, + { + "epoch": 2.2410749313456257, + "grad_norm": 22.98430633544922, + "learning_rate": 4.092399943441884e-06, + "loss": 4.6567, + "step": 11425 + }, + { + "epoch": 2.242055708120832, + "grad_norm": 16.777803421020508, + "learning_rate": 4.088594033309683e-06, + "loss": 4.5046, + "step": 11430 + }, + { + "epoch": 2.2430364848960376, + "grad_norm": 19.35264015197754, + "learning_rate": 4.0847886693148495e-06, + "loss": 4.2147, + "step": 11435 + }, + { + "epoch": 2.2440172616712437, + "grad_norm": 29.10360336303711, + "learning_rate": 4.080983853737654e-06, + "loss": 4.7951, + "step": 11440 + }, + { + "epoch": 2.2449980384464494, + "grad_norm": 16.541683197021484, + "learning_rate": 4.077179588858035e-06, + "loss": 4.389, + "step": 11445 + }, + { + "epoch": 2.2459788152216555, + "grad_norm": 18.082550048828125, + "learning_rate": 4.073375876955606e-06, + "loss": 4.8307, + "step": 11450 + }, + { + "epoch": 2.2469595919968617, + "grad_norm": 13.92489242553711, + "learning_rate": 4.0695727203096466e-06, + "loss": 4.599, + "step": 11455 + }, + { + "epoch": 2.2479403687720674, + "grad_norm": 22.647361755371094, + "learning_rate": 4.065770121199103e-06, + "loss": 4.2624, + "step": 11460 + }, + { + "epoch": 2.2489211455472735, + "grad_norm": 23.03203773498535, + "learning_rate": 4.061968081902591e-06, + "loss": 4.4389, + "step": 11465 + }, + { + "epoch": 2.249901922322479, + "grad_norm": 22.755691528320312, + "learning_rate": 4.058166604698384e-06, + "loss": 4.614, + "step": 11470 + }, + { + "epoch": 2.2508826990976853, + "grad_norm": 28.409160614013672, + "learning_rate": 4.054365691864423e-06, + "loss": 4.5129, + "step": 11475 + }, + { + "epoch": 2.2508826990976853, + "eval_loss": 4.867314338684082, + "eval_runtime": 7.6481, + "eval_samples_per_second": 27.327, + "eval_steps_per_second": 13.729, + "step": 11475 + }, + { + "epoch": 2.2518634758728915, + "grad_norm": 17.517236709594727, + "learning_rate": 4.050565345678316e-06, + "loss": 4.4421, + "step": 11480 + }, + { + "epoch": 2.252844252648097, + "grad_norm": 25.891996383666992, + "learning_rate": 4.046765568417318e-06, + "loss": 4.382, + "step": 11485 + }, + { + "epoch": 2.2538250294233033, + "grad_norm": 42.0782470703125, + "learning_rate": 4.042966362358358e-06, + "loss": 4.7125, + "step": 11490 + }, + { + "epoch": 2.254805806198509, + "grad_norm": 19.0367374420166, + "learning_rate": 4.039167729778011e-06, + "loss": 4.5896, + "step": 11495 + }, + { + "epoch": 2.255786582973715, + "grad_norm": 15.167940139770508, + "learning_rate": 4.035369672952516e-06, + "loss": 4.5573, + "step": 11500 + }, + { + "epoch": 2.2567673597489213, + "grad_norm": 24.878446578979492, + "learning_rate": 4.031572194157764e-06, + "loss": 4.7611, + "step": 11505 + }, + { + "epoch": 2.257748136524127, + "grad_norm": 15.153958320617676, + "learning_rate": 4.027775295669297e-06, + "loss": 4.6099, + "step": 11510 + }, + { + "epoch": 2.258728913299333, + "grad_norm": 16.728437423706055, + "learning_rate": 4.023978979762316e-06, + "loss": 4.5316, + "step": 11515 + }, + { + "epoch": 2.259709690074539, + "grad_norm": 21.663965225219727, + "learning_rate": 4.0201832487116655e-06, + "loss": 4.9828, + "step": 11520 + }, + { + "epoch": 2.260690466849745, + "grad_norm": 18.675153732299805, + "learning_rate": 4.016388104791843e-06, + "loss": 4.2137, + "step": 11525 + }, + { + "epoch": 2.261671243624951, + "grad_norm": 15.859734535217285, + "learning_rate": 4.0125935502769984e-06, + "loss": 4.3856, + "step": 11530 + }, + { + "epoch": 2.262652020400157, + "grad_norm": 25.595434188842773, + "learning_rate": 4.00879958744092e-06, + "loss": 4.3875, + "step": 11535 + }, + { + "epoch": 2.263632797175363, + "grad_norm": 21.921201705932617, + "learning_rate": 4.005006218557048e-06, + "loss": 4.3901, + "step": 11540 + }, + { + "epoch": 2.2646135739505686, + "grad_norm": 25.38811683654785, + "learning_rate": 4.001213445898462e-06, + "loss": 4.8216, + "step": 11545 + }, + { + "epoch": 2.2655943507257748, + "grad_norm": 29.609708786010742, + "learning_rate": 3.997421271737887e-06, + "loss": 4.3386, + "step": 11550 + }, + { + "epoch": 2.266575127500981, + "grad_norm": 22.167301177978516, + "learning_rate": 3.993629698347693e-06, + "loss": 4.552, + "step": 11555 + }, + { + "epoch": 2.2675559042761866, + "grad_norm": 24.351184844970703, + "learning_rate": 3.989838727999881e-06, + "loss": 4.8661, + "step": 11560 + }, + { + "epoch": 2.2685366810513927, + "grad_norm": 16.84194564819336, + "learning_rate": 3.9860483629661e-06, + "loss": 4.5044, + "step": 11565 + }, + { + "epoch": 2.2695174578265984, + "grad_norm": 21.152740478515625, + "learning_rate": 3.982258605517627e-06, + "loss": 4.558, + "step": 11570 + }, + { + "epoch": 2.2704982346018046, + "grad_norm": 16.635719299316406, + "learning_rate": 3.978469457925385e-06, + "loss": 4.4371, + "step": 11575 + }, + { + "epoch": 2.2714790113770107, + "grad_norm": 17.723609924316406, + "learning_rate": 3.974680922459926e-06, + "loss": 4.5154, + "step": 11580 + }, + { + "epoch": 2.2724597881522164, + "grad_norm": 16.740421295166016, + "learning_rate": 3.970893001391431e-06, + "loss": 4.2428, + "step": 11585 + }, + { + "epoch": 2.2734405649274225, + "grad_norm": 27.927316665649414, + "learning_rate": 3.967105696989723e-06, + "loss": 4.3807, + "step": 11590 + }, + { + "epoch": 2.2744213417026287, + "grad_norm": 14.875885963439941, + "learning_rate": 3.963319011524246e-06, + "loss": 4.3996, + "step": 11595 + }, + { + "epoch": 2.2754021184778344, + "grad_norm": 14.314786911010742, + "learning_rate": 3.959532947264078e-06, + "loss": 4.5584, + "step": 11600 + }, + { + "epoch": 2.2763828952530405, + "grad_norm": 17.703577041625977, + "learning_rate": 3.955747506477927e-06, + "loss": 4.4526, + "step": 11605 + }, + { + "epoch": 2.277363672028246, + "grad_norm": 51.20561981201172, + "learning_rate": 3.951962691434121e-06, + "loss": 4.802, + "step": 11610 + }, + { + "epoch": 2.2783444488034523, + "grad_norm": 14.392014503479004, + "learning_rate": 3.948178504400619e-06, + "loss": 4.8058, + "step": 11615 + }, + { + "epoch": 2.2793252255786585, + "grad_norm": 14.023991584777832, + "learning_rate": 3.944394947644996e-06, + "loss": 4.1792, + "step": 11620 + }, + { + "epoch": 2.280306002353864, + "grad_norm": 20.808156967163086, + "learning_rate": 3.940612023434459e-06, + "loss": 4.5473, + "step": 11625 + }, + { + "epoch": 2.2812867791290703, + "grad_norm": 42.275081634521484, + "learning_rate": 3.936829734035831e-06, + "loss": 4.7274, + "step": 11630 + }, + { + "epoch": 2.2822675559042764, + "grad_norm": 21.149211883544922, + "learning_rate": 3.933048081715553e-06, + "loss": 4.6594, + "step": 11635 + }, + { + "epoch": 2.283248332679482, + "grad_norm": 29.46340560913086, + "learning_rate": 3.929267068739687e-06, + "loss": 4.4786, + "step": 11640 + }, + { + "epoch": 2.2842291094546883, + "grad_norm": 13.892107009887695, + "learning_rate": 3.925486697373911e-06, + "loss": 4.5621, + "step": 11645 + }, + { + "epoch": 2.285209886229894, + "grad_norm": 15.604551315307617, + "learning_rate": 3.9217069698835175e-06, + "loss": 4.7846, + "step": 11650 + }, + { + "epoch": 2.2861906630051, + "grad_norm": 27.05708885192871, + "learning_rate": 3.917927888533418e-06, + "loss": 4.2979, + "step": 11655 + }, + { + "epoch": 2.2871714397803062, + "grad_norm": 24.699222564697266, + "learning_rate": 3.914149455588127e-06, + "loss": 4.6958, + "step": 11660 + }, + { + "epoch": 2.288152216555512, + "grad_norm": 15.108890533447266, + "learning_rate": 3.910371673311783e-06, + "loss": 4.522, + "step": 11665 + }, + { + "epoch": 2.289132993330718, + "grad_norm": 24.2240047454834, + "learning_rate": 3.906594543968122e-06, + "loss": 4.2788, + "step": 11670 + }, + { + "epoch": 2.2901137701059238, + "grad_norm": 20.101787567138672, + "learning_rate": 3.902818069820498e-06, + "loss": 4.5444, + "step": 11675 + }, + { + "epoch": 2.29109454688113, + "grad_norm": 16.885448455810547, + "learning_rate": 3.8990422531318705e-06, + "loss": 4.5266, + "step": 11680 + }, + { + "epoch": 2.292075323656336, + "grad_norm": 15.544132232666016, + "learning_rate": 3.895267096164802e-06, + "loss": 4.984, + "step": 11685 + }, + { + "epoch": 2.2930561004315417, + "grad_norm": 19.72736167907715, + "learning_rate": 3.891492601181462e-06, + "loss": 4.5247, + "step": 11690 + }, + { + "epoch": 2.294036877206748, + "grad_norm": 37.8563346862793, + "learning_rate": 3.887718770443622e-06, + "loss": 4.5902, + "step": 11695 + }, + { + "epoch": 2.2950176539819536, + "grad_norm": 14.793488502502441, + "learning_rate": 3.883945606212655e-06, + "loss": 4.7889, + "step": 11700 + }, + { + "epoch": 2.2959984307571597, + "grad_norm": 25.496313095092773, + "learning_rate": 3.880173110749541e-06, + "loss": 4.6955, + "step": 11705 + }, + { + "epoch": 2.296979207532366, + "grad_norm": 21.444076538085938, + "learning_rate": 3.876401286314848e-06, + "loss": 4.4975, + "step": 11710 + }, + { + "epoch": 2.2979599843075715, + "grad_norm": 17.857149124145508, + "learning_rate": 3.872630135168753e-06, + "loss": 4.4218, + "step": 11715 + }, + { + "epoch": 2.2989407610827777, + "grad_norm": 19.443084716796875, + "learning_rate": 3.868859659571022e-06, + "loss": 4.407, + "step": 11720 + }, + { + "epoch": 2.2999215378579834, + "grad_norm": 15.299306869506836, + "learning_rate": 3.865089861781017e-06, + "loss": 4.7506, + "step": 11725 + }, + { + "epoch": 2.3009023146331895, + "grad_norm": 13.797977447509766, + "learning_rate": 3.861320744057701e-06, + "loss": 4.6852, + "step": 11730 + }, + { + "epoch": 2.3018830914083956, + "grad_norm": 28.186782836914062, + "learning_rate": 3.857552308659618e-06, + "loss": 4.7819, + "step": 11735 + }, + { + "epoch": 2.3028638681836013, + "grad_norm": 25.146703720092773, + "learning_rate": 3.8537845578449146e-06, + "loss": 4.6642, + "step": 11740 + }, + { + "epoch": 2.3038446449588075, + "grad_norm": 12.95285415649414, + "learning_rate": 3.850017493871317e-06, + "loss": 4.4853, + "step": 11745 + }, + { + "epoch": 2.304825421734013, + "grad_norm": 21.151214599609375, + "learning_rate": 3.846251118996148e-06, + "loss": 4.5531, + "step": 11750 + }, + { + "epoch": 2.3058061985092193, + "grad_norm": 13.681554794311523, + "learning_rate": 3.842485435476313e-06, + "loss": 4.5772, + "step": 11755 + }, + { + "epoch": 2.3067869752844254, + "grad_norm": 14.076757431030273, + "learning_rate": 3.838720445568304e-06, + "loss": 4.5151, + "step": 11760 + }, + { + "epoch": 2.307767752059631, + "grad_norm": 22.38243865966797, + "learning_rate": 3.834956151528198e-06, + "loss": 4.3797, + "step": 11765 + }, + { + "epoch": 2.3087485288348373, + "grad_norm": 17.647878646850586, + "learning_rate": 3.831192555611654e-06, + "loss": 4.3728, + "step": 11770 + }, + { + "epoch": 2.309729305610043, + "grad_norm": 24.30431365966797, + "learning_rate": 3.827429660073913e-06, + "loss": 4.6663, + "step": 11775 + }, + { + "epoch": 2.310710082385249, + "grad_norm": 16.04897689819336, + "learning_rate": 3.8236674671698e-06, + "loss": 4.3886, + "step": 11780 + }, + { + "epoch": 2.3116908591604552, + "grad_norm": 20.717838287353516, + "learning_rate": 3.8199059791537105e-06, + "loss": 4.2956, + "step": 11785 + }, + { + "epoch": 2.312671635935661, + "grad_norm": 50.36362075805664, + "learning_rate": 3.816145198279626e-06, + "loss": 4.6332, + "step": 11790 + }, + { + "epoch": 2.313652412710867, + "grad_norm": 17.40586280822754, + "learning_rate": 3.8123851268011006e-06, + "loss": 4.6359, + "step": 11795 + }, + { + "epoch": 2.3146331894860728, + "grad_norm": 22.39040756225586, + "learning_rate": 3.8086257669712617e-06, + "loss": 4.4741, + "step": 11800 + }, + { + "epoch": 2.315613966261279, + "grad_norm": 18.884647369384766, + "learning_rate": 3.8048671210428157e-06, + "loss": 4.3912, + "step": 11805 + }, + { + "epoch": 2.316594743036485, + "grad_norm": 29.65871810913086, + "learning_rate": 3.8011091912680337e-06, + "loss": 4.6412, + "step": 11810 + }, + { + "epoch": 2.3175755198116907, + "grad_norm": 21.042131423950195, + "learning_rate": 3.7973519798987653e-06, + "loss": 4.4512, + "step": 11815 + }, + { + "epoch": 2.318556296586897, + "grad_norm": 16.669906616210938, + "learning_rate": 3.7935954891864222e-06, + "loss": 4.654, + "step": 11820 + }, + { + "epoch": 2.3195370733621026, + "grad_norm": 29.70143699645996, + "learning_rate": 3.7898397213819916e-06, + "loss": 4.3917, + "step": 11825 + }, + { + "epoch": 2.3205178501373087, + "grad_norm": 23.048263549804688, + "learning_rate": 3.786084678736024e-06, + "loss": 4.6097, + "step": 11830 + }, + { + "epoch": 2.321498626912515, + "grad_norm": 13.745390892028809, + "learning_rate": 3.7823303634986313e-06, + "loss": 4.4516, + "step": 11835 + }, + { + "epoch": 2.3224794036877205, + "grad_norm": 14.595437049865723, + "learning_rate": 3.7785767779194984e-06, + "loss": 4.469, + "step": 11840 + }, + { + "epoch": 2.3234601804629267, + "grad_norm": 30.718673706054688, + "learning_rate": 3.774823924247864e-06, + "loss": 4.0418, + "step": 11845 + }, + { + "epoch": 2.3244409572381324, + "grad_norm": 20.652172088623047, + "learning_rate": 3.771071804732534e-06, + "loss": 4.3759, + "step": 11850 + }, + { + "epoch": 2.3254217340133385, + "grad_norm": 30.810976028442383, + "learning_rate": 3.7673204216218757e-06, + "loss": 4.4283, + "step": 11855 + }, + { + "epoch": 2.3264025107885447, + "grad_norm": 12.208151817321777, + "learning_rate": 3.763569777163808e-06, + "loss": 4.5245, + "step": 11860 + }, + { + "epoch": 2.3273832875637503, + "grad_norm": 33.74302291870117, + "learning_rate": 3.759819873605813e-06, + "loss": 4.6234, + "step": 11865 + }, + { + "epoch": 2.3283640643389565, + "grad_norm": 22.965965270996094, + "learning_rate": 3.7560707131949276e-06, + "loss": 4.6614, + "step": 11870 + }, + { + "epoch": 2.329344841114162, + "grad_norm": 17.791505813598633, + "learning_rate": 3.752322298177741e-06, + "loss": 4.6324, + "step": 11875 + }, + { + "epoch": 2.3303256178893683, + "grad_norm": 22.28071403503418, + "learning_rate": 3.7485746308004013e-06, + "loss": 4.6046, + "step": 11880 + }, + { + "epoch": 2.3313063946645745, + "grad_norm": 14.98755931854248, + "learning_rate": 3.744827713308601e-06, + "loss": 4.6156, + "step": 11885 + }, + { + "epoch": 2.33228717143978, + "grad_norm": 22.762523651123047, + "learning_rate": 3.7410815479475903e-06, + "loss": 4.4806, + "step": 11890 + }, + { + "epoch": 2.3332679482149863, + "grad_norm": 20.753673553466797, + "learning_rate": 3.7373361369621638e-06, + "loss": 4.3978, + "step": 11895 + }, + { + "epoch": 2.3342487249901924, + "grad_norm": 15.564764022827148, + "learning_rate": 3.733591482596667e-06, + "loss": 4.728, + "step": 11900 + }, + { + "epoch": 2.335229501765398, + "grad_norm": 22.621809005737305, + "learning_rate": 3.729847587094991e-06, + "loss": 4.5043, + "step": 11905 + }, + { + "epoch": 2.3362102785406043, + "grad_norm": 21.3169002532959, + "learning_rate": 3.72610445270057e-06, + "loss": 4.48, + "step": 11910 + }, + { + "epoch": 2.33719105531581, + "grad_norm": 10.993232727050781, + "learning_rate": 3.7223620816563884e-06, + "loss": 4.6117, + "step": 11915 + }, + { + "epoch": 2.338171832091016, + "grad_norm": 22.939523696899414, + "learning_rate": 3.7186204762049638e-06, + "loss": 4.5433, + "step": 11920 + }, + { + "epoch": 2.3391526088662222, + "grad_norm": 24.98711585998535, + "learning_rate": 3.714879638588363e-06, + "loss": 4.6915, + "step": 11925 + }, + { + "epoch": 2.340133385641428, + "grad_norm": 25.30660057067871, + "learning_rate": 3.7111395710481924e-06, + "loss": 4.3815, + "step": 11930 + }, + { + "epoch": 2.341114162416634, + "grad_norm": 19.658090591430664, + "learning_rate": 3.70740027582559e-06, + "loss": 4.4353, + "step": 11935 + }, + { + "epoch": 2.3420949391918398, + "grad_norm": 24.44713592529297, + "learning_rate": 3.7036617551612387e-06, + "loss": 4.267, + "step": 11940 + }, + { + "epoch": 2.343075715967046, + "grad_norm": 12.363530158996582, + "learning_rate": 3.699924011295352e-06, + "loss": 4.7713, + "step": 11945 + }, + { + "epoch": 2.344056492742252, + "grad_norm": 21.272781372070312, + "learning_rate": 3.6961870464676796e-06, + "loss": 4.9373, + "step": 11950 + }, + { + "epoch": 2.3450372695174577, + "grad_norm": 26.136646270751953, + "learning_rate": 3.6924508629175083e-06, + "loss": 4.5627, + "step": 11955 + }, + { + "epoch": 2.346018046292664, + "grad_norm": 18.093294143676758, + "learning_rate": 3.6887154628836492e-06, + "loss": 4.468, + "step": 11960 + }, + { + "epoch": 2.34699882306787, + "grad_norm": 17.701435089111328, + "learning_rate": 3.6849808486044515e-06, + "loss": 4.6783, + "step": 11965 + }, + { + "epoch": 2.3479795998430757, + "grad_norm": 16.679710388183594, + "learning_rate": 3.6812470223177865e-06, + "loss": 4.7766, + "step": 11970 + }, + { + "epoch": 2.348960376618282, + "grad_norm": 17.240333557128906, + "learning_rate": 3.6775139862610577e-06, + "loss": 5.2171, + "step": 11975 + }, + { + "epoch": 2.3499411533934875, + "grad_norm": 16.750885009765625, + "learning_rate": 3.6737817426711973e-06, + "loss": 4.4293, + "step": 11980 + }, + { + "epoch": 2.3509219301686937, + "grad_norm": 32.81517028808594, + "learning_rate": 3.6700502937846543e-06, + "loss": 4.8392, + "step": 11985 + }, + { + "epoch": 2.3519027069439, + "grad_norm": 27.170766830444336, + "learning_rate": 3.6663196418374114e-06, + "loss": 4.4677, + "step": 11990 + }, + { + "epoch": 2.3528834837191055, + "grad_norm": 20.98493766784668, + "learning_rate": 3.6625897890649653e-06, + "loss": 4.6036, + "step": 11995 + }, + { + "epoch": 2.3538642604943116, + "grad_norm": 14.241714477539062, + "learning_rate": 3.65886073770234e-06, + "loss": 4.6737, + "step": 12000 + }, + { + "epoch": 2.3548450372695173, + "grad_norm": 13.720659255981445, + "learning_rate": 3.655132489984077e-06, + "loss": 4.6364, + "step": 12005 + }, + { + "epoch": 2.3558258140447235, + "grad_norm": 20.04840850830078, + "learning_rate": 3.6514050481442336e-06, + "loss": 4.6231, + "step": 12010 + }, + { + "epoch": 2.3568065908199296, + "grad_norm": 26.38751220703125, + "learning_rate": 3.64767841441639e-06, + "loss": 4.3643, + "step": 12015 + }, + { + "epoch": 2.3577873675951353, + "grad_norm": 22.324962615966797, + "learning_rate": 3.6439525910336347e-06, + "loss": 4.1944, + "step": 12020 + }, + { + "epoch": 2.3587681443703414, + "grad_norm": 26.925119400024414, + "learning_rate": 3.640227580228577e-06, + "loss": 4.7769, + "step": 12025 + }, + { + "epoch": 2.359748921145547, + "grad_norm": 14.318288803100586, + "learning_rate": 3.6365033842333396e-06, + "loss": 4.6321, + "step": 12030 + }, + { + "epoch": 2.3607296979207533, + "grad_norm": 30.081195831298828, + "learning_rate": 3.6327800052795492e-06, + "loss": 4.0996, + "step": 12035 + }, + { + "epoch": 2.3617104746959594, + "grad_norm": 14.89295482635498, + "learning_rate": 3.6290574455983528e-06, + "loss": 4.5598, + "step": 12040 + }, + { + "epoch": 2.362691251471165, + "grad_norm": 14.272405624389648, + "learning_rate": 3.625335707420399e-06, + "loss": 4.402, + "step": 12045 + }, + { + "epoch": 2.3636720282463712, + "grad_norm": 15.58284854888916, + "learning_rate": 3.621614792975846e-06, + "loss": 4.3527, + "step": 12050 + }, + { + "epoch": 2.364652805021577, + "grad_norm": 13.304563522338867, + "learning_rate": 3.6178947044943636e-06, + "loss": 4.573, + "step": 12055 + }, + { + "epoch": 2.365633581796783, + "grad_norm": 31.662683486938477, + "learning_rate": 3.614175444205116e-06, + "loss": 4.69, + "step": 12060 + }, + { + "epoch": 2.366614358571989, + "grad_norm": 27.504352569580078, + "learning_rate": 3.6104570143367847e-06, + "loss": 4.3019, + "step": 12065 + }, + { + "epoch": 2.367595135347195, + "grad_norm": 18.727642059326172, + "learning_rate": 3.6067394171175397e-06, + "loss": 4.0567, + "step": 12070 + }, + { + "epoch": 2.368575912122401, + "grad_norm": 16.816965103149414, + "learning_rate": 3.6030226547750625e-06, + "loss": 4.5963, + "step": 12075 + }, + { + "epoch": 2.3695566888976067, + "grad_norm": 18.147640228271484, + "learning_rate": 3.5993067295365303e-06, + "loss": 4.5701, + "step": 12080 + }, + { + "epoch": 2.370537465672813, + "grad_norm": 23.622173309326172, + "learning_rate": 3.5955916436286177e-06, + "loss": 4.6356, + "step": 12085 + }, + { + "epoch": 2.371518242448019, + "grad_norm": 17.842937469482422, + "learning_rate": 3.5918773992774996e-06, + "loss": 4.3194, + "step": 12090 + }, + { + "epoch": 2.3724990192232247, + "grad_norm": 16.464218139648438, + "learning_rate": 3.588163998708841e-06, + "loss": 4.4403, + "step": 12095 + }, + { + "epoch": 2.373479795998431, + "grad_norm": 24.46657943725586, + "learning_rate": 3.5844514441478075e-06, + "loss": 4.4403, + "step": 12100 + }, + { + "epoch": 2.3744605727736365, + "grad_norm": 12.043354034423828, + "learning_rate": 3.5807397378190558e-06, + "loss": 4.0212, + "step": 12105 + }, + { + "epoch": 2.3754413495488427, + "grad_norm": 27.728321075439453, + "learning_rate": 3.5770288819467307e-06, + "loss": 4.785, + "step": 12110 + }, + { + "epoch": 2.376422126324049, + "grad_norm": 15.207083702087402, + "learning_rate": 3.573318878754475e-06, + "loss": 4.392, + "step": 12115 + }, + { + "epoch": 2.3774029030992545, + "grad_norm": 15.682321548461914, + "learning_rate": 3.5696097304654107e-06, + "loss": 4.6056, + "step": 12120 + }, + { + "epoch": 2.3783836798744606, + "grad_norm": 21.045578002929688, + "learning_rate": 3.5659014393021547e-06, + "loss": 4.5539, + "step": 12125 + }, + { + "epoch": 2.3793644566496663, + "grad_norm": 19.962541580200195, + "learning_rate": 3.5621940074868105e-06, + "loss": 4.5095, + "step": 12130 + }, + { + "epoch": 2.3803452334248725, + "grad_norm": 22.4492244720459, + "learning_rate": 3.5584874372409605e-06, + "loss": 4.1886, + "step": 12135 + }, + { + "epoch": 2.3813260102000786, + "grad_norm": 35.58995056152344, + "learning_rate": 3.5547817307856792e-06, + "loss": 4.7669, + "step": 12140 + }, + { + "epoch": 2.3823067869752843, + "grad_norm": 17.150493621826172, + "learning_rate": 3.551076890341514e-06, + "loss": 4.5894, + "step": 12145 + }, + { + "epoch": 2.3832875637504904, + "grad_norm": 26.246667861938477, + "learning_rate": 3.547372918128503e-06, + "loss": 4.6605, + "step": 12150 + }, + { + "epoch": 2.384268340525696, + "grad_norm": 21.408432006835938, + "learning_rate": 3.5436698163661578e-06, + "loss": 4.2601, + "step": 12155 + }, + { + "epoch": 2.3852491173009023, + "grad_norm": 11.675660133361816, + "learning_rate": 3.5399675872734687e-06, + "loss": 4.3653, + "step": 12160 + }, + { + "epoch": 2.3862298940761084, + "grad_norm": 23.07163429260254, + "learning_rate": 3.5362662330689067e-06, + "loss": 4.7729, + "step": 12165 + }, + { + "epoch": 2.387210670851314, + "grad_norm": 28.680078506469727, + "learning_rate": 3.532565755970413e-06, + "loss": 4.302, + "step": 12170 + }, + { + "epoch": 2.3881914476265202, + "grad_norm": 36.65128707885742, + "learning_rate": 3.5288661581954097e-06, + "loss": 4.129, + "step": 12175 + }, + { + "epoch": 2.389172224401726, + "grad_norm": 16.943334579467773, + "learning_rate": 3.525167441960789e-06, + "loss": 4.5921, + "step": 12180 + }, + { + "epoch": 2.390153001176932, + "grad_norm": 23.751453399658203, + "learning_rate": 3.521469609482913e-06, + "loss": 4.9026, + "step": 12185 + }, + { + "epoch": 2.391133777952138, + "grad_norm": 17.941225051879883, + "learning_rate": 3.5177726629776155e-06, + "loss": 4.5361, + "step": 12190 + }, + { + "epoch": 2.392114554727344, + "grad_norm": 18.334062576293945, + "learning_rate": 3.5140766046602014e-06, + "loss": 4.5451, + "step": 12195 + }, + { + "epoch": 2.39309533150255, + "grad_norm": 17.336273193359375, + "learning_rate": 3.5103814367454397e-06, + "loss": 4.3965, + "step": 12200 + }, + { + "epoch": 2.3940761082777557, + "grad_norm": 9.473857879638672, + "learning_rate": 3.506687161447571e-06, + "loss": 4.504, + "step": 12205 + }, + { + "epoch": 2.395056885052962, + "grad_norm": 19.98438262939453, + "learning_rate": 3.5029937809802946e-06, + "loss": 4.5001, + "step": 12210 + }, + { + "epoch": 2.396037661828168, + "grad_norm": 22.35348892211914, + "learning_rate": 3.49930129755678e-06, + "loss": 4.7322, + "step": 12215 + }, + { + "epoch": 2.3970184386033737, + "grad_norm": 20.084745407104492, + "learning_rate": 3.4956097133896525e-06, + "loss": 4.5124, + "step": 12220 + }, + { + "epoch": 2.39799921537858, + "grad_norm": 16.486888885498047, + "learning_rate": 3.491919030691005e-06, + "loss": 4.2832, + "step": 12225 + }, + { + "epoch": 2.398979992153786, + "grad_norm": 12.889225006103516, + "learning_rate": 3.488229251672388e-06, + "loss": 4.6114, + "step": 12230 + }, + { + "epoch": 2.3999607689289917, + "grad_norm": 15.054161071777344, + "learning_rate": 3.484540378544806e-06, + "loss": 4.4341, + "step": 12235 + }, + { + "epoch": 2.400941545704198, + "grad_norm": 21.588682174682617, + "learning_rate": 3.4808524135187294e-06, + "loss": 4.6551, + "step": 12240 + }, + { + "epoch": 2.4019223224794035, + "grad_norm": 18.084217071533203, + "learning_rate": 3.4771653588040742e-06, + "loss": 4.3948, + "step": 12245 + }, + { + "epoch": 2.4029030992546097, + "grad_norm": 18.453102111816406, + "learning_rate": 3.4734792166102193e-06, + "loss": 4.6146, + "step": 12250 + }, + { + "epoch": 2.403883876029816, + "grad_norm": 17.33672332763672, + "learning_rate": 3.4697939891459958e-06, + "loss": 4.5763, + "step": 12255 + }, + { + "epoch": 2.4048646528050215, + "grad_norm": 18.220375061035156, + "learning_rate": 3.466109678619681e-06, + "loss": 4.3382, + "step": 12260 + }, + { + "epoch": 2.4058454295802276, + "grad_norm": 23.422691345214844, + "learning_rate": 3.4624262872390092e-06, + "loss": 4.8424, + "step": 12265 + }, + { + "epoch": 2.4068262063554333, + "grad_norm": 36.47475051879883, + "learning_rate": 3.458743817211158e-06, + "loss": 4.4286, + "step": 12270 + }, + { + "epoch": 2.4078069831306395, + "grad_norm": 30.083444595336914, + "learning_rate": 3.455062270742757e-06, + "loss": 4.6231, + "step": 12275 + }, + { + "epoch": 2.4087877599058456, + "grad_norm": 18.711580276489258, + "learning_rate": 3.451381650039885e-06, + "loss": 4.4978, + "step": 12280 + }, + { + "epoch": 2.4097685366810513, + "grad_norm": 16.53005599975586, + "learning_rate": 3.4477019573080572e-06, + "loss": 4.503, + "step": 12285 + }, + { + "epoch": 2.4107493134562574, + "grad_norm": 15.226875305175781, + "learning_rate": 3.4440231947522424e-06, + "loss": 4.7863, + "step": 12290 + }, + { + "epoch": 2.4117300902314636, + "grad_norm": 22.330272674560547, + "learning_rate": 3.440345364576845e-06, + "loss": 4.4584, + "step": 12295 + }, + { + "epoch": 2.4127108670066693, + "grad_norm": 25.432588577270508, + "learning_rate": 3.4366684689857118e-06, + "loss": 4.6265, + "step": 12300 + }, + { + "epoch": 2.4136916437818754, + "grad_norm": 14.317787170410156, + "learning_rate": 3.432992510182136e-06, + "loss": 4.6704, + "step": 12305 + }, + { + "epoch": 2.414672420557081, + "grad_norm": 26.9719295501709, + "learning_rate": 3.429317490368839e-06, + "loss": 4.5278, + "step": 12310 + }, + { + "epoch": 2.4156531973322872, + "grad_norm": 12.308198928833008, + "learning_rate": 3.4256434117479897e-06, + "loss": 4.3488, + "step": 12315 + }, + { + "epoch": 2.4166339741074934, + "grad_norm": 17.44641876220703, + "learning_rate": 3.4219702765211846e-06, + "loss": 4.1399, + "step": 12320 + }, + { + "epoch": 2.417614750882699, + "grad_norm": 16.461467742919922, + "learning_rate": 3.418298086889462e-06, + "loss": 4.2916, + "step": 12325 + }, + { + "epoch": 2.418595527657905, + "grad_norm": 25.63442611694336, + "learning_rate": 3.4146268450532883e-06, + "loss": 4.6587, + "step": 12330 + }, + { + "epoch": 2.419576304433111, + "grad_norm": 25.120079040527344, + "learning_rate": 3.4109565532125645e-06, + "loss": 4.7741, + "step": 12335 + }, + { + "epoch": 2.420557081208317, + "grad_norm": 27.460861206054688, + "learning_rate": 3.4072872135666223e-06, + "loss": 4.7769, + "step": 12340 + }, + { + "epoch": 2.421537857983523, + "grad_norm": 14.591779708862305, + "learning_rate": 3.40361882831422e-06, + "loss": 4.3034, + "step": 12345 + }, + { + "epoch": 2.422518634758729, + "grad_norm": 18.143795013427734, + "learning_rate": 3.399951399653547e-06, + "loss": 4.4657, + "step": 12350 + }, + { + "epoch": 2.423499411533935, + "grad_norm": 26.938961029052734, + "learning_rate": 3.3962849297822225e-06, + "loss": 4.5643, + "step": 12355 + }, + { + "epoch": 2.4244801883091407, + "grad_norm": 21.520898818969727, + "learning_rate": 3.392619420897282e-06, + "loss": 4.1642, + "step": 12360 + }, + { + "epoch": 2.425460965084347, + "grad_norm": 12.339751243591309, + "learning_rate": 3.388954875195195e-06, + "loss": 4.4566, + "step": 12365 + }, + { + "epoch": 2.426441741859553, + "grad_norm": 15.379558563232422, + "learning_rate": 3.3852912948718463e-06, + "loss": 4.3242, + "step": 12370 + }, + { + "epoch": 2.4274225186347587, + "grad_norm": 30.704980850219727, + "learning_rate": 3.3816286821225454e-06, + "loss": 4.6186, + "step": 12375 + }, + { + "epoch": 2.428403295409965, + "grad_norm": 10.835277557373047, + "learning_rate": 3.3779670391420255e-06, + "loss": 4.6727, + "step": 12380 + }, + { + "epoch": 2.4293840721851705, + "grad_norm": 25.137619018554688, + "learning_rate": 3.3743063681244302e-06, + "loss": 4.697, + "step": 12385 + }, + { + "epoch": 2.4303648489603766, + "grad_norm": 12.802051544189453, + "learning_rate": 3.3706466712633302e-06, + "loss": 4.5622, + "step": 12390 + }, + { + "epoch": 2.4313456257355828, + "grad_norm": 48.51518630981445, + "learning_rate": 3.3669879507517034e-06, + "loss": 4.4323, + "step": 12395 + }, + { + "epoch": 2.4323264025107885, + "grad_norm": 20.264951705932617, + "learning_rate": 3.3633302087819507e-06, + "loss": 4.3534, + "step": 12400 + }, + { + "epoch": 2.4333071792859946, + "grad_norm": 39.245704650878906, + "learning_rate": 3.3596734475458815e-06, + "loss": 4.7553, + "step": 12405 + }, + { + "epoch": 2.4342879560612003, + "grad_norm": 10.262246131896973, + "learning_rate": 3.3560176692347198e-06, + "loss": 4.3705, + "step": 12410 + }, + { + "epoch": 2.4352687328364064, + "grad_norm": 20.34832191467285, + "learning_rate": 3.3523628760391e-06, + "loss": 4.4396, + "step": 12415 + }, + { + "epoch": 2.4362495096116126, + "grad_norm": 22.843975067138672, + "learning_rate": 3.3487090701490633e-06, + "loss": 4.9229, + "step": 12420 + }, + { + "epoch": 2.4372302863868183, + "grad_norm": 19.620826721191406, + "learning_rate": 3.3450562537540643e-06, + "loss": 4.3282, + "step": 12425 + }, + { + "epoch": 2.4382110631620244, + "grad_norm": 23.795570373535156, + "learning_rate": 3.3414044290429647e-06, + "loss": 4.5254, + "step": 12430 + }, + { + "epoch": 2.43919183993723, + "grad_norm": 11.415021896362305, + "learning_rate": 3.3377535982040245e-06, + "loss": 4.442, + "step": 12435 + }, + { + "epoch": 2.4401726167124362, + "grad_norm": 25.729127883911133, + "learning_rate": 3.3341037634249185e-06, + "loss": 4.5446, + "step": 12440 + }, + { + "epoch": 2.4411533934876424, + "grad_norm": 25.259185791015625, + "learning_rate": 3.3304549268927163e-06, + "loss": 4.6079, + "step": 12445 + }, + { + "epoch": 2.442134170262848, + "grad_norm": 22.145832061767578, + "learning_rate": 3.3268070907938915e-06, + "loss": 4.5964, + "step": 12450 + }, + { + "epoch": 2.443114947038054, + "grad_norm": 12.732306480407715, + "learning_rate": 3.3231602573143233e-06, + "loss": 4.651, + "step": 12455 + }, + { + "epoch": 2.44409572381326, + "grad_norm": 16.242197036743164, + "learning_rate": 3.3195144286392816e-06, + "loss": 4.4981, + "step": 12460 + }, + { + "epoch": 2.445076500588466, + "grad_norm": 23.256351470947266, + "learning_rate": 3.3158696069534423e-06, + "loss": 4.6798, + "step": 12465 + }, + { + "epoch": 2.446057277363672, + "grad_norm": 11.750959396362305, + "learning_rate": 3.312225794440871e-06, + "loss": 5.0038, + "step": 12470 + }, + { + "epoch": 2.447038054138878, + "grad_norm": 20.7374267578125, + "learning_rate": 3.3085829932850342e-06, + "loss": 4.5022, + "step": 12475 + }, + { + "epoch": 2.448018830914084, + "grad_norm": 14.744890213012695, + "learning_rate": 3.3049412056687895e-06, + "loss": 4.5182, + "step": 12480 + }, + { + "epoch": 2.4489996076892897, + "grad_norm": 30.004308700561523, + "learning_rate": 3.3013004337743857e-06, + "loss": 4.5898, + "step": 12485 + }, + { + "epoch": 2.449980384464496, + "grad_norm": 16.387863159179688, + "learning_rate": 3.2976606797834678e-06, + "loss": 4.779, + "step": 12490 + }, + { + "epoch": 2.450961161239702, + "grad_norm": 26.760541915893555, + "learning_rate": 3.294021945877064e-06, + "loss": 4.4479, + "step": 12495 + }, + { + "epoch": 2.4519419380149077, + "grad_norm": 18.055334091186523, + "learning_rate": 3.290384234235598e-06, + "loss": 4.6721, + "step": 12500 + }, + { + "epoch": 2.452922714790114, + "grad_norm": 13.502447128295898, + "learning_rate": 3.2867475470388793e-06, + "loss": 4.907, + "step": 12505 + }, + { + "epoch": 2.4539034915653195, + "grad_norm": 21.915271759033203, + "learning_rate": 3.2831118864660994e-06, + "loss": 4.4258, + "step": 12510 + }, + { + "epoch": 2.4548842683405256, + "grad_norm": 24.941059112548828, + "learning_rate": 3.279477254695839e-06, + "loss": 4.923, + "step": 12515 + }, + { + "epoch": 2.455865045115732, + "grad_norm": 14.76949691772461, + "learning_rate": 3.27584365390606e-06, + "loss": 4.5605, + "step": 12520 + }, + { + "epoch": 2.4568458218909375, + "grad_norm": 19.59633445739746, + "learning_rate": 3.272211086274107e-06, + "loss": 4.9415, + "step": 12525 + }, + { + "epoch": 2.4578265986661436, + "grad_norm": 16.15921401977539, + "learning_rate": 3.2685795539767084e-06, + "loss": 4.7336, + "step": 12530 + }, + { + "epoch": 2.4588073754413493, + "grad_norm": 30.89405632019043, + "learning_rate": 3.264949059189966e-06, + "loss": 4.593, + "step": 12535 + }, + { + "epoch": 2.4597881522165554, + "grad_norm": 34.44179916381836, + "learning_rate": 3.2613196040893675e-06, + "loss": 4.5955, + "step": 12540 + }, + { + "epoch": 2.4607689289917616, + "grad_norm": 15.018108367919922, + "learning_rate": 3.2576911908497695e-06, + "loss": 4.575, + "step": 12545 + }, + { + "epoch": 2.4617497057669673, + "grad_norm": 14.546805381774902, + "learning_rate": 3.2540638216454114e-06, + "loss": 4.1292, + "step": 12550 + }, + { + "epoch": 2.4627304825421734, + "grad_norm": 18.152332305908203, + "learning_rate": 3.2504374986499044e-06, + "loss": 4.3065, + "step": 12555 + }, + { + "epoch": 2.4637112593173796, + "grad_norm": 17.869962692260742, + "learning_rate": 3.2468122240362287e-06, + "loss": 4.409, + "step": 12560 + }, + { + "epoch": 2.4646920360925852, + "grad_norm": 12.660460472106934, + "learning_rate": 3.2431879999767445e-06, + "loss": 4.4719, + "step": 12565 + }, + { + "epoch": 2.4656728128677914, + "grad_norm": 18.58382225036621, + "learning_rate": 3.2395648286431735e-06, + "loss": 4.7638, + "step": 12570 + }, + { + "epoch": 2.466653589642997, + "grad_norm": 20.497264862060547, + "learning_rate": 3.235942712206614e-06, + "loss": 4.3147, + "step": 12575 + }, + { + "epoch": 2.467634366418203, + "grad_norm": 16.558277130126953, + "learning_rate": 3.2323216528375302e-06, + "loss": 4.8391, + "step": 12580 + }, + { + "epoch": 2.4686151431934094, + "grad_norm": 19.21946144104004, + "learning_rate": 3.2287016527057497e-06, + "loss": 4.7922, + "step": 12585 + }, + { + "epoch": 2.469595919968615, + "grad_norm": 19.4241886138916, + "learning_rate": 3.225082713980468e-06, + "loss": 4.6347, + "step": 12590 + }, + { + "epoch": 2.470576696743821, + "grad_norm": 36.83026885986328, + "learning_rate": 3.2214648388302445e-06, + "loss": 4.4627, + "step": 12595 + }, + { + "epoch": 2.471557473519027, + "grad_norm": 23.589780807495117, + "learning_rate": 3.2178480294229998e-06, + "loss": 4.4692, + "step": 12600 + }, + { + "epoch": 2.472538250294233, + "grad_norm": 21.215566635131836, + "learning_rate": 3.21423228792602e-06, + "loss": 4.57, + "step": 12605 + }, + { + "epoch": 2.473519027069439, + "grad_norm": 16.17399787902832, + "learning_rate": 3.2106176165059444e-06, + "loss": 4.7717, + "step": 12610 + }, + { + "epoch": 2.474499803844645, + "grad_norm": 25.54176139831543, + "learning_rate": 3.207004017328779e-06, + "loss": 4.5617, + "step": 12615 + }, + { + "epoch": 2.475480580619851, + "grad_norm": 12.003211975097656, + "learning_rate": 3.2033914925598796e-06, + "loss": 4.7012, + "step": 12620 + }, + { + "epoch": 2.476461357395057, + "grad_norm": 11.047526359558105, + "learning_rate": 3.199780044363963e-06, + "loss": 4.2175, + "step": 12625 + }, + { + "epoch": 2.477442134170263, + "grad_norm": 21.22567367553711, + "learning_rate": 3.196169674905102e-06, + "loss": 4.3798, + "step": 12630 + }, + { + "epoch": 2.478422910945469, + "grad_norm": 28.6398983001709, + "learning_rate": 3.192560386346717e-06, + "loss": 4.4773, + "step": 12635 + }, + { + "epoch": 2.4794036877206747, + "grad_norm": 16.08147430419922, + "learning_rate": 3.1889521808515888e-06, + "loss": 4.2833, + "step": 12640 + }, + { + "epoch": 2.480384464495881, + "grad_norm": 16.921104431152344, + "learning_rate": 3.1853450605818403e-06, + "loss": 4.4575, + "step": 12645 + }, + { + "epoch": 2.481365241271087, + "grad_norm": 27.06111717224121, + "learning_rate": 3.1817390276989514e-06, + "loss": 4.558, + "step": 12650 + }, + { + "epoch": 2.4823460180462926, + "grad_norm": 16.07041358947754, + "learning_rate": 3.178134084363747e-06, + "loss": 4.4276, + "step": 12655 + }, + { + "epoch": 2.4833267948214988, + "grad_norm": 17.705095291137695, + "learning_rate": 3.1745302327364e-06, + "loss": 4.2914, + "step": 12660 + }, + { + "epoch": 2.4843075715967045, + "grad_norm": 20.390050888061523, + "learning_rate": 3.1709274749764294e-06, + "loss": 4.6041, + "step": 12665 + }, + { + "epoch": 2.4852883483719106, + "grad_norm": 18.144638061523438, + "learning_rate": 3.1673258132426958e-06, + "loss": 4.6561, + "step": 12670 + }, + { + "epoch": 2.4862691251471167, + "grad_norm": 26.244768142700195, + "learning_rate": 3.1637252496934073e-06, + "loss": 4.0215, + "step": 12675 + }, + { + "epoch": 2.4872499019223224, + "grad_norm": 34.454917907714844, + "learning_rate": 3.160125786486114e-06, + "loss": 4.517, + "step": 12680 + }, + { + "epoch": 2.4882306786975286, + "grad_norm": 24.92203140258789, + "learning_rate": 3.1565274257777e-06, + "loss": 4.5899, + "step": 12685 + }, + { + "epoch": 2.4892114554727343, + "grad_norm": 22.43060874938965, + "learning_rate": 3.152930169724399e-06, + "loss": 4.267, + "step": 12690 + }, + { + "epoch": 2.4901922322479404, + "grad_norm": 18.194143295288086, + "learning_rate": 3.1493340204817735e-06, + "loss": 4.4478, + "step": 12695 + }, + { + "epoch": 2.4911730090231465, + "grad_norm": 22.45499038696289, + "learning_rate": 3.145738980204726e-06, + "loss": 4.4645, + "step": 12700 + }, + { + "epoch": 2.4921537857983522, + "grad_norm": 21.602619171142578, + "learning_rate": 3.1421450510474986e-06, + "loss": 4.532, + "step": 12705 + }, + { + "epoch": 2.4931345625735584, + "grad_norm": 21.310916900634766, + "learning_rate": 3.13855223516366e-06, + "loss": 4.6983, + "step": 12710 + }, + { + "epoch": 2.494115339348764, + "grad_norm": 21.922300338745117, + "learning_rate": 3.1349605347061195e-06, + "loss": 4.5509, + "step": 12715 + }, + { + "epoch": 2.49509611612397, + "grad_norm": 12.221722602844238, + "learning_rate": 3.1313699518271113e-06, + "loss": 4.5321, + "step": 12720 + }, + { + "epoch": 2.4960768928991763, + "grad_norm": 28.056127548217773, + "learning_rate": 3.1277804886782043e-06, + "loss": 4.4323, + "step": 12725 + }, + { + "epoch": 2.497057669674382, + "grad_norm": 13.224735260009766, + "learning_rate": 3.1241921474102952e-06, + "loss": 4.2532, + "step": 12730 + }, + { + "epoch": 2.498038446449588, + "grad_norm": 22.16506004333496, + "learning_rate": 3.120604930173608e-06, + "loss": 4.454, + "step": 12735 + }, + { + "epoch": 2.499019223224794, + "grad_norm": 18.699708938598633, + "learning_rate": 3.1170188391176946e-06, + "loss": 4.7483, + "step": 12740 + }, + { + "epoch": 2.5, + "grad_norm": 12.912882804870605, + "learning_rate": 3.1134338763914272e-06, + "loss": 4.3887, + "step": 12745 + }, + { + "epoch": 2.500980776775206, + "grad_norm": 16.283729553222656, + "learning_rate": 3.1098500441430085e-06, + "loss": 4.3017, + "step": 12750 + }, + { + "epoch": 2.500980776775206, + "eval_loss": 4.861581802368164, + "eval_runtime": 7.7147, + "eval_samples_per_second": 27.091, + "eval_steps_per_second": 13.61, + "step": 12750 + }, + { + "epoch": 2.501961553550412, + "grad_norm": 19.177406311035156, + "learning_rate": 3.1062673445199625e-06, + "loss": 4.4072, + "step": 12755 + }, + { + "epoch": 2.502942330325618, + "grad_norm": 19.394792556762695, + "learning_rate": 3.102685779669129e-06, + "loss": 4.3576, + "step": 12760 + }, + { + "epoch": 2.5039231071008237, + "grad_norm": 47.282066345214844, + "learning_rate": 3.0991053517366753e-06, + "loss": 4.5277, + "step": 12765 + }, + { + "epoch": 2.50490388387603, + "grad_norm": 11.863676071166992, + "learning_rate": 3.095526062868082e-06, + "loss": 4.4062, + "step": 12770 + }, + { + "epoch": 2.505884660651236, + "grad_norm": 13.989859580993652, + "learning_rate": 3.0919479152081468e-06, + "loss": 4.7194, + "step": 12775 + }, + { + "epoch": 2.5068654374264416, + "grad_norm": 14.908836364746094, + "learning_rate": 3.0883709109009907e-06, + "loss": 4.6656, + "step": 12780 + }, + { + "epoch": 2.5078462142016478, + "grad_norm": 16.357318878173828, + "learning_rate": 3.08479505209004e-06, + "loss": 4.6039, + "step": 12785 + }, + { + "epoch": 2.5088269909768535, + "grad_norm": 19.404542922973633, + "learning_rate": 3.081220340918043e-06, + "loss": 4.3132, + "step": 12790 + }, + { + "epoch": 2.5098077677520596, + "grad_norm": 20.268781661987305, + "learning_rate": 3.0776467795270526e-06, + "loss": 4.6035, + "step": 12795 + }, + { + "epoch": 2.5107885445272657, + "grad_norm": 15.913606643676758, + "learning_rate": 3.0740743700584397e-06, + "loss": 4.4568, + "step": 12800 + }, + { + "epoch": 2.5117693213024714, + "grad_norm": 14.354854583740234, + "learning_rate": 3.0705031146528817e-06, + "loss": 4.7239, + "step": 12805 + }, + { + "epoch": 2.5127500980776776, + "grad_norm": 12.96605396270752, + "learning_rate": 3.0669330154503617e-06, + "loss": 4.8816, + "step": 12810 + }, + { + "epoch": 2.5137308748528833, + "grad_norm": 13.831136703491211, + "learning_rate": 3.063364074590177e-06, + "loss": 4.5382, + "step": 12815 + }, + { + "epoch": 2.5147116516280894, + "grad_norm": 17.773210525512695, + "learning_rate": 3.059796294210923e-06, + "loss": 4.4677, + "step": 12820 + }, + { + "epoch": 2.5156924284032955, + "grad_norm": 14.534049987792969, + "learning_rate": 3.056229676450504e-06, + "loss": 4.4686, + "step": 12825 + }, + { + "epoch": 2.5166732051785012, + "grad_norm": 21.867084503173828, + "learning_rate": 3.0526642234461313e-06, + "loss": 4.391, + "step": 12830 + }, + { + "epoch": 2.5176539819537074, + "grad_norm": 23.599096298217773, + "learning_rate": 3.049099937334309e-06, + "loss": 4.4712, + "step": 12835 + }, + { + "epoch": 2.518634758728913, + "grad_norm": 21.118675231933594, + "learning_rate": 3.0455368202508484e-06, + "loss": 4.2714, + "step": 12840 + }, + { + "epoch": 2.519615535504119, + "grad_norm": 10.092329978942871, + "learning_rate": 3.0419748743308595e-06, + "loss": 3.9771, + "step": 12845 + }, + { + "epoch": 2.5205963122793253, + "grad_norm": 28.692956924438477, + "learning_rate": 3.0384141017087483e-06, + "loss": 4.6071, + "step": 12850 + }, + { + "epoch": 2.521577089054531, + "grad_norm": 20.138452529907227, + "learning_rate": 3.034854504518222e-06, + "loss": 4.7027, + "step": 12855 + }, + { + "epoch": 2.522557865829737, + "grad_norm": 15.83338451385498, + "learning_rate": 3.031296084892278e-06, + "loss": 4.4956, + "step": 12860 + }, + { + "epoch": 2.523538642604943, + "grad_norm": 11.186666488647461, + "learning_rate": 3.027738844963213e-06, + "loss": 4.7868, + "step": 12865 + }, + { + "epoch": 2.524519419380149, + "grad_norm": 13.635848999023438, + "learning_rate": 3.024182786862612e-06, + "loss": 4.3454, + "step": 12870 + }, + { + "epoch": 2.525500196155355, + "grad_norm": 16.45800018310547, + "learning_rate": 3.0206279127213565e-06, + "loss": 4.4047, + "step": 12875 + }, + { + "epoch": 2.526480972930561, + "grad_norm": 35.179325103759766, + "learning_rate": 3.017074224669617e-06, + "loss": 4.5525, + "step": 12880 + }, + { + "epoch": 2.527461749705767, + "grad_norm": 19.517745971679688, + "learning_rate": 3.01352172483685e-06, + "loss": 4.5151, + "step": 12885 + }, + { + "epoch": 2.5284425264809727, + "grad_norm": 32.89816665649414, + "learning_rate": 3.0099704153518057e-06, + "loss": 4.5045, + "step": 12890 + }, + { + "epoch": 2.529423303256179, + "grad_norm": 24.201087951660156, + "learning_rate": 3.006420298342515e-06, + "loss": 4.6896, + "step": 12895 + }, + { + "epoch": 2.530404080031385, + "grad_norm": 19.274837493896484, + "learning_rate": 3.002871375936298e-06, + "loss": 4.5481, + "step": 12900 + }, + { + "epoch": 2.531384856806591, + "grad_norm": 26.81613540649414, + "learning_rate": 2.9993236502597624e-06, + "loss": 5.2386, + "step": 12905 + }, + { + "epoch": 2.532365633581797, + "grad_norm": 12.64597225189209, + "learning_rate": 2.99577712343879e-06, + "loss": 4.5118, + "step": 12910 + }, + { + "epoch": 2.5333464103570025, + "grad_norm": 20.370603561401367, + "learning_rate": 2.9922317975985494e-06, + "loss": 4.4, + "step": 12915 + }, + { + "epoch": 2.5343271871322086, + "grad_norm": 48.84408950805664, + "learning_rate": 2.98868767486349e-06, + "loss": 4.903, + "step": 12920 + }, + { + "epoch": 2.5353079639074148, + "grad_norm": 18.01617431640625, + "learning_rate": 2.9851447573573383e-06, + "loss": 4.5406, + "step": 12925 + }, + { + "epoch": 2.536288740682621, + "grad_norm": 13.745526313781738, + "learning_rate": 2.981603047203102e-06, + "loss": 4.7924, + "step": 12930 + }, + { + "epoch": 2.5372695174578266, + "grad_norm": 19.135507583618164, + "learning_rate": 2.9780625465230583e-06, + "loss": 4.5221, + "step": 12935 + }, + { + "epoch": 2.5382502942330327, + "grad_norm": 13.795884132385254, + "learning_rate": 2.97452325743877e-06, + "loss": 4.5289, + "step": 12940 + }, + { + "epoch": 2.5392310710082384, + "grad_norm": 21.768945693969727, + "learning_rate": 2.970985182071063e-06, + "loss": 4.381, + "step": 12945 + }, + { + "epoch": 2.5402118477834446, + "grad_norm": 22.04071807861328, + "learning_rate": 2.9674483225400436e-06, + "loss": 4.3116, + "step": 12950 + }, + { + "epoch": 2.5411926245586507, + "grad_norm": 24.65144920349121, + "learning_rate": 2.9639126809650877e-06, + "loss": 4.9135, + "step": 12955 + }, + { + "epoch": 2.5421734013338564, + "grad_norm": 15.5100736618042, + "learning_rate": 2.9603782594648365e-06, + "loss": 4.3342, + "step": 12960 + }, + { + "epoch": 2.5431541781090625, + "grad_norm": 20.800643920898438, + "learning_rate": 2.9568450601572095e-06, + "loss": 4.7446, + "step": 12965 + }, + { + "epoch": 2.544134954884268, + "grad_norm": 32.53725814819336, + "learning_rate": 2.9533130851593846e-06, + "loss": 4.3541, + "step": 12970 + }, + { + "epoch": 2.5451157316594744, + "grad_norm": 33.92058181762695, + "learning_rate": 2.949782336587812e-06, + "loss": 4.3456, + "step": 12975 + }, + { + "epoch": 2.5460965084346805, + "grad_norm": 29.062822341918945, + "learning_rate": 2.946252816558205e-06, + "loss": 4.6352, + "step": 12980 + }, + { + "epoch": 2.547077285209886, + "grad_norm": 14.796578407287598, + "learning_rate": 2.942724527185539e-06, + "loss": 4.263, + "step": 12985 + }, + { + "epoch": 2.5480580619850923, + "grad_norm": 17.256603240966797, + "learning_rate": 2.939197470584057e-06, + "loss": 4.3981, + "step": 12990 + }, + { + "epoch": 2.549038838760298, + "grad_norm": 13.124008178710938, + "learning_rate": 2.9356716488672556e-06, + "loss": 4.4093, + "step": 12995 + }, + { + "epoch": 2.550019615535504, + "grad_norm": 19.26356315612793, + "learning_rate": 2.9321470641478978e-06, + "loss": 4.4876, + "step": 13000 + }, + { + "epoch": 2.5510003923107103, + "grad_norm": 9.095273971557617, + "learning_rate": 2.928623718538006e-06, + "loss": 4.4389, + "step": 13005 + }, + { + "epoch": 2.551981169085916, + "grad_norm": 29.93180274963379, + "learning_rate": 2.9251016141488532e-06, + "loss": 4.2358, + "step": 13010 + }, + { + "epoch": 2.552961945861122, + "grad_norm": 12.834431648254395, + "learning_rate": 2.921580753090977e-06, + "loss": 4.3116, + "step": 13015 + }, + { + "epoch": 2.553942722636328, + "grad_norm": 27.98518943786621, + "learning_rate": 2.9180611374741623e-06, + "loss": 4.6901, + "step": 13020 + }, + { + "epoch": 2.554923499411534, + "grad_norm": 15.999112129211426, + "learning_rate": 2.914542769407452e-06, + "loss": 4.4974, + "step": 13025 + }, + { + "epoch": 2.55590427618674, + "grad_norm": 13.242138862609863, + "learning_rate": 2.911025650999143e-06, + "loss": 4.2905, + "step": 13030 + }, + { + "epoch": 2.556885052961946, + "grad_norm": 17.243799209594727, + "learning_rate": 2.9075097843567775e-06, + "loss": 4.5918, + "step": 13035 + }, + { + "epoch": 2.557865829737152, + "grad_norm": 13.571252822875977, + "learning_rate": 2.903995171587155e-06, + "loss": 4.2669, + "step": 13040 + }, + { + "epoch": 2.5588466065123576, + "grad_norm": 47.06708908081055, + "learning_rate": 2.900481814796316e-06, + "loss": 5.0637, + "step": 13045 + }, + { + "epoch": 2.5598273832875638, + "grad_norm": 15.13114070892334, + "learning_rate": 2.8969697160895545e-06, + "loss": 4.229, + "step": 13050 + }, + { + "epoch": 2.56080816006277, + "grad_norm": 21.789676666259766, + "learning_rate": 2.893458877571409e-06, + "loss": 4.2882, + "step": 13055 + }, + { + "epoch": 2.5617889368379756, + "grad_norm": 22.40684700012207, + "learning_rate": 2.8899493013456602e-06, + "loss": 4.625, + "step": 13060 + }, + { + "epoch": 2.5627697136131817, + "grad_norm": 33.69407272338867, + "learning_rate": 2.8864409895153365e-06, + "loss": 4.523, + "step": 13065 + }, + { + "epoch": 2.5637504903883874, + "grad_norm": 18.701000213623047, + "learning_rate": 2.8829339441827044e-06, + "loss": 4.5509, + "step": 13070 + }, + { + "epoch": 2.5647312671635936, + "grad_norm": 14.72784423828125, + "learning_rate": 2.879428167449276e-06, + "loss": 4.3159, + "step": 13075 + }, + { + "epoch": 2.5657120439387997, + "grad_norm": 22.692087173461914, + "learning_rate": 2.875923661415799e-06, + "loss": 4.5071, + "step": 13080 + }, + { + "epoch": 2.5666928207140054, + "grad_norm": 15.203378677368164, + "learning_rate": 2.872420428182261e-06, + "loss": 4.4571, + "step": 13085 + }, + { + "epoch": 2.5676735974892115, + "grad_norm": 24.574600219726562, + "learning_rate": 2.86891846984789e-06, + "loss": 4.4902, + "step": 13090 + }, + { + "epoch": 2.5686543742644172, + "grad_norm": 19.6531982421875, + "learning_rate": 2.8654177885111444e-06, + "loss": 4.5339, + "step": 13095 + }, + { + "epoch": 2.5696351510396234, + "grad_norm": 12.66419792175293, + "learning_rate": 2.861918386269721e-06, + "loss": 4.267, + "step": 13100 + }, + { + "epoch": 2.5706159278148295, + "grad_norm": 20.505992889404297, + "learning_rate": 2.8584202652205536e-06, + "loss": 4.4909, + "step": 13105 + }, + { + "epoch": 2.571596704590035, + "grad_norm": 30.728412628173828, + "learning_rate": 2.8549234274597982e-06, + "loss": 4.5643, + "step": 13110 + }, + { + "epoch": 2.5725774813652413, + "grad_norm": 14.343387603759766, + "learning_rate": 2.8514278750828537e-06, + "loss": 4.5294, + "step": 13115 + }, + { + "epoch": 2.573558258140447, + "grad_norm": 26.706621170043945, + "learning_rate": 2.847933610184338e-06, + "loss": 4.8221, + "step": 13120 + }, + { + "epoch": 2.574539034915653, + "grad_norm": 13.684526443481445, + "learning_rate": 2.8444406348581046e-06, + "loss": 4.7329, + "step": 13125 + }, + { + "epoch": 2.5755198116908593, + "grad_norm": 22.70045280456543, + "learning_rate": 2.840948951197234e-06, + "loss": 4.6044, + "step": 13130 + }, + { + "epoch": 2.576500588466065, + "grad_norm": 10.388460159301758, + "learning_rate": 2.8374585612940274e-06, + "loss": 4.5919, + "step": 13135 + }, + { + "epoch": 2.577481365241271, + "grad_norm": 34.217323303222656, + "learning_rate": 2.8339694672400176e-06, + "loss": 4.526, + "step": 13140 + }, + { + "epoch": 2.578462142016477, + "grad_norm": 19.96867561340332, + "learning_rate": 2.8304816711259554e-06, + "loss": 4.416, + "step": 13145 + }, + { + "epoch": 2.579442918791683, + "grad_norm": 11.595075607299805, + "learning_rate": 2.8269951750418144e-06, + "loss": 4.4408, + "step": 13150 + }, + { + "epoch": 2.580423695566889, + "grad_norm": 17.77700424194336, + "learning_rate": 2.823509981076795e-06, + "loss": 4.4397, + "step": 13155 + }, + { + "epoch": 2.581404472342095, + "grad_norm": 12.028759002685547, + "learning_rate": 2.8200260913193077e-06, + "loss": 4.7812, + "step": 13160 + }, + { + "epoch": 2.582385249117301, + "grad_norm": 22.082979202270508, + "learning_rate": 2.816543507856992e-06, + "loss": 4.3825, + "step": 13165 + }, + { + "epoch": 2.5833660258925066, + "grad_norm": 34.65692901611328, + "learning_rate": 2.813062232776695e-06, + "loss": 4.3139, + "step": 13170 + }, + { + "epoch": 2.5843468026677128, + "grad_norm": 22.31768798828125, + "learning_rate": 2.8095822681644864e-06, + "loss": 4.5857, + "step": 13175 + }, + { + "epoch": 2.585327579442919, + "grad_norm": 22.302383422851562, + "learning_rate": 2.8061036161056505e-06, + "loss": 4.7143, + "step": 13180 + }, + { + "epoch": 2.5863083562181246, + "grad_norm": 20.870906829833984, + "learning_rate": 2.802626278684679e-06, + "loss": 4.3662, + "step": 13185 + }, + { + "epoch": 2.5872891329933307, + "grad_norm": 14.885607719421387, + "learning_rate": 2.7991502579852837e-06, + "loss": 4.6355, + "step": 13190 + }, + { + "epoch": 2.5882699097685364, + "grad_norm": 13.58197021484375, + "learning_rate": 2.7956755560903797e-06, + "loss": 4.3287, + "step": 13195 + }, + { + "epoch": 2.5892506865437426, + "grad_norm": 15.81235122680664, + "learning_rate": 2.7922021750820983e-06, + "loss": 4.5046, + "step": 13200 + }, + { + "epoch": 2.5902314633189487, + "grad_norm": 16.68513298034668, + "learning_rate": 2.788730117041778e-06, + "loss": 4.5428, + "step": 13205 + }, + { + "epoch": 2.5912122400941544, + "grad_norm": 14.253608703613281, + "learning_rate": 2.785259384049959e-06, + "loss": 4.8577, + "step": 13210 + }, + { + "epoch": 2.5921930168693605, + "grad_norm": 25.999387741088867, + "learning_rate": 2.7817899781863964e-06, + "loss": 4.498, + "step": 13215 + }, + { + "epoch": 2.5931737936445662, + "grad_norm": 18.823204040527344, + "learning_rate": 2.7783219015300443e-06, + "loss": 4.7903, + "step": 13220 + }, + { + "epoch": 2.5941545704197724, + "grad_norm": 20.179967880249023, + "learning_rate": 2.7748551561590574e-06, + "loss": 4.5352, + "step": 13225 + }, + { + "epoch": 2.5951353471949785, + "grad_norm": 15.403043746948242, + "learning_rate": 2.771389744150802e-06, + "loss": 4.6202, + "step": 13230 + }, + { + "epoch": 2.5961161239701847, + "grad_norm": 23.73948097229004, + "learning_rate": 2.7679256675818357e-06, + "loss": 4.9047, + "step": 13235 + }, + { + "epoch": 2.5970969007453903, + "grad_norm": 16.40243148803711, + "learning_rate": 2.764462928527924e-06, + "loss": 4.384, + "step": 13240 + }, + { + "epoch": 2.598077677520596, + "grad_norm": 21.48336410522461, + "learning_rate": 2.7610015290640237e-06, + "loss": 4.3126, + "step": 13245 + }, + { + "epoch": 2.599058454295802, + "grad_norm": 26.895301818847656, + "learning_rate": 2.7575414712642947e-06, + "loss": 4.5225, + "step": 13250 + }, + { + "epoch": 2.6000392310710083, + "grad_norm": 28.130233764648438, + "learning_rate": 2.754082757202091e-06, + "loss": 4.7777, + "step": 13255 + }, + { + "epoch": 2.6010200078462145, + "grad_norm": 15.378296852111816, + "learning_rate": 2.750625388949959e-06, + "loss": 4.1805, + "step": 13260 + }, + { + "epoch": 2.60200078462142, + "grad_norm": 23.175745010375977, + "learning_rate": 2.7471693685796437e-06, + "loss": 4.5967, + "step": 13265 + }, + { + "epoch": 2.6029815613966263, + "grad_norm": 17.780309677124023, + "learning_rate": 2.7437146981620754e-06, + "loss": 4.3283, + "step": 13270 + }, + { + "epoch": 2.603962338171832, + "grad_norm": 27.35478401184082, + "learning_rate": 2.740261379767382e-06, + "loss": 4.6849, + "step": 13275 + }, + { + "epoch": 2.604943114947038, + "grad_norm": 15.305866241455078, + "learning_rate": 2.7368094154648794e-06, + "loss": 4.4812, + "step": 13280 + }, + { + "epoch": 2.6059238917222443, + "grad_norm": 20.15849494934082, + "learning_rate": 2.7333588073230682e-06, + "loss": 4.6729, + "step": 13285 + }, + { + "epoch": 2.60690466849745, + "grad_norm": 23.166080474853516, + "learning_rate": 2.7299095574096435e-06, + "loss": 4.5444, + "step": 13290 + }, + { + "epoch": 2.607885445272656, + "grad_norm": 13.613513946533203, + "learning_rate": 2.726461667791481e-06, + "loss": 4.9226, + "step": 13295 + }, + { + "epoch": 2.608866222047862, + "grad_norm": 14.808908462524414, + "learning_rate": 2.7230151405346407e-06, + "loss": 4.4457, + "step": 13300 + }, + { + "epoch": 2.609846998823068, + "grad_norm": 25.118194580078125, + "learning_rate": 2.7195699777043723e-06, + "loss": 4.2229, + "step": 13305 + }, + { + "epoch": 2.610827775598274, + "grad_norm": 26.882368087768555, + "learning_rate": 2.7161261813650997e-06, + "loss": 4.6098, + "step": 13310 + }, + { + "epoch": 2.6118085523734798, + "grad_norm": 26.372621536254883, + "learning_rate": 2.7126837535804362e-06, + "loss": 4.6734, + "step": 13315 + }, + { + "epoch": 2.612789329148686, + "grad_norm": 31.41575050354004, + "learning_rate": 2.7092426964131667e-06, + "loss": 4.6502, + "step": 13320 + }, + { + "epoch": 2.6137701059238916, + "grad_norm": 43.63661193847656, + "learning_rate": 2.705803011925262e-06, + "loss": 4.8477, + "step": 13325 + }, + { + "epoch": 2.6147508826990977, + "grad_norm": 37.062068939208984, + "learning_rate": 2.70236470217787e-06, + "loss": 4.5201, + "step": 13330 + }, + { + "epoch": 2.615731659474304, + "grad_norm": 13.730938911437988, + "learning_rate": 2.6989277692313064e-06, + "loss": 4.6581, + "step": 13335 + }, + { + "epoch": 2.6167124362495096, + "grad_norm": 19.538053512573242, + "learning_rate": 2.6954922151450735e-06, + "loss": 4.5461, + "step": 13340 + }, + { + "epoch": 2.6176932130247157, + "grad_norm": 18.18435287475586, + "learning_rate": 2.6920580419778375e-06, + "loss": 4.6776, + "step": 13345 + }, + { + "epoch": 2.6186739897999214, + "grad_norm": 14.151314735412598, + "learning_rate": 2.6886252517874423e-06, + "loss": 4.7098, + "step": 13350 + }, + { + "epoch": 2.6196547665751275, + "grad_norm": 16.580251693725586, + "learning_rate": 2.6851938466309053e-06, + "loss": 4.7572, + "step": 13355 + }, + { + "epoch": 2.6206355433503337, + "grad_norm": 18.998332977294922, + "learning_rate": 2.6817638285644077e-06, + "loss": 4.3379, + "step": 13360 + }, + { + "epoch": 2.6216163201255394, + "grad_norm": 21.826854705810547, + "learning_rate": 2.6783351996433018e-06, + "loss": 4.3292, + "step": 13365 + }, + { + "epoch": 2.6225970969007455, + "grad_norm": 25.782760620117188, + "learning_rate": 2.67490796192211e-06, + "loss": 4.8116, + "step": 13370 + }, + { + "epoch": 2.623577873675951, + "grad_norm": 17.976646423339844, + "learning_rate": 2.671482117454518e-06, + "loss": 4.641, + "step": 13375 + }, + { + "epoch": 2.6245586504511573, + "grad_norm": 20.79668617248535, + "learning_rate": 2.66805766829338e-06, + "loss": 4.7579, + "step": 13380 + }, + { + "epoch": 2.6255394272263635, + "grad_norm": 14.479522705078125, + "learning_rate": 2.6646346164907087e-06, + "loss": 4.7085, + "step": 13385 + }, + { + "epoch": 2.626520204001569, + "grad_norm": 28.237882614135742, + "learning_rate": 2.6612129640976875e-06, + "loss": 4.4694, + "step": 13390 + }, + { + "epoch": 2.6275009807767753, + "grad_norm": 15.98440170288086, + "learning_rate": 2.6577927131646513e-06, + "loss": 4.5517, + "step": 13395 + }, + { + "epoch": 2.628481757551981, + "grad_norm": 24.405895233154297, + "learning_rate": 2.6543738657411033e-06, + "loss": 4.3712, + "step": 13400 + }, + { + "epoch": 2.629462534327187, + "grad_norm": 14.398210525512695, + "learning_rate": 2.650956423875704e-06, + "loss": 4.7705, + "step": 13405 + }, + { + "epoch": 2.6304433111023933, + "grad_norm": 19.128984451293945, + "learning_rate": 2.6475403896162676e-06, + "loss": 4.5675, + "step": 13410 + }, + { + "epoch": 2.631424087877599, + "grad_norm": 20.786842346191406, + "learning_rate": 2.6441257650097705e-06, + "loss": 4.3912, + "step": 13415 + }, + { + "epoch": 2.632404864652805, + "grad_norm": 19.81397819519043, + "learning_rate": 2.6407125521023387e-06, + "loss": 4.6613, + "step": 13420 + }, + { + "epoch": 2.633385641428011, + "grad_norm": 15.989570617675781, + "learning_rate": 2.6373007529392565e-06, + "loss": 4.5901, + "step": 13425 + }, + { + "epoch": 2.634366418203217, + "grad_norm": 19.9948673248291, + "learning_rate": 2.633890369564962e-06, + "loss": 4.4191, + "step": 13430 + }, + { + "epoch": 2.635347194978423, + "grad_norm": 21.669771194458008, + "learning_rate": 2.6304814040230397e-06, + "loss": 4.5845, + "step": 13435 + }, + { + "epoch": 2.6363279717536288, + "grad_norm": 36.29818344116211, + "learning_rate": 2.6270738583562295e-06, + "loss": 4.5574, + "step": 13440 + }, + { + "epoch": 2.637308748528835, + "grad_norm": 14.819993019104004, + "learning_rate": 2.623667734606414e-06, + "loss": 4.4312, + "step": 13445 + }, + { + "epoch": 2.6382895253040406, + "grad_norm": 13.607355117797852, + "learning_rate": 2.6202630348146323e-06, + "loss": 4.5856, + "step": 13450 + }, + { + "epoch": 2.6392703020792467, + "grad_norm": 18.39212417602539, + "learning_rate": 2.6168597610210673e-06, + "loss": 4.4272, + "step": 13455 + }, + { + "epoch": 2.640251078854453, + "grad_norm": 10.951640129089355, + "learning_rate": 2.613457915265042e-06, + "loss": 4.5139, + "step": 13460 + }, + { + "epoch": 2.6412318556296586, + "grad_norm": 15.258952140808105, + "learning_rate": 2.6100574995850316e-06, + "loss": 4.2683, + "step": 13465 + }, + { + "epoch": 2.6422126324048647, + "grad_norm": 16.9803466796875, + "learning_rate": 2.6066585160186477e-06, + "loss": 4.7361, + "step": 13470 + }, + { + "epoch": 2.6431934091800704, + "grad_norm": 18.152610778808594, + "learning_rate": 2.6032609666026476e-06, + "loss": 4.7387, + "step": 13475 + }, + { + "epoch": 2.6441741859552765, + "grad_norm": 24.47258949279785, + "learning_rate": 2.599864853372931e-06, + "loss": 4.4338, + "step": 13480 + }, + { + "epoch": 2.6451549627304827, + "grad_norm": 33.78486633300781, + "learning_rate": 2.5964701783645296e-06, + "loss": 4.4178, + "step": 13485 + }, + { + "epoch": 2.6461357395056884, + "grad_norm": 28.668806076049805, + "learning_rate": 2.593076943611623e-06, + "loss": 4.3034, + "step": 13490 + }, + { + "epoch": 2.6471165162808945, + "grad_norm": 14.24481201171875, + "learning_rate": 2.5896851511475184e-06, + "loss": 4.5715, + "step": 13495 + }, + { + "epoch": 2.6480972930561, + "grad_norm": 23.95184326171875, + "learning_rate": 2.5862948030046676e-06, + "loss": 4.5584, + "step": 13500 + }, + { + "epoch": 2.6490780698313063, + "grad_norm": 18.729578018188477, + "learning_rate": 2.5829059012146466e-06, + "loss": 4.4817, + "step": 13505 + }, + { + "epoch": 2.6500588466065125, + "grad_norm": 38.478668212890625, + "learning_rate": 2.579518447808177e-06, + "loss": 4.7689, + "step": 13510 + }, + { + "epoch": 2.651039623381718, + "grad_norm": 29.371217727661133, + "learning_rate": 2.5761324448151017e-06, + "loss": 4.6578, + "step": 13515 + }, + { + "epoch": 2.6520204001569243, + "grad_norm": 11.126766204833984, + "learning_rate": 2.572747894264399e-06, + "loss": 4.7112, + "step": 13520 + }, + { + "epoch": 2.65300117693213, + "grad_norm": 23.9212646484375, + "learning_rate": 2.5693647981841766e-06, + "loss": 4.4847, + "step": 13525 + }, + { + "epoch": 2.653981953707336, + "grad_norm": 16.88579559326172, + "learning_rate": 2.565983158601675e-06, + "loss": 4.5293, + "step": 13530 + }, + { + "epoch": 2.6549627304825423, + "grad_norm": 21.897232055664062, + "learning_rate": 2.5626029775432513e-06, + "loss": 4.6525, + "step": 13535 + }, + { + "epoch": 2.655943507257748, + "grad_norm": 26.479185104370117, + "learning_rate": 2.5592242570344008e-06, + "loss": 4.477, + "step": 13540 + }, + { + "epoch": 2.656924284032954, + "grad_norm": 17.854076385498047, + "learning_rate": 2.5558469990997327e-06, + "loss": 4.4696, + "step": 13545 + }, + { + "epoch": 2.65790506080816, + "grad_norm": 13.446738243103027, + "learning_rate": 2.5524712057629867e-06, + "loss": 4.8397, + "step": 13550 + }, + { + "epoch": 2.658885837583366, + "grad_norm": 14.61733627319336, + "learning_rate": 2.549096879047026e-06, + "loss": 4.4787, + "step": 13555 + }, + { + "epoch": 2.659866614358572, + "grad_norm": 15.360088348388672, + "learning_rate": 2.5457240209738278e-06, + "loss": 4.4534, + "step": 13560 + }, + { + "epoch": 2.660847391133778, + "grad_norm": 14.018891334533691, + "learning_rate": 2.5423526335644967e-06, + "loss": 4.6227, + "step": 13565 + }, + { + "epoch": 2.661828167908984, + "grad_norm": 13.576262474060059, + "learning_rate": 2.5389827188392495e-06, + "loss": 4.7652, + "step": 13570 + }, + { + "epoch": 2.6628089446841896, + "grad_norm": 18.732263565063477, + "learning_rate": 2.5356142788174277e-06, + "loss": 4.6374, + "step": 13575 + }, + { + "epoch": 2.6637897214593957, + "grad_norm": 20.31509017944336, + "learning_rate": 2.532247315517481e-06, + "loss": 4.3886, + "step": 13580 + }, + { + "epoch": 2.664770498234602, + "grad_norm": 12.62197494506836, + "learning_rate": 2.528881830956983e-06, + "loss": 4.8373, + "step": 13585 + }, + { + "epoch": 2.665751275009808, + "grad_norm": 32.32179260253906, + "learning_rate": 2.525517827152614e-06, + "loss": 4.224, + "step": 13590 + }, + { + "epoch": 2.6667320517850137, + "grad_norm": 21.820711135864258, + "learning_rate": 2.5221553061201678e-06, + "loss": 4.5365, + "step": 13595 + }, + { + "epoch": 2.66771282856022, + "grad_norm": 33.68283462524414, + "learning_rate": 2.518794269874553e-06, + "loss": 4.5051, + "step": 13600 + }, + { + "epoch": 2.6686936053354255, + "grad_norm": 12.54172134399414, + "learning_rate": 2.5154347204297903e-06, + "loss": 4.1707, + "step": 13605 + }, + { + "epoch": 2.6696743821106317, + "grad_norm": 10.5189847946167, + "learning_rate": 2.512076659799001e-06, + "loss": 4.5797, + "step": 13610 + }, + { + "epoch": 2.670655158885838, + "grad_norm": 32.68767547607422, + "learning_rate": 2.508720089994424e-06, + "loss": 4.8186, + "step": 13615 + }, + { + "epoch": 2.6716359356610435, + "grad_norm": 19.620351791381836, + "learning_rate": 2.505365013027397e-06, + "loss": 4.3365, + "step": 13620 + }, + { + "epoch": 2.6726167124362497, + "grad_norm": 27.10638999938965, + "learning_rate": 2.5020114309083676e-06, + "loss": 4.5674, + "step": 13625 + }, + { + "epoch": 2.6735974892114553, + "grad_norm": 24.49579620361328, + "learning_rate": 2.498659345646888e-06, + "loss": 4.3949, + "step": 13630 + }, + { + "epoch": 2.6745782659866615, + "grad_norm": 12.413771629333496, + "learning_rate": 2.4953087592516088e-06, + "loss": 4.4483, + "step": 13635 + }, + { + "epoch": 2.6755590427618676, + "grad_norm": 15.775691032409668, + "learning_rate": 2.49195967373029e-06, + "loss": 4.3685, + "step": 13640 + }, + { + "epoch": 2.6765398195370733, + "grad_norm": 14.530928611755371, + "learning_rate": 2.4886120910897826e-06, + "loss": 4.4426, + "step": 13645 + }, + { + "epoch": 2.6775205963122795, + "grad_norm": 36.24016189575195, + "learning_rate": 2.485266013336047e-06, + "loss": 4.3312, + "step": 13650 + }, + { + "epoch": 2.678501373087485, + "grad_norm": 16.918540954589844, + "learning_rate": 2.481921442474135e-06, + "loss": 4.4554, + "step": 13655 + }, + { + "epoch": 2.6794821498626913, + "grad_norm": 21.943265914916992, + "learning_rate": 2.478578380508196e-06, + "loss": 4.9183, + "step": 13660 + }, + { + "epoch": 2.6804629266378974, + "grad_norm": 28.09203338623047, + "learning_rate": 2.47523682944148e-06, + "loss": 4.5556, + "step": 13665 + }, + { + "epoch": 2.681443703413103, + "grad_norm": 12.404359817504883, + "learning_rate": 2.471896791276325e-06, + "loss": 4.4326, + "step": 13670 + }, + { + "epoch": 2.6824244801883093, + "grad_norm": 32.98741912841797, + "learning_rate": 2.4685582680141672e-06, + "loss": 4.1956, + "step": 13675 + }, + { + "epoch": 2.683405256963515, + "grad_norm": 18.23993682861328, + "learning_rate": 2.4652212616555367e-06, + "loss": 4.6537, + "step": 13680 + }, + { + "epoch": 2.684386033738721, + "grad_norm": 20.293107986450195, + "learning_rate": 2.4618857742000463e-06, + "loss": 4.622, + "step": 13685 + }, + { + "epoch": 2.6853668105139272, + "grad_norm": 21.977144241333008, + "learning_rate": 2.458551807646409e-06, + "loss": 4.631, + "step": 13690 + }, + { + "epoch": 2.686347587289133, + "grad_norm": 40.22378158569336, + "learning_rate": 2.4552193639924167e-06, + "loss": 4.6723, + "step": 13695 + }, + { + "epoch": 2.687328364064339, + "grad_norm": 44.497806549072266, + "learning_rate": 2.451888445234955e-06, + "loss": 5.0051, + "step": 13700 + }, + { + "epoch": 2.6883091408395448, + "grad_norm": 18.855010986328125, + "learning_rate": 2.4485590533699977e-06, + "loss": 4.3175, + "step": 13705 + }, + { + "epoch": 2.689289917614751, + "grad_norm": 14.9886474609375, + "learning_rate": 2.4452311903925953e-06, + "loss": 4.6321, + "step": 13710 + }, + { + "epoch": 2.690270694389957, + "grad_norm": 19.263511657714844, + "learning_rate": 2.44190485829689e-06, + "loss": 4.4146, + "step": 13715 + }, + { + "epoch": 2.6912514711651627, + "grad_norm": 15.527201652526855, + "learning_rate": 2.4385800590761017e-06, + "loss": 4.4363, + "step": 13720 + }, + { + "epoch": 2.692232247940369, + "grad_norm": 29.534744262695312, + "learning_rate": 2.435256794722536e-06, + "loss": 4.7091, + "step": 13725 + }, + { + "epoch": 2.6932130247155746, + "grad_norm": 33.34679412841797, + "learning_rate": 2.4319350672275743e-06, + "loss": 4.456, + "step": 13730 + }, + { + "epoch": 2.6941938014907807, + "grad_norm": 18.027267456054688, + "learning_rate": 2.428614878581678e-06, + "loss": 4.5147, + "step": 13735 + }, + { + "epoch": 2.695174578265987, + "grad_norm": 17.677387237548828, + "learning_rate": 2.4252962307743922e-06, + "loss": 4.5624, + "step": 13740 + }, + { + "epoch": 2.6961553550411925, + "grad_norm": 16.738374710083008, + "learning_rate": 2.4219791257943287e-06, + "loss": 4.4275, + "step": 13745 + }, + { + "epoch": 2.6971361318163987, + "grad_norm": 19.49000358581543, + "learning_rate": 2.4186635656291834e-06, + "loss": 4.2897, + "step": 13750 + }, + { + "epoch": 2.6981169085916044, + "grad_norm": 26.61591911315918, + "learning_rate": 2.4153495522657246e-06, + "loss": 4.5153, + "step": 13755 + }, + { + "epoch": 2.6990976853668105, + "grad_norm": 20.397048950195312, + "learning_rate": 2.412037087689788e-06, + "loss": 4.31, + "step": 13760 + }, + { + "epoch": 2.7000784621420166, + "grad_norm": 18.144649505615234, + "learning_rate": 2.4087261738862907e-06, + "loss": 4.5384, + "step": 13765 + }, + { + "epoch": 2.7010592389172223, + "grad_norm": 18.39889144897461, + "learning_rate": 2.405416812839211e-06, + "loss": 4.4709, + "step": 13770 + }, + { + "epoch": 2.7020400156924285, + "grad_norm": 19.8008975982666, + "learning_rate": 2.4021090065316026e-06, + "loss": 4.2814, + "step": 13775 + }, + { + "epoch": 2.703020792467634, + "grad_norm": 12.667327880859375, + "learning_rate": 2.3988027569455895e-06, + "loss": 4.6457, + "step": 13780 + }, + { + "epoch": 2.7040015692428403, + "grad_norm": 23.41883087158203, + "learning_rate": 2.3954980660623545e-06, + "loss": 4.4892, + "step": 13785 + }, + { + "epoch": 2.7049823460180464, + "grad_norm": 23.43563461303711, + "learning_rate": 2.392194935862156e-06, + "loss": 4.7607, + "step": 13790 + }, + { + "epoch": 2.705963122793252, + "grad_norm": 17.52957534790039, + "learning_rate": 2.3888933683243105e-06, + "loss": 4.3509, + "step": 13795 + }, + { + "epoch": 2.7069438995684583, + "grad_norm": 21.38890266418457, + "learning_rate": 2.3855933654271986e-06, + "loss": 4.6536, + "step": 13800 + }, + { + "epoch": 2.707924676343664, + "grad_norm": 21.12211799621582, + "learning_rate": 2.382294929148268e-06, + "loss": 4.5384, + "step": 13805 + }, + { + "epoch": 2.70890545311887, + "grad_norm": 12.217236518859863, + "learning_rate": 2.3789980614640212e-06, + "loss": 4.6123, + "step": 13810 + }, + { + "epoch": 2.7098862298940762, + "grad_norm": 28.40160369873047, + "learning_rate": 2.375702764350029e-06, + "loss": 4.6543, + "step": 13815 + }, + { + "epoch": 2.710867006669282, + "grad_norm": 18.181427001953125, + "learning_rate": 2.3724090397809112e-06, + "loss": 4.3407, + "step": 13820 + }, + { + "epoch": 2.711847783444488, + "grad_norm": 12.905070304870605, + "learning_rate": 2.369116889730353e-06, + "loss": 4.4545, + "step": 13825 + }, + { + "epoch": 2.7128285602196938, + "grad_norm": 14.678855895996094, + "learning_rate": 2.3658263161710948e-06, + "loss": 4.5105, + "step": 13830 + }, + { + "epoch": 2.7138093369949, + "grad_norm": 25.800718307495117, + "learning_rate": 2.3625373210749277e-06, + "loss": 4.1325, + "step": 13835 + }, + { + "epoch": 2.714790113770106, + "grad_norm": 25.922931671142578, + "learning_rate": 2.359249906412704e-06, + "loss": 4.5417, + "step": 13840 + }, + { + "epoch": 2.7157708905453117, + "grad_norm": 16.528545379638672, + "learning_rate": 2.3559640741543212e-06, + "loss": 4.3443, + "step": 13845 + }, + { + "epoch": 2.716751667320518, + "grad_norm": 26.857633590698242, + "learning_rate": 2.3526798262687337e-06, + "loss": 4.4824, + "step": 13850 + }, + { + "epoch": 2.7177324440957236, + "grad_norm": 16.661218643188477, + "learning_rate": 2.3493971647239495e-06, + "loss": 4.4816, + "step": 13855 + }, + { + "epoch": 2.7187132208709297, + "grad_norm": 17.91668128967285, + "learning_rate": 2.346116091487016e-06, + "loss": 4.8708, + "step": 13860 + }, + { + "epoch": 2.719693997646136, + "grad_norm": 18.95901870727539, + "learning_rate": 2.3428366085240394e-06, + "loss": 4.7818, + "step": 13865 + }, + { + "epoch": 2.720674774421342, + "grad_norm": 14.655415534973145, + "learning_rate": 2.3395587178001667e-06, + "loss": 4.5327, + "step": 13870 + }, + { + "epoch": 2.7216555511965477, + "grad_norm": 22.418703079223633, + "learning_rate": 2.33628242127959e-06, + "loss": 4.3016, + "step": 13875 + }, + { + "epoch": 2.7226363279717534, + "grad_norm": 29.357666015625, + "learning_rate": 2.333007720925552e-06, + "loss": 4.1335, + "step": 13880 + }, + { + "epoch": 2.7236171047469595, + "grad_norm": 25.820316314697266, + "learning_rate": 2.3297346187003327e-06, + "loss": 4.8931, + "step": 13885 + }, + { + "epoch": 2.7245978815221656, + "grad_norm": 66.62987518310547, + "learning_rate": 2.3264631165652608e-06, + "loss": 4.781, + "step": 13890 + }, + { + "epoch": 2.725578658297372, + "grad_norm": 12.275607109069824, + "learning_rate": 2.323193216480698e-06, + "loss": 4.3785, + "step": 13895 + }, + { + "epoch": 2.7265594350725775, + "grad_norm": 32.049034118652344, + "learning_rate": 2.319924920406054e-06, + "loss": 4.359, + "step": 13900 + }, + { + "epoch": 2.727540211847783, + "grad_norm": 45.356746673583984, + "learning_rate": 2.3166582302997744e-06, + "loss": 4.5512, + "step": 13905 + }, + { + "epoch": 2.7285209886229893, + "grad_norm": 13.335826873779297, + "learning_rate": 2.3133931481193383e-06, + "loss": 4.8334, + "step": 13910 + }, + { + "epoch": 2.7295017653981954, + "grad_norm": 18.16646957397461, + "learning_rate": 2.31012967582127e-06, + "loss": 4.5381, + "step": 13915 + }, + { + "epoch": 2.7304825421734016, + "grad_norm": 37.283790588378906, + "learning_rate": 2.3068678153611195e-06, + "loss": 4.7861, + "step": 13920 + }, + { + "epoch": 2.7314633189486073, + "grad_norm": 25.240339279174805, + "learning_rate": 2.303607568693478e-06, + "loss": 4.5222, + "step": 13925 + }, + { + "epoch": 2.7324440957238134, + "grad_norm": 23.09556770324707, + "learning_rate": 2.3003489377719682e-06, + "loss": 4.3437, + "step": 13930 + }, + { + "epoch": 2.733424872499019, + "grad_norm": 22.769893646240234, + "learning_rate": 2.2970919245492406e-06, + "loss": 4.6166, + "step": 13935 + }, + { + "epoch": 2.7344056492742252, + "grad_norm": 10.76894760131836, + "learning_rate": 2.293836530976984e-06, + "loss": 4.4213, + "step": 13940 + }, + { + "epoch": 2.7353864260494314, + "grad_norm": 27.2360897064209, + "learning_rate": 2.290582759005908e-06, + "loss": 4.5921, + "step": 13945 + }, + { + "epoch": 2.736367202824637, + "grad_norm": 36.54833984375, + "learning_rate": 2.2873306105857546e-06, + "loss": 4.7721, + "step": 13950 + }, + { + "epoch": 2.737347979599843, + "grad_norm": 28.067516326904297, + "learning_rate": 2.2840800876652963e-06, + "loss": 4.3667, + "step": 13955 + }, + { + "epoch": 2.738328756375049, + "grad_norm": 26.357559204101562, + "learning_rate": 2.280831192192324e-06, + "loss": 4.6859, + "step": 13960 + }, + { + "epoch": 2.739309533150255, + "grad_norm": 37.89749526977539, + "learning_rate": 2.2775839261136607e-06, + "loss": 4.519, + "step": 13965 + }, + { + "epoch": 2.740290309925461, + "grad_norm": 29.265758514404297, + "learning_rate": 2.274338291375147e-06, + "loss": 4.7713, + "step": 13970 + }, + { + "epoch": 2.741271086700667, + "grad_norm": 24.310449600219727, + "learning_rate": 2.271094289921651e-06, + "loss": 4.5785, + "step": 13975 + }, + { + "epoch": 2.742251863475873, + "grad_norm": 24.709442138671875, + "learning_rate": 2.2678519236970612e-06, + "loss": 4.3531, + "step": 13980 + }, + { + "epoch": 2.7432326402510787, + "grad_norm": 21.423248291015625, + "learning_rate": 2.2646111946442813e-06, + "loss": 4.3444, + "step": 13985 + }, + { + "epoch": 2.744213417026285, + "grad_norm": 28.040691375732422, + "learning_rate": 2.261372104705241e-06, + "loss": 4.2849, + "step": 13990 + }, + { + "epoch": 2.745194193801491, + "grad_norm": 22.47325325012207, + "learning_rate": 2.2581346558208817e-06, + "loss": 4.3244, + "step": 13995 + }, + { + "epoch": 2.7461749705766967, + "grad_norm": 16.90743064880371, + "learning_rate": 2.2548988499311647e-06, + "loss": 4.3961, + "step": 14000 + }, + { + "epoch": 2.747155747351903, + "grad_norm": 19.77780532836914, + "learning_rate": 2.2516646889750694e-06, + "loss": 4.693, + "step": 14005 + }, + { + "epoch": 2.7481365241271085, + "grad_norm": 19.31178855895996, + "learning_rate": 2.2484321748905835e-06, + "loss": 4.2515, + "step": 14010 + }, + { + "epoch": 2.7491173009023147, + "grad_norm": 20.418745040893555, + "learning_rate": 2.245201309614709e-06, + "loss": 4.0248, + "step": 14015 + }, + { + "epoch": 2.750098077677521, + "grad_norm": 11.354876518249512, + "learning_rate": 2.241972095083466e-06, + "loss": 4.429, + "step": 14020 + }, + { + "epoch": 2.7510788544527265, + "grad_norm": 19.0582332611084, + "learning_rate": 2.238744533231877e-06, + "loss": 4.5341, + "step": 14025 + }, + { + "epoch": 2.7510788544527265, + "eval_loss": 4.857458114624023, + "eval_runtime": 7.7687, + "eval_samples_per_second": 26.903, + "eval_steps_per_second": 13.516, + "step": 14025 + }, + { + "epoch": 2.7520596312279326, + "grad_norm": 24.781587600708008, + "learning_rate": 2.235518625993981e-06, + "loss": 4.8467, + "step": 14030 + }, + { + "epoch": 2.7530404080031383, + "grad_norm": 20.93712615966797, + "learning_rate": 2.2322943753028204e-06, + "loss": 4.8466, + "step": 14035 + }, + { + "epoch": 2.7540211847783445, + "grad_norm": 58.71204376220703, + "learning_rate": 2.22907178309045e-06, + "loss": 5.1757, + "step": 14040 + }, + { + "epoch": 2.7550019615535506, + "grad_norm": 18.296762466430664, + "learning_rate": 2.2258508512879246e-06, + "loss": 4.4446, + "step": 14045 + }, + { + "epoch": 2.7559827383287563, + "grad_norm": 25.872365951538086, + "learning_rate": 2.2226315818253097e-06, + "loss": 4.7809, + "step": 14050 + }, + { + "epoch": 2.7569635151039624, + "grad_norm": 24.24050521850586, + "learning_rate": 2.219413976631674e-06, + "loss": 4.4556, + "step": 14055 + }, + { + "epoch": 2.757944291879168, + "grad_norm": 23.216476440429688, + "learning_rate": 2.2161980376350837e-06, + "loss": 4.6814, + "step": 14060 + }, + { + "epoch": 2.7589250686543743, + "grad_norm": 18.804798126220703, + "learning_rate": 2.2129837667626147e-06, + "loss": 4.3724, + "step": 14065 + }, + { + "epoch": 2.7599058454295804, + "grad_norm": 23.965618133544922, + "learning_rate": 2.2097711659403344e-06, + "loss": 4.7329, + "step": 14070 + }, + { + "epoch": 2.760886622204786, + "grad_norm": 17.891071319580078, + "learning_rate": 2.2065602370933153e-06, + "loss": 4.8861, + "step": 14075 + }, + { + "epoch": 2.7618673989799922, + "grad_norm": 19.460275650024414, + "learning_rate": 2.20335098214563e-06, + "loss": 4.8461, + "step": 14080 + }, + { + "epoch": 2.762848175755198, + "grad_norm": 20.419872283935547, + "learning_rate": 2.2001434030203423e-06, + "loss": 4.5978, + "step": 14085 + }, + { + "epoch": 2.763828952530404, + "grad_norm": 23.175350189208984, + "learning_rate": 2.1969375016395138e-06, + "loss": 4.3628, + "step": 14090 + }, + { + "epoch": 2.76480972930561, + "grad_norm": 15.233471870422363, + "learning_rate": 2.1937332799241993e-06, + "loss": 4.4651, + "step": 14095 + }, + { + "epoch": 2.765790506080816, + "grad_norm": 21.109416961669922, + "learning_rate": 2.190530739794452e-06, + "loss": 4.4845, + "step": 14100 + }, + { + "epoch": 2.766771282856022, + "grad_norm": 26.169544219970703, + "learning_rate": 2.187329883169315e-06, + "loss": 4.5825, + "step": 14105 + }, + { + "epoch": 2.7677520596312277, + "grad_norm": 27.972253799438477, + "learning_rate": 2.184130711966819e-06, + "loss": 4.7661, + "step": 14110 + }, + { + "epoch": 2.768732836406434, + "grad_norm": 15.088165283203125, + "learning_rate": 2.180933228103992e-06, + "loss": 4.3559, + "step": 14115 + }, + { + "epoch": 2.76971361318164, + "grad_norm": 10.408885955810547, + "learning_rate": 2.177737433496842e-06, + "loss": 4.3335, + "step": 14120 + }, + { + "epoch": 2.7706943899568457, + "grad_norm": 12.056416511535645, + "learning_rate": 2.1745433300603714e-06, + "loss": 4.2495, + "step": 14125 + }, + { + "epoch": 2.771675166732052, + "grad_norm": 19.54852294921875, + "learning_rate": 2.1713509197085698e-06, + "loss": 4.4882, + "step": 14130 + }, + { + "epoch": 2.7726559435072575, + "grad_norm": 25.85139274597168, + "learning_rate": 2.1681602043544057e-06, + "loss": 4.8348, + "step": 14135 + }, + { + "epoch": 2.7736367202824637, + "grad_norm": 11.395781517028809, + "learning_rate": 2.164971185909839e-06, + "loss": 4.3054, + "step": 14140 + }, + { + "epoch": 2.77461749705767, + "grad_norm": 24.38686752319336, + "learning_rate": 2.1617838662858075e-06, + "loss": 4.1594, + "step": 14145 + }, + { + "epoch": 2.7755982738328755, + "grad_norm": 28.35773468017578, + "learning_rate": 2.158598247392236e-06, + "loss": 4.5292, + "step": 14150 + }, + { + "epoch": 2.7765790506080816, + "grad_norm": 17.550443649291992, + "learning_rate": 2.1554143311380237e-06, + "loss": 4.8225, + "step": 14155 + }, + { + "epoch": 2.7775598273832873, + "grad_norm": 26.152284622192383, + "learning_rate": 2.1522321194310577e-06, + "loss": 4.8608, + "step": 14160 + }, + { + "epoch": 2.7785406041584935, + "grad_norm": 19.523468017578125, + "learning_rate": 2.1490516141781957e-06, + "loss": 4.7656, + "step": 14165 + }, + { + "epoch": 2.7795213809336996, + "grad_norm": 15.315962791442871, + "learning_rate": 2.1458728172852765e-06, + "loss": 4.4612, + "step": 14170 + }, + { + "epoch": 2.7805021577089053, + "grad_norm": 19.123991012573242, + "learning_rate": 2.142695730657116e-06, + "loss": 4.499, + "step": 14175 + }, + { + "epoch": 2.7814829344841114, + "grad_norm": 47.76284408569336, + "learning_rate": 2.139520356197506e-06, + "loss": 4.7392, + "step": 14180 + }, + { + "epoch": 2.782463711259317, + "grad_norm": 24.149187088012695, + "learning_rate": 2.1363466958092077e-06, + "loss": 4.4477, + "step": 14185 + }, + { + "epoch": 2.7834444880345233, + "grad_norm": 22.08119773864746, + "learning_rate": 2.1331747513939615e-06, + "loss": 4.5059, + "step": 14190 + }, + { + "epoch": 2.7844252648097294, + "grad_norm": 20.25182342529297, + "learning_rate": 2.1300045248524724e-06, + "loss": 4.7203, + "step": 14195 + }, + { + "epoch": 2.7854060415849355, + "grad_norm": 10.127427101135254, + "learning_rate": 2.126836018084422e-06, + "loss": 4.4238, + "step": 14200 + }, + { + "epoch": 2.7863868183601412, + "grad_norm": 14.273734092712402, + "learning_rate": 2.123669232988461e-06, + "loss": 4.4544, + "step": 14205 + }, + { + "epoch": 2.787367595135347, + "grad_norm": 37.149620056152344, + "learning_rate": 2.120504171462203e-06, + "loss": 4.4163, + "step": 14210 + }, + { + "epoch": 2.788348371910553, + "grad_norm": 24.374950408935547, + "learning_rate": 2.1173408354022357e-06, + "loss": 4.4589, + "step": 14215 + }, + { + "epoch": 2.789329148685759, + "grad_norm": 15.419386863708496, + "learning_rate": 2.114179226704106e-06, + "loss": 4.3327, + "step": 14220 + }, + { + "epoch": 2.7903099254609653, + "grad_norm": 21.48487663269043, + "learning_rate": 2.1110193472623335e-06, + "loss": 4.6511, + "step": 14225 + }, + { + "epoch": 2.791290702236171, + "grad_norm": 12.925050735473633, + "learning_rate": 2.1078611989703934e-06, + "loss": 4.3783, + "step": 14230 + }, + { + "epoch": 2.7922714790113767, + "grad_norm": 18.62735939025879, + "learning_rate": 2.1047047837207315e-06, + "loss": 4.2144, + "step": 14235 + }, + { + "epoch": 2.793252255786583, + "grad_norm": 23.844316482543945, + "learning_rate": 2.1015501034047486e-06, + "loss": 4.7369, + "step": 14240 + }, + { + "epoch": 2.794233032561789, + "grad_norm": 24.961200714111328, + "learning_rate": 2.0983971599128072e-06, + "loss": 4.6055, + "step": 14245 + }, + { + "epoch": 2.795213809336995, + "grad_norm": 16.179439544677734, + "learning_rate": 2.0952459551342325e-06, + "loss": 4.5461, + "step": 14250 + }, + { + "epoch": 2.796194586112201, + "grad_norm": 20.85818862915039, + "learning_rate": 2.0920964909573065e-06, + "loss": 4.8779, + "step": 14255 + }, + { + "epoch": 2.797175362887407, + "grad_norm": 13.407557487487793, + "learning_rate": 2.0889487692692644e-06, + "loss": 4.6044, + "step": 14260 + }, + { + "epoch": 2.7981561396626127, + "grad_norm": 20.780235290527344, + "learning_rate": 2.0858027919563032e-06, + "loss": 4.7772, + "step": 14265 + }, + { + "epoch": 2.799136916437819, + "grad_norm": 14.837874412536621, + "learning_rate": 2.0826585609035686e-06, + "loss": 4.0348, + "step": 14270 + }, + { + "epoch": 2.800117693213025, + "grad_norm": 24.567312240600586, + "learning_rate": 2.0795160779951645e-06, + "loss": 4.277, + "step": 14275 + }, + { + "epoch": 2.8010984699882306, + "grad_norm": 21.53330421447754, + "learning_rate": 2.076375345114147e-06, + "loss": 4.4838, + "step": 14280 + }, + { + "epoch": 2.802079246763437, + "grad_norm": 32.00398635864258, + "learning_rate": 2.0732363641425197e-06, + "loss": 4.5141, + "step": 14285 + }, + { + "epoch": 2.8030600235386425, + "grad_norm": 17.834571838378906, + "learning_rate": 2.070099136961241e-06, + "loss": 4.417, + "step": 14290 + }, + { + "epoch": 2.8040408003138486, + "grad_norm": 10.907025337219238, + "learning_rate": 2.066963665450214e-06, + "loss": 4.8046, + "step": 14295 + }, + { + "epoch": 2.8050215770890548, + "grad_norm": 36.39023971557617, + "learning_rate": 2.063829951488295e-06, + "loss": 4.9091, + "step": 14300 + }, + { + "epoch": 2.8060023538642604, + "grad_norm": 33.004146575927734, + "learning_rate": 2.0606979969532826e-06, + "loss": 4.57, + "step": 14305 + }, + { + "epoch": 2.8069831306394666, + "grad_norm": 14.840770721435547, + "learning_rate": 2.0575678037219205e-06, + "loss": 4.4305, + "step": 14310 + }, + { + "epoch": 2.8079639074146723, + "grad_norm": 14.304911613464355, + "learning_rate": 2.0544393736699033e-06, + "loss": 4.2868, + "step": 14315 + }, + { + "epoch": 2.8089446841898784, + "grad_norm": 16.990671157836914, + "learning_rate": 2.051312708671861e-06, + "loss": 4.448, + "step": 14320 + }, + { + "epoch": 2.8099254609650846, + "grad_norm": 33.899173736572266, + "learning_rate": 2.048187810601372e-06, + "loss": 4.4279, + "step": 14325 + }, + { + "epoch": 2.8109062377402902, + "grad_norm": 23.93379020690918, + "learning_rate": 2.0450646813309555e-06, + "loss": 4.7488, + "step": 14330 + }, + { + "epoch": 2.8118870145154964, + "grad_norm": 27.46114730834961, + "learning_rate": 2.0419433227320653e-06, + "loss": 4.3003, + "step": 14335 + }, + { + "epoch": 2.812867791290702, + "grad_norm": 26.478914260864258, + "learning_rate": 2.0388237366751005e-06, + "loss": 4.8474, + "step": 14340 + }, + { + "epoch": 2.813848568065908, + "grad_norm": 26.579689025878906, + "learning_rate": 2.035705925029394e-06, + "loss": 4.7607, + "step": 14345 + }, + { + "epoch": 2.8148293448411144, + "grad_norm": 13.559803009033203, + "learning_rate": 2.0325898896632178e-06, + "loss": 4.5509, + "step": 14350 + }, + { + "epoch": 2.81581012161632, + "grad_norm": 24.959630966186523, + "learning_rate": 2.0294756324437804e-06, + "loss": 4.2184, + "step": 14355 + }, + { + "epoch": 2.816790898391526, + "grad_norm": 14.203490257263184, + "learning_rate": 2.026363155237219e-06, + "loss": 4.4857, + "step": 14360 + }, + { + "epoch": 2.817771675166732, + "grad_norm": 14.8135347366333, + "learning_rate": 2.0232524599086116e-06, + "loss": 4.4352, + "step": 14365 + }, + { + "epoch": 2.818752451941938, + "grad_norm": 19.16025161743164, + "learning_rate": 2.0201435483219627e-06, + "loss": 4.4608, + "step": 14370 + }, + { + "epoch": 2.819733228717144, + "grad_norm": 11.188132286071777, + "learning_rate": 2.0170364223402126e-06, + "loss": 4.3011, + "step": 14375 + }, + { + "epoch": 2.82071400549235, + "grad_norm": 15.542959213256836, + "learning_rate": 2.0139310838252283e-06, + "loss": 4.668, + "step": 14380 + }, + { + "epoch": 2.821694782267556, + "grad_norm": 15.383172988891602, + "learning_rate": 2.0108275346378052e-06, + "loss": 4.6553, + "step": 14385 + }, + { + "epoch": 2.8226755590427617, + "grad_norm": 32.74557113647461, + "learning_rate": 2.0077257766376707e-06, + "loss": 4.7629, + "step": 14390 + }, + { + "epoch": 2.823656335817968, + "grad_norm": 17.852758407592773, + "learning_rate": 2.0046258116834725e-06, + "loss": 4.6995, + "step": 14395 + }, + { + "epoch": 2.824637112593174, + "grad_norm": 17.616985321044922, + "learning_rate": 2.00152764163279e-06, + "loss": 4.4477, + "step": 14400 + }, + { + "epoch": 2.8256178893683797, + "grad_norm": 22.82828712463379, + "learning_rate": 1.9984312683421265e-06, + "loss": 4.4233, + "step": 14405 + }, + { + "epoch": 2.826598666143586, + "grad_norm": 20.8966121673584, + "learning_rate": 1.9953366936669023e-06, + "loss": 4.3212, + "step": 14410 + }, + { + "epoch": 2.8275794429187915, + "grad_norm": 13.764612197875977, + "learning_rate": 1.9922439194614686e-06, + "loss": 4.5841, + "step": 14415 + }, + { + "epoch": 2.8285602196939976, + "grad_norm": 10.778817176818848, + "learning_rate": 1.9891529475790894e-06, + "loss": 4.5126, + "step": 14420 + }, + { + "epoch": 2.8295409964692038, + "grad_norm": 26.92489242553711, + "learning_rate": 1.986063779871955e-06, + "loss": 4.4893, + "step": 14425 + }, + { + "epoch": 2.8305217732444095, + "grad_norm": 22.98575782775879, + "learning_rate": 1.9829764181911738e-06, + "loss": 4.5547, + "step": 14430 + }, + { + "epoch": 2.8315025500196156, + "grad_norm": 23.186853408813477, + "learning_rate": 1.979890864386767e-06, + "loss": 4.5995, + "step": 14435 + }, + { + "epoch": 2.8324833267948213, + "grad_norm": 14.08784294128418, + "learning_rate": 1.97680712030768e-06, + "loss": 4.9109, + "step": 14440 + }, + { + "epoch": 2.8334641035700274, + "grad_norm": 16.856956481933594, + "learning_rate": 1.9737251878017678e-06, + "loss": 4.7937, + "step": 14445 + }, + { + "epoch": 2.8344448803452336, + "grad_norm": 20.808732986450195, + "learning_rate": 1.970645068715799e-06, + "loss": 4.646, + "step": 14450 + }, + { + "epoch": 2.8354256571204393, + "grad_norm": 40.77276611328125, + "learning_rate": 1.967566764895464e-06, + "loss": 4.9738, + "step": 14455 + }, + { + "epoch": 2.8364064338956454, + "grad_norm": 16.21761131286621, + "learning_rate": 1.964490278185354e-06, + "loss": 4.4132, + "step": 14460 + }, + { + "epoch": 2.837387210670851, + "grad_norm": 11.041531562805176, + "learning_rate": 1.961415610428983e-06, + "loss": 4.6614, + "step": 14465 + }, + { + "epoch": 2.8383679874460572, + "grad_norm": 18.46826934814453, + "learning_rate": 1.958342763468764e-06, + "loss": 4.3458, + "step": 14470 + }, + { + "epoch": 2.8393487642212634, + "grad_norm": 25.5513973236084, + "learning_rate": 1.955271739146026e-06, + "loss": 4.454, + "step": 14475 + }, + { + "epoch": 2.840329540996469, + "grad_norm": 21.196474075317383, + "learning_rate": 1.952202539301007e-06, + "loss": 4.6721, + "step": 14480 + }, + { + "epoch": 2.841310317771675, + "grad_norm": 14.46927547454834, + "learning_rate": 1.949135165772844e-06, + "loss": 4.5196, + "step": 14485 + }, + { + "epoch": 2.842291094546881, + "grad_norm": 16.33852767944336, + "learning_rate": 1.9460696203995884e-06, + "loss": 4.3415, + "step": 14490 + }, + { + "epoch": 2.843271871322087, + "grad_norm": 22.176271438598633, + "learning_rate": 1.9430059050181883e-06, + "loss": 4.2941, + "step": 14495 + }, + { + "epoch": 2.844252648097293, + "grad_norm": 14.859818458557129, + "learning_rate": 1.9399440214645003e-06, + "loss": 4.5169, + "step": 14500 + }, + { + "epoch": 2.845233424872499, + "grad_norm": 13.2625732421875, + "learning_rate": 1.936883971573285e-06, + "loss": 4.424, + "step": 14505 + }, + { + "epoch": 2.846214201647705, + "grad_norm": 15.930765151977539, + "learning_rate": 1.9338257571781973e-06, + "loss": 4.386, + "step": 14510 + }, + { + "epoch": 2.8471949784229107, + "grad_norm": 16.069883346557617, + "learning_rate": 1.9307693801117983e-06, + "loss": 4.5049, + "step": 14515 + }, + { + "epoch": 2.848175755198117, + "grad_norm": 17.39498519897461, + "learning_rate": 1.9277148422055457e-06, + "loss": 4.5397, + "step": 14520 + }, + { + "epoch": 2.849156531973323, + "grad_norm": 15.79026985168457, + "learning_rate": 1.924662145289793e-06, + "loss": 4.4386, + "step": 14525 + }, + { + "epoch": 2.850137308748529, + "grad_norm": 28.315719604492188, + "learning_rate": 1.921611291193797e-06, + "loss": 4.6099, + "step": 14530 + }, + { + "epoch": 2.851118085523735, + "grad_norm": 19.82541847229004, + "learning_rate": 1.9185622817457024e-06, + "loss": 4.8542, + "step": 14535 + }, + { + "epoch": 2.8520988622989405, + "grad_norm": 19.061132431030273, + "learning_rate": 1.915515118772555e-06, + "loss": 4.5148, + "step": 14540 + }, + { + "epoch": 2.8530796390741466, + "grad_norm": 26.26479148864746, + "learning_rate": 1.912469804100289e-06, + "loss": 4.3535, + "step": 14545 + }, + { + "epoch": 2.8540604158493528, + "grad_norm": 23.275548934936523, + "learning_rate": 1.9094263395537353e-06, + "loss": 4.6812, + "step": 14550 + }, + { + "epoch": 2.855041192624559, + "grad_norm": 16.343244552612305, + "learning_rate": 1.9063847269566154e-06, + "loss": 4.2992, + "step": 14555 + }, + { + "epoch": 2.8560219693997646, + "grad_norm": 19.14699363708496, + "learning_rate": 1.903344968131537e-06, + "loss": 4.8218, + "step": 14560 + }, + { + "epoch": 2.8570027461749707, + "grad_norm": 15.184374809265137, + "learning_rate": 1.9003070649000033e-06, + "loss": 4.5549, + "step": 14565 + }, + { + "epoch": 2.8579835229501764, + "grad_norm": 32.97745895385742, + "learning_rate": 1.897271019082399e-06, + "loss": 4.6135, + "step": 14570 + }, + { + "epoch": 2.8589642997253826, + "grad_norm": 25.775196075439453, + "learning_rate": 1.894236832498001e-06, + "loss": 4.4366, + "step": 14575 + }, + { + "epoch": 2.8599450765005887, + "grad_norm": 9.466419219970703, + "learning_rate": 1.8912045069649709e-06, + "loss": 4.6671, + "step": 14580 + }, + { + "epoch": 2.8609258532757944, + "grad_norm": 12.89758014678955, + "learning_rate": 1.888174044300352e-06, + "loss": 4.631, + "step": 14585 + }, + { + "epoch": 2.8619066300510005, + "grad_norm": 25.096372604370117, + "learning_rate": 1.8851454463200769e-06, + "loss": 4.9087, + "step": 14590 + }, + { + "epoch": 2.8628874068262062, + "grad_norm": 21.342893600463867, + "learning_rate": 1.8821187148389557e-06, + "loss": 4.2392, + "step": 14595 + }, + { + "epoch": 2.8638681836014124, + "grad_norm": 14.787396430969238, + "learning_rate": 1.8790938516706802e-06, + "loss": 4.9076, + "step": 14600 + }, + { + "epoch": 2.8648489603766185, + "grad_norm": 25.85067367553711, + "learning_rate": 1.8760708586278287e-06, + "loss": 4.4382, + "step": 14605 + }, + { + "epoch": 2.865829737151824, + "grad_norm": 29.25151824951172, + "learning_rate": 1.8730497375218504e-06, + "loss": 4.4079, + "step": 14610 + }, + { + "epoch": 2.8668105139270303, + "grad_norm": 22.0438289642334, + "learning_rate": 1.87003049016308e-06, + "loss": 4.7012, + "step": 14615 + }, + { + "epoch": 2.867791290702236, + "grad_norm": 52.987953186035156, + "learning_rate": 1.8670131183607242e-06, + "loss": 4.2756, + "step": 14620 + }, + { + "epoch": 2.868772067477442, + "grad_norm": 18.41710090637207, + "learning_rate": 1.863997623922869e-06, + "loss": 4.7143, + "step": 14625 + }, + { + "epoch": 2.8697528442526483, + "grad_norm": 22.505746841430664, + "learning_rate": 1.8609840086564769e-06, + "loss": 4.4031, + "step": 14630 + }, + { + "epoch": 2.870733621027854, + "grad_norm": 35.385902404785156, + "learning_rate": 1.8579722743673773e-06, + "loss": 4.3598, + "step": 14635 + }, + { + "epoch": 2.87171439780306, + "grad_norm": 23.900920867919922, + "learning_rate": 1.8549624228602815e-06, + "loss": 4.5235, + "step": 14640 + }, + { + "epoch": 2.872695174578266, + "grad_norm": 22.860639572143555, + "learning_rate": 1.8519544559387642e-06, + "loss": 4.6174, + "step": 14645 + }, + { + "epoch": 2.873675951353472, + "grad_norm": 28.040868759155273, + "learning_rate": 1.8489483754052767e-06, + "loss": 4.4914, + "step": 14650 + }, + { + "epoch": 2.874656728128678, + "grad_norm": 21.25215721130371, + "learning_rate": 1.8459441830611402e-06, + "loss": 4.4763, + "step": 14655 + }, + { + "epoch": 2.875637504903884, + "grad_norm": 19.2554931640625, + "learning_rate": 1.8429418807065403e-06, + "loss": 4.5422, + "step": 14660 + }, + { + "epoch": 2.87661828167909, + "grad_norm": 17.228635787963867, + "learning_rate": 1.83994147014053e-06, + "loss": 4.385, + "step": 14665 + }, + { + "epoch": 2.8775990584542956, + "grad_norm": 27.80978775024414, + "learning_rate": 1.8369429531610339e-06, + "loss": 4.4902, + "step": 14670 + }, + { + "epoch": 2.878579835229502, + "grad_norm": 12.32872486114502, + "learning_rate": 1.8339463315648365e-06, + "loss": 4.3144, + "step": 14675 + }, + { + "epoch": 2.879560612004708, + "grad_norm": 18.406604766845703, + "learning_rate": 1.8309516071475909e-06, + "loss": 4.0658, + "step": 14680 + }, + { + "epoch": 2.8805413887799136, + "grad_norm": 21.1339168548584, + "learning_rate": 1.8279587817038086e-06, + "loss": 4.4005, + "step": 14685 + }, + { + "epoch": 2.8815221655551198, + "grad_norm": 17.803857803344727, + "learning_rate": 1.8249678570268697e-06, + "loss": 4.5032, + "step": 14690 + }, + { + "epoch": 2.8825029423303254, + "grad_norm": 32.61869812011719, + "learning_rate": 1.8219788349090067e-06, + "loss": 4.7084, + "step": 14695 + }, + { + "epoch": 2.8834837191055316, + "grad_norm": 23.566926956176758, + "learning_rate": 1.8189917171413196e-06, + "loss": 4.2739, + "step": 14700 + }, + { + "epoch": 2.8844644958807377, + "grad_norm": 19.303241729736328, + "learning_rate": 1.816006505513766e-06, + "loss": 4.4533, + "step": 14705 + }, + { + "epoch": 2.8854452726559434, + "grad_norm": 15.478785514831543, + "learning_rate": 1.8130232018151562e-06, + "loss": 4.9631, + "step": 14710 + }, + { + "epoch": 2.8864260494311496, + "grad_norm": 17.744274139404297, + "learning_rate": 1.8100418078331638e-06, + "loss": 4.298, + "step": 14715 + }, + { + "epoch": 2.8874068262063552, + "grad_norm": 26.17151641845703, + "learning_rate": 1.8070623253543118e-06, + "loss": 4.6553, + "step": 14720 + }, + { + "epoch": 2.8883876029815614, + "grad_norm": 16.9974308013916, + "learning_rate": 1.8040847561639834e-06, + "loss": 4.2575, + "step": 14725 + }, + { + "epoch": 2.8893683797567675, + "grad_norm": 16.582048416137695, + "learning_rate": 1.8011091020464138e-06, + "loss": 4.8634, + "step": 14730 + }, + { + "epoch": 2.890349156531973, + "grad_norm": 14.952964782714844, + "learning_rate": 1.7981353647846883e-06, + "loss": 4.4459, + "step": 14735 + }, + { + "epoch": 2.8913299333071794, + "grad_norm": 26.1202335357666, + "learning_rate": 1.7951635461607453e-06, + "loss": 4.4072, + "step": 14740 + }, + { + "epoch": 2.892310710082385, + "grad_norm": 27.30790138244629, + "learning_rate": 1.792193647955371e-06, + "loss": 4.7133, + "step": 14745 + }, + { + "epoch": 2.893291486857591, + "grad_norm": 14.769981384277344, + "learning_rate": 1.7892256719482053e-06, + "loss": 4.2343, + "step": 14750 + }, + { + "epoch": 2.8942722636327973, + "grad_norm": 21.78799819946289, + "learning_rate": 1.7862596199177351e-06, + "loss": 4.8337, + "step": 14755 + }, + { + "epoch": 2.895253040408003, + "grad_norm": 16.13930320739746, + "learning_rate": 1.783295493641291e-06, + "loss": 4.5237, + "step": 14760 + }, + { + "epoch": 2.896233817183209, + "grad_norm": 23.837434768676758, + "learning_rate": 1.7803332948950542e-06, + "loss": 4.5499, + "step": 14765 + }, + { + "epoch": 2.897214593958415, + "grad_norm": 20.946630477905273, + "learning_rate": 1.777373025454046e-06, + "loss": 4.4247, + "step": 14770 + }, + { + "epoch": 2.898195370733621, + "grad_norm": 41.92950439453125, + "learning_rate": 1.7744146870921357e-06, + "loss": 4.4211, + "step": 14775 + }, + { + "epoch": 2.899176147508827, + "grad_norm": 35.671695709228516, + "learning_rate": 1.7714582815820358e-06, + "loss": 4.3364, + "step": 14780 + }, + { + "epoch": 2.900156924284033, + "grad_norm": 15.399882316589355, + "learning_rate": 1.7685038106952952e-06, + "loss": 4.7295, + "step": 14785 + }, + { + "epoch": 2.901137701059239, + "grad_norm": 20.41010856628418, + "learning_rate": 1.7655512762023108e-06, + "loss": 4.4719, + "step": 14790 + }, + { + "epoch": 2.9021184778344447, + "grad_norm": 16.088794708251953, + "learning_rate": 1.7626006798723121e-06, + "loss": 4.2059, + "step": 14795 + }, + { + "epoch": 2.903099254609651, + "grad_norm": 25.732389450073242, + "learning_rate": 1.7596520234733739e-06, + "loss": 4.5793, + "step": 14800 + }, + { + "epoch": 2.904080031384857, + "grad_norm": 12.215232849121094, + "learning_rate": 1.7567053087724018e-06, + "loss": 4.7218, + "step": 14805 + }, + { + "epoch": 2.9050608081600626, + "grad_norm": 19.739543914794922, + "learning_rate": 1.7537605375351446e-06, + "loss": 4.4933, + "step": 14810 + }, + { + "epoch": 2.9060415849352688, + "grad_norm": 14.414162635803223, + "learning_rate": 1.7508177115261815e-06, + "loss": 4.9669, + "step": 14815 + }, + { + "epoch": 2.9070223617104745, + "grad_norm": 14.16154956817627, + "learning_rate": 1.7478768325089269e-06, + "loss": 4.6426, + "step": 14820 + }, + { + "epoch": 2.9080031384856806, + "grad_norm": 21.966236114501953, + "learning_rate": 1.7449379022456297e-06, + "loss": 4.4094, + "step": 14825 + }, + { + "epoch": 2.9089839152608867, + "grad_norm": 24.579914093017578, + "learning_rate": 1.7420009224973743e-06, + "loss": 4.5039, + "step": 14830 + }, + { + "epoch": 2.9099646920360924, + "grad_norm": 29.428955078125, + "learning_rate": 1.739065895024068e-06, + "loss": 4.3724, + "step": 14835 + }, + { + "epoch": 2.9109454688112986, + "grad_norm": 15.026422500610352, + "learning_rate": 1.736132821584457e-06, + "loss": 4.3833, + "step": 14840 + }, + { + "epoch": 2.9119262455865043, + "grad_norm": 35.190284729003906, + "learning_rate": 1.7332017039361094e-06, + "loss": 4.6888, + "step": 14845 + }, + { + "epoch": 2.9129070223617104, + "grad_norm": 31.889781951904297, + "learning_rate": 1.7302725438354256e-06, + "loss": 4.0796, + "step": 14850 + }, + { + "epoch": 2.9138877991369165, + "grad_norm": 19.924861907958984, + "learning_rate": 1.7273453430376347e-06, + "loss": 4.7306, + "step": 14855 + }, + { + "epoch": 2.9148685759121227, + "grad_norm": 29.968568801879883, + "learning_rate": 1.7244201032967844e-06, + "loss": 4.3262, + "step": 14860 + }, + { + "epoch": 2.9158493526873284, + "grad_norm": 16.26209831237793, + "learning_rate": 1.7214968263657561e-06, + "loss": 4.4238, + "step": 14865 + }, + { + "epoch": 2.916830129462534, + "grad_norm": 12.584505081176758, + "learning_rate": 1.7185755139962473e-06, + "loss": 4.6434, + "step": 14870 + }, + { + "epoch": 2.91781090623774, + "grad_norm": 13.820115089416504, + "learning_rate": 1.7156561679387851e-06, + "loss": 4.7432, + "step": 14875 + }, + { + "epoch": 2.9187916830129463, + "grad_norm": 23.35183334350586, + "learning_rate": 1.7127387899427118e-06, + "loss": 4.4374, + "step": 14880 + }, + { + "epoch": 2.9197724597881525, + "grad_norm": 10.54651165008545, + "learning_rate": 1.7098233817561966e-06, + "loss": 4.5436, + "step": 14885 + }, + { + "epoch": 2.920753236563358, + "grad_norm": 13.422380447387695, + "learning_rate": 1.7069099451262245e-06, + "loss": 4.7536, + "step": 14890 + }, + { + "epoch": 2.9217340133385643, + "grad_norm": 16.435741424560547, + "learning_rate": 1.703998481798597e-06, + "loss": 4.6658, + "step": 14895 + }, + { + "epoch": 2.92271479011377, + "grad_norm": 38.003475189208984, + "learning_rate": 1.7010889935179398e-06, + "loss": 4.7723, + "step": 14900 + }, + { + "epoch": 2.923695566888976, + "grad_norm": 18.305591583251953, + "learning_rate": 1.698181482027691e-06, + "loss": 4.6293, + "step": 14905 + }, + { + "epoch": 2.9246763436641823, + "grad_norm": 19.372703552246094, + "learning_rate": 1.6952759490701021e-06, + "loss": 4.5238, + "step": 14910 + }, + { + "epoch": 2.925657120439388, + "grad_norm": 15.01822280883789, + "learning_rate": 1.6923723963862455e-06, + "loss": 4.6348, + "step": 14915 + }, + { + "epoch": 2.926637897214594, + "grad_norm": 13.797202110290527, + "learning_rate": 1.689470825715998e-06, + "loss": 4.3862, + "step": 14920 + }, + { + "epoch": 2.9276186739898, + "grad_norm": 13.342180252075195, + "learning_rate": 1.686571238798057e-06, + "loss": 4.4863, + "step": 14925 + }, + { + "epoch": 2.928599450765006, + "grad_norm": 14.439830780029297, + "learning_rate": 1.6836736373699286e-06, + "loss": 4.6578, + "step": 14930 + }, + { + "epoch": 2.929580227540212, + "grad_norm": 14.736735343933105, + "learning_rate": 1.680778023167926e-06, + "loss": 4.4495, + "step": 14935 + }, + { + "epoch": 2.9305610043154178, + "grad_norm": 19.791828155517578, + "learning_rate": 1.677884397927176e-06, + "loss": 4.5876, + "step": 14940 + }, + { + "epoch": 2.931541781090624, + "grad_norm": 27.638660430908203, + "learning_rate": 1.6749927633816093e-06, + "loss": 4.9591, + "step": 14945 + }, + { + "epoch": 2.9325225578658296, + "grad_norm": 28.315677642822266, + "learning_rate": 1.6721031212639688e-06, + "loss": 4.6651, + "step": 14950 + }, + { + "epoch": 2.9335033346410357, + "grad_norm": 22.39848518371582, + "learning_rate": 1.6692154733057996e-06, + "loss": 4.3891, + "step": 14955 + }, + { + "epoch": 2.934484111416242, + "grad_norm": 23.636980056762695, + "learning_rate": 1.6663298212374508e-06, + "loss": 4.3512, + "step": 14960 + }, + { + "epoch": 2.9354648881914476, + "grad_norm": 17.69251251220703, + "learning_rate": 1.6634461667880807e-06, + "loss": 4.4783, + "step": 14965 + }, + { + "epoch": 2.9364456649666537, + "grad_norm": 16.90591812133789, + "learning_rate": 1.660564511685645e-06, + "loss": 4.7497, + "step": 14970 + }, + { + "epoch": 2.9374264417418594, + "grad_norm": 34.280216217041016, + "learning_rate": 1.6576848576569054e-06, + "loss": 4.5117, + "step": 14975 + }, + { + "epoch": 2.9384072185170655, + "grad_norm": 23.570716857910156, + "learning_rate": 1.654807206427424e-06, + "loss": 4.4814, + "step": 14980 + }, + { + "epoch": 2.9393879952922717, + "grad_norm": 13.759195327758789, + "learning_rate": 1.651931559721559e-06, + "loss": 4.3734, + "step": 14985 + }, + { + "epoch": 2.9403687720674774, + "grad_norm": 24.15625, + "learning_rate": 1.6490579192624734e-06, + "loss": 4.4103, + "step": 14990 + }, + { + "epoch": 2.9413495488426835, + "grad_norm": 26.735837936401367, + "learning_rate": 1.6461862867721218e-06, + "loss": 4.4619, + "step": 14995 + }, + { + "epoch": 2.942330325617889, + "grad_norm": 22.7193603515625, + "learning_rate": 1.6433166639712594e-06, + "loss": 4.5971, + "step": 15000 + }, + { + "epoch": 2.9433111023930953, + "grad_norm": 42.432960510253906, + "learning_rate": 1.6404490525794392e-06, + "loss": 4.5656, + "step": 15005 + }, + { + "epoch": 2.9442918791683015, + "grad_norm": 17.02939796447754, + "learning_rate": 1.6375834543150015e-06, + "loss": 5.0169, + "step": 15010 + }, + { + "epoch": 2.945272655943507, + "grad_norm": 35.580562591552734, + "learning_rate": 1.6347198708950884e-06, + "loss": 4.3484, + "step": 15015 + }, + { + "epoch": 2.9462534327187133, + "grad_norm": 28.986095428466797, + "learning_rate": 1.6318583040356285e-06, + "loss": 4.8381, + "step": 15020 + }, + { + "epoch": 2.947234209493919, + "grad_norm": 22.669565200805664, + "learning_rate": 1.6289987554513475e-06, + "loss": 4.2845, + "step": 15025 + }, + { + "epoch": 2.948214986269125, + "grad_norm": 17.930286407470703, + "learning_rate": 1.6261412268557564e-06, + "loss": 4.7354, + "step": 15030 + }, + { + "epoch": 2.9491957630443313, + "grad_norm": 16.47120475769043, + "learning_rate": 1.6232857199611579e-06, + "loss": 4.3747, + "step": 15035 + }, + { + "epoch": 2.950176539819537, + "grad_norm": 25.326810836791992, + "learning_rate": 1.6204322364786456e-06, + "loss": 4.6454, + "step": 15040 + }, + { + "epoch": 2.951157316594743, + "grad_norm": 30.15776252746582, + "learning_rate": 1.6175807781180964e-06, + "loss": 4.4318, + "step": 15045 + }, + { + "epoch": 2.952138093369949, + "grad_norm": 16.590038299560547, + "learning_rate": 1.6147313465881758e-06, + "loss": 4.4618, + "step": 15050 + }, + { + "epoch": 2.953118870145155, + "grad_norm": 17.120126724243164, + "learning_rate": 1.6118839435963386e-06, + "loss": 4.6643, + "step": 15055 + }, + { + "epoch": 2.954099646920361, + "grad_norm": 18.091629028320312, + "learning_rate": 1.6090385708488148e-06, + "loss": 4.6134, + "step": 15060 + }, + { + "epoch": 2.955080423695567, + "grad_norm": 27.78541374206543, + "learning_rate": 1.6061952300506285e-06, + "loss": 4.6667, + "step": 15065 + }, + { + "epoch": 2.956061200470773, + "grad_norm": 18.00626564025879, + "learning_rate": 1.6033539229055762e-06, + "loss": 4.5732, + "step": 15070 + }, + { + "epoch": 2.9570419772459786, + "grad_norm": 23.326290130615234, + "learning_rate": 1.6005146511162428e-06, + "loss": 4.4773, + "step": 15075 + }, + { + "epoch": 2.9580227540211848, + "grad_norm": 23.181758880615234, + "learning_rate": 1.5976774163839937e-06, + "loss": 4.5546, + "step": 15080 + }, + { + "epoch": 2.959003530796391, + "grad_norm": 33.70425796508789, + "learning_rate": 1.5948422204089664e-06, + "loss": 4.795, + "step": 15085 + }, + { + "epoch": 2.9599843075715966, + "grad_norm": 23.166748046875, + "learning_rate": 1.5920090648900866e-06, + "loss": 4.4982, + "step": 15090 + }, + { + "epoch": 2.9609650843468027, + "grad_norm": 17.901594161987305, + "learning_rate": 1.5891779515250494e-06, + "loss": 4.5362, + "step": 15095 + }, + { + "epoch": 2.9619458611220084, + "grad_norm": 15.403538703918457, + "learning_rate": 1.586348882010328e-06, + "loss": 4.4604, + "step": 15100 + }, + { + "epoch": 2.9629266378972146, + "grad_norm": 12.828426361083984, + "learning_rate": 1.583521858041175e-06, + "loss": 4.4609, + "step": 15105 + }, + { + "epoch": 2.9639074146724207, + "grad_norm": 34.96916961669922, + "learning_rate": 1.580696881311611e-06, + "loss": 4.6684, + "step": 15110 + }, + { + "epoch": 2.9648881914476264, + "grad_norm": 19.698801040649414, + "learning_rate": 1.5778739535144366e-06, + "loss": 4.7109, + "step": 15115 + }, + { + "epoch": 2.9658689682228325, + "grad_norm": 17.187973022460938, + "learning_rate": 1.5750530763412181e-06, + "loss": 4.3427, + "step": 15120 + }, + { + "epoch": 2.966849744998038, + "grad_norm": 16.712276458740234, + "learning_rate": 1.572234251482297e-06, + "loss": 4.5441, + "step": 15125 + }, + { + "epoch": 2.9678305217732444, + "grad_norm": 22.001623153686523, + "learning_rate": 1.5694174806267854e-06, + "loss": 4.4557, + "step": 15130 + }, + { + "epoch": 2.9688112985484505, + "grad_norm": 25.948938369750977, + "learning_rate": 1.566602765462561e-06, + "loss": 4.1687, + "step": 15135 + }, + { + "epoch": 2.969792075323656, + "grad_norm": 12.910621643066406, + "learning_rate": 1.5637901076762747e-06, + "loss": 4.6866, + "step": 15140 + }, + { + "epoch": 2.9707728520988623, + "grad_norm": 12.790865898132324, + "learning_rate": 1.560979508953338e-06, + "loss": 4.5784, + "step": 15145 + }, + { + "epoch": 2.971753628874068, + "grad_norm": 13.090932846069336, + "learning_rate": 1.5581709709779346e-06, + "loss": 4.3458, + "step": 15150 + }, + { + "epoch": 2.972734405649274, + "grad_norm": 18.293384552001953, + "learning_rate": 1.5553644954330122e-06, + "loss": 4.4609, + "step": 15155 + }, + { + "epoch": 2.9737151824244803, + "grad_norm": 21.8541202545166, + "learning_rate": 1.5525600840002785e-06, + "loss": 4.6097, + "step": 15160 + }, + { + "epoch": 2.974695959199686, + "grad_norm": 25.977567672729492, + "learning_rate": 1.549757738360211e-06, + "loss": 4.7444, + "step": 15165 + }, + { + "epoch": 2.975676735974892, + "grad_norm": 16.419042587280273, + "learning_rate": 1.546957460192043e-06, + "loss": 4.3684, + "step": 15170 + }, + { + "epoch": 2.976657512750098, + "grad_norm": 36.12578201293945, + "learning_rate": 1.5441592511737701e-06, + "loss": 4.4426, + "step": 15175 + }, + { + "epoch": 2.977638289525304, + "grad_norm": 44.24177932739258, + "learning_rate": 1.5413631129821544e-06, + "loss": 4.286, + "step": 15180 + }, + { + "epoch": 2.97861906630051, + "grad_norm": 15.310659408569336, + "learning_rate": 1.5385690472927067e-06, + "loss": 4.2994, + "step": 15185 + }, + { + "epoch": 2.9795998430757162, + "grad_norm": 17.0074405670166, + "learning_rate": 1.5357770557797064e-06, + "loss": 4.5799, + "step": 15190 + }, + { + "epoch": 2.980580619850922, + "grad_norm": 10.577085494995117, + "learning_rate": 1.5329871401161806e-06, + "loss": 4.3414, + "step": 15195 + }, + { + "epoch": 2.9815613966261276, + "grad_norm": 13.469268798828125, + "learning_rate": 1.5301993019739186e-06, + "loss": 3.8734, + "step": 15200 + }, + { + "epoch": 2.9825421734013338, + "grad_norm": 13.40593433380127, + "learning_rate": 1.5274135430234654e-06, + "loss": 4.4555, + "step": 15205 + }, + { + "epoch": 2.98352295017654, + "grad_norm": 34.490821838378906, + "learning_rate": 1.5246298649341146e-06, + "loss": 4.7537, + "step": 15210 + }, + { + "epoch": 2.984503726951746, + "grad_norm": 12.969808578491211, + "learning_rate": 1.5218482693739183e-06, + "loss": 5.0805, + "step": 15215 + }, + { + "epoch": 2.9854845037269517, + "grad_norm": 15.976425170898438, + "learning_rate": 1.5190687580096762e-06, + "loss": 4.5393, + "step": 15220 + }, + { + "epoch": 2.986465280502158, + "grad_norm": 30.466705322265625, + "learning_rate": 1.5162913325069428e-06, + "loss": 4.4184, + "step": 15225 + }, + { + "epoch": 2.9874460572773636, + "grad_norm": 17.97763442993164, + "learning_rate": 1.5135159945300232e-06, + "loss": 4.3463, + "step": 15230 + }, + { + "epoch": 2.9884268340525697, + "grad_norm": 23.519624710083008, + "learning_rate": 1.5107427457419654e-06, + "loss": 4.3618, + "step": 15235 + }, + { + "epoch": 2.989407610827776, + "grad_norm": 20.899688720703125, + "learning_rate": 1.5079715878045737e-06, + "loss": 4.6069, + "step": 15240 + }, + { + "epoch": 2.9903883876029815, + "grad_norm": 21.9375, + "learning_rate": 1.5052025223783944e-06, + "loss": 4.5154, + "step": 15245 + }, + { + "epoch": 2.9913691643781877, + "grad_norm": 11.83838176727295, + "learning_rate": 1.502435551122719e-06, + "loss": 4.493, + "step": 15250 + }, + { + "epoch": 2.9923499411533934, + "grad_norm": 18.852176666259766, + "learning_rate": 1.4996706756955892e-06, + "loss": 4.5604, + "step": 15255 + }, + { + "epoch": 2.9933307179285995, + "grad_norm": 13.880756378173828, + "learning_rate": 1.496907897753785e-06, + "loss": 4.6919, + "step": 15260 + }, + { + "epoch": 2.9943114947038056, + "grad_norm": 21.89354705810547, + "learning_rate": 1.4941472189528356e-06, + "loss": 4.6131, + "step": 15265 + }, + { + "epoch": 2.9952922714790113, + "grad_norm": 16.394149780273438, + "learning_rate": 1.4913886409470062e-06, + "loss": 4.3678, + "step": 15270 + }, + { + "epoch": 2.9962730482542175, + "grad_norm": 21.523550033569336, + "learning_rate": 1.488632165389307e-06, + "loss": 4.5263, + "step": 15275 + }, + { + "epoch": 2.997253825029423, + "grad_norm": 16.209943771362305, + "learning_rate": 1.48587779393149e-06, + "loss": 4.3487, + "step": 15280 + }, + { + "epoch": 2.9982346018046293, + "grad_norm": 17.297212600708008, + "learning_rate": 1.4831255282240397e-06, + "loss": 4.755, + "step": 15285 + }, + { + "epoch": 2.9992153785798354, + "grad_norm": 15.209367752075195, + "learning_rate": 1.4803753699161866e-06, + "loss": 4.4553, + "step": 15290 + }, + { + "epoch": 3.000196155355041, + "grad_norm": 13.265889167785645, + "learning_rate": 1.4776273206558911e-06, + "loss": 4.9951, + "step": 15295 + }, + { + "epoch": 3.0011769321302473, + "grad_norm": 28.354883193969727, + "learning_rate": 1.4748813820898554e-06, + "loss": 4.4493, + "step": 15300 + }, + { + "epoch": 3.0011769321302473, + "eval_loss": 4.852348804473877, + "eval_runtime": 7.8034, + "eval_samples_per_second": 26.783, + "eval_steps_per_second": 13.456, + "step": 15300 + }, + { + "epoch": 3.002157708905453, + "grad_norm": 24.23794174194336, + "learning_rate": 1.4721375558635164e-06, + "loss": 4.474, + "step": 15305 + }, + { + "epoch": 3.003138485680659, + "grad_norm": 11.600415229797363, + "learning_rate": 1.4693958436210426e-06, + "loss": 4.6386, + "step": 15310 + }, + { + "epoch": 3.0041192624558652, + "grad_norm": 30.11103057861328, + "learning_rate": 1.466656247005334e-06, + "loss": 4.722, + "step": 15315 + }, + { + "epoch": 3.005100039231071, + "grad_norm": 24.502426147460938, + "learning_rate": 1.4639187676580301e-06, + "loss": 4.5352, + "step": 15320 + }, + { + "epoch": 3.006080816006277, + "grad_norm": 11.984843254089355, + "learning_rate": 1.4611834072194948e-06, + "loss": 4.6266, + "step": 15325 + }, + { + "epoch": 3.0070615927814828, + "grad_norm": 15.542444229125977, + "learning_rate": 1.4584501673288259e-06, + "loss": 4.2322, + "step": 15330 + }, + { + "epoch": 3.008042369556689, + "grad_norm": 17.654617309570312, + "learning_rate": 1.4557190496238483e-06, + "loss": 4.5381, + "step": 15335 + }, + { + "epoch": 3.009023146331895, + "grad_norm": 29.028491973876953, + "learning_rate": 1.452990055741118e-06, + "loss": 4.1393, + "step": 15340 + }, + { + "epoch": 3.0100039231071007, + "grad_norm": 21.569608688354492, + "learning_rate": 1.4502631873159146e-06, + "loss": 4.378, + "step": 15345 + }, + { + "epoch": 3.010984699882307, + "grad_norm": 22.8667049407959, + "learning_rate": 1.4475384459822477e-06, + "loss": 4.4697, + "step": 15350 + }, + { + "epoch": 3.0119654766575126, + "grad_norm": 22.211538314819336, + "learning_rate": 1.444815833372852e-06, + "loss": 4.3405, + "step": 15355 + }, + { + "epoch": 3.0129462534327187, + "grad_norm": 19.6219539642334, + "learning_rate": 1.442095351119182e-06, + "loss": 4.4564, + "step": 15360 + }, + { + "epoch": 3.013927030207925, + "grad_norm": 28.267377853393555, + "learning_rate": 1.4393770008514235e-06, + "loss": 4.2524, + "step": 15365 + }, + { + "epoch": 3.0149078069831305, + "grad_norm": 12.272148132324219, + "learning_rate": 1.436660784198476e-06, + "loss": 4.3088, + "step": 15370 + }, + { + "epoch": 3.0158885837583367, + "grad_norm": 14.716875076293945, + "learning_rate": 1.4339467027879661e-06, + "loss": 4.7513, + "step": 15375 + }, + { + "epoch": 3.0168693605335424, + "grad_norm": 28.98792266845703, + "learning_rate": 1.4312347582462427e-06, + "loss": 4.2431, + "step": 15380 + }, + { + "epoch": 3.0178501373087485, + "grad_norm": 16.454082489013672, + "learning_rate": 1.428524952198368e-06, + "loss": 4.5905, + "step": 15385 + }, + { + "epoch": 3.0188309140839547, + "grad_norm": 22.37074851989746, + "learning_rate": 1.4258172862681268e-06, + "loss": 4.6151, + "step": 15390 + }, + { + "epoch": 3.0198116908591603, + "grad_norm": 29.454204559326172, + "learning_rate": 1.4231117620780188e-06, + "loss": 4.6649, + "step": 15395 + }, + { + "epoch": 3.0207924676343665, + "grad_norm": 21.081298828125, + "learning_rate": 1.4204083812492636e-06, + "loss": 4.4562, + "step": 15400 + }, + { + "epoch": 3.021773244409572, + "grad_norm": 14.012701988220215, + "learning_rate": 1.4177071454017966e-06, + "loss": 4.3996, + "step": 15405 + }, + { + "epoch": 3.0227540211847783, + "grad_norm": 23.03716278076172, + "learning_rate": 1.415008056154263e-06, + "loss": 4.4021, + "step": 15410 + }, + { + "epoch": 3.0237347979599845, + "grad_norm": 23.56743621826172, + "learning_rate": 1.4123111151240283e-06, + "loss": 4.7541, + "step": 15415 + }, + { + "epoch": 3.02471557473519, + "grad_norm": 14.281283378601074, + "learning_rate": 1.4096163239271638e-06, + "loss": 4.3043, + "step": 15420 + }, + { + "epoch": 3.0256963515103963, + "grad_norm": 34.98441696166992, + "learning_rate": 1.4069236841784584e-06, + "loss": 4.308, + "step": 15425 + }, + { + "epoch": 3.026677128285602, + "grad_norm": 12.55133056640625, + "learning_rate": 1.4042331974914103e-06, + "loss": 4.2404, + "step": 15430 + }, + { + "epoch": 3.027657905060808, + "grad_norm": 15.474343299865723, + "learning_rate": 1.4015448654782243e-06, + "loss": 4.756, + "step": 15435 + }, + { + "epoch": 3.0286386818360143, + "grad_norm": 23.840938568115234, + "learning_rate": 1.398858689749819e-06, + "loss": 4.5373, + "step": 15440 + }, + { + "epoch": 3.02961945861122, + "grad_norm": 21.256649017333984, + "learning_rate": 1.3961746719158158e-06, + "loss": 4.5571, + "step": 15445 + }, + { + "epoch": 3.030600235386426, + "grad_norm": 18.67331886291504, + "learning_rate": 1.3934928135845488e-06, + "loss": 4.4685, + "step": 15450 + }, + { + "epoch": 3.0315810121616322, + "grad_norm": 29.247690200805664, + "learning_rate": 1.3908131163630513e-06, + "loss": 4.6533, + "step": 15455 + }, + { + "epoch": 3.032561788936838, + "grad_norm": 18.554914474487305, + "learning_rate": 1.3881355818570691e-06, + "loss": 4.444, + "step": 15460 + }, + { + "epoch": 3.033542565712044, + "grad_norm": 42.42098617553711, + "learning_rate": 1.3854602116710459e-06, + "loss": 4.7042, + "step": 15465 + }, + { + "epoch": 3.0345233424872498, + "grad_norm": 38.945499420166016, + "learning_rate": 1.3827870074081296e-06, + "loss": 4.3739, + "step": 15470 + }, + { + "epoch": 3.035504119262456, + "grad_norm": 24.96856117248535, + "learning_rate": 1.3801159706701727e-06, + "loss": 4.1805, + "step": 15475 + }, + { + "epoch": 3.036484896037662, + "grad_norm": 15.618642807006836, + "learning_rate": 1.3774471030577298e-06, + "loss": 4.443, + "step": 15480 + }, + { + "epoch": 3.0374656728128677, + "grad_norm": 30.052515029907227, + "learning_rate": 1.3747804061700497e-06, + "loss": 4.2959, + "step": 15485 + }, + { + "epoch": 3.038446449588074, + "grad_norm": 28.82718849182129, + "learning_rate": 1.3721158816050872e-06, + "loss": 4.6018, + "step": 15490 + }, + { + "epoch": 3.0394272263632796, + "grad_norm": 17.50002098083496, + "learning_rate": 1.3694535309594903e-06, + "loss": 4.5142, + "step": 15495 + }, + { + "epoch": 3.0404080031384857, + "grad_norm": 28.0140438079834, + "learning_rate": 1.3667933558286067e-06, + "loss": 4.2015, + "step": 15500 + }, + { + "epoch": 3.041388779913692, + "grad_norm": 11.1800537109375, + "learning_rate": 1.3641353578064825e-06, + "loss": 4.4448, + "step": 15505 + }, + { + "epoch": 3.0423695566888975, + "grad_norm": 24.721168518066406, + "learning_rate": 1.3614795384858538e-06, + "loss": 4.4718, + "step": 15510 + }, + { + "epoch": 3.0433503334641037, + "grad_norm": 13.32351303100586, + "learning_rate": 1.3588258994581572e-06, + "loss": 4.8924, + "step": 15515 + }, + { + "epoch": 3.0443311102393094, + "grad_norm": 29.64592933654785, + "learning_rate": 1.3561744423135164e-06, + "loss": 4.4433, + "step": 15520 + }, + { + "epoch": 3.0453118870145155, + "grad_norm": 36.95175552368164, + "learning_rate": 1.3535251686407553e-06, + "loss": 4.1702, + "step": 15525 + }, + { + "epoch": 3.0462926637897216, + "grad_norm": 19.097549438476562, + "learning_rate": 1.3508780800273818e-06, + "loss": 4.1353, + "step": 15530 + }, + { + "epoch": 3.0472734405649273, + "grad_norm": 41.65642547607422, + "learning_rate": 1.3482331780596003e-06, + "loss": 4.7218, + "step": 15535 + }, + { + "epoch": 3.0482542173401335, + "grad_norm": 16.86134910583496, + "learning_rate": 1.3455904643223022e-06, + "loss": 4.5133, + "step": 15540 + }, + { + "epoch": 3.049234994115339, + "grad_norm": 13.893656730651855, + "learning_rate": 1.3429499403990658e-06, + "loss": 4.2403, + "step": 15545 + }, + { + "epoch": 3.0502157708905453, + "grad_norm": 15.907454490661621, + "learning_rate": 1.3403116078721606e-06, + "loss": 4.9011, + "step": 15550 + }, + { + "epoch": 3.0511965476657514, + "grad_norm": 14.247295379638672, + "learning_rate": 1.337675468322544e-06, + "loss": 4.5309, + "step": 15555 + }, + { + "epoch": 3.052177324440957, + "grad_norm": 24.694496154785156, + "learning_rate": 1.3350415233298542e-06, + "loss": 4.5389, + "step": 15560 + }, + { + "epoch": 3.0531581012161633, + "grad_norm": 33.52488327026367, + "learning_rate": 1.332409774472419e-06, + "loss": 5.0962, + "step": 15565 + }, + { + "epoch": 3.054138877991369, + "grad_norm": 16.590829849243164, + "learning_rate": 1.3297802233272473e-06, + "loss": 4.4915, + "step": 15570 + }, + { + "epoch": 3.055119654766575, + "grad_norm": 15.594505310058594, + "learning_rate": 1.3271528714700321e-06, + "loss": 4.5788, + "step": 15575 + }, + { + "epoch": 3.0561004315417812, + "grad_norm": 23.571426391601562, + "learning_rate": 1.3245277204751511e-06, + "loss": 4.4476, + "step": 15580 + }, + { + "epoch": 3.057081208316987, + "grad_norm": 11.0546236038208, + "learning_rate": 1.3219047719156575e-06, + "loss": 4.5721, + "step": 15585 + }, + { + "epoch": 3.058061985092193, + "grad_norm": 14.763998031616211, + "learning_rate": 1.3192840273632907e-06, + "loss": 4.628, + "step": 15590 + }, + { + "epoch": 3.0590427618673988, + "grad_norm": 20.179122924804688, + "learning_rate": 1.3166654883884643e-06, + "loss": 4.4939, + "step": 15595 + }, + { + "epoch": 3.060023538642605, + "grad_norm": 26.991134643554688, + "learning_rate": 1.314049156560276e-06, + "loss": 4.7653, + "step": 15600 + }, + { + "epoch": 3.061004315417811, + "grad_norm": 16.699296951293945, + "learning_rate": 1.3114350334464948e-06, + "loss": 4.3669, + "step": 15605 + }, + { + "epoch": 3.0619850921930167, + "grad_norm": 13.49559497833252, + "learning_rate": 1.308823120613568e-06, + "loss": 4.212, + "step": 15610 + }, + { + "epoch": 3.062965868968223, + "grad_norm": 26.69697380065918, + "learning_rate": 1.3062134196266235e-06, + "loss": 4.5457, + "step": 15615 + }, + { + "epoch": 3.063946645743429, + "grad_norm": 30.913908004760742, + "learning_rate": 1.303605932049456e-06, + "loss": 4.6255, + "step": 15620 + }, + { + "epoch": 3.0649274225186347, + "grad_norm": 26.240177154541016, + "learning_rate": 1.3010006594445384e-06, + "loss": 4.6354, + "step": 15625 + }, + { + "epoch": 3.065908199293841, + "grad_norm": 17.10197639465332, + "learning_rate": 1.2983976033730179e-06, + "loss": 4.5503, + "step": 15630 + }, + { + "epoch": 3.0668889760690465, + "grad_norm": 20.739166259765625, + "learning_rate": 1.2957967653947078e-06, + "loss": 4.4489, + "step": 15635 + }, + { + "epoch": 3.0678697528442527, + "grad_norm": 16.324705123901367, + "learning_rate": 1.2931981470680988e-06, + "loss": 4.414, + "step": 15640 + }, + { + "epoch": 3.068850529619459, + "grad_norm": 23.186613082885742, + "learning_rate": 1.2906017499503454e-06, + "loss": 4.4988, + "step": 15645 + }, + { + "epoch": 3.0698313063946645, + "grad_norm": 22.5463924407959, + "learning_rate": 1.288007575597275e-06, + "loss": 4.3282, + "step": 15650 + }, + { + "epoch": 3.0708120831698706, + "grad_norm": 12.31671142578125, + "learning_rate": 1.2854156255633837e-06, + "loss": 4.2331, + "step": 15655 + }, + { + "epoch": 3.0717928599450763, + "grad_norm": 21.361194610595703, + "learning_rate": 1.2828259014018308e-06, + "loss": 4.4753, + "step": 15660 + }, + { + "epoch": 3.0727736367202825, + "grad_norm": 11.575060844421387, + "learning_rate": 1.2802384046644468e-06, + "loss": 4.2673, + "step": 15665 + }, + { + "epoch": 3.0737544134954886, + "grad_norm": 13.329882621765137, + "learning_rate": 1.2776531369017215e-06, + "loss": 4.4678, + "step": 15670 + }, + { + "epoch": 3.0747351902706943, + "grad_norm": 27.24415397644043, + "learning_rate": 1.275070099662815e-06, + "loss": 4.4223, + "step": 15675 + }, + { + "epoch": 3.0757159670459004, + "grad_norm": 21.532888412475586, + "learning_rate": 1.272489294495548e-06, + "loss": 4.4301, + "step": 15680 + }, + { + "epoch": 3.076696743821106, + "grad_norm": 17.741239547729492, + "learning_rate": 1.2699107229464008e-06, + "loss": 4.4361, + "step": 15685 + }, + { + "epoch": 3.0776775205963123, + "grad_norm": 18.234464645385742, + "learning_rate": 1.2673343865605225e-06, + "loss": 4.3779, + "step": 15690 + }, + { + "epoch": 3.0786582973715184, + "grad_norm": 29.617433547973633, + "learning_rate": 1.264760286881715e-06, + "loss": 4.6656, + "step": 15695 + }, + { + "epoch": 3.079639074146724, + "grad_norm": 9.898058891296387, + "learning_rate": 1.2621884254524452e-06, + "loss": 4.8513, + "step": 15700 + }, + { + "epoch": 3.0806198509219302, + "grad_norm": 24.25059700012207, + "learning_rate": 1.2596188038138385e-06, + "loss": 4.5291, + "step": 15705 + }, + { + "epoch": 3.081600627697136, + "grad_norm": 19.24030113220215, + "learning_rate": 1.2570514235056735e-06, + "loss": 4.4476, + "step": 15710 + }, + { + "epoch": 3.082581404472342, + "grad_norm": 14.411766052246094, + "learning_rate": 1.254486286066393e-06, + "loss": 4.7423, + "step": 15715 + }, + { + "epoch": 3.083562181247548, + "grad_norm": 19.341766357421875, + "learning_rate": 1.2519233930330877e-06, + "loss": 4.4205, + "step": 15720 + }, + { + "epoch": 3.084542958022754, + "grad_norm": 26.251022338867188, + "learning_rate": 1.2493627459415096e-06, + "loss": 4.3927, + "step": 15725 + }, + { + "epoch": 3.08552373479796, + "grad_norm": 32.140380859375, + "learning_rate": 1.2468043463260649e-06, + "loss": 4.6213, + "step": 15730 + }, + { + "epoch": 3.0865045115731657, + "grad_norm": 21.708030700683594, + "learning_rate": 1.2442481957198066e-06, + "loss": 4.256, + "step": 15735 + }, + { + "epoch": 3.087485288348372, + "grad_norm": 20.06285285949707, + "learning_rate": 1.2416942956544486e-06, + "loss": 4.4784, + "step": 15740 + }, + { + "epoch": 3.088466065123578, + "grad_norm": 17.960481643676758, + "learning_rate": 1.2391426476603496e-06, + "loss": 4.4885, + "step": 15745 + }, + { + "epoch": 3.0894468418987837, + "grad_norm": 19.873380661010742, + "learning_rate": 1.236593253266521e-06, + "loss": 4.5705, + "step": 15750 + }, + { + "epoch": 3.09042761867399, + "grad_norm": 17.436277389526367, + "learning_rate": 1.2340461140006255e-06, + "loss": 4.1984, + "step": 15755 + }, + { + "epoch": 3.0914083954491955, + "grad_norm": 19.593332290649414, + "learning_rate": 1.2315012313889708e-06, + "loss": 4.3993, + "step": 15760 + }, + { + "epoch": 3.0923891722244017, + "grad_norm": 29.571025848388672, + "learning_rate": 1.2289586069565174e-06, + "loss": 4.6506, + "step": 15765 + }, + { + "epoch": 3.093369948999608, + "grad_norm": 15.669720649719238, + "learning_rate": 1.2264182422268673e-06, + "loss": 4.4139, + "step": 15770 + }, + { + "epoch": 3.0943507257748135, + "grad_norm": 12.525907516479492, + "learning_rate": 1.2238801387222716e-06, + "loss": 4.2745, + "step": 15775 + }, + { + "epoch": 3.0953315025500197, + "grad_norm": 31.15891456604004, + "learning_rate": 1.221344297963627e-06, + "loss": 4.3835, + "step": 15780 + }, + { + "epoch": 3.096312279325226, + "grad_norm": 14.809552192687988, + "learning_rate": 1.2188107214704714e-06, + "loss": 4.3162, + "step": 15785 + }, + { + "epoch": 3.0972930561004315, + "grad_norm": 17.6668758392334, + "learning_rate": 1.2162794107609888e-06, + "loss": 4.1695, + "step": 15790 + }, + { + "epoch": 3.0982738328756376, + "grad_norm": 16.730649948120117, + "learning_rate": 1.213750367352003e-06, + "loss": 4.2216, + "step": 15795 + }, + { + "epoch": 3.0992546096508433, + "grad_norm": 19.196990966796875, + "learning_rate": 1.2112235927589805e-06, + "loss": 4.4758, + "step": 15800 + }, + { + "epoch": 3.1002353864260495, + "grad_norm": 18.599266052246094, + "learning_rate": 1.2086990884960304e-06, + "loss": 4.4006, + "step": 15805 + }, + { + "epoch": 3.1012161632012556, + "grad_norm": 17.576534271240234, + "learning_rate": 1.2061768560758957e-06, + "loss": 4.4374, + "step": 15810 + }, + { + "epoch": 3.1021969399764613, + "grad_norm": 21.37404441833496, + "learning_rate": 1.2036568970099643e-06, + "loss": 4.4077, + "step": 15815 + }, + { + "epoch": 3.1031777167516674, + "grad_norm": 10.35575008392334, + "learning_rate": 1.2011392128082583e-06, + "loss": 4.3657, + "step": 15820 + }, + { + "epoch": 3.104158493526873, + "grad_norm": 16.843027114868164, + "learning_rate": 1.1986238049794352e-06, + "loss": 4.5244, + "step": 15825 + }, + { + "epoch": 3.1051392703020793, + "grad_norm": 8.886418342590332, + "learning_rate": 1.1961106750307945e-06, + "loss": 4.2819, + "step": 15830 + }, + { + "epoch": 3.1061200470772854, + "grad_norm": 18.96690559387207, + "learning_rate": 1.1935998244682624e-06, + "loss": 4.6573, + "step": 15835 + }, + { + "epoch": 3.107100823852491, + "grad_norm": 20.823482513427734, + "learning_rate": 1.1910912547964076e-06, + "loss": 4.5941, + "step": 15840 + }, + { + "epoch": 3.1080816006276972, + "grad_norm": 11.613541603088379, + "learning_rate": 1.1885849675184252e-06, + "loss": 4.3887, + "step": 15845 + }, + { + "epoch": 3.109062377402903, + "grad_norm": 12.627098083496094, + "learning_rate": 1.186080964136147e-06, + "loss": 4.5249, + "step": 15850 + }, + { + "epoch": 3.110043154178109, + "grad_norm": 14.541468620300293, + "learning_rate": 1.1835792461500357e-06, + "loss": 4.2469, + "step": 15855 + }, + { + "epoch": 3.111023930953315, + "grad_norm": 15.33710765838623, + "learning_rate": 1.1810798150591813e-06, + "loss": 4.3271, + "step": 15860 + }, + { + "epoch": 3.112004707728521, + "grad_norm": 13.666189193725586, + "learning_rate": 1.1785826723613081e-06, + "loss": 4.2686, + "step": 15865 + }, + { + "epoch": 3.112985484503727, + "grad_norm": 18.96344566345215, + "learning_rate": 1.1760878195527642e-06, + "loss": 4.3628, + "step": 15870 + }, + { + "epoch": 3.1139662612789327, + "grad_norm": 14.437033653259277, + "learning_rate": 1.1735952581285299e-06, + "loss": 4.4556, + "step": 15875 + }, + { + "epoch": 3.114947038054139, + "grad_norm": 20.044998168945312, + "learning_rate": 1.1711049895822114e-06, + "loss": 4.3907, + "step": 15880 + }, + { + "epoch": 3.115927814829345, + "grad_norm": 26.939075469970703, + "learning_rate": 1.1686170154060379e-06, + "loss": 4.3439, + "step": 15885 + }, + { + "epoch": 3.1169085916045507, + "grad_norm": 32.64443588256836, + "learning_rate": 1.1661313370908689e-06, + "loss": 4.3217, + "step": 15890 + }, + { + "epoch": 3.117889368379757, + "grad_norm": 17.43419075012207, + "learning_rate": 1.1636479561261832e-06, + "loss": 4.7089, + "step": 15895 + }, + { + "epoch": 3.1188701451549625, + "grad_norm": 17.65254783630371, + "learning_rate": 1.1611668740000848e-06, + "loss": 4.3605, + "step": 15900 + }, + { + "epoch": 3.1198509219301687, + "grad_norm": 19.919321060180664, + "learning_rate": 1.1586880921993022e-06, + "loss": 4.3343, + "step": 15905 + }, + { + "epoch": 3.120831698705375, + "grad_norm": 31.183734893798828, + "learning_rate": 1.156211612209182e-06, + "loss": 4.1166, + "step": 15910 + }, + { + "epoch": 3.1218124754805805, + "grad_norm": 20.650432586669922, + "learning_rate": 1.1537374355136954e-06, + "loss": 4.5356, + "step": 15915 + }, + { + "epoch": 3.1227932522557866, + "grad_norm": 25.45415687561035, + "learning_rate": 1.1512655635954284e-06, + "loss": 4.3773, + "step": 15920 + }, + { + "epoch": 3.1237740290309928, + "grad_norm": 30.577579498291016, + "learning_rate": 1.1487959979355906e-06, + "loss": 4.1834, + "step": 15925 + }, + { + "epoch": 3.1247548058061985, + "grad_norm": 21.581764221191406, + "learning_rate": 1.1463287400140089e-06, + "loss": 4.4372, + "step": 15930 + }, + { + "epoch": 3.1257355825814046, + "grad_norm": 24.099647521972656, + "learning_rate": 1.1438637913091238e-06, + "loss": 4.4539, + "step": 15935 + }, + { + "epoch": 3.1267163593566103, + "grad_norm": 14.903850555419922, + "learning_rate": 1.1414011532979975e-06, + "loss": 4.3897, + "step": 15940 + }, + { + "epoch": 3.1276971361318164, + "grad_norm": 21.976409912109375, + "learning_rate": 1.1389408274563013e-06, + "loss": 4.6149, + "step": 15945 + }, + { + "epoch": 3.1286779129070226, + "grad_norm": 26.018178939819336, + "learning_rate": 1.1364828152583252e-06, + "loss": 4.7381, + "step": 15950 + }, + { + "epoch": 3.1296586896822283, + "grad_norm": 19.22270965576172, + "learning_rate": 1.1340271181769746e-06, + "loss": 4.3429, + "step": 15955 + }, + { + "epoch": 3.1306394664574344, + "grad_norm": 24.88014030456543, + "learning_rate": 1.1315737376837627e-06, + "loss": 4.3767, + "step": 15960 + }, + { + "epoch": 3.13162024323264, + "grad_norm": 15.811385154724121, + "learning_rate": 1.129122675248816e-06, + "loss": 4.2861, + "step": 15965 + }, + { + "epoch": 3.1326010200078462, + "grad_norm": 17.602039337158203, + "learning_rate": 1.1266739323408743e-06, + "loss": 4.6436, + "step": 15970 + }, + { + "epoch": 3.1335817967830524, + "grad_norm": 17.281982421875, + "learning_rate": 1.124227510427286e-06, + "loss": 4.6336, + "step": 15975 + }, + { + "epoch": 3.134562573558258, + "grad_norm": 10.94583511352539, + "learning_rate": 1.1217834109740061e-06, + "loss": 3.9648, + "step": 15980 + }, + { + "epoch": 3.135543350333464, + "grad_norm": 29.885208129882812, + "learning_rate": 1.1193416354456022e-06, + "loss": 4.4361, + "step": 15985 + }, + { + "epoch": 3.13652412710867, + "grad_norm": 30.1024112701416, + "learning_rate": 1.1169021853052491e-06, + "loss": 4.9684, + "step": 15990 + }, + { + "epoch": 3.137504903883876, + "grad_norm": 18.619029998779297, + "learning_rate": 1.114465062014724e-06, + "loss": 4.654, + "step": 15995 + }, + { + "epoch": 3.138485680659082, + "grad_norm": 17.20469093322754, + "learning_rate": 1.1120302670344153e-06, + "loss": 4.5966, + "step": 16000 + }, + { + "epoch": 3.139466457434288, + "grad_norm": 18.83102035522461, + "learning_rate": 1.1095978018233094e-06, + "loss": 4.4428, + "step": 16005 + }, + { + "epoch": 3.140447234209494, + "grad_norm": 23.325239181518555, + "learning_rate": 1.1071676678390036e-06, + "loss": 4.3781, + "step": 16010 + }, + { + "epoch": 3.1414280109846997, + "grad_norm": 25.824480056762695, + "learning_rate": 1.1047398665376956e-06, + "loss": 4.5236, + "step": 16015 + }, + { + "epoch": 3.142408787759906, + "grad_norm": 21.332595825195312, + "learning_rate": 1.102314399374183e-06, + "loss": 4.5056, + "step": 16020 + }, + { + "epoch": 3.143389564535112, + "grad_norm": 21.27334213256836, + "learning_rate": 1.0998912678018685e-06, + "loss": 4.3762, + "step": 16025 + }, + { + "epoch": 3.1443703413103177, + "grad_norm": 18.111543655395508, + "learning_rate": 1.0974704732727514e-06, + "loss": 4.5658, + "step": 16030 + }, + { + "epoch": 3.145351118085524, + "grad_norm": 15.349037170410156, + "learning_rate": 1.0950520172374352e-06, + "loss": 4.2495, + "step": 16035 + }, + { + "epoch": 3.1463318948607295, + "grad_norm": 13.66447639465332, + "learning_rate": 1.0926359011451182e-06, + "loss": 4.8234, + "step": 16040 + }, + { + "epoch": 3.1473126716359356, + "grad_norm": 24.965930938720703, + "learning_rate": 1.0902221264435964e-06, + "loss": 4.3959, + "step": 16045 + }, + { + "epoch": 3.148293448411142, + "grad_norm": 14.090888977050781, + "learning_rate": 1.0878106945792676e-06, + "loss": 4.2983, + "step": 16050 + }, + { + "epoch": 3.1492742251863475, + "grad_norm": 22.15886878967285, + "learning_rate": 1.0854016069971184e-06, + "loss": 4.4799, + "step": 16055 + }, + { + "epoch": 3.1502550019615536, + "grad_norm": 38.32559585571289, + "learning_rate": 1.0829948651407374e-06, + "loss": 4.4311, + "step": 16060 + }, + { + "epoch": 3.1512357787367593, + "grad_norm": 40.12593460083008, + "learning_rate": 1.0805904704523057e-06, + "loss": 4.4074, + "step": 16065 + }, + { + "epoch": 3.1522165555119654, + "grad_norm": 13.718966484069824, + "learning_rate": 1.0781884243725937e-06, + "loss": 4.3726, + "step": 16070 + }, + { + "epoch": 3.1531973322871716, + "grad_norm": 14.995040893554688, + "learning_rate": 1.0757887283409718e-06, + "loss": 4.3585, + "step": 16075 + }, + { + "epoch": 3.1541781090623773, + "grad_norm": 17.867856979370117, + "learning_rate": 1.0733913837953942e-06, + "loss": 4.4938, + "step": 16080 + }, + { + "epoch": 3.1551588858375834, + "grad_norm": 30.474164962768555, + "learning_rate": 1.0709963921724115e-06, + "loss": 4.4126, + "step": 16085 + }, + { + "epoch": 3.156139662612789, + "grad_norm": 25.54427719116211, + "learning_rate": 1.0686037549071648e-06, + "loss": 4.7638, + "step": 16090 + }, + { + "epoch": 3.1571204393879952, + "grad_norm": 23.775362014770508, + "learning_rate": 1.06621347343338e-06, + "loss": 4.4845, + "step": 16095 + }, + { + "epoch": 3.1581012161632014, + "grad_norm": 23.316543579101562, + "learning_rate": 1.063825549183376e-06, + "loss": 4.5359, + "step": 16100 + }, + { + "epoch": 3.159081992938407, + "grad_norm": 24.66636085510254, + "learning_rate": 1.0614399835880545e-06, + "loss": 4.537, + "step": 16105 + }, + { + "epoch": 3.160062769713613, + "grad_norm": 16.905139923095703, + "learning_rate": 1.059056778076909e-06, + "loss": 4.4013, + "step": 16110 + }, + { + "epoch": 3.161043546488819, + "grad_norm": 22.527822494506836, + "learning_rate": 1.0566759340780153e-06, + "loss": 4.7759, + "step": 16115 + }, + { + "epoch": 3.162024323264025, + "grad_norm": 25.074085235595703, + "learning_rate": 1.0542974530180327e-06, + "loss": 4.5525, + "step": 16120 + }, + { + "epoch": 3.163005100039231, + "grad_norm": 22.77043342590332, + "learning_rate": 1.0519213363222102e-06, + "loss": 4.4541, + "step": 16125 + }, + { + "epoch": 3.163985876814437, + "grad_norm": 21.752971649169922, + "learning_rate": 1.0495475854143738e-06, + "loss": 4.0442, + "step": 16130 + }, + { + "epoch": 3.164966653589643, + "grad_norm": 33.8589973449707, + "learning_rate": 1.0471762017169362e-06, + "loss": 4.3213, + "step": 16135 + }, + { + "epoch": 3.165947430364849, + "grad_norm": 12.692889213562012, + "learning_rate": 1.0448071866508914e-06, + "loss": 4.8471, + "step": 16140 + }, + { + "epoch": 3.166928207140055, + "grad_norm": 22.040273666381836, + "learning_rate": 1.0424405416358096e-06, + "loss": 4.3255, + "step": 16145 + }, + { + "epoch": 3.167908983915261, + "grad_norm": 32.375091552734375, + "learning_rate": 1.0400762680898474e-06, + "loss": 4.4573, + "step": 16150 + }, + { + "epoch": 3.1688897606904667, + "grad_norm": 16.68021583557129, + "learning_rate": 1.037714367429734e-06, + "loss": 4.3776, + "step": 16155 + }, + { + "epoch": 3.169870537465673, + "grad_norm": 27.631816864013672, + "learning_rate": 1.0353548410707815e-06, + "loss": 4.7017, + "step": 16160 + }, + { + "epoch": 3.170851314240879, + "grad_norm": 21.348148345947266, + "learning_rate": 1.0329976904268773e-06, + "loss": 4.6062, + "step": 16165 + }, + { + "epoch": 3.1718320910160847, + "grad_norm": 23.80921745300293, + "learning_rate": 1.0306429169104841e-06, + "loss": 4.4043, + "step": 16170 + }, + { + "epoch": 3.172812867791291, + "grad_norm": 20.834857940673828, + "learning_rate": 1.0282905219326438e-06, + "loss": 4.5575, + "step": 16175 + }, + { + "epoch": 3.1737936445664965, + "grad_norm": 20.40863609313965, + "learning_rate": 1.0259405069029672e-06, + "loss": 4.3255, + "step": 16180 + }, + { + "epoch": 3.1747744213417026, + "grad_norm": 12.205023765563965, + "learning_rate": 1.0235928732296458e-06, + "loss": 4.2396, + "step": 16185 + }, + { + "epoch": 3.1757551981169088, + "grad_norm": 19.171253204345703, + "learning_rate": 1.021247622319439e-06, + "loss": 4.7084, + "step": 16190 + }, + { + "epoch": 3.1767359748921145, + "grad_norm": 16.924537658691406, + "learning_rate": 1.0189047555776787e-06, + "loss": 4.2925, + "step": 16195 + }, + { + "epoch": 3.1777167516673206, + "grad_norm": 15.352378845214844, + "learning_rate": 1.0165642744082726e-06, + "loss": 4.4434, + "step": 16200 + }, + { + "epoch": 3.1786975284425263, + "grad_norm": 19.03781509399414, + "learning_rate": 1.0142261802136931e-06, + "loss": 4.5886, + "step": 16205 + }, + { + "epoch": 3.1796783052177324, + "grad_norm": 18.974306106567383, + "learning_rate": 1.0118904743949865e-06, + "loss": 4.8805, + "step": 16210 + }, + { + "epoch": 3.1806590819929386, + "grad_norm": 23.474401473999023, + "learning_rate": 1.0095571583517665e-06, + "loss": 4.3292, + "step": 16215 + }, + { + "epoch": 3.1816398587681443, + "grad_norm": 17.590652465820312, + "learning_rate": 1.0072262334822142e-06, + "loss": 4.3978, + "step": 16220 + }, + { + "epoch": 3.1826206355433504, + "grad_norm": 21.562280654907227, + "learning_rate": 1.0048977011830791e-06, + "loss": 4.1862, + "step": 16225 + }, + { + "epoch": 3.183601412318556, + "grad_norm": 35.704036712646484, + "learning_rate": 1.0025715628496752e-06, + "loss": 4.7851, + "step": 16230 + }, + { + "epoch": 3.1845821890937622, + "grad_norm": 11.571843147277832, + "learning_rate": 1.000247819875883e-06, + "loss": 4.6119, + "step": 16235 + }, + { + "epoch": 3.1855629658689684, + "grad_norm": 39.009552001953125, + "learning_rate": 9.9792647365415e-07, + "loss": 4.5718, + "step": 16240 + }, + { + "epoch": 3.186543742644174, + "grad_norm": 17.692380905151367, + "learning_rate": 9.956075255754822e-07, + "loss": 4.449, + "step": 16245 + }, + { + "epoch": 3.18752451941938, + "grad_norm": 14.382763862609863, + "learning_rate": 9.932909770294542e-07, + "loss": 4.4584, + "step": 16250 + }, + { + "epoch": 3.1885052961945863, + "grad_norm": 17.71790313720703, + "learning_rate": 9.909768294041989e-07, + "loss": 4.5061, + "step": 16255 + }, + { + "epoch": 3.189486072969792, + "grad_norm": 24.063451766967773, + "learning_rate": 9.886650840864104e-07, + "loss": 4.3325, + "step": 16260 + }, + { + "epoch": 3.190466849744998, + "grad_norm": 14.177840232849121, + "learning_rate": 9.863557424613473e-07, + "loss": 4.5417, + "step": 16265 + }, + { + "epoch": 3.191447626520204, + "grad_norm": 32.26413345336914, + "learning_rate": 9.840488059128228e-07, + "loss": 4.4526, + "step": 16270 + }, + { + "epoch": 3.19242840329541, + "grad_norm": 30.41230010986328, + "learning_rate": 9.817442758232132e-07, + "loss": 4.8116, + "step": 16275 + }, + { + "epoch": 3.193409180070616, + "grad_norm": 22.243484497070312, + "learning_rate": 9.79442153573449e-07, + "loss": 4.3898, + "step": 16280 + }, + { + "epoch": 3.194389956845822, + "grad_norm": 22.45796775817871, + "learning_rate": 9.771424405430196e-07, + "loss": 4.5671, + "step": 16285 + }, + { + "epoch": 3.195370733621028, + "grad_norm": 17.344751358032227, + "learning_rate": 9.748451381099743e-07, + "loss": 4.6885, + "step": 16290 + }, + { + "epoch": 3.1963515103962337, + "grad_norm": 20.54185676574707, + "learning_rate": 9.725502476509102e-07, + "loss": 4.5413, + "step": 16295 + }, + { + "epoch": 3.19733228717144, + "grad_norm": 19.294301986694336, + "learning_rate": 9.702577705409872e-07, + "loss": 4.2378, + "step": 16300 + }, + { + "epoch": 3.198313063946646, + "grad_norm": 31.30463218688965, + "learning_rate": 9.679677081539112e-07, + "loss": 4.6142, + "step": 16305 + }, + { + "epoch": 3.1992938407218516, + "grad_norm": 27.021711349487305, + "learning_rate": 9.656800618619478e-07, + "loss": 4.2349, + "step": 16310 + }, + { + "epoch": 3.2002746174970578, + "grad_norm": 20.498126983642578, + "learning_rate": 9.633948330359122e-07, + "loss": 4.4625, + "step": 16315 + }, + { + "epoch": 3.2012553942722635, + "grad_norm": 16.401073455810547, + "learning_rate": 9.611120230451698e-07, + "loss": 4.6958, + "step": 16320 + }, + { + "epoch": 3.2022361710474696, + "grad_norm": 21.446414947509766, + "learning_rate": 9.588316332576392e-07, + "loss": 4.8353, + "step": 16325 + }, + { + "epoch": 3.2032169478226757, + "grad_norm": 30.655405044555664, + "learning_rate": 9.56553665039786e-07, + "loss": 4.0624, + "step": 16330 + }, + { + "epoch": 3.2041977245978814, + "grad_norm": 9.359758377075195, + "learning_rate": 9.542781197566254e-07, + "loss": 4.4915, + "step": 16335 + }, + { + "epoch": 3.2051785013730876, + "grad_norm": 13.281656265258789, + "learning_rate": 9.52004998771724e-07, + "loss": 4.5648, + "step": 16340 + }, + { + "epoch": 3.2061592781482933, + "grad_norm": 16.86419677734375, + "learning_rate": 9.497343034471896e-07, + "loss": 4.3901, + "step": 16345 + }, + { + "epoch": 3.2071400549234994, + "grad_norm": 13.59611701965332, + "learning_rate": 9.474660351436832e-07, + "loss": 4.4795, + "step": 16350 + }, + { + "epoch": 3.2081208316987055, + "grad_norm": 19.933231353759766, + "learning_rate": 9.452001952204049e-07, + "loss": 4.3226, + "step": 16355 + }, + { + "epoch": 3.2091016084739112, + "grad_norm": 10.386981010437012, + "learning_rate": 9.429367850351051e-07, + "loss": 4.2176, + "step": 16360 + }, + { + "epoch": 3.2100823852491174, + "grad_norm": 14.15022087097168, + "learning_rate": 9.406758059440774e-07, + "loss": 4.3851, + "step": 16365 + }, + { + "epoch": 3.211063162024323, + "grad_norm": 24.58717918395996, + "learning_rate": 9.384172593021534e-07, + "loss": 4.1824, + "step": 16370 + }, + { + "epoch": 3.212043938799529, + "grad_norm": 18.0743408203125, + "learning_rate": 9.361611464627152e-07, + "loss": 4.4787, + "step": 16375 + }, + { + "epoch": 3.2130247155747353, + "grad_norm": 11.677950859069824, + "learning_rate": 9.339074687776789e-07, + "loss": 4.3985, + "step": 16380 + }, + { + "epoch": 3.214005492349941, + "grad_norm": 11.800333023071289, + "learning_rate": 9.316562275975066e-07, + "loss": 4.7217, + "step": 16385 + }, + { + "epoch": 3.214986269125147, + "grad_norm": 14.704316139221191, + "learning_rate": 9.294074242711993e-07, + "loss": 4.4862, + "step": 16390 + }, + { + "epoch": 3.215967045900353, + "grad_norm": 17.01054573059082, + "learning_rate": 9.271610601462955e-07, + "loss": 4.533, + "step": 16395 + }, + { + "epoch": 3.216947822675559, + "grad_norm": 16.018836975097656, + "learning_rate": 9.249171365688714e-07, + "loss": 4.3827, + "step": 16400 + }, + { + "epoch": 3.217928599450765, + "grad_norm": 24.968673706054688, + "learning_rate": 9.226756548835458e-07, + "loss": 4.4041, + "step": 16405 + }, + { + "epoch": 3.218909376225971, + "grad_norm": 27.278209686279297, + "learning_rate": 9.204366164334677e-07, + "loss": 4.3926, + "step": 16410 + }, + { + "epoch": 3.219890153001177, + "grad_norm": 15.485451698303223, + "learning_rate": 9.182000225603282e-07, + "loss": 4.4093, + "step": 16415 + }, + { + "epoch": 3.2208709297763827, + "grad_norm": 14.306288719177246, + "learning_rate": 9.159658746043476e-07, + "loss": 4.6559, + "step": 16420 + }, + { + "epoch": 3.221851706551589, + "grad_norm": 19.840612411499023, + "learning_rate": 9.137341739042859e-07, + "loss": 4.4419, + "step": 16425 + }, + { + "epoch": 3.222832483326795, + "grad_norm": 36.60581588745117, + "learning_rate": 9.115049217974325e-07, + "loss": 3.9703, + "step": 16430 + }, + { + "epoch": 3.2238132601020006, + "grad_norm": 39.210731506347656, + "learning_rate": 9.092781196196121e-07, + "loss": 4.9126, + "step": 16435 + }, + { + "epoch": 3.224794036877207, + "grad_norm": 14.3563814163208, + "learning_rate": 9.070537687051817e-07, + "loss": 4.6904, + "step": 16440 + }, + { + "epoch": 3.2257748136524125, + "grad_norm": 32.390159606933594, + "learning_rate": 9.048318703870263e-07, + "loss": 4.4831, + "step": 16445 + }, + { + "epoch": 3.2267555904276186, + "grad_norm": 20.18743133544922, + "learning_rate": 9.026124259965647e-07, + "loss": 4.1097, + "step": 16450 + }, + { + "epoch": 3.2277363672028248, + "grad_norm": 14.164064407348633, + "learning_rate": 9.003954368637424e-07, + "loss": 4.4878, + "step": 16455 + }, + { + "epoch": 3.2287171439780304, + "grad_norm": 17.929447174072266, + "learning_rate": 8.981809043170353e-07, + "loss": 4.4077, + "step": 16460 + }, + { + "epoch": 3.2296979207532366, + "grad_norm": 14.398148536682129, + "learning_rate": 8.959688296834491e-07, + "loss": 4.4003, + "step": 16465 + }, + { + "epoch": 3.2306786975284427, + "grad_norm": 20.427139282226562, + "learning_rate": 8.937592142885126e-07, + "loss": 4.7461, + "step": 16470 + }, + { + "epoch": 3.2316594743036484, + "grad_norm": 17.9531192779541, + "learning_rate": 8.915520594562821e-07, + "loss": 4.4267, + "step": 16475 + }, + { + "epoch": 3.2326402510788546, + "grad_norm": 10.198659896850586, + "learning_rate": 8.893473665093427e-07, + "loss": 4.7172, + "step": 16480 + }, + { + "epoch": 3.2336210278540602, + "grad_norm": 25.5524845123291, + "learning_rate": 8.871451367687994e-07, + "loss": 4.4695, + "step": 16485 + }, + { + "epoch": 3.2346018046292664, + "grad_norm": 24.992595672607422, + "learning_rate": 8.849453715542855e-07, + "loss": 4.5502, + "step": 16490 + }, + { + "epoch": 3.2355825814044725, + "grad_norm": 13.642231941223145, + "learning_rate": 8.827480721839538e-07, + "loss": 4.4869, + "step": 16495 + }, + { + "epoch": 3.236563358179678, + "grad_norm": 15.115335464477539, + "learning_rate": 8.805532399744837e-07, + "loss": 4.375, + "step": 16500 + }, + { + "epoch": 3.2375441349548844, + "grad_norm": 17.352785110473633, + "learning_rate": 8.783608762410712e-07, + "loss": 4.4238, + "step": 16505 + }, + { + "epoch": 3.23852491173009, + "grad_norm": 26.560272216796875, + "learning_rate": 8.761709822974368e-07, + "loss": 4.4159, + "step": 16510 + }, + { + "epoch": 3.239505688505296, + "grad_norm": 15.408249855041504, + "learning_rate": 8.739835594558216e-07, + "loss": 4.538, + "step": 16515 + }, + { + "epoch": 3.2404864652805023, + "grad_norm": 15.455544471740723, + "learning_rate": 8.71798609026982e-07, + "loss": 4.6661, + "step": 16520 + }, + { + "epoch": 3.241467242055708, + "grad_norm": 12.074155807495117, + "learning_rate": 8.696161323201974e-07, + "loss": 4.4038, + "step": 16525 + }, + { + "epoch": 3.242448018830914, + "grad_norm": 18.908735275268555, + "learning_rate": 8.674361306432599e-07, + "loss": 4.458, + "step": 16530 + }, + { + "epoch": 3.24342879560612, + "grad_norm": 10.406591415405273, + "learning_rate": 8.652586053024836e-07, + "loss": 4.5516, + "step": 16535 + }, + { + "epoch": 3.244409572381326, + "grad_norm": 32.47367477416992, + "learning_rate": 8.630835576026963e-07, + "loss": 4.1534, + "step": 16540 + }, + { + "epoch": 3.245390349156532, + "grad_norm": 14.95496940612793, + "learning_rate": 8.609109888472411e-07, + "loss": 4.2641, + "step": 16545 + }, + { + "epoch": 3.246371125931738, + "grad_norm": 30.883024215698242, + "learning_rate": 8.587409003379754e-07, + "loss": 4.4005, + "step": 16550 + }, + { + "epoch": 3.247351902706944, + "grad_norm": 17.348379135131836, + "learning_rate": 8.565732933752702e-07, + "loss": 4.626, + "step": 16555 + }, + { + "epoch": 3.24833267948215, + "grad_norm": 15.369430541992188, + "learning_rate": 8.544081692580097e-07, + "loss": 4.3092, + "step": 16560 + }, + { + "epoch": 3.249313456257356, + "grad_norm": 16.423633575439453, + "learning_rate": 8.522455292835935e-07, + "loss": 4.8843, + "step": 16565 + }, + { + "epoch": 3.250294233032562, + "grad_norm": 28.41481590270996, + "learning_rate": 8.50085374747927e-07, + "loss": 4.6737, + "step": 16570 + }, + { + "epoch": 3.2512750098077676, + "grad_norm": 22.398956298828125, + "learning_rate": 8.479277069454312e-07, + "loss": 4.5422, + "step": 16575 + }, + { + "epoch": 3.2512750098077676, + "eval_loss": 4.852513313293457, + "eval_runtime": 7.885, + "eval_samples_per_second": 26.506, + "eval_steps_per_second": 13.317, + "step": 16575 + }, + { + "epoch": 3.2522557865829738, + "grad_norm": 15.183259963989258, + "learning_rate": 8.457725271690326e-07, + "loss": 4.2779, + "step": 16580 + }, + { + "epoch": 3.25323656335818, + "grad_norm": 22.774782180786133, + "learning_rate": 8.436198367101705e-07, + "loss": 4.3453, + "step": 16585 + }, + { + "epoch": 3.2542173401333856, + "grad_norm": 27.975404739379883, + "learning_rate": 8.414696368587922e-07, + "loss": 4.4088, + "step": 16590 + }, + { + "epoch": 3.2551981169085917, + "grad_norm": 18.01176643371582, + "learning_rate": 8.393219289033489e-07, + "loss": 4.5194, + "step": 16595 + }, + { + "epoch": 3.2561788936837974, + "grad_norm": 33.312408447265625, + "learning_rate": 8.37176714130804e-07, + "loss": 4.2223, + "step": 16600 + }, + { + "epoch": 3.2571596704590036, + "grad_norm": 14.787341117858887, + "learning_rate": 8.350339938266211e-07, + "loss": 4.8151, + "step": 16605 + }, + { + "epoch": 3.2581404472342097, + "grad_norm": 23.211856842041016, + "learning_rate": 8.328937692747757e-07, + "loss": 4.7171, + "step": 16610 + }, + { + "epoch": 3.2591212240094154, + "grad_norm": 14.314786911010742, + "learning_rate": 8.307560417577404e-07, + "loss": 4.4659, + "step": 16615 + }, + { + "epoch": 3.2601020007846215, + "grad_norm": 15.22576904296875, + "learning_rate": 8.286208125564982e-07, + "loss": 4.2322, + "step": 16620 + }, + { + "epoch": 3.2610827775598272, + "grad_norm": 15.027915954589844, + "learning_rate": 8.264880829505312e-07, + "loss": 4.5284, + "step": 16625 + }, + { + "epoch": 3.2620635543350334, + "grad_norm": 32.40642547607422, + "learning_rate": 8.243578542178227e-07, + "loss": 4.5117, + "step": 16630 + }, + { + "epoch": 3.2630443311102395, + "grad_norm": 17.12091064453125, + "learning_rate": 8.222301276348615e-07, + "loss": 4.616, + "step": 16635 + }, + { + "epoch": 3.264025107885445, + "grad_norm": 18.07270622253418, + "learning_rate": 8.201049044766352e-07, + "loss": 4.2342, + "step": 16640 + }, + { + "epoch": 3.2650058846606513, + "grad_norm": 13.647133827209473, + "learning_rate": 8.179821860166288e-07, + "loss": 4.4643, + "step": 16645 + }, + { + "epoch": 3.265986661435857, + "grad_norm": 13.987504005432129, + "learning_rate": 8.158619735268314e-07, + "loss": 4.3447, + "step": 16650 + }, + { + "epoch": 3.266967438211063, + "grad_norm": 18.850154876708984, + "learning_rate": 8.137442682777241e-07, + "loss": 4.0809, + "step": 16655 + }, + { + "epoch": 3.2679482149862693, + "grad_norm": 20.752836227416992, + "learning_rate": 8.116290715382919e-07, + "loss": 4.5345, + "step": 16660 + }, + { + "epoch": 3.268928991761475, + "grad_norm": 11.551191329956055, + "learning_rate": 8.095163845760134e-07, + "loss": 4.3558, + "step": 16665 + }, + { + "epoch": 3.269909768536681, + "grad_norm": 23.9580135345459, + "learning_rate": 8.074062086568629e-07, + "loss": 4.5156, + "step": 16670 + }, + { + "epoch": 3.270890545311887, + "grad_norm": 15.155923843383789, + "learning_rate": 8.052985450453121e-07, + "loss": 4.5786, + "step": 16675 + }, + { + "epoch": 3.271871322087093, + "grad_norm": 19.49130630493164, + "learning_rate": 8.031933950043242e-07, + "loss": 4.3139, + "step": 16680 + }, + { + "epoch": 3.272852098862299, + "grad_norm": 20.867582321166992, + "learning_rate": 8.010907597953604e-07, + "loss": 4.2326, + "step": 16685 + }, + { + "epoch": 3.273832875637505, + "grad_norm": 18.114887237548828, + "learning_rate": 7.989906406783709e-07, + "loss": 4.4595, + "step": 16690 + }, + { + "epoch": 3.274813652412711, + "grad_norm": 13.96712875366211, + "learning_rate": 7.96893038911799e-07, + "loss": 4.456, + "step": 16695 + }, + { + "epoch": 3.2757944291879166, + "grad_norm": 10.919675827026367, + "learning_rate": 7.947979557525832e-07, + "loss": 4.3077, + "step": 16700 + }, + { + "epoch": 3.2767752059631228, + "grad_norm": 20.295543670654297, + "learning_rate": 7.927053924561473e-07, + "loss": 4.2028, + "step": 16705 + }, + { + "epoch": 3.277755982738329, + "grad_norm": 20.351930618286133, + "learning_rate": 7.906153502764085e-07, + "loss": 4.6729, + "step": 16710 + }, + { + "epoch": 3.2787367595135346, + "grad_norm": 12.80075454711914, + "learning_rate": 7.885278304657745e-07, + "loss": 4.2542, + "step": 16715 + }, + { + "epoch": 3.2797175362887407, + "grad_norm": 19.2437744140625, + "learning_rate": 7.864428342751368e-07, + "loss": 4.4505, + "step": 16720 + }, + { + "epoch": 3.2806983130639464, + "grad_norm": 17.746822357177734, + "learning_rate": 7.843603629538804e-07, + "loss": 4.5905, + "step": 16725 + }, + { + "epoch": 3.2816790898391526, + "grad_norm": 18.832889556884766, + "learning_rate": 7.822804177498716e-07, + "loss": 4.5141, + "step": 16730 + }, + { + "epoch": 3.2826598666143587, + "grad_norm": 12.651679992675781, + "learning_rate": 7.802029999094674e-07, + "loss": 4.41, + "step": 16735 + }, + { + "epoch": 3.2836406433895644, + "grad_norm": 18.479333877563477, + "learning_rate": 7.781281106775101e-07, + "loss": 4.3111, + "step": 16740 + }, + { + "epoch": 3.2846214201647705, + "grad_norm": 17.73377227783203, + "learning_rate": 7.760557512973227e-07, + "loss": 4.71, + "step": 16745 + }, + { + "epoch": 3.2856021969399762, + "grad_norm": 24.678346633911133, + "learning_rate": 7.739859230107177e-07, + "loss": 4.3451, + "step": 16750 + }, + { + "epoch": 3.2865829737151824, + "grad_norm": 10.567961692810059, + "learning_rate": 7.719186270579853e-07, + "loss": 4.5535, + "step": 16755 + }, + { + "epoch": 3.2875637504903885, + "grad_norm": 18.80795669555664, + "learning_rate": 7.698538646779047e-07, + "loss": 4.211, + "step": 16760 + }, + { + "epoch": 3.288544527265594, + "grad_norm": 11.36734676361084, + "learning_rate": 7.67791637107731e-07, + "loss": 4.1832, + "step": 16765 + }, + { + "epoch": 3.2895253040408003, + "grad_norm": 13.719120025634766, + "learning_rate": 7.657319455832024e-07, + "loss": 4.39, + "step": 16770 + }, + { + "epoch": 3.290506080816006, + "grad_norm": 21.944128036499023, + "learning_rate": 7.6367479133854e-07, + "loss": 4.3722, + "step": 16775 + }, + { + "epoch": 3.291486857591212, + "grad_norm": 18.50696563720703, + "learning_rate": 7.616201756064401e-07, + "loss": 4.7951, + "step": 16780 + }, + { + "epoch": 3.2924676343664183, + "grad_norm": 18.831857681274414, + "learning_rate": 7.59568099618081e-07, + "loss": 4.3203, + "step": 16785 + }, + { + "epoch": 3.293448411141624, + "grad_norm": 10.791632652282715, + "learning_rate": 7.575185646031197e-07, + "loss": 4.4587, + "step": 16790 + }, + { + "epoch": 3.29442918791683, + "grad_norm": 21.668577194213867, + "learning_rate": 7.554715717896866e-07, + "loss": 4.6222, + "step": 16795 + }, + { + "epoch": 3.2954099646920363, + "grad_norm": 16.101211547851562, + "learning_rate": 7.534271224043932e-07, + "loss": 4.4963, + "step": 16800 + }, + { + "epoch": 3.296390741467242, + "grad_norm": 18.882286071777344, + "learning_rate": 7.513852176723236e-07, + "loss": 4.4336, + "step": 16805 + }, + { + "epoch": 3.297371518242448, + "grad_norm": 21.985626220703125, + "learning_rate": 7.493458588170389e-07, + "loss": 4.1264, + "step": 16810 + }, + { + "epoch": 3.298352295017654, + "grad_norm": 20.74184226989746, + "learning_rate": 7.473090470605754e-07, + "loss": 4.3489, + "step": 16815 + }, + { + "epoch": 3.29933307179286, + "grad_norm": 22.413835525512695, + "learning_rate": 7.452747836234392e-07, + "loss": 4.3452, + "step": 16820 + }, + { + "epoch": 3.300313848568066, + "grad_norm": 13.036279678344727, + "learning_rate": 7.432430697246157e-07, + "loss": 4.4649, + "step": 16825 + }, + { + "epoch": 3.301294625343272, + "grad_norm": 14.902352333068848, + "learning_rate": 7.412139065815555e-07, + "loss": 4.4924, + "step": 16830 + }, + { + "epoch": 3.302275402118478, + "grad_norm": 24.96709632873535, + "learning_rate": 7.39187295410187e-07, + "loss": 4.5216, + "step": 16835 + }, + { + "epoch": 3.3032561788936836, + "grad_norm": 27.07962989807129, + "learning_rate": 7.371632374249049e-07, + "loss": 4.7339, + "step": 16840 + }, + { + "epoch": 3.3042369556688898, + "grad_norm": 29.041719436645508, + "learning_rate": 7.351417338385746e-07, + "loss": 4.4904, + "step": 16845 + }, + { + "epoch": 3.305217732444096, + "grad_norm": 19.445049285888672, + "learning_rate": 7.33122785862535e-07, + "loss": 4.3771, + "step": 16850 + }, + { + "epoch": 3.3061985092193016, + "grad_norm": 21.5811767578125, + "learning_rate": 7.311063947065871e-07, + "loss": 4.4102, + "step": 16855 + }, + { + "epoch": 3.3071792859945077, + "grad_norm": 21.47755241394043, + "learning_rate": 7.290925615790051e-07, + "loss": 4.3652, + "step": 16860 + }, + { + "epoch": 3.308160062769714, + "grad_norm": 14.804069519042969, + "learning_rate": 7.270812876865291e-07, + "loss": 4.36, + "step": 16865 + }, + { + "epoch": 3.3091408395449196, + "grad_norm": 26.158729553222656, + "learning_rate": 7.250725742343629e-07, + "loss": 4.8634, + "step": 16870 + }, + { + "epoch": 3.3101216163201257, + "grad_norm": 31.665578842163086, + "learning_rate": 7.230664224261801e-07, + "loss": 4.2634, + "step": 16875 + }, + { + "epoch": 3.3111023930953314, + "grad_norm": 23.756200790405273, + "learning_rate": 7.210628334641156e-07, + "loss": 4.5788, + "step": 16880 + }, + { + "epoch": 3.3120831698705375, + "grad_norm": 34.05301284790039, + "learning_rate": 7.190618085487705e-07, + "loss": 4.1559, + "step": 16885 + }, + { + "epoch": 3.3130639466457437, + "grad_norm": 20.167665481567383, + "learning_rate": 7.170633488792111e-07, + "loss": 4.5557, + "step": 16890 + }, + { + "epoch": 3.3140447234209494, + "grad_norm": 12.421319961547852, + "learning_rate": 7.150674556529624e-07, + "loss": 4.5346, + "step": 16895 + }, + { + "epoch": 3.3150255001961555, + "grad_norm": 19.413768768310547, + "learning_rate": 7.13074130066016e-07, + "loss": 4.3397, + "step": 16900 + }, + { + "epoch": 3.316006276971361, + "grad_norm": 17.019386291503906, + "learning_rate": 7.110833733128214e-07, + "loss": 4.7594, + "step": 16905 + }, + { + "epoch": 3.3169870537465673, + "grad_norm": 14.768471717834473, + "learning_rate": 7.090951865862888e-07, + "loss": 4.3822, + "step": 16910 + }, + { + "epoch": 3.3179678305217735, + "grad_norm": 30.920869827270508, + "learning_rate": 7.071095710777925e-07, + "loss": 4.5661, + "step": 16915 + }, + { + "epoch": 3.318948607296979, + "grad_norm": 29.130586624145508, + "learning_rate": 7.051265279771602e-07, + "loss": 4.4697, + "step": 16920 + }, + { + "epoch": 3.3199293840721853, + "grad_norm": 43.41074752807617, + "learning_rate": 7.03146058472684e-07, + "loss": 4.8567, + "step": 16925 + }, + { + "epoch": 3.320910160847391, + "grad_norm": 28.771547317504883, + "learning_rate": 7.011681637511092e-07, + "loss": 4.5718, + "step": 16930 + }, + { + "epoch": 3.321890937622597, + "grad_norm": 13.533172607421875, + "learning_rate": 6.991928449976398e-07, + "loss": 4.276, + "step": 16935 + }, + { + "epoch": 3.3228717143978033, + "grad_norm": 14.783927917480469, + "learning_rate": 6.972201033959386e-07, + "loss": 4.3144, + "step": 16940 + }, + { + "epoch": 3.323852491173009, + "grad_norm": 14.02553939819336, + "learning_rate": 6.952499401281199e-07, + "loss": 4.3646, + "step": 16945 + }, + { + "epoch": 3.324833267948215, + "grad_norm": 35.986019134521484, + "learning_rate": 6.932823563747559e-07, + "loss": 4.5013, + "step": 16950 + }, + { + "epoch": 3.325814044723421, + "grad_norm": 19.50735092163086, + "learning_rate": 6.91317353314872e-07, + "loss": 4.3468, + "step": 16955 + }, + { + "epoch": 3.326794821498627, + "grad_norm": 10.819270133972168, + "learning_rate": 6.893549321259468e-07, + "loss": 4.3793, + "step": 16960 + }, + { + "epoch": 3.327775598273833, + "grad_norm": 31.2229061126709, + "learning_rate": 6.873950939839147e-07, + "loss": 4.2508, + "step": 16965 + }, + { + "epoch": 3.3287563750490388, + "grad_norm": 22.09214973449707, + "learning_rate": 6.854378400631573e-07, + "loss": 4.7499, + "step": 16970 + }, + { + "epoch": 3.329737151824245, + "grad_norm": 27.418867111206055, + "learning_rate": 6.834831715365125e-07, + "loss": 4.3997, + "step": 16975 + }, + { + "epoch": 3.3307179285994506, + "grad_norm": 31.877626419067383, + "learning_rate": 6.815310895752658e-07, + "loss": 4.2885, + "step": 16980 + }, + { + "epoch": 3.3316987053746567, + "grad_norm": 28.25274658203125, + "learning_rate": 6.79581595349153e-07, + "loss": 4.523, + "step": 16985 + }, + { + "epoch": 3.332679482149863, + "grad_norm": 16.39922523498535, + "learning_rate": 6.776346900263614e-07, + "loss": 4.5928, + "step": 16990 + }, + { + "epoch": 3.3336602589250686, + "grad_norm": 45.208892822265625, + "learning_rate": 6.756903747735244e-07, + "loss": 4.4122, + "step": 16995 + }, + { + "epoch": 3.3346410357002747, + "grad_norm": 12.759726524353027, + "learning_rate": 6.737486507557262e-07, + "loss": 4.3968, + "step": 17000 + }, + { + "epoch": 3.3356218124754804, + "grad_norm": 20.541330337524414, + "learning_rate": 6.718095191364943e-07, + "loss": 4.2077, + "step": 17005 + }, + { + "epoch": 3.3366025892506865, + "grad_norm": 18.055675506591797, + "learning_rate": 6.698729810778065e-07, + "loss": 4.5941, + "step": 17010 + }, + { + "epoch": 3.3375833660258927, + "grad_norm": 32.806358337402344, + "learning_rate": 6.679390377400868e-07, + "loss": 4.4608, + "step": 17015 + }, + { + "epoch": 3.3385641428010984, + "grad_norm": 14.002641677856445, + "learning_rate": 6.660076902821994e-07, + "loss": 4.4344, + "step": 17020 + }, + { + "epoch": 3.3395449195763045, + "grad_norm": 12.99300765991211, + "learning_rate": 6.640789398614588e-07, + "loss": 4.5186, + "step": 17025 + }, + { + "epoch": 3.34052569635151, + "grad_norm": 9.83618450164795, + "learning_rate": 6.621527876336187e-07, + "loss": 4.6571, + "step": 17030 + }, + { + "epoch": 3.3415064731267163, + "grad_norm": 24.592845916748047, + "learning_rate": 6.602292347528794e-07, + "loss": 4.2534, + "step": 17035 + }, + { + "epoch": 3.3424872499019225, + "grad_norm": 28.41790008544922, + "learning_rate": 6.583082823718823e-07, + "loss": 4.2271, + "step": 17040 + }, + { + "epoch": 3.343468026677128, + "grad_norm": 10.874235153198242, + "learning_rate": 6.563899316417099e-07, + "loss": 4.5524, + "step": 17045 + }, + { + "epoch": 3.3444488034523343, + "grad_norm": 29.61422348022461, + "learning_rate": 6.544741837118851e-07, + "loss": 4.5697, + "step": 17050 + }, + { + "epoch": 3.34542958022754, + "grad_norm": 39.208351135253906, + "learning_rate": 6.525610397303739e-07, + "loss": 4.6328, + "step": 17055 + }, + { + "epoch": 3.346410357002746, + "grad_norm": 16.006139755249023, + "learning_rate": 6.506505008435787e-07, + "loss": 4.6253, + "step": 17060 + }, + { + "epoch": 3.3473911337779523, + "grad_norm": 13.404540061950684, + "learning_rate": 6.48742568196345e-07, + "loss": 4.4215, + "step": 17065 + }, + { + "epoch": 3.348371910553158, + "grad_norm": 17.115400314331055, + "learning_rate": 6.468372429319503e-07, + "loss": 4.3132, + "step": 17070 + }, + { + "epoch": 3.349352687328364, + "grad_norm": 25.940824508666992, + "learning_rate": 6.44934526192117e-07, + "loss": 4.6653, + "step": 17075 + }, + { + "epoch": 3.35033346410357, + "grad_norm": 17.652284622192383, + "learning_rate": 6.43034419116998e-07, + "loss": 4.4474, + "step": 17080 + }, + { + "epoch": 3.351314240878776, + "grad_norm": 12.598196983337402, + "learning_rate": 6.411369228451858e-07, + "loss": 4.3505, + "step": 17085 + }, + { + "epoch": 3.352295017653982, + "grad_norm": 24.629087448120117, + "learning_rate": 6.392420385137104e-07, + "loss": 4.1629, + "step": 17090 + }, + { + "epoch": 3.3532757944291878, + "grad_norm": 13.39428997039795, + "learning_rate": 6.373497672580309e-07, + "loss": 4.3679, + "step": 17095 + }, + { + "epoch": 3.354256571204394, + "grad_norm": 18.239906311035156, + "learning_rate": 6.354601102120462e-07, + "loss": 4.5945, + "step": 17100 + }, + { + "epoch": 3.3552373479795996, + "grad_norm": 12.23827838897705, + "learning_rate": 6.335730685080838e-07, + "loss": 4.432, + "step": 17105 + }, + { + "epoch": 3.3562181247548057, + "grad_norm": 20.083877563476562, + "learning_rate": 6.316886432769081e-07, + "loss": 4.3537, + "step": 17110 + }, + { + "epoch": 3.357198901530012, + "grad_norm": 11.51676082611084, + "learning_rate": 6.29806835647715e-07, + "loss": 4.5358, + "step": 17115 + }, + { + "epoch": 3.3581796783052176, + "grad_norm": 17.398361206054688, + "learning_rate": 6.279276467481299e-07, + "loss": 4.4557, + "step": 17120 + }, + { + "epoch": 3.3591604550804237, + "grad_norm": 33.21310806274414, + "learning_rate": 6.260510777042089e-07, + "loss": 4.3373, + "step": 17125 + }, + { + "epoch": 3.36014123185563, + "grad_norm": 13.65185260772705, + "learning_rate": 6.24177129640442e-07, + "loss": 4.2739, + "step": 17130 + }, + { + "epoch": 3.3611220086308355, + "grad_norm": 20.5279541015625, + "learning_rate": 6.22305803679743e-07, + "loss": 4.54, + "step": 17135 + }, + { + "epoch": 3.3621027854060417, + "grad_norm": 24.423192977905273, + "learning_rate": 6.204371009434595e-07, + "loss": 4.5603, + "step": 17140 + }, + { + "epoch": 3.3630835621812474, + "grad_norm": 25.014257431030273, + "learning_rate": 6.185710225513641e-07, + "loss": 4.3737, + "step": 17145 + }, + { + "epoch": 3.3640643389564535, + "grad_norm": 18.271961212158203, + "learning_rate": 6.16707569621659e-07, + "loss": 4.2319, + "step": 17150 + }, + { + "epoch": 3.3650451157316597, + "grad_norm": 17.02702522277832, + "learning_rate": 6.148467432709704e-07, + "loss": 4.1899, + "step": 17155 + }, + { + "epoch": 3.3660258925068653, + "grad_norm": 14.603148460388184, + "learning_rate": 6.129885446143536e-07, + "loss": 4.3499, + "step": 17160 + }, + { + "epoch": 3.3670066692820715, + "grad_norm": 14.893680572509766, + "learning_rate": 6.111329747652884e-07, + "loss": 4.7695, + "step": 17165 + }, + { + "epoch": 3.367987446057277, + "grad_norm": 22.7106876373291, + "learning_rate": 6.092800348356765e-07, + "loss": 4.4914, + "step": 17170 + }, + { + "epoch": 3.3689682228324833, + "grad_norm": 31.128007888793945, + "learning_rate": 6.074297259358492e-07, + "loss": 4.3197, + "step": 17175 + }, + { + "epoch": 3.3699489996076895, + "grad_norm": 17.588899612426758, + "learning_rate": 6.055820491745557e-07, + "loss": 4.7125, + "step": 17180 + }, + { + "epoch": 3.370929776382895, + "grad_norm": 11.689655303955078, + "learning_rate": 6.037370056589709e-07, + "loss": 4.4528, + "step": 17185 + }, + { + "epoch": 3.3719105531581013, + "grad_norm": 10.81857967376709, + "learning_rate": 6.01894596494692e-07, + "loss": 4.5021, + "step": 17190 + }, + { + "epoch": 3.3728913299333074, + "grad_norm": 18.195873260498047, + "learning_rate": 6.000548227857372e-07, + "loss": 4.4054, + "step": 17195 + }, + { + "epoch": 3.373872106708513, + "grad_norm": 15.955674171447754, + "learning_rate": 5.982176856345445e-07, + "loss": 4.3278, + "step": 17200 + }, + { + "epoch": 3.3748528834837193, + "grad_norm": 24.263078689575195, + "learning_rate": 5.963831861419711e-07, + "loss": 4.7147, + "step": 17205 + }, + { + "epoch": 3.375833660258925, + "grad_norm": 24.826202392578125, + "learning_rate": 5.945513254072971e-07, + "loss": 4.4723, + "step": 17210 + }, + { + "epoch": 3.376814437034131, + "grad_norm": 36.65044021606445, + "learning_rate": 5.9272210452822e-07, + "loss": 4.6737, + "step": 17215 + }, + { + "epoch": 3.3777952138093372, + "grad_norm": 30.517133712768555, + "learning_rate": 5.90895524600853e-07, + "loss": 4.4498, + "step": 17220 + }, + { + "epoch": 3.378775990584543, + "grad_norm": 17.615612030029297, + "learning_rate": 5.890715867197305e-07, + "loss": 4.2525, + "step": 17225 + }, + { + "epoch": 3.379756767359749, + "grad_norm": 24.215167999267578, + "learning_rate": 5.872502919778006e-07, + "loss": 4.5694, + "step": 17230 + }, + { + "epoch": 3.3807375441349548, + "grad_norm": 18.129798889160156, + "learning_rate": 5.85431641466429e-07, + "loss": 4.2075, + "step": 17235 + }, + { + "epoch": 3.381718320910161, + "grad_norm": 14.993080139160156, + "learning_rate": 5.836156362753987e-07, + "loss": 4.5913, + "step": 17240 + }, + { + "epoch": 3.382699097685367, + "grad_norm": 20.029644012451172, + "learning_rate": 5.818022774929033e-07, + "loss": 4.4598, + "step": 17245 + }, + { + "epoch": 3.3836798744605727, + "grad_norm": 24.22998809814453, + "learning_rate": 5.799915662055544e-07, + "loss": 4.3082, + "step": 17250 + }, + { + "epoch": 3.384660651235779, + "grad_norm": 13.657512664794922, + "learning_rate": 5.781835034983746e-07, + "loss": 4.3457, + "step": 17255 + }, + { + "epoch": 3.3856414280109846, + "grad_norm": 17.667686462402344, + "learning_rate": 5.763780904548022e-07, + "loss": 4.619, + "step": 17260 + }, + { + "epoch": 3.3866222047861907, + "grad_norm": 27.960433959960938, + "learning_rate": 5.745753281566841e-07, + "loss": 4.7395, + "step": 17265 + }, + { + "epoch": 3.387602981561397, + "grad_norm": 15.462547302246094, + "learning_rate": 5.727752176842827e-07, + "loss": 4.609, + "step": 17270 + }, + { + "epoch": 3.3885837583366025, + "grad_norm": 12.116942405700684, + "learning_rate": 5.70977760116268e-07, + "loss": 4.4295, + "step": 17275 + }, + { + "epoch": 3.3895645351118087, + "grad_norm": 15.605005264282227, + "learning_rate": 5.691829565297219e-07, + "loss": 4.3967, + "step": 17280 + }, + { + "epoch": 3.3905453118870144, + "grad_norm": 17.77362632751465, + "learning_rate": 5.673908080001356e-07, + "loss": 3.9033, + "step": 17285 + }, + { + "epoch": 3.3915260886622205, + "grad_norm": 13.116978645324707, + "learning_rate": 5.656013156014118e-07, + "loss": 4.5672, + "step": 17290 + }, + { + "epoch": 3.3925068654374266, + "grad_norm": 16.185178756713867, + "learning_rate": 5.63814480405856e-07, + "loss": 4.6599, + "step": 17295 + }, + { + "epoch": 3.3934876422126323, + "grad_norm": 23.99867820739746, + "learning_rate": 5.620303034841879e-07, + "loss": 4.5349, + "step": 17300 + }, + { + "epoch": 3.3944684189878385, + "grad_norm": 17.248830795288086, + "learning_rate": 5.602487859055283e-07, + "loss": 4.1866, + "step": 17305 + }, + { + "epoch": 3.395449195763044, + "grad_norm": 20.530193328857422, + "learning_rate": 5.58469928737409e-07, + "loss": 4.362, + "step": 17310 + }, + { + "epoch": 3.3964299725382503, + "grad_norm": 23.010272979736328, + "learning_rate": 5.566937330457667e-07, + "loss": 4.0736, + "step": 17315 + }, + { + "epoch": 3.3974107493134564, + "grad_norm": 14.467041015625, + "learning_rate": 5.549201998949399e-07, + "loss": 4.6956, + "step": 17320 + }, + { + "epoch": 3.398391526088662, + "grad_norm": 25.341157913208008, + "learning_rate": 5.531493303476775e-07, + "loss": 4.6609, + "step": 17325 + }, + { + "epoch": 3.3993723028638683, + "grad_norm": 16.466978073120117, + "learning_rate": 5.513811254651258e-07, + "loss": 4.2494, + "step": 17330 + }, + { + "epoch": 3.400353079639074, + "grad_norm": 21.759614944458008, + "learning_rate": 5.496155863068409e-07, + "loss": 4.4691, + "step": 17335 + }, + { + "epoch": 3.40133385641428, + "grad_norm": 31.557613372802734, + "learning_rate": 5.47852713930776e-07, + "loss": 4.4447, + "step": 17340 + }, + { + "epoch": 3.4023146331894862, + "grad_norm": 20.74547004699707, + "learning_rate": 5.460925093932879e-07, + "loss": 4.8401, + "step": 17345 + }, + { + "epoch": 3.403295409964692, + "grad_norm": 22.440385818481445, + "learning_rate": 5.443349737491377e-07, + "loss": 4.5203, + "step": 17350 + }, + { + "epoch": 3.404276186739898, + "grad_norm": 15.865456581115723, + "learning_rate": 5.425801080514831e-07, + "loss": 4.6614, + "step": 17355 + }, + { + "epoch": 3.4052569635151038, + "grad_norm": 24.033784866333008, + "learning_rate": 5.408279133518846e-07, + "loss": 4.6595, + "step": 17360 + }, + { + "epoch": 3.40623774029031, + "grad_norm": 16.923505783081055, + "learning_rate": 5.390783907003017e-07, + "loss": 4.5436, + "step": 17365 + }, + { + "epoch": 3.407218517065516, + "grad_norm": 17.031034469604492, + "learning_rate": 5.373315411450908e-07, + "loss": 4.2987, + "step": 17370 + }, + { + "epoch": 3.4081992938407217, + "grad_norm": 21.693077087402344, + "learning_rate": 5.355873657330107e-07, + "loss": 4.4216, + "step": 17375 + }, + { + "epoch": 3.409180070615928, + "grad_norm": 16.699193954467773, + "learning_rate": 5.338458655092122e-07, + "loss": 4.3596, + "step": 17380 + }, + { + "epoch": 3.4101608473911336, + "grad_norm": 21.562950134277344, + "learning_rate": 5.321070415172469e-07, + "loss": 3.9197, + "step": 17385 + }, + { + "epoch": 3.4111416241663397, + "grad_norm": 17.56022834777832, + "learning_rate": 5.303708947990638e-07, + "loss": 4.3206, + "step": 17390 + }, + { + "epoch": 3.412122400941546, + "grad_norm": 20.220117568969727, + "learning_rate": 5.286374263950034e-07, + "loss": 4.2082, + "step": 17395 + }, + { + "epoch": 3.4131031777167515, + "grad_norm": 20.73967933654785, + "learning_rate": 5.269066373438048e-07, + "loss": 4.5537, + "step": 17400 + }, + { + "epoch": 3.4140839544919577, + "grad_norm": 18.557260513305664, + "learning_rate": 5.251785286825994e-07, + "loss": 4.4783, + "step": 17405 + }, + { + "epoch": 3.4150647312671634, + "grad_norm": 26.5538387298584, + "learning_rate": 5.23453101446914e-07, + "loss": 4.317, + "step": 17410 + }, + { + "epoch": 3.4160455080423695, + "grad_norm": 14.261948585510254, + "learning_rate": 5.217303566706683e-07, + "loss": 4.465, + "step": 17415 + }, + { + "epoch": 3.4170262848175756, + "grad_norm": 16.286413192749023, + "learning_rate": 5.20010295386173e-07, + "loss": 4.1083, + "step": 17420 + }, + { + "epoch": 3.4180070615927813, + "grad_norm": 25.792701721191406, + "learning_rate": 5.182929186241331e-07, + "loss": 4.3977, + "step": 17425 + }, + { + "epoch": 3.4189878383679875, + "grad_norm": 30.001115798950195, + "learning_rate": 5.165782274136433e-07, + "loss": 4.4344, + "step": 17430 + }, + { + "epoch": 3.419968615143193, + "grad_norm": 15.186819076538086, + "learning_rate": 5.148662227821899e-07, + "loss": 4.4817, + "step": 17435 + }, + { + "epoch": 3.4209493919183993, + "grad_norm": 14.81648063659668, + "learning_rate": 5.131569057556496e-07, + "loss": 4.3782, + "step": 17440 + }, + { + "epoch": 3.4219301686936054, + "grad_norm": 24.624345779418945, + "learning_rate": 5.114502773582875e-07, + "loss": 4.6731, + "step": 17445 + }, + { + "epoch": 3.422910945468811, + "grad_norm": 19.26167106628418, + "learning_rate": 5.097463386127593e-07, + "loss": 4.5661, + "step": 17450 + }, + { + "epoch": 3.4238917222440173, + "grad_norm": 16.40976333618164, + "learning_rate": 5.080450905401057e-07, + "loss": 4.3338, + "step": 17455 + }, + { + "epoch": 3.4248724990192234, + "grad_norm": 20.465471267700195, + "learning_rate": 5.063465341597589e-07, + "loss": 4.4763, + "step": 17460 + }, + { + "epoch": 3.425853275794429, + "grad_norm": 13.48697280883789, + "learning_rate": 5.046506704895376e-07, + "loss": 4.3597, + "step": 17465 + }, + { + "epoch": 3.4268340525696352, + "grad_norm": 33.226158142089844, + "learning_rate": 5.029575005456439e-07, + "loss": 4.4754, + "step": 17470 + }, + { + "epoch": 3.427814829344841, + "grad_norm": 13.726212501525879, + "learning_rate": 5.012670253426699e-07, + "loss": 4.4491, + "step": 17475 + }, + { + "epoch": 3.428795606120047, + "grad_norm": 16.772018432617188, + "learning_rate": 4.995792458935877e-07, + "loss": 4.7095, + "step": 17480 + }, + { + "epoch": 3.429776382895253, + "grad_norm": 12.425341606140137, + "learning_rate": 4.978941632097612e-07, + "loss": 4.7728, + "step": 17485 + }, + { + "epoch": 3.430757159670459, + "grad_norm": 13.08029842376709, + "learning_rate": 4.962117783009313e-07, + "loss": 4.5698, + "step": 17490 + }, + { + "epoch": 3.431737936445665, + "grad_norm": 13.78771686553955, + "learning_rate": 4.945320921752255e-07, + "loss": 4.3103, + "step": 17495 + }, + { + "epoch": 3.4327187132208707, + "grad_norm": 17.305667877197266, + "learning_rate": 4.928551058391556e-07, + "loss": 4.458, + "step": 17500 + }, + { + "epoch": 3.433699489996077, + "grad_norm": 13.464914321899414, + "learning_rate": 4.911808202976121e-07, + "loss": 4.3716, + "step": 17505 + }, + { + "epoch": 3.434680266771283, + "grad_norm": 18.66707992553711, + "learning_rate": 4.895092365538701e-07, + "loss": 4.2377, + "step": 17510 + }, + { + "epoch": 3.4356610435464887, + "grad_norm": 16.312175750732422, + "learning_rate": 4.878403556095851e-07, + "loss": 4.6405, + "step": 17515 + }, + { + "epoch": 3.436641820321695, + "grad_norm": 18.651382446289062, + "learning_rate": 4.86174178464791e-07, + "loss": 4.835, + "step": 17520 + }, + { + "epoch": 3.437622597096901, + "grad_norm": 18.005874633789062, + "learning_rate": 4.845107061179049e-07, + "loss": 4.3222, + "step": 17525 + }, + { + "epoch": 3.4386033738721067, + "grad_norm": 21.741695404052734, + "learning_rate": 4.828499395657194e-07, + "loss": 4.4938, + "step": 17530 + }, + { + "epoch": 3.439584150647313, + "grad_norm": 27.118772506713867, + "learning_rate": 4.811918798034082e-07, + "loss": 4.2959, + "step": 17535 + }, + { + "epoch": 3.4405649274225185, + "grad_norm": 15.861380577087402, + "learning_rate": 4.79536527824524e-07, + "loss": 4.3856, + "step": 17540 + }, + { + "epoch": 3.4415457041977247, + "grad_norm": 17.277454376220703, + "learning_rate": 4.778838846209927e-07, + "loss": 4.6898, + "step": 17545 + }, + { + "epoch": 3.442526480972931, + "grad_norm": 12.770515441894531, + "learning_rate": 4.7623395118312154e-07, + "loss": 4.3914, + "step": 17550 + }, + { + "epoch": 3.4435072577481365, + "grad_norm": 18.324600219726562, + "learning_rate": 4.745867284995914e-07, + "loss": 4.308, + "step": 17555 + }, + { + "epoch": 3.4444880345233426, + "grad_norm": 49.0174560546875, + "learning_rate": 4.7294221755745885e-07, + "loss": 4.8907, + "step": 17560 + }, + { + "epoch": 3.4454688112985483, + "grad_norm": 23.207433700561523, + "learning_rate": 4.7130041934215777e-07, + "loss": 4.3127, + "step": 17565 + }, + { + "epoch": 3.4464495880737545, + "grad_norm": 17.527206420898438, + "learning_rate": 4.6966133483749346e-07, + "loss": 4.4622, + "step": 17570 + }, + { + "epoch": 3.4474303648489606, + "grad_norm": 14.135499000549316, + "learning_rate": 4.680249650256474e-07, + "loss": 4.2042, + "step": 17575 + }, + { + "epoch": 3.4484111416241663, + "grad_norm": 53.29972457885742, + "learning_rate": 4.663913108871726e-07, + "loss": 4.5112, + "step": 17580 + }, + { + "epoch": 3.4493919183993724, + "grad_norm": 24.076627731323242, + "learning_rate": 4.647603734009964e-07, + "loss": 4.4795, + "step": 17585 + }, + { + "epoch": 3.450372695174578, + "grad_norm": 19.697158813476562, + "learning_rate": 4.6313215354441885e-07, + "loss": 4.6016, + "step": 17590 + }, + { + "epoch": 3.4513534719497843, + "grad_norm": 19.25453758239746, + "learning_rate": 4.6150665229310774e-07, + "loss": 4.3914, + "step": 17595 + }, + { + "epoch": 3.4523342487249904, + "grad_norm": 12.775351524353027, + "learning_rate": 4.598838706211062e-07, + "loss": 4.7117, + "step": 17600 + }, + { + "epoch": 3.453315025500196, + "grad_norm": 25.01823616027832, + "learning_rate": 4.5826380950082403e-07, + "loss": 4.2695, + "step": 17605 + }, + { + "epoch": 3.4542958022754022, + "grad_norm": 16.992021560668945, + "learning_rate": 4.5664646990304375e-07, + "loss": 4.516, + "step": 17610 + }, + { + "epoch": 3.455276579050608, + "grad_norm": 13.946948051452637, + "learning_rate": 4.550318527969161e-07, + "loss": 4.306, + "step": 17615 + }, + { + "epoch": 3.456257355825814, + "grad_norm": 49.40144348144531, + "learning_rate": 4.534199591499594e-07, + "loss": 4.7331, + "step": 17620 + }, + { + "epoch": 3.45723813260102, + "grad_norm": 25.051666259765625, + "learning_rate": 4.5181078992806215e-07, + "loss": 4.3483, + "step": 17625 + }, + { + "epoch": 3.458218909376226, + "grad_norm": 22.993051528930664, + "learning_rate": 4.502043460954786e-07, + "loss": 4.1843, + "step": 17630 + }, + { + "epoch": 3.459199686151432, + "grad_norm": 31.63532829284668, + "learning_rate": 4.486006286148287e-07, + "loss": 4.4431, + "step": 17635 + }, + { + "epoch": 3.4601804629266377, + "grad_norm": 17.333829879760742, + "learning_rate": 4.4699963844710203e-07, + "loss": 5.1192, + "step": 17640 + }, + { + "epoch": 3.461161239701844, + "grad_norm": 14.513242721557617, + "learning_rate": 4.454013765516507e-07, + "loss": 4.7808, + "step": 17645 + }, + { + "epoch": 3.46214201647705, + "grad_norm": 11.146553039550781, + "learning_rate": 4.438058438861953e-07, + "loss": 4.5906, + "step": 17650 + }, + { + "epoch": 3.4631227932522557, + "grad_norm": 17.965736389160156, + "learning_rate": 4.4221304140681707e-07, + "loss": 4.5514, + "step": 17655 + }, + { + "epoch": 3.464103570027462, + "grad_norm": 16.620882034301758, + "learning_rate": 4.406229700679643e-07, + "loss": 4.7881, + "step": 17660 + }, + { + "epoch": 3.4650843468026675, + "grad_norm": 19.04009246826172, + "learning_rate": 4.390356308224486e-07, + "loss": 4.6736, + "step": 17665 + }, + { + "epoch": 3.4660651235778737, + "grad_norm": 21.045454025268555, + "learning_rate": 4.3745102462144197e-07, + "loss": 4.4628, + "step": 17670 + }, + { + "epoch": 3.46704590035308, + "grad_norm": 15.176902770996094, + "learning_rate": 4.35869152414482e-07, + "loss": 4.3495, + "step": 17675 + }, + { + "epoch": 3.4680266771282855, + "grad_norm": 16.883756637573242, + "learning_rate": 4.342900151494639e-07, + "loss": 4.4987, + "step": 17680 + }, + { + "epoch": 3.4690074539034916, + "grad_norm": 17.017446517944336, + "learning_rate": 4.327136137726479e-07, + "loss": 4.2579, + "step": 17685 + }, + { + "epoch": 3.4699882306786973, + "grad_norm": 36.79636001586914, + "learning_rate": 4.3113994922865443e-07, + "loss": 4.5016, + "step": 17690 + }, + { + "epoch": 3.4709690074539035, + "grad_norm": 22.775876998901367, + "learning_rate": 4.2956902246046093e-07, + "loss": 4.446, + "step": 17695 + }, + { + "epoch": 3.4719497842291096, + "grad_norm": 17.92118263244629, + "learning_rate": 4.2800083440940663e-07, + "loss": 4.582, + "step": 17700 + }, + { + "epoch": 3.4729305610043153, + "grad_norm": 25.144628524780273, + "learning_rate": 4.264353860151904e-07, + "loss": 4.6917, + "step": 17705 + }, + { + "epoch": 3.4739113377795214, + "grad_norm": 24.989463806152344, + "learning_rate": 4.248726782158663e-07, + "loss": 4.7562, + "step": 17710 + }, + { + "epoch": 3.474892114554727, + "grad_norm": 21.577003479003906, + "learning_rate": 4.233127119478497e-07, + "loss": 4.3303, + "step": 17715 + }, + { + "epoch": 3.4758728913299333, + "grad_norm": 18.959033966064453, + "learning_rate": 4.217554881459107e-07, + "loss": 4.4397, + "step": 17720 + }, + { + "epoch": 3.4768536681051394, + "grad_norm": 20.839256286621094, + "learning_rate": 4.202010077431784e-07, + "loss": 4.6779, + "step": 17725 + }, + { + "epoch": 3.477834444880345, + "grad_norm": 15.40798282623291, + "learning_rate": 4.1864927167113434e-07, + "loss": 4.0281, + "step": 17730 + }, + { + "epoch": 3.4788152216555512, + "grad_norm": 12.977888107299805, + "learning_rate": 4.171002808596192e-07, + "loss": 4.7644, + "step": 17735 + }, + { + "epoch": 3.479795998430757, + "grad_norm": 18.18838119506836, + "learning_rate": 4.155540362368277e-07, + "loss": 4.2649, + "step": 17740 + }, + { + "epoch": 3.480776775205963, + "grad_norm": 22.027528762817383, + "learning_rate": 4.140105387293064e-07, + "loss": 4.7378, + "step": 17745 + }, + { + "epoch": 3.481757551981169, + "grad_norm": 19.1549015045166, + "learning_rate": 4.1246978926196057e-07, + "loss": 4.4474, + "step": 17750 + }, + { + "epoch": 3.482738328756375, + "grad_norm": 20.467920303344727, + "learning_rate": 4.1093178875804384e-07, + "loss": 4.4719, + "step": 17755 + }, + { + "epoch": 3.483719105531581, + "grad_norm": 13.859403610229492, + "learning_rate": 4.093965381391651e-07, + "loss": 4.6655, + "step": 17760 + }, + { + "epoch": 3.4846998823067867, + "grad_norm": 26.066524505615234, + "learning_rate": 4.078640383252869e-07, + "loss": 4.3725, + "step": 17765 + }, + { + "epoch": 3.485680659081993, + "grad_norm": 22.628177642822266, + "learning_rate": 4.0633429023472004e-07, + "loss": 4.506, + "step": 17770 + }, + { + "epoch": 3.486661435857199, + "grad_norm": 27.378026962280273, + "learning_rate": 4.048072947841275e-07, + "loss": 4.366, + "step": 17775 + }, + { + "epoch": 3.4876422126324047, + "grad_norm": 19.4333553314209, + "learning_rate": 4.0328305288852454e-07, + "loss": 4.5974, + "step": 17780 + }, + { + "epoch": 3.488622989407611, + "grad_norm": 10.120104789733887, + "learning_rate": 4.0176156546127443e-07, + "loss": 4.6727, + "step": 17785 + }, + { + "epoch": 3.489603766182817, + "grad_norm": 31.34845733642578, + "learning_rate": 4.0024283341409233e-07, + "loss": 4.2804, + "step": 17790 + }, + { + "epoch": 3.4905845429580227, + "grad_norm": 18.09394645690918, + "learning_rate": 3.98726857657038e-07, + "loss": 4.4176, + "step": 17795 + }, + { + "epoch": 3.491565319733229, + "grad_norm": 25.27265739440918, + "learning_rate": 3.972136390985248e-07, + "loss": 4.6796, + "step": 17800 + }, + { + "epoch": 3.4925460965084345, + "grad_norm": 24.39628028869629, + "learning_rate": 3.9570317864530916e-07, + "loss": 4.0912, + "step": 17805 + }, + { + "epoch": 3.4935268732836406, + "grad_norm": 36.115509033203125, + "learning_rate": 3.941954772024981e-07, + "loss": 4.6049, + "step": 17810 + }, + { + "epoch": 3.494507650058847, + "grad_norm": 28.326950073242188, + "learning_rate": 3.9269053567354497e-07, + "loss": 5.1064, + "step": 17815 + }, + { + "epoch": 3.4954884268340525, + "grad_norm": 13.671051025390625, + "learning_rate": 3.9118835496024685e-07, + "loss": 4.6874, + "step": 17820 + }, + { + "epoch": 3.4964692036092586, + "grad_norm": 22.13825035095215, + "learning_rate": 3.896889359627498e-07, + "loss": 4.3539, + "step": 17825 + }, + { + "epoch": 3.4974499803844643, + "grad_norm": 20.894668579101562, + "learning_rate": 3.8819227957954173e-07, + "loss": 4.6068, + "step": 17830 + }, + { + "epoch": 3.4984307571596704, + "grad_norm": 21.378633499145508, + "learning_rate": 3.866983867074575e-07, + "loss": 4.5642, + "step": 17835 + }, + { + "epoch": 3.4994115339348766, + "grad_norm": 25.99394416809082, + "learning_rate": 3.852072582416766e-07, + "loss": 4.5547, + "step": 17840 + }, + { + "epoch": 3.5003923107100823, + "grad_norm": 18.724058151245117, + "learning_rate": 3.8371889507571925e-07, + "loss": 4.501, + "step": 17845 + }, + { + "epoch": 3.5013730874852884, + "grad_norm": 23.53910255432129, + "learning_rate": 3.8223329810145035e-07, + "loss": 4.3075, + "step": 17850 + }, + { + "epoch": 3.5013730874852884, + "eval_loss": 4.851827621459961, + "eval_runtime": 7.6055, + "eval_samples_per_second": 27.48, + "eval_steps_per_second": 13.806, + "step": 17850 + }, + { + "epoch": 3.5023538642604946, + "grad_norm": 21.512908935546875, + "learning_rate": 3.8075046820907623e-07, + "loss": 4.3834, + "step": 17855 + }, + { + "epoch": 3.5033346410357002, + "grad_norm": 13.204446792602539, + "learning_rate": 3.7927040628714663e-07, + "loss": 4.4252, + "step": 17860 + }, + { + "epoch": 3.5043154178109064, + "grad_norm": 22.651100158691406, + "learning_rate": 3.777931132225526e-07, + "loss": 4.1096, + "step": 17865 + }, + { + "epoch": 3.505296194586112, + "grad_norm": 18.523298263549805, + "learning_rate": 3.763185899005234e-07, + "loss": 4.5935, + "step": 17870 + }, + { + "epoch": 3.506276971361318, + "grad_norm": 21.02290916442871, + "learning_rate": 3.7484683720463264e-07, + "loss": 4.4319, + "step": 17875 + }, + { + "epoch": 3.5072577481365244, + "grad_norm": 19.69526481628418, + "learning_rate": 3.7337785601679e-07, + "loss": 4.6678, + "step": 17880 + }, + { + "epoch": 3.50823852491173, + "grad_norm": 17.148012161254883, + "learning_rate": 3.7191164721724573e-07, + "loss": 4.4583, + "step": 17885 + }, + { + "epoch": 3.509219301686936, + "grad_norm": 17.189929962158203, + "learning_rate": 3.704482116845909e-07, + "loss": 4.8074, + "step": 17890 + }, + { + "epoch": 3.510200078462142, + "grad_norm": 23.94347381591797, + "learning_rate": 3.6898755029575016e-07, + "loss": 4.4356, + "step": 17895 + }, + { + "epoch": 3.511180855237348, + "grad_norm": 25.62436866760254, + "learning_rate": 3.675296639259912e-07, + "loss": 4.4871, + "step": 17900 + }, + { + "epoch": 3.512161632012554, + "grad_norm": 47.62681198120117, + "learning_rate": 3.6607455344891464e-07, + "loss": 4.6938, + "step": 17905 + }, + { + "epoch": 3.51314240878776, + "grad_norm": 12.18612003326416, + "learning_rate": 3.646222197364596e-07, + "loss": 4.1505, + "step": 17910 + }, + { + "epoch": 3.514123185562966, + "grad_norm": 37.44921875, + "learning_rate": 3.631726636589006e-07, + "loss": 4.2759, + "step": 17915 + }, + { + "epoch": 3.5151039623381717, + "grad_norm": 10.986493110656738, + "learning_rate": 3.6172588608484936e-07, + "loss": 4.4109, + "step": 17920 + }, + { + "epoch": 3.516084739113378, + "grad_norm": 29.93390464782715, + "learning_rate": 3.602818878812503e-07, + "loss": 4.3751, + "step": 17925 + }, + { + "epoch": 3.517065515888584, + "grad_norm": 14.553876876831055, + "learning_rate": 3.5884066991338283e-07, + "loss": 4.3612, + "step": 17930 + }, + { + "epoch": 3.5180462926637897, + "grad_norm": 18.669994354248047, + "learning_rate": 3.574022330448612e-07, + "loss": 4.2946, + "step": 17935 + }, + { + "epoch": 3.519027069438996, + "grad_norm": 38.16789627075195, + "learning_rate": 3.559665781376348e-07, + "loss": 4.5239, + "step": 17940 + }, + { + "epoch": 3.5200078462142015, + "grad_norm": 25.857666015625, + "learning_rate": 3.5453370605198213e-07, + "loss": 4.3706, + "step": 17945 + }, + { + "epoch": 3.5209886229894076, + "grad_norm": 15.636646270751953, + "learning_rate": 3.531036176465175e-07, + "loss": 4.3487, + "step": 17950 + }, + { + "epoch": 3.5219693997646138, + "grad_norm": 24.652542114257812, + "learning_rate": 3.516763137781842e-07, + "loss": 4.7284, + "step": 17955 + }, + { + "epoch": 3.5229501765398195, + "grad_norm": 25.52812957763672, + "learning_rate": 3.5025179530225995e-07, + "loss": 4.369, + "step": 17960 + }, + { + "epoch": 3.5239309533150256, + "grad_norm": 24.099510192871094, + "learning_rate": 3.4883006307235233e-07, + "loss": 4.1875, + "step": 17965 + }, + { + "epoch": 3.5249117300902313, + "grad_norm": 20.995121002197266, + "learning_rate": 3.474111179403977e-07, + "loss": 4.4124, + "step": 17970 + }, + { + "epoch": 3.5258925068654374, + "grad_norm": 18.973020553588867, + "learning_rate": 3.4599496075666484e-07, + "loss": 4.6197, + "step": 17975 + }, + { + "epoch": 3.5268732836406436, + "grad_norm": 37.977508544921875, + "learning_rate": 3.445815923697499e-07, + "loss": 4.5785, + "step": 17980 + }, + { + "epoch": 3.5278540604158493, + "grad_norm": 10.61478328704834, + "learning_rate": 3.431710136265792e-07, + "loss": 4.2225, + "step": 17985 + }, + { + "epoch": 3.5288348371910554, + "grad_norm": 21.45575714111328, + "learning_rate": 3.4176322537240736e-07, + "loss": 4.3511, + "step": 17990 + }, + { + "epoch": 3.529815613966261, + "grad_norm": 27.8227596282959, + "learning_rate": 3.40358228450815e-07, + "loss": 4.5644, + "step": 17995 + }, + { + "epoch": 3.5307963907414672, + "grad_norm": 38.02301788330078, + "learning_rate": 3.3895602370371374e-07, + "loss": 4.4346, + "step": 18000 + }, + { + "epoch": 3.5317771675166734, + "grad_norm": 9.561601638793945, + "learning_rate": 3.3755661197133747e-07, + "loss": 4.3879, + "step": 18005 + }, + { + "epoch": 3.532757944291879, + "grad_norm": 14.51889419555664, + "learning_rate": 3.361599940922505e-07, + "loss": 4.3403, + "step": 18010 + }, + { + "epoch": 3.533738721067085, + "grad_norm": 31.724550247192383, + "learning_rate": 3.3476617090334174e-07, + "loss": 4.2, + "step": 18015 + }, + { + "epoch": 3.534719497842291, + "grad_norm": 35.23511505126953, + "learning_rate": 3.3337514323982356e-07, + "loss": 4.2134, + "step": 18020 + }, + { + "epoch": 3.535700274617497, + "grad_norm": 21.88624382019043, + "learning_rate": 3.3198691193523593e-07, + "loss": 4.7372, + "step": 18025 + }, + { + "epoch": 3.536681051392703, + "grad_norm": 22.539533615112305, + "learning_rate": 3.3060147782144114e-07, + "loss": 4.2245, + "step": 18030 + }, + { + "epoch": 3.537661828167909, + "grad_norm": 24.725444793701172, + "learning_rate": 3.2921884172862686e-07, + "loss": 4.6021, + "step": 18035 + }, + { + "epoch": 3.538642604943115, + "grad_norm": 39.704078674316406, + "learning_rate": 3.278390044853036e-07, + "loss": 4.605, + "step": 18040 + }, + { + "epoch": 3.5396233817183207, + "grad_norm": 17.120359420776367, + "learning_rate": 3.264619669183033e-07, + "loss": 4.4304, + "step": 18045 + }, + { + "epoch": 3.540604158493527, + "grad_norm": 20.995365142822266, + "learning_rate": 3.250877298527827e-07, + "loss": 4.2342, + "step": 18050 + }, + { + "epoch": 3.541584935268733, + "grad_norm": 17.42166519165039, + "learning_rate": 3.237162941122185e-07, + "loss": 4.5169, + "step": 18055 + }, + { + "epoch": 3.5425657120439387, + "grad_norm": 31.418668746948242, + "learning_rate": 3.2234766051841006e-07, + "loss": 4.2783, + "step": 18060 + }, + { + "epoch": 3.543546488819145, + "grad_norm": 15.284781455993652, + "learning_rate": 3.209818298914763e-07, + "loss": 4.4126, + "step": 18065 + }, + { + "epoch": 3.5445272655943505, + "grad_norm": 14.738502502441406, + "learning_rate": 3.196188030498576e-07, + "loss": 4.184, + "step": 18070 + }, + { + "epoch": 3.5455080423695566, + "grad_norm": 15.123030662536621, + "learning_rate": 3.182585808103139e-07, + "loss": 4.4825, + "step": 18075 + }, + { + "epoch": 3.5464888191447628, + "grad_norm": 27.406064987182617, + "learning_rate": 3.1690116398792435e-07, + "loss": 4.2695, + "step": 18080 + }, + { + "epoch": 3.5474695959199685, + "grad_norm": 34.00102233886719, + "learning_rate": 3.155465533960872e-07, + "loss": 4.3696, + "step": 18085 + }, + { + "epoch": 3.5484503726951746, + "grad_norm": 23.0728759765625, + "learning_rate": 3.1419474984652034e-07, + "loss": 4.3947, + "step": 18090 + }, + { + "epoch": 3.5494311494703803, + "grad_norm": 23.59056854248047, + "learning_rate": 3.128457541492569e-07, + "loss": 4.5822, + "step": 18095 + }, + { + "epoch": 3.5504119262455864, + "grad_norm": 35.71723175048828, + "learning_rate": 3.1149956711265027e-07, + "loss": 4.5142, + "step": 18100 + }, + { + "epoch": 3.5513927030207926, + "grad_norm": 12.386441230773926, + "learning_rate": 3.101561895433686e-07, + "loss": 4.4894, + "step": 18105 + }, + { + "epoch": 3.5523734797959983, + "grad_norm": 16.826030731201172, + "learning_rate": 3.0881562224639726e-07, + "loss": 4.2603, + "step": 18110 + }, + { + "epoch": 3.5533542565712044, + "grad_norm": 21.321094512939453, + "learning_rate": 3.074778660250394e-07, + "loss": 4.4259, + "step": 18115 + }, + { + "epoch": 3.55433503334641, + "grad_norm": 27.676136016845703, + "learning_rate": 3.0614292168091086e-07, + "loss": 4.4941, + "step": 18120 + }, + { + "epoch": 3.5553158101216162, + "grad_norm": 20.773574829101562, + "learning_rate": 3.0481079001394465e-07, + "loss": 4.3518, + "step": 18125 + }, + { + "epoch": 3.5562965868968224, + "grad_norm": 19.71353530883789, + "learning_rate": 3.034814718223861e-07, + "loss": 4.2774, + "step": 18130 + }, + { + "epoch": 3.5572773636720285, + "grad_norm": 14.87898063659668, + "learning_rate": 3.0215496790279853e-07, + "loss": 4.5793, + "step": 18135 + }, + { + "epoch": 3.558258140447234, + "grad_norm": 11.980874061584473, + "learning_rate": 3.0083127905005447e-07, + "loss": 4.448, + "step": 18140 + }, + { + "epoch": 3.55923891722244, + "grad_norm": 15.427719116210938, + "learning_rate": 2.995104060573417e-07, + "loss": 4.3879, + "step": 18145 + }, + { + "epoch": 3.560219693997646, + "grad_norm": 37.64731979370117, + "learning_rate": 2.9819234971616154e-07, + "loss": 4.5402, + "step": 18150 + }, + { + "epoch": 3.561200470772852, + "grad_norm": 32.419612884521484, + "learning_rate": 2.968771108163249e-07, + "loss": 4.3902, + "step": 18155 + }, + { + "epoch": 3.5621812475480583, + "grad_norm": 20.03985023498535, + "learning_rate": 2.9556469014595744e-07, + "loss": 4.5519, + "step": 18160 + }, + { + "epoch": 3.563162024323264, + "grad_norm": 31.856380462646484, + "learning_rate": 2.9425508849149464e-07, + "loss": 4.5121, + "step": 18165 + }, + { + "epoch": 3.56414280109847, + "grad_norm": 11.391762733459473, + "learning_rate": 2.92948306637682e-07, + "loss": 4.2529, + "step": 18170 + }, + { + "epoch": 3.565123577873676, + "grad_norm": 18.532207489013672, + "learning_rate": 2.916443453675766e-07, + "loss": 4.4126, + "step": 18175 + }, + { + "epoch": 3.566104354648882, + "grad_norm": 20.73578643798828, + "learning_rate": 2.903432054625438e-07, + "loss": 4.2476, + "step": 18180 + }, + { + "epoch": 3.567085131424088, + "grad_norm": 18.467668533325195, + "learning_rate": 2.8904488770226003e-07, + "loss": 4.5179, + "step": 18185 + }, + { + "epoch": 3.568065908199294, + "grad_norm": 15.97587776184082, + "learning_rate": 2.877493928647107e-07, + "loss": 4.2937, + "step": 18190 + }, + { + "epoch": 3.5690466849745, + "grad_norm": 16.574209213256836, + "learning_rate": 2.8645672172618766e-07, + "loss": 4.6197, + "step": 18195 + }, + { + "epoch": 3.5700274617497056, + "grad_norm": 21.534997940063477, + "learning_rate": 2.8516687506129294e-07, + "loss": 4.3969, + "step": 18200 + }, + { + "epoch": 3.571008238524912, + "grad_norm": 21.43780517578125, + "learning_rate": 2.8387985364293493e-07, + "loss": 4.6026, + "step": 18205 + }, + { + "epoch": 3.571989015300118, + "grad_norm": 15.082340240478516, + "learning_rate": 2.8259565824232784e-07, + "loss": 4.4005, + "step": 18210 + }, + { + "epoch": 3.5729697920753236, + "grad_norm": 32.8424186706543, + "learning_rate": 2.8131428962899557e-07, + "loss": 4.2716, + "step": 18215 + }, + { + "epoch": 3.5739505688505298, + "grad_norm": 14.402861595153809, + "learning_rate": 2.80035748570765e-07, + "loss": 4.2866, + "step": 18220 + }, + { + "epoch": 3.5749313456257354, + "grad_norm": 27.641376495361328, + "learning_rate": 2.7876003583377165e-07, + "loss": 4.5308, + "step": 18225 + }, + { + "epoch": 3.5759121224009416, + "grad_norm": 21.16995620727539, + "learning_rate": 2.7748715218245346e-07, + "loss": 4.6729, + "step": 18230 + }, + { + "epoch": 3.5768928991761477, + "grad_norm": 17.894859313964844, + "learning_rate": 2.762170983795542e-07, + "loss": 4.5135, + "step": 18235 + }, + { + "epoch": 3.5778736759513534, + "grad_norm": 13.897953987121582, + "learning_rate": 2.749498751861229e-07, + "loss": 4.2009, + "step": 18240 + }, + { + "epoch": 3.5788544527265596, + "grad_norm": 20.38204002380371, + "learning_rate": 2.73685483361511e-07, + "loss": 4.7626, + "step": 18245 + }, + { + "epoch": 3.5798352295017652, + "grad_norm": 29.557910919189453, + "learning_rate": 2.7242392366337465e-07, + "loss": 4.2005, + "step": 18250 + }, + { + "epoch": 3.5808160062769714, + "grad_norm": 16.029287338256836, + "learning_rate": 2.711651968476708e-07, + "loss": 4.3986, + "step": 18255 + }, + { + "epoch": 3.5817967830521775, + "grad_norm": 20.355484008789062, + "learning_rate": 2.6990930366866065e-07, + "loss": 4.1751, + "step": 18260 + }, + { + "epoch": 3.582777559827383, + "grad_norm": 10.150146484375, + "learning_rate": 2.686562448789082e-07, + "loss": 4.3862, + "step": 18265 + }, + { + "epoch": 3.5837583366025894, + "grad_norm": 40.30896759033203, + "learning_rate": 2.674060212292756e-07, + "loss": 4.4452, + "step": 18270 + }, + { + "epoch": 3.584739113377795, + "grad_norm": 20.16090965270996, + "learning_rate": 2.661586334689309e-07, + "loss": 4.4507, + "step": 18275 + }, + { + "epoch": 3.585719890153001, + "grad_norm": 21.508737564086914, + "learning_rate": 2.6491408234533834e-07, + "loss": 4.0909, + "step": 18280 + }, + { + "epoch": 3.5867006669282073, + "grad_norm": 17.113956451416016, + "learning_rate": 2.6367236860426414e-07, + "loss": 4.4327, + "step": 18285 + }, + { + "epoch": 3.587681443703413, + "grad_norm": 11.977952003479004, + "learning_rate": 2.624334929897754e-07, + "loss": 4.3403, + "step": 18290 + }, + { + "epoch": 3.588662220478619, + "grad_norm": 17.363544464111328, + "learning_rate": 2.611974562442365e-07, + "loss": 4.3587, + "step": 18295 + }, + { + "epoch": 3.589642997253825, + "grad_norm": 41.32529830932617, + "learning_rate": 2.599642591083129e-07, + "loss": 4.3573, + "step": 18300 + }, + { + "epoch": 3.590623774029031, + "grad_norm": 16.15401840209961, + "learning_rate": 2.587339023209662e-07, + "loss": 4.3301, + "step": 18305 + }, + { + "epoch": 3.591604550804237, + "grad_norm": 51.65797424316406, + "learning_rate": 2.575063866194577e-07, + "loss": 4.4128, + "step": 18310 + }, + { + "epoch": 3.592585327579443, + "grad_norm": 21.297771453857422, + "learning_rate": 2.5628171273934635e-07, + "loss": 4.3364, + "step": 18315 + }, + { + "epoch": 3.593566104354649, + "grad_norm": 27.42387580871582, + "learning_rate": 2.550598814144861e-07, + "loss": 4.3562, + "step": 18320 + }, + { + "epoch": 3.5945468811298547, + "grad_norm": 48.64586639404297, + "learning_rate": 2.538408933770303e-07, + "loss": 3.9871, + "step": 18325 + }, + { + "epoch": 3.595527657905061, + "grad_norm": 19.293163299560547, + "learning_rate": 2.5262474935742574e-07, + "loss": 4.2953, + "step": 18330 + }, + { + "epoch": 3.596508434680267, + "grad_norm": 21.172189712524414, + "learning_rate": 2.514114500844178e-07, + "loss": 4.5203, + "step": 18335 + }, + { + "epoch": 3.5974892114554726, + "grad_norm": 16.0252685546875, + "learning_rate": 2.5020099628504603e-07, + "loss": 4.3533, + "step": 18340 + }, + { + "epoch": 3.5984699882306788, + "grad_norm": 27.545793533325195, + "learning_rate": 2.4899338868464404e-07, + "loss": 4.684, + "step": 18345 + }, + { + "epoch": 3.5994507650058845, + "grad_norm": 16.568904876708984, + "learning_rate": 2.4778862800684034e-07, + "loss": 4.3269, + "step": 18350 + }, + { + "epoch": 3.6004315417810906, + "grad_norm": 27.713424682617188, + "learning_rate": 2.4658671497355847e-07, + "loss": 4.4722, + "step": 18355 + }, + { + "epoch": 3.6014123185562967, + "grad_norm": 24.068340301513672, + "learning_rate": 2.4538765030501455e-07, + "loss": 4.6103, + "step": 18360 + }, + { + "epoch": 3.6023930953315024, + "grad_norm": 19.014244079589844, + "learning_rate": 2.441914347197194e-07, + "loss": 4.4829, + "step": 18365 + }, + { + "epoch": 3.6033738721067086, + "grad_norm": 25.846216201782227, + "learning_rate": 2.429980689344735e-07, + "loss": 4.2449, + "step": 18370 + }, + { + "epoch": 3.6043546488819143, + "grad_norm": 17.586687088012695, + "learning_rate": 2.4180755366437324e-07, + "loss": 4.3699, + "step": 18375 + }, + { + "epoch": 3.6053354256571204, + "grad_norm": 17.855295181274414, + "learning_rate": 2.406198896228046e-07, + "loss": 4.4747, + "step": 18380 + }, + { + "epoch": 3.6063162024323265, + "grad_norm": 19.772993087768555, + "learning_rate": 2.3943507752144546e-07, + "loss": 4.7973, + "step": 18385 + }, + { + "epoch": 3.6072969792075322, + "grad_norm": 9.745254516601562, + "learning_rate": 2.382531180702663e-07, + "loss": 4.494, + "step": 18390 + }, + { + "epoch": 3.6082777559827384, + "grad_norm": 25.192859649658203, + "learning_rate": 2.3707401197752556e-07, + "loss": 4.536, + "step": 18395 + }, + { + "epoch": 3.609258532757944, + "grad_norm": 22.296049118041992, + "learning_rate": 2.3589775994977416e-07, + "loss": 4.4789, + "step": 18400 + }, + { + "epoch": 3.61023930953315, + "grad_norm": 10.856245040893555, + "learning_rate": 2.3472436269185105e-07, + "loss": 4.4375, + "step": 18405 + }, + { + "epoch": 3.6112200863083563, + "grad_norm": 13.569968223571777, + "learning_rate": 2.3355382090688605e-07, + "loss": 4.1242, + "step": 18410 + }, + { + "epoch": 3.612200863083562, + "grad_norm": 30.028282165527344, + "learning_rate": 2.323861352962975e-07, + "loss": 4.7917, + "step": 18415 + }, + { + "epoch": 3.613181639858768, + "grad_norm": 21.72928237915039, + "learning_rate": 2.3122130655979124e-07, + "loss": 4.6841, + "step": 18420 + }, + { + "epoch": 3.614162416633974, + "grad_norm": 24.678524017333984, + "learning_rate": 2.3005933539536118e-07, + "loss": 4.2326, + "step": 18425 + }, + { + "epoch": 3.61514319340918, + "grad_norm": 30.336259841918945, + "learning_rate": 2.28900222499292e-07, + "loss": 3.8564, + "step": 18430 + }, + { + "epoch": 3.616123970184386, + "grad_norm": 21.32823944091797, + "learning_rate": 2.277439685661509e-07, + "loss": 4.8797, + "step": 18435 + }, + { + "epoch": 3.617104746959592, + "grad_norm": 29.761159896850586, + "learning_rate": 2.2659057428879584e-07, + "loss": 4.8908, + "step": 18440 + }, + { + "epoch": 3.618085523734798, + "grad_norm": 14.10708236694336, + "learning_rate": 2.2544004035836897e-07, + "loss": 4.4972, + "step": 18445 + }, + { + "epoch": 3.6190663005100037, + "grad_norm": 16.236936569213867, + "learning_rate": 2.2429236746429938e-07, + "loss": 4.4356, + "step": 18450 + }, + { + "epoch": 3.62004707728521, + "grad_norm": 11.860494613647461, + "learning_rate": 2.231475562943014e-07, + "loss": 4.5097, + "step": 18455 + }, + { + "epoch": 3.621027854060416, + "grad_norm": 29.234317779541016, + "learning_rate": 2.2200560753437462e-07, + "loss": 4.6355, + "step": 18460 + }, + { + "epoch": 3.622008630835622, + "grad_norm": 29.146080017089844, + "learning_rate": 2.2086652186880386e-07, + "loss": 4.8315, + "step": 18465 + }, + { + "epoch": 3.6229894076108278, + "grad_norm": 22.21369171142578, + "learning_rate": 2.1973029998015703e-07, + "loss": 4.7367, + "step": 18470 + }, + { + "epoch": 3.6239701843860335, + "grad_norm": 12.365029335021973, + "learning_rate": 2.1859694254928844e-07, + "loss": 4.5659, + "step": 18475 + }, + { + "epoch": 3.6249509611612396, + "grad_norm": 14.387511253356934, + "learning_rate": 2.1746645025533198e-07, + "loss": 4.2935, + "step": 18480 + }, + { + "epoch": 3.6259317379364457, + "grad_norm": 24.00560188293457, + "learning_rate": 2.1633882377570913e-07, + "loss": 4.4385, + "step": 18485 + }, + { + "epoch": 3.626912514711652, + "grad_norm": 20.93753433227539, + "learning_rate": 2.1521406378612164e-07, + "loss": 4.5372, + "step": 18490 + }, + { + "epoch": 3.6278932914868576, + "grad_norm": 35.840152740478516, + "learning_rate": 2.1409217096055311e-07, + "loss": 4.5068, + "step": 18495 + }, + { + "epoch": 3.6288740682620637, + "grad_norm": 19.95467185974121, + "learning_rate": 2.1297314597127082e-07, + "loss": 4.4501, + "step": 18500 + }, + { + "epoch": 3.6298548450372694, + "grad_norm": 15.141322135925293, + "learning_rate": 2.118569894888217e-07, + "loss": 4.7295, + "step": 18505 + }, + { + "epoch": 3.6308356218124755, + "grad_norm": 19.499128341674805, + "learning_rate": 2.1074370218203522e-07, + "loss": 4.6502, + "step": 18510 + }, + { + "epoch": 3.6318163985876817, + "grad_norm": 24.709644317626953, + "learning_rate": 2.0963328471802213e-07, + "loss": 4.678, + "step": 18515 + }, + { + "epoch": 3.6327971753628874, + "grad_norm": 12.919239044189453, + "learning_rate": 2.0852573776217078e-07, + "loss": 4.3808, + "step": 18520 + }, + { + "epoch": 3.6337779521380935, + "grad_norm": 27.91121482849121, + "learning_rate": 2.0742106197815304e-07, + "loss": 4.6819, + "step": 18525 + }, + { + "epoch": 3.634758728913299, + "grad_norm": 26.32551383972168, + "learning_rate": 2.0631925802791608e-07, + "loss": 4.8111, + "step": 18530 + }, + { + "epoch": 3.6357395056885053, + "grad_norm": 10.567716598510742, + "learning_rate": 2.0522032657169012e-07, + "loss": 4.6578, + "step": 18535 + }, + { + "epoch": 3.6367202824637115, + "grad_norm": 17.105424880981445, + "learning_rate": 2.0412426826798283e-07, + "loss": 4.5804, + "step": 18540 + }, + { + "epoch": 3.637701059238917, + "grad_norm": 22.463489532470703, + "learning_rate": 2.0303108377357827e-07, + "loss": 4.5606, + "step": 18545 + }, + { + "epoch": 3.6386818360141233, + "grad_norm": 19.65961265563965, + "learning_rate": 2.0194077374354248e-07, + "loss": 4.297, + "step": 18550 + }, + { + "epoch": 3.639662612789329, + "grad_norm": 12.41259765625, + "learning_rate": 2.0085333883121393e-07, + "loss": 4.2794, + "step": 18555 + }, + { + "epoch": 3.640643389564535, + "grad_norm": 14.857327461242676, + "learning_rate": 1.9976877968821306e-07, + "loss": 4.2883, + "step": 18560 + }, + { + "epoch": 3.6416241663397413, + "grad_norm": 34.63252258300781, + "learning_rate": 1.9868709696443334e-07, + "loss": 4.5432, + "step": 18565 + }, + { + "epoch": 3.642604943114947, + "grad_norm": 17.150300979614258, + "learning_rate": 1.9760829130804794e-07, + "loss": 4.344, + "step": 18570 + }, + { + "epoch": 3.643585719890153, + "grad_norm": 15.355920791625977, + "learning_rate": 1.9653236336550363e-07, + "loss": 4.5685, + "step": 18575 + }, + { + "epoch": 3.644566496665359, + "grad_norm": 13.861881256103516, + "learning_rate": 1.954593137815225e-07, + "loss": 4.301, + "step": 18580 + }, + { + "epoch": 3.645547273440565, + "grad_norm": 52.94959259033203, + "learning_rate": 1.943891431991035e-07, + "loss": 4.2231, + "step": 18585 + }, + { + "epoch": 3.646528050215771, + "grad_norm": 15.73333740234375, + "learning_rate": 1.9332185225952092e-07, + "loss": 4.49, + "step": 18590 + }, + { + "epoch": 3.647508826990977, + "grad_norm": 23.316686630249023, + "learning_rate": 1.922574416023204e-07, + "loss": 4.7615, + "step": 18595 + }, + { + "epoch": 3.648489603766183, + "grad_norm": 15.769940376281738, + "learning_rate": 1.9119591186532506e-07, + "loss": 4.3419, + "step": 18600 + }, + { + "epoch": 3.6494703805413886, + "grad_norm": 19.157127380371094, + "learning_rate": 1.9013726368462937e-07, + "loss": 4.437, + "step": 18605 + }, + { + "epoch": 3.6504511573165948, + "grad_norm": 20.292354583740234, + "learning_rate": 1.8908149769460204e-07, + "loss": 4.4277, + "step": 18610 + }, + { + "epoch": 3.651431934091801, + "grad_norm": 13.844761848449707, + "learning_rate": 1.880286145278848e-07, + "loss": 4.3454, + "step": 18615 + }, + { + "epoch": 3.6524127108670066, + "grad_norm": 16.793840408325195, + "learning_rate": 1.8697861481539182e-07, + "loss": 4.2819, + "step": 18620 + }, + { + "epoch": 3.6533934876422127, + "grad_norm": 14.200879096984863, + "learning_rate": 1.8593149918630927e-07, + "loss": 4.5943, + "step": 18625 + }, + { + "epoch": 3.6543742644174184, + "grad_norm": 15.40435791015625, + "learning_rate": 1.848872682680941e-07, + "loss": 4.6043, + "step": 18630 + }, + { + "epoch": 3.6553550411926246, + "grad_norm": 19.25347328186035, + "learning_rate": 1.8384592268647806e-07, + "loss": 4.5762, + "step": 18635 + }, + { + "epoch": 3.6563358179678307, + "grad_norm": 22.764402389526367, + "learning_rate": 1.828074630654597e-07, + "loss": 4.8733, + "step": 18640 + }, + { + "epoch": 3.6573165947430364, + "grad_norm": 17.222026824951172, + "learning_rate": 1.8177189002731021e-07, + "loss": 4.3624, + "step": 18645 + }, + { + "epoch": 3.6582973715182425, + "grad_norm": 15.655689239501953, + "learning_rate": 1.8073920419257208e-07, + "loss": 4.3877, + "step": 18650 + }, + { + "epoch": 3.659278148293448, + "grad_norm": 15.489755630493164, + "learning_rate": 1.7970940618005528e-07, + "loss": 4.3674, + "step": 18655 + }, + { + "epoch": 3.6602589250686544, + "grad_norm": 21.11366844177246, + "learning_rate": 1.7868249660684123e-07, + "loss": 4.5259, + "step": 18660 + }, + { + "epoch": 3.6612397018438605, + "grad_norm": 23.487411499023438, + "learning_rate": 1.7765847608828047e-07, + "loss": 4.3651, + "step": 18665 + }, + { + "epoch": 3.662220478619066, + "grad_norm": 25.111196517944336, + "learning_rate": 1.7663734523799104e-07, + "loss": 4.5454, + "step": 18670 + }, + { + "epoch": 3.6632012553942723, + "grad_norm": 44.54734420776367, + "learning_rate": 1.7561910466786125e-07, + "loss": 4.9847, + "step": 18675 + }, + { + "epoch": 3.664182032169478, + "grad_norm": 24.96506118774414, + "learning_rate": 1.7460375498804527e-07, + "loss": 4.2462, + "step": 18680 + }, + { + "epoch": 3.665162808944684, + "grad_norm": 14.811649322509766, + "learning_rate": 1.7359129680696696e-07, + "loss": 4.7993, + "step": 18685 + }, + { + "epoch": 3.6661435857198903, + "grad_norm": 20.218128204345703, + "learning_rate": 1.7258173073131658e-07, + "loss": 4.2195, + "step": 18690 + }, + { + "epoch": 3.667124362495096, + "grad_norm": 27.616926193237305, + "learning_rate": 1.715750573660513e-07, + "loss": 4.3696, + "step": 18695 + }, + { + "epoch": 3.668105139270302, + "grad_norm": 19.38068962097168, + "learning_rate": 1.7057127731439526e-07, + "loss": 4.362, + "step": 18700 + }, + { + "epoch": 3.669085916045508, + "grad_norm": 15.382796287536621, + "learning_rate": 1.6957039117783848e-07, + "loss": 4.6597, + "step": 18705 + }, + { + "epoch": 3.670066692820714, + "grad_norm": 14.528923034667969, + "learning_rate": 1.6857239955613724e-07, + "loss": 4.5035, + "step": 18710 + }, + { + "epoch": 3.67104746959592, + "grad_norm": 26.649324417114258, + "learning_rate": 1.6757730304731378e-07, + "loss": 4.6962, + "step": 18715 + }, + { + "epoch": 3.672028246371126, + "grad_norm": 33.269126892089844, + "learning_rate": 1.6658510224765333e-07, + "loss": 4.4128, + "step": 18720 + }, + { + "epoch": 3.673009023146332, + "grad_norm": 17.99794578552246, + "learning_rate": 1.655957977517092e-07, + "loss": 4.7475, + "step": 18725 + }, + { + "epoch": 3.6739897999215376, + "grad_norm": 13.681337356567383, + "learning_rate": 1.646093901522966e-07, + "loss": 4.6715, + "step": 18730 + }, + { + "epoch": 3.6749705766967438, + "grad_norm": 22.046131134033203, + "learning_rate": 1.6362588004049606e-07, + "loss": 4.3841, + "step": 18735 + }, + { + "epoch": 3.67595135347195, + "grad_norm": 13.013222694396973, + "learning_rate": 1.6264526800565228e-07, + "loss": 4.026, + "step": 18740 + }, + { + "epoch": 3.6769321302471556, + "grad_norm": 15.13515853881836, + "learning_rate": 1.616675546353713e-07, + "loss": 4.3175, + "step": 18745 + }, + { + "epoch": 3.6779129070223617, + "grad_norm": 14.422954559326172, + "learning_rate": 1.6069274051552453e-07, + "loss": 4.3712, + "step": 18750 + }, + { + "epoch": 3.6788936837975674, + "grad_norm": 22.292251586914062, + "learning_rate": 1.597208262302441e-07, + "loss": 4.7974, + "step": 18755 + }, + { + "epoch": 3.6798744605727736, + "grad_norm": 23.049213409423828, + "learning_rate": 1.5875181236192638e-07, + "loss": 4.3128, + "step": 18760 + }, + { + "epoch": 3.6808552373479797, + "grad_norm": 14.684503555297852, + "learning_rate": 1.5778569949122914e-07, + "loss": 4.3054, + "step": 18765 + }, + { + "epoch": 3.6818360141231854, + "grad_norm": 24.637575149536133, + "learning_rate": 1.5682248819707036e-07, + "loss": 4.4834, + "step": 18770 + }, + { + "epoch": 3.6828167908983915, + "grad_norm": 24.20187759399414, + "learning_rate": 1.558621790566317e-07, + "loss": 4.3936, + "step": 18775 + }, + { + "epoch": 3.6837975676735972, + "grad_norm": 12.726591110229492, + "learning_rate": 1.549047726453534e-07, + "loss": 4.504, + "step": 18780 + }, + { + "epoch": 3.6847783444488034, + "grad_norm": 20.728734970092773, + "learning_rate": 1.5395026953693826e-07, + "loss": 4.2898, + "step": 18785 + }, + { + "epoch": 3.6857591212240095, + "grad_norm": 20.017847061157227, + "learning_rate": 1.5299867030334815e-07, + "loss": 4.4186, + "step": 18790 + }, + { + "epoch": 3.6867398979992156, + "grad_norm": 13.246440887451172, + "learning_rate": 1.5204997551480527e-07, + "loss": 4.5196, + "step": 18795 + }, + { + "epoch": 3.6877206747744213, + "grad_norm": 19.194934844970703, + "learning_rate": 1.5110418573979157e-07, + "loss": 4.4459, + "step": 18800 + }, + { + "epoch": 3.688701451549627, + "grad_norm": 14.851069450378418, + "learning_rate": 1.501613015450476e-07, + "loss": 4.4218, + "step": 18805 + }, + { + "epoch": 3.689682228324833, + "grad_norm": 14.522515296936035, + "learning_rate": 1.492213234955736e-07, + "loss": 4.6519, + "step": 18810 + }, + { + "epoch": 3.6906630051000393, + "grad_norm": 16.660293579101562, + "learning_rate": 1.482842521546285e-07, + "loss": 4.3321, + "step": 18815 + }, + { + "epoch": 3.6916437818752454, + "grad_norm": 19.738510131835938, + "learning_rate": 1.4735008808372808e-07, + "loss": 4.32, + "step": 18820 + }, + { + "epoch": 3.692624558650451, + "grad_norm": 35.67677307128906, + "learning_rate": 1.4641883184264794e-07, + "loss": 4.2388, + "step": 18825 + }, + { + "epoch": 3.6936053354256573, + "grad_norm": 22.168577194213867, + "learning_rate": 1.454904839894189e-07, + "loss": 4.2093, + "step": 18830 + }, + { + "epoch": 3.694586112200863, + "grad_norm": 13.524019241333008, + "learning_rate": 1.4456504508033152e-07, + "loss": 4.4022, + "step": 18835 + }, + { + "epoch": 3.695566888976069, + "grad_norm": 15.70751667022705, + "learning_rate": 1.4364251566993225e-07, + "loss": 4.4863, + "step": 18840 + }, + { + "epoch": 3.6965476657512752, + "grad_norm": 11.819734573364258, + "learning_rate": 1.4272289631102276e-07, + "loss": 4.261, + "step": 18845 + }, + { + "epoch": 3.697528442526481, + "grad_norm": 15.136698722839355, + "learning_rate": 1.418061875546628e-07, + "loss": 4.4006, + "step": 18850 + }, + { + "epoch": 3.698509219301687, + "grad_norm": 13.050637245178223, + "learning_rate": 1.408923899501674e-07, + "loss": 4.5681, + "step": 18855 + }, + { + "epoch": 3.6994899960768928, + "grad_norm": 18.412694931030273, + "learning_rate": 1.3998150404510635e-07, + "loss": 4.6765, + "step": 18860 + }, + { + "epoch": 3.700470772852099, + "grad_norm": 20.788856506347656, + "learning_rate": 1.390735303853069e-07, + "loss": 4.1471, + "step": 18865 + }, + { + "epoch": 3.701451549627305, + "grad_norm": 18.045942306518555, + "learning_rate": 1.381684695148472e-07, + "loss": 4.3271, + "step": 18870 + }, + { + "epoch": 3.7024323264025107, + "grad_norm": 14.031227111816406, + "learning_rate": 1.3726632197606503e-07, + "loss": 4.4308, + "step": 18875 + }, + { + "epoch": 3.703413103177717, + "grad_norm": 26.201160430908203, + "learning_rate": 1.3636708830954803e-07, + "loss": 4.1403, + "step": 18880 + }, + { + "epoch": 3.7043938799529226, + "grad_norm": 30.57729721069336, + "learning_rate": 1.354707690541407e-07, + "loss": 4.4129, + "step": 18885 + }, + { + "epoch": 3.7053746567281287, + "grad_norm": 23.976205825805664, + "learning_rate": 1.3457736474693949e-07, + "loss": 4.3689, + "step": 18890 + }, + { + "epoch": 3.706355433503335, + "grad_norm": 24.748432159423828, + "learning_rate": 1.336868759232951e-07, + "loss": 4.29, + "step": 18895 + }, + { + "epoch": 3.7073362102785405, + "grad_norm": 19.802711486816406, + "learning_rate": 1.3279930311681123e-07, + "loss": 4.6155, + "step": 18900 + }, + { + "epoch": 3.7083169870537467, + "grad_norm": 14.154813766479492, + "learning_rate": 1.3191464685934241e-07, + "loss": 4.4138, + "step": 18905 + }, + { + "epoch": 3.7092977638289524, + "grad_norm": 15.545702934265137, + "learning_rate": 1.3103290768099796e-07, + "loss": 4.571, + "step": 18910 + }, + { + "epoch": 3.7102785406041585, + "grad_norm": 56.756019592285156, + "learning_rate": 1.30154086110138e-07, + "loss": 4.1372, + "step": 18915 + }, + { + "epoch": 3.7112593173793647, + "grad_norm": 19.387264251708984, + "learning_rate": 1.2927818267337468e-07, + "loss": 4.5391, + "step": 18920 + }, + { + "epoch": 3.7122400941545703, + "grad_norm": 24.630348205566406, + "learning_rate": 1.2840519789557137e-07, + "loss": 4.2267, + "step": 18925 + }, + { + "epoch": 3.7132208709297765, + "grad_norm": 24.16886329650879, + "learning_rate": 1.275351322998425e-07, + "loss": 4.6744, + "step": 18930 + }, + { + "epoch": 3.714201647704982, + "grad_norm": 21.653339385986328, + "learning_rate": 1.2666798640755206e-07, + "loss": 4.3333, + "step": 18935 + }, + { + "epoch": 3.7151824244801883, + "grad_norm": 17.944013595581055, + "learning_rate": 1.2580376073831723e-07, + "loss": 4.4333, + "step": 18940 + }, + { + "epoch": 3.7161632012553945, + "grad_norm": 20.188127517700195, + "learning_rate": 1.249424558100032e-07, + "loss": 4.264, + "step": 18945 + }, + { + "epoch": 3.7171439780306, + "grad_norm": 25.705101013183594, + "learning_rate": 1.2408407213872543e-07, + "loss": 4.4369, + "step": 18950 + }, + { + "epoch": 3.7181247548058063, + "grad_norm": 21.701995849609375, + "learning_rate": 1.2322861023884859e-07, + "loss": 4.365, + "step": 18955 + }, + { + "epoch": 3.719105531581012, + "grad_norm": 12.588302612304688, + "learning_rate": 1.2237607062298762e-07, + "loss": 4.0742, + "step": 18960 + }, + { + "epoch": 3.720086308356218, + "grad_norm": 24.353504180908203, + "learning_rate": 1.215264538020061e-07, + "loss": 4.7479, + "step": 18965 + }, + { + "epoch": 3.7210670851314243, + "grad_norm": 11.862863540649414, + "learning_rate": 1.206797602850146e-07, + "loss": 4.3784, + "step": 18970 + }, + { + "epoch": 3.72204786190663, + "grad_norm": 23.318727493286133, + "learning_rate": 1.198359905793739e-07, + "loss": 4.3121, + "step": 18975 + }, + { + "epoch": 3.723028638681836, + "grad_norm": 13.879663467407227, + "learning_rate": 1.1899514519069177e-07, + "loss": 4.0607, + "step": 18980 + }, + { + "epoch": 3.724009415457042, + "grad_norm": 14.94577693939209, + "learning_rate": 1.1815722462282409e-07, + "loss": 4.4307, + "step": 18985 + }, + { + "epoch": 3.724990192232248, + "grad_norm": 37.08789825439453, + "learning_rate": 1.1732222937787419e-07, + "loss": 4.5394, + "step": 18990 + }, + { + "epoch": 3.725970969007454, + "grad_norm": 21.196096420288086, + "learning_rate": 1.1649015995619184e-07, + "loss": 4.6473, + "step": 18995 + }, + { + "epoch": 3.7269517457826598, + "grad_norm": 16.924158096313477, + "learning_rate": 1.156610168563732e-07, + "loss": 4.4108, + "step": 19000 + }, + { + "epoch": 3.727932522557866, + "grad_norm": 35.096519470214844, + "learning_rate": 1.1483480057526364e-07, + "loss": 4.546, + "step": 19005 + }, + { + "epoch": 3.7289132993330716, + "grad_norm": 10.689122200012207, + "learning_rate": 1.1401151160795044e-07, + "loss": 4.3182, + "step": 19010 + }, + { + "epoch": 3.7298940761082777, + "grad_norm": 22.799142837524414, + "learning_rate": 1.1319115044777063e-07, + "loss": 4.3949, + "step": 19015 + }, + { + "epoch": 3.730874852883484, + "grad_norm": 14.333352088928223, + "learning_rate": 1.1237371758630488e-07, + "loss": 4.5137, + "step": 19020 + }, + { + "epoch": 3.7318556296586896, + "grad_norm": 40.326988220214844, + "learning_rate": 1.1155921351337917e-07, + "loss": 4.5281, + "step": 19025 + }, + { + "epoch": 3.7328364064338957, + "grad_norm": 21.95504379272461, + "learning_rate": 1.1074763871706473e-07, + "loss": 4.2508, + "step": 19030 + }, + { + "epoch": 3.7338171832091014, + "grad_norm": 18.045551300048828, + "learning_rate": 1.0993899368367811e-07, + "loss": 4.2403, + "step": 19035 + }, + { + "epoch": 3.7347979599843075, + "grad_norm": 11.443171501159668, + "learning_rate": 1.0913327889777948e-07, + "loss": 4.351, + "step": 19040 + }, + { + "epoch": 3.7357787367595137, + "grad_norm": 13.363077163696289, + "learning_rate": 1.0833049484217373e-07, + "loss": 4.344, + "step": 19045 + }, + { + "epoch": 3.7367595135347194, + "grad_norm": 18.82840919494629, + "learning_rate": 1.0753064199790886e-07, + "loss": 4.2728, + "step": 19050 + }, + { + "epoch": 3.7377402903099255, + "grad_norm": 16.456222534179688, + "learning_rate": 1.0673372084427646e-07, + "loss": 4.3274, + "step": 19055 + }, + { + "epoch": 3.738721067085131, + "grad_norm": 15.155776023864746, + "learning_rate": 1.0593973185881179e-07, + "loss": 4.5719, + "step": 19060 + }, + { + "epoch": 3.7397018438603373, + "grad_norm": 14.27761173248291, + "learning_rate": 1.0514867551729424e-07, + "loss": 4.2692, + "step": 19065 + }, + { + "epoch": 3.7406826206355435, + "grad_norm": 12.729808807373047, + "learning_rate": 1.0436055229374408e-07, + "loss": 4.3371, + "step": 19070 + }, + { + "epoch": 3.741663397410749, + "grad_norm": 21.104854583740234, + "learning_rate": 1.0357536266042356e-07, + "loss": 4.3169, + "step": 19075 + }, + { + "epoch": 3.7426441741859553, + "grad_norm": 14.363003730773926, + "learning_rate": 1.0279310708783907e-07, + "loss": 4.3119, + "step": 19080 + }, + { + "epoch": 3.743624950961161, + "grad_norm": 25.326114654541016, + "learning_rate": 1.0201378604473677e-07, + "loss": 4.7418, + "step": 19085 + }, + { + "epoch": 3.744605727736367, + "grad_norm": 14.007926940917969, + "learning_rate": 1.01237399998107e-07, + "loss": 4.2135, + "step": 19090 + }, + { + "epoch": 3.7455865045115733, + "grad_norm": 15.097603797912598, + "learning_rate": 1.0046394941317816e-07, + "loss": 4.6373, + "step": 19095 + }, + { + "epoch": 3.7465672812867794, + "grad_norm": 18.339405059814453, + "learning_rate": 9.969343475342285e-08, + "loss": 4.8545, + "step": 19100 + }, + { + "epoch": 3.747548058061985, + "grad_norm": 10.937397956848145, + "learning_rate": 9.89258564805512e-08, + "loss": 4.4624, + "step": 19105 + }, + { + "epoch": 3.748528834837191, + "grad_norm": 27.80323600769043, + "learning_rate": 9.816121505451692e-08, + "loss": 4.6097, + "step": 19110 + }, + { + "epoch": 3.749509611612397, + "grad_norm": 16.348608016967773, + "learning_rate": 9.73995109335113e-08, + "loss": 4.2309, + "step": 19115 + }, + { + "epoch": 3.750490388387603, + "grad_norm": 19.31072998046875, + "learning_rate": 9.664074457396699e-08, + "loss": 4.4816, + "step": 19120 + }, + { + "epoch": 3.751471165162809, + "grad_norm": 26.25499153137207, + "learning_rate": 9.588491643055642e-08, + "loss": 4.4299, + "step": 19125 + }, + { + "epoch": 3.751471165162809, + "eval_loss": 4.851955413818359, + "eval_runtime": 7.7483, + "eval_samples_per_second": 26.974, + "eval_steps_per_second": 13.551, + "step": 19125 + }, + { + "epoch": 3.752451941938015, + "grad_norm": 26.895282745361328, + "learning_rate": 9.513202695618951e-08, + "loss": 4.6251, + "step": 19130 + }, + { + "epoch": 3.7534327187132206, + "grad_norm": 13.861442565917969, + "learning_rate": 9.438207660201759e-08, + "loss": 4.6765, + "step": 19135 + }, + { + "epoch": 3.7544134954884267, + "grad_norm": 38.26374053955078, + "learning_rate": 9.363506581742953e-08, + "loss": 4.5282, + "step": 19140 + }, + { + "epoch": 3.755394272263633, + "grad_norm": 22.872318267822266, + "learning_rate": 9.289099505005339e-08, + "loss": 4.2434, + "step": 19145 + }, + { + "epoch": 3.756375049038839, + "grad_norm": 30.124589920043945, + "learning_rate": 9.214986474575471e-08, + "loss": 4.2806, + "step": 19150 + }, + { + "epoch": 3.7573558258140447, + "grad_norm": 30.88665771484375, + "learning_rate": 9.14116753486366e-08, + "loss": 4.4856, + "step": 19155 + }, + { + "epoch": 3.758336602589251, + "grad_norm": 18.82733917236328, + "learning_rate": 9.067642730104132e-08, + "loss": 4.4474, + "step": 19160 + }, + { + "epoch": 3.7593173793644565, + "grad_norm": 20.41982078552246, + "learning_rate": 8.994412104354865e-08, + "loss": 4.322, + "step": 19165 + }, + { + "epoch": 3.7602981561396627, + "grad_norm": 15.03927993774414, + "learning_rate": 8.921475701497373e-08, + "loss": 4.3169, + "step": 19170 + }, + { + "epoch": 3.761278932914869, + "grad_norm": 14.643938064575195, + "learning_rate": 8.84883356523708e-08, + "loss": 4.4695, + "step": 19175 + }, + { + "epoch": 3.7622597096900745, + "grad_norm": 14.6721830368042, + "learning_rate": 8.776485739102947e-08, + "loss": 4.3335, + "step": 19180 + }, + { + "epoch": 3.7632404864652806, + "grad_norm": 17.07347869873047, + "learning_rate": 8.70443226644757e-08, + "loss": 4.7187, + "step": 19185 + }, + { + "epoch": 3.7642212632404863, + "grad_norm": 11.451022148132324, + "learning_rate": 8.632673190447305e-08, + "loss": 4.2598, + "step": 19190 + }, + { + "epoch": 3.7652020400156925, + "grad_norm": 24.953075408935547, + "learning_rate": 8.561208554101863e-08, + "loss": 4.3177, + "step": 19195 + }, + { + "epoch": 3.7661828167908986, + "grad_norm": 11.898823738098145, + "learning_rate": 8.490038400234767e-08, + "loss": 4.4786, + "step": 19200 + }, + { + "epoch": 3.7671635935661043, + "grad_norm": 29.196277618408203, + "learning_rate": 8.41916277149285e-08, + "loss": 4.3394, + "step": 19205 + }, + { + "epoch": 3.7681443703413104, + "grad_norm": 20.188243865966797, + "learning_rate": 8.348581710346692e-08, + "loss": 4.3111, + "step": 19210 + }, + { + "epoch": 3.769125147116516, + "grad_norm": 16.19799041748047, + "learning_rate": 8.27829525909013e-08, + "loss": 4.7131, + "step": 19215 + }, + { + "epoch": 3.7701059238917223, + "grad_norm": 16.04700469970703, + "learning_rate": 8.208303459840694e-08, + "loss": 4.2501, + "step": 19220 + }, + { + "epoch": 3.7710867006669284, + "grad_norm": 24.354026794433594, + "learning_rate": 8.138606354539114e-08, + "loss": 4.506, + "step": 19225 + }, + { + "epoch": 3.772067477442134, + "grad_norm": 13.918210983276367, + "learning_rate": 8.069203984949648e-08, + "loss": 4.5426, + "step": 19230 + }, + { + "epoch": 3.7730482542173402, + "grad_norm": 15.953398704528809, + "learning_rate": 8.000096392660029e-08, + "loss": 4.4152, + "step": 19235 + }, + { + "epoch": 3.774029030992546, + "grad_norm": 22.89542579650879, + "learning_rate": 7.931283619081187e-08, + "loss": 4.4485, + "step": 19240 + }, + { + "epoch": 3.775009807767752, + "grad_norm": 14.230191230773926, + "learning_rate": 7.862765705447528e-08, + "loss": 4.6742, + "step": 19245 + }, + { + "epoch": 3.775990584542958, + "grad_norm": 32.547611236572266, + "learning_rate": 7.794542692816654e-08, + "loss": 4.6502, + "step": 19250 + }, + { + "epoch": 3.776971361318164, + "grad_norm": 19.713741302490234, + "learning_rate": 7.726614622069528e-08, + "loss": 4.4383, + "step": 19255 + }, + { + "epoch": 3.77795213809337, + "grad_norm": 23.55909538269043, + "learning_rate": 7.658981533910315e-08, + "loss": 4.8089, + "step": 19260 + }, + { + "epoch": 3.7789329148685757, + "grad_norm": 18.59007453918457, + "learning_rate": 7.591643468866594e-08, + "loss": 4.4116, + "step": 19265 + }, + { + "epoch": 3.779913691643782, + "grad_norm": 28.624298095703125, + "learning_rate": 7.524600467288923e-08, + "loss": 4.6207, + "step": 19270 + }, + { + "epoch": 3.780894468418988, + "grad_norm": 35.744075775146484, + "learning_rate": 7.457852569351165e-08, + "loss": 4.4529, + "step": 19275 + }, + { + "epoch": 3.7818752451941937, + "grad_norm": 14.48076057434082, + "learning_rate": 7.391399815050326e-08, + "loss": 4.3982, + "step": 19280 + }, + { + "epoch": 3.7828560219694, + "grad_norm": 15.289562225341797, + "learning_rate": 7.32524224420661e-08, + "loss": 3.776, + "step": 19285 + }, + { + "epoch": 3.7838367987446055, + "grad_norm": 16.431413650512695, + "learning_rate": 7.259379896463248e-08, + "loss": 5.1205, + "step": 19290 + }, + { + "epoch": 3.7848175755198117, + "grad_norm": 22.791608810424805, + "learning_rate": 7.193812811286615e-08, + "loss": 4.7257, + "step": 19295 + }, + { + "epoch": 3.785798352295018, + "grad_norm": 18.99640464782715, + "learning_rate": 7.12854102796623e-08, + "loss": 4.5685, + "step": 19300 + }, + { + "epoch": 3.7867791290702235, + "grad_norm": 14.632157325744629, + "learning_rate": 7.063564585614524e-08, + "loss": 4.588, + "step": 19305 + }, + { + "epoch": 3.7877599058454297, + "grad_norm": 23.26076316833496, + "learning_rate": 6.998883523167021e-08, + "loss": 4.6019, + "step": 19310 + }, + { + "epoch": 3.7887406826206353, + "grad_norm": 15.142358779907227, + "learning_rate": 6.934497879382218e-08, + "loss": 4.5146, + "step": 19315 + }, + { + "epoch": 3.7897214593958415, + "grad_norm": 24.590253829956055, + "learning_rate": 6.870407692841696e-08, + "loss": 4.2335, + "step": 19320 + }, + { + "epoch": 3.7907022361710476, + "grad_norm": 16.87588882446289, + "learning_rate": 6.806613001949846e-08, + "loss": 4.5, + "step": 19325 + }, + { + "epoch": 3.7916830129462533, + "grad_norm": 15.2261323928833, + "learning_rate": 6.74311384493398e-08, + "loss": 4.4496, + "step": 19330 + }, + { + "epoch": 3.7926637897214595, + "grad_norm": 25.586700439453125, + "learning_rate": 6.679910259844491e-08, + "loss": 5.0063, + "step": 19335 + }, + { + "epoch": 3.793644566496665, + "grad_norm": 22.457977294921875, + "learning_rate": 6.617002284554585e-08, + "loss": 4.1973, + "step": 19340 + }, + { + "epoch": 3.7946253432718713, + "grad_norm": 22.65077018737793, + "learning_rate": 6.554389956760166e-08, + "loss": 4.6877, + "step": 19345 + }, + { + "epoch": 3.7956061200470774, + "grad_norm": 23.340280532836914, + "learning_rate": 6.492073313980274e-08, + "loss": 4.4651, + "step": 19350 + }, + { + "epoch": 3.796586896822283, + "grad_norm": 15.831352233886719, + "learning_rate": 6.430052393556485e-08, + "loss": 4.063, + "step": 19355 + }, + { + "epoch": 3.7975676735974893, + "grad_norm": 21.76120948791504, + "learning_rate": 6.368327232653349e-08, + "loss": 4.4149, + "step": 19360 + }, + { + "epoch": 3.798548450372695, + "grad_norm": 20.37674331665039, + "learning_rate": 6.306897868258167e-08, + "loss": 4.5668, + "step": 19365 + }, + { + "epoch": 3.799529227147901, + "grad_norm": 30.422216415405273, + "learning_rate": 6.245764337180827e-08, + "loss": 4.455, + "step": 19370 + }, + { + "epoch": 3.8005100039231072, + "grad_norm": 27.67301368713379, + "learning_rate": 6.184926676054192e-08, + "loss": 4.6488, + "step": 19375 + }, + { + "epoch": 3.801490780698313, + "grad_norm": 15.223337173461914, + "learning_rate": 6.124384921333714e-08, + "loss": 4.5511, + "step": 19380 + }, + { + "epoch": 3.802471557473519, + "grad_norm": 27.290067672729492, + "learning_rate": 6.064139109297485e-08, + "loss": 4.3251, + "step": 19385 + }, + { + "epoch": 3.8034523342487248, + "grad_norm": 12.322364807128906, + "learning_rate": 6.004189276046346e-08, + "loss": 4.451, + "step": 19390 + }, + { + "epoch": 3.804433111023931, + "grad_norm": 20.460012435913086, + "learning_rate": 5.944535457503731e-08, + "loss": 4.3789, + "step": 19395 + }, + { + "epoch": 3.805413887799137, + "grad_norm": 14.362495422363281, + "learning_rate": 5.885177689415711e-08, + "loss": 4.476, + "step": 19400 + }, + { + "epoch": 3.8063946645743427, + "grad_norm": 16.157991409301758, + "learning_rate": 5.826116007350946e-08, + "loss": 4.6647, + "step": 19405 + }, + { + "epoch": 3.807375441349549, + "grad_norm": 21.604549407958984, + "learning_rate": 5.7673504467006816e-08, + "loss": 4.5372, + "step": 19410 + }, + { + "epoch": 3.8083562181247546, + "grad_norm": 26.53864097595215, + "learning_rate": 5.708881042678749e-08, + "loss": 4.4941, + "step": 19415 + }, + { + "epoch": 3.8093369948999607, + "grad_norm": 36.335411071777344, + "learning_rate": 5.650707830321456e-08, + "loss": 4.4285, + "step": 19420 + }, + { + "epoch": 3.810317771675167, + "grad_norm": 17.700517654418945, + "learning_rate": 5.59283084448764e-08, + "loss": 4.6249, + "step": 19425 + }, + { + "epoch": 3.811298548450373, + "grad_norm": 16.858427047729492, + "learning_rate": 5.5352501198586705e-08, + "loss": 4.4317, + "step": 19430 + }, + { + "epoch": 3.8122793252255787, + "grad_norm": 11.746870994567871, + "learning_rate": 5.477965690938392e-08, + "loss": 4.3457, + "step": 19435 + }, + { + "epoch": 3.8132601020007844, + "grad_norm": 15.386157035827637, + "learning_rate": 5.420977592053067e-08, + "loss": 4.2697, + "step": 19440 + }, + { + "epoch": 3.8142408787759905, + "grad_norm": 19.786962509155273, + "learning_rate": 5.36428585735127e-08, + "loss": 4.7254, + "step": 19445 + }, + { + "epoch": 3.8152216555511966, + "grad_norm": 12.420428276062012, + "learning_rate": 5.307890520804271e-08, + "loss": 4.4631, + "step": 19450 + }, + { + "epoch": 3.8162024323264028, + "grad_norm": 19.548391342163086, + "learning_rate": 5.251791616205537e-08, + "loss": 4.55, + "step": 19455 + }, + { + "epoch": 3.8171832091016085, + "grad_norm": 12.915349960327148, + "learning_rate": 5.1959891771708456e-08, + "loss": 4.5048, + "step": 19460 + }, + { + "epoch": 3.8181639858768146, + "grad_norm": 24.07160758972168, + "learning_rate": 5.14048323713856e-08, + "loss": 4.723, + "step": 19465 + }, + { + "epoch": 3.8191447626520203, + "grad_norm": 15.310680389404297, + "learning_rate": 5.085273829369186e-08, + "loss": 3.9662, + "step": 19470 + }, + { + "epoch": 3.8201255394272264, + "grad_norm": 18.540067672729492, + "learning_rate": 5.0303609869455375e-08, + "loss": 4.6253, + "step": 19475 + }, + { + "epoch": 3.8211063162024326, + "grad_norm": 12.179683685302734, + "learning_rate": 4.975744742772848e-08, + "loss": 4.5506, + "step": 19480 + }, + { + "epoch": 3.8220870929776383, + "grad_norm": 20.053329467773438, + "learning_rate": 4.9214251295784385e-08, + "loss": 4.436, + "step": 19485 + }, + { + "epoch": 3.8230678697528444, + "grad_norm": 18.177942276000977, + "learning_rate": 4.8674021799121064e-08, + "loss": 4.3544, + "step": 19490 + }, + { + "epoch": 3.82404864652805, + "grad_norm": 18.189285278320312, + "learning_rate": 4.813675926145678e-08, + "loss": 4.4312, + "step": 19495 + }, + { + "epoch": 3.8250294233032562, + "grad_norm": 40.68314743041992, + "learning_rate": 4.760246400473345e-08, + "loss": 4.597, + "step": 19500 + }, + { + "epoch": 3.8260102000784624, + "grad_norm": 24.781036376953125, + "learning_rate": 4.707113634911387e-08, + "loss": 4.3725, + "step": 19505 + }, + { + "epoch": 3.826990976853668, + "grad_norm": 14.774175643920898, + "learning_rate": 4.654277661298223e-08, + "loss": 4.2771, + "step": 19510 + }, + { + "epoch": 3.827971753628874, + "grad_norm": 21.735511779785156, + "learning_rate": 4.60173851129464e-08, + "loss": 4.4238, + "step": 19515 + }, + { + "epoch": 3.82895253040408, + "grad_norm": 26.12150764465332, + "learning_rate": 4.549496216383287e-08, + "loss": 4.87, + "step": 19520 + }, + { + "epoch": 3.829933307179286, + "grad_norm": 21.527135848999023, + "learning_rate": 4.497550807869122e-08, + "loss": 4.2331, + "step": 19525 + }, + { + "epoch": 3.830914083954492, + "grad_norm": 17.46333122253418, + "learning_rate": 4.44590231687908e-08, + "loss": 4.7262, + "step": 19530 + }, + { + "epoch": 3.831894860729698, + "grad_norm": 13.620034217834473, + "learning_rate": 4.394550774362349e-08, + "loss": 4.7126, + "step": 19535 + }, + { + "epoch": 3.832875637504904, + "grad_norm": 13.067037582397461, + "learning_rate": 4.343496211089981e-08, + "loss": 4.4245, + "step": 19540 + }, + { + "epoch": 3.8338564142801097, + "grad_norm": 33.16718292236328, + "learning_rate": 4.292738657655171e-08, + "loss": 4.3331, + "step": 19545 + }, + { + "epoch": 3.834837191055316, + "grad_norm": 15.34744644165039, + "learning_rate": 4.242278144473144e-08, + "loss": 4.2482, + "step": 19550 + }, + { + "epoch": 3.835817967830522, + "grad_norm": 14.823765754699707, + "learning_rate": 4.192114701781047e-08, + "loss": 4.3858, + "step": 19555 + }, + { + "epoch": 3.8367987446057277, + "grad_norm": 18.384389877319336, + "learning_rate": 4.142248359638168e-08, + "loss": 4.483, + "step": 19560 + }, + { + "epoch": 3.837779521380934, + "grad_norm": 13.538766860961914, + "learning_rate": 4.092679147925604e-08, + "loss": 4.599, + "step": 19565 + }, + { + "epoch": 3.8387602981561395, + "grad_norm": 16.762392044067383, + "learning_rate": 4.043407096346486e-08, + "loss": 4.2274, + "step": 19570 + }, + { + "epoch": 3.8397410749313456, + "grad_norm": 31.2176570892334, + "learning_rate": 3.99443223442586e-08, + "loss": 4.4158, + "step": 19575 + }, + { + "epoch": 3.840721851706552, + "grad_norm": 23.36688804626465, + "learning_rate": 3.945754591510698e-08, + "loss": 4.5466, + "step": 19580 + }, + { + "epoch": 3.8417026284817575, + "grad_norm": 22.09506607055664, + "learning_rate": 3.8973741967698874e-08, + "loss": 4.5284, + "step": 19585 + }, + { + "epoch": 3.8426834052569636, + "grad_norm": 31.000293731689453, + "learning_rate": 3.849291079194184e-08, + "loss": 4.895, + "step": 19590 + }, + { + "epoch": 3.8436641820321693, + "grad_norm": 20.989912033081055, + "learning_rate": 3.8015052675961505e-08, + "loss": 4.5427, + "step": 19595 + }, + { + "epoch": 3.8446449588073754, + "grad_norm": 16.181804656982422, + "learning_rate": 3.754016790610271e-08, + "loss": 4.3438, + "step": 19600 + }, + { + "epoch": 3.8456257355825816, + "grad_norm": 18.582027435302734, + "learning_rate": 3.706825676692838e-08, + "loss": 4.3791, + "step": 19605 + }, + { + "epoch": 3.8466065123577873, + "grad_norm": 42.60034942626953, + "learning_rate": 3.659931954121954e-08, + "loss": 4.6263, + "step": 19610 + }, + { + "epoch": 3.8475872891329934, + "grad_norm": 15.503507614135742, + "learning_rate": 3.613335650997585e-08, + "loss": 4.6023, + "step": 19615 + }, + { + "epoch": 3.848568065908199, + "grad_norm": 16.451189041137695, + "learning_rate": 3.56703679524123e-08, + "loss": 4.2281, + "step": 19620 + }, + { + "epoch": 3.8495488426834052, + "grad_norm": 18.993877410888672, + "learning_rate": 3.52103541459653e-08, + "loss": 4.5004, + "step": 19625 + }, + { + "epoch": 3.8505296194586114, + "grad_norm": 14.785061836242676, + "learning_rate": 3.4753315366284904e-08, + "loss": 4.1803, + "step": 19630 + }, + { + "epoch": 3.851510396233817, + "grad_norm": 19.010860443115234, + "learning_rate": 3.429925188724148e-08, + "loss": 4.3486, + "step": 19635 + }, + { + "epoch": 3.852491173009023, + "grad_norm": 20.439987182617188, + "learning_rate": 3.384816398092128e-08, + "loss": 4.5895, + "step": 19640 + }, + { + "epoch": 3.853471949784229, + "grad_norm": 20.579715728759766, + "learning_rate": 3.3400051917626964e-08, + "loss": 4.752, + "step": 19645 + }, + { + "epoch": 3.854452726559435, + "grad_norm": 35.6832389831543, + "learning_rate": 3.295491596587874e-08, + "loss": 4.3648, + "step": 19650 + }, + { + "epoch": 3.855433503334641, + "grad_norm": 24.63140296936035, + "learning_rate": 3.251275639241269e-08, + "loss": 4.9887, + "step": 19655 + }, + { + "epoch": 3.856414280109847, + "grad_norm": 20.618412017822266, + "learning_rate": 3.2073573462182984e-08, + "loss": 4.5265, + "step": 19660 + }, + { + "epoch": 3.857395056885053, + "grad_norm": 25.951919555664062, + "learning_rate": 3.1637367438358544e-08, + "loss": 4.9724, + "step": 19665 + }, + { + "epoch": 3.8583758336602587, + "grad_norm": 23.954572677612305, + "learning_rate": 3.120413858232474e-08, + "loss": 4.7667, + "step": 19670 + }, + { + "epoch": 3.859356610435465, + "grad_norm": 13.720315933227539, + "learning_rate": 3.07738871536839e-08, + "loss": 4.3292, + "step": 19675 + }, + { + "epoch": 3.860337387210671, + "grad_norm": 15.496363639831543, + "learning_rate": 3.034661341025258e-08, + "loss": 4.6116, + "step": 19680 + }, + { + "epoch": 3.8613181639858767, + "grad_norm": 60.01427459716797, + "learning_rate": 2.9922317608064856e-08, + "loss": 4.4719, + "step": 19685 + }, + { + "epoch": 3.862298940761083, + "grad_norm": 15.243767738342285, + "learning_rate": 2.9501000001369018e-08, + "loss": 4.3308, + "step": 19690 + }, + { + "epoch": 3.8632797175362885, + "grad_norm": 29.687528610229492, + "learning_rate": 2.9082660842628674e-08, + "loss": 4.7989, + "step": 19695 + }, + { + "epoch": 3.8642604943114947, + "grad_norm": 35.74703598022461, + "learning_rate": 2.8667300382523855e-08, + "loss": 4.586, + "step": 19700 + }, + { + "epoch": 3.865241271086701, + "grad_norm": 22.265159606933594, + "learning_rate": 2.82549188699488e-08, + "loss": 4.8159, + "step": 19705 + }, + { + "epoch": 3.8662220478619065, + "grad_norm": 16.983205795288086, + "learning_rate": 2.7845516552013064e-08, + "loss": 4.362, + "step": 19710 + }, + { + "epoch": 3.8672028246371126, + "grad_norm": 15.669620513916016, + "learning_rate": 2.7439093674040406e-08, + "loss": 4.1492, + "step": 19715 + }, + { + "epoch": 3.8681836014123183, + "grad_norm": 11.486489295959473, + "learning_rate": 2.7035650479570463e-08, + "loss": 4.6597, + "step": 19720 + }, + { + "epoch": 3.8691643781875245, + "grad_norm": 19.21707534790039, + "learning_rate": 2.6635187210355408e-08, + "loss": 4.4448, + "step": 19725 + }, + { + "epoch": 3.8701451549627306, + "grad_norm": 28.177640914916992, + "learning_rate": 2.6237704106363282e-08, + "loss": 4.4331, + "step": 19730 + }, + { + "epoch": 3.8711259317379363, + "grad_norm": 19.105148315429688, + "learning_rate": 2.584320140577634e-08, + "loss": 4.6519, + "step": 19735 + }, + { + "epoch": 3.8721067085131424, + "grad_norm": 17.474411010742188, + "learning_rate": 2.5451679344989934e-08, + "loss": 4.5222, + "step": 19740 + }, + { + "epoch": 3.873087485288348, + "grad_norm": 14.374054908752441, + "learning_rate": 2.506313815861472e-08, + "loss": 4.4389, + "step": 19745 + }, + { + "epoch": 3.8740682620635543, + "grad_norm": 29.551166534423828, + "learning_rate": 2.467757807947335e-08, + "loss": 4.3835, + "step": 19750 + }, + { + "epoch": 3.8750490388387604, + "grad_norm": 14.76496696472168, + "learning_rate": 2.4294999338604352e-08, + "loss": 4.2703, + "step": 19755 + }, + { + "epoch": 3.8760298156139665, + "grad_norm": 27.193767547607422, + "learning_rate": 2.391540216525712e-08, + "loss": 4.7443, + "step": 19760 + }, + { + "epoch": 3.8770105923891722, + "grad_norm": 33.37945556640625, + "learning_rate": 2.3538786786896918e-08, + "loss": 4.8142, + "step": 19765 + }, + { + "epoch": 3.877991369164378, + "grad_norm": 36.48703384399414, + "learning_rate": 2.316515342920045e-08, + "loss": 4.3292, + "step": 19770 + }, + { + "epoch": 3.878972145939584, + "grad_norm": 13.171440124511719, + "learning_rate": 2.279450231605862e-08, + "loss": 4.277, + "step": 19775 + }, + { + "epoch": 3.87995292271479, + "grad_norm": 22.090845108032227, + "learning_rate": 2.2426833669574875e-08, + "loss": 4.2819, + "step": 19780 + }, + { + "epoch": 3.8809336994899963, + "grad_norm": 23.142976760864258, + "learning_rate": 2.2062147710065208e-08, + "loss": 4.5399, + "step": 19785 + }, + { + "epoch": 3.881914476265202, + "grad_norm": 21.87250328063965, + "learning_rate": 2.170044465605925e-08, + "loss": 4.4368, + "step": 19790 + }, + { + "epoch": 3.882895253040408, + "grad_norm": 22.828344345092773, + "learning_rate": 2.1341724724298073e-08, + "loss": 4.2221, + "step": 19795 + }, + { + "epoch": 3.883876029815614, + "grad_norm": 18.955265045166016, + "learning_rate": 2.0985988129735847e-08, + "loss": 4.3943, + "step": 19800 + }, + { + "epoch": 3.88485680659082, + "grad_norm": 14.534862518310547, + "learning_rate": 2.063323508553816e-08, + "loss": 4.3378, + "step": 19805 + }, + { + "epoch": 3.885837583366026, + "grad_norm": 22.451967239379883, + "learning_rate": 2.028346580308427e-08, + "loss": 4.3979, + "step": 19810 + }, + { + "epoch": 3.886818360141232, + "grad_norm": 22.964393615722656, + "learning_rate": 1.99366804919654e-08, + "loss": 4.3687, + "step": 19815 + }, + { + "epoch": 3.887799136916438, + "grad_norm": 15.007269859313965, + "learning_rate": 1.9592879359981998e-08, + "loss": 4.5002, + "step": 19820 + }, + { + "epoch": 3.8887799136916437, + "grad_norm": 22.878572463989258, + "learning_rate": 1.925206261315038e-08, + "loss": 4.9794, + "step": 19825 + }, + { + "epoch": 3.88976069046685, + "grad_norm": 32.9472770690918, + "learning_rate": 1.8914230455695514e-08, + "loss": 4.5072, + "step": 19830 + }, + { + "epoch": 3.890741467242056, + "grad_norm": 41.51423645019531, + "learning_rate": 1.8579383090054915e-08, + "loss": 4.4305, + "step": 19835 + }, + { + "epoch": 3.8917222440172616, + "grad_norm": 14.0291748046875, + "learning_rate": 1.8247520716878075e-08, + "loss": 4.2225, + "step": 19840 + }, + { + "epoch": 3.8927030207924678, + "grad_norm": 12.759921073913574, + "learning_rate": 1.7918643535024816e-08, + "loss": 4.5276, + "step": 19845 + }, + { + "epoch": 3.8936837975676735, + "grad_norm": 21.33737564086914, + "learning_rate": 1.7592751741566384e-08, + "loss": 4.448, + "step": 19850 + }, + { + "epoch": 3.8946645743428796, + "grad_norm": 26.02752685546875, + "learning_rate": 1.726984553178601e-08, + "loss": 4.4764, + "step": 19855 + }, + { + "epoch": 3.8956453511180857, + "grad_norm": 15.944296836853027, + "learning_rate": 1.69499250991767e-08, + "loss": 4.5847, + "step": 19860 + }, + { + "epoch": 3.8966261278932914, + "grad_norm": 21.915067672729492, + "learning_rate": 1.663299063544288e-08, + "loss": 4.9598, + "step": 19865 + }, + { + "epoch": 3.8976069046684976, + "grad_norm": 26.551939010620117, + "learning_rate": 1.6319042330500413e-08, + "loss": 4.7225, + "step": 19870 + }, + { + "epoch": 3.8985876814437033, + "grad_norm": 24.626413345336914, + "learning_rate": 1.600808037247381e-08, + "loss": 4.1449, + "step": 19875 + }, + { + "epoch": 3.8995684582189094, + "grad_norm": 9.839727401733398, + "learning_rate": 1.570010494769958e-08, + "loss": 4.6128, + "step": 19880 + }, + { + "epoch": 3.9005492349941155, + "grad_norm": 34.44369888305664, + "learning_rate": 1.5395116240725093e-08, + "loss": 4.5166, + "step": 19885 + }, + { + "epoch": 3.9015300117693212, + "grad_norm": 19.511125564575195, + "learning_rate": 1.5093114434306388e-08, + "loss": 4.49, + "step": 19890 + }, + { + "epoch": 3.9025107885445274, + "grad_norm": 12.833151817321777, + "learning_rate": 1.4794099709410925e-08, + "loss": 4.4542, + "step": 19895 + }, + { + "epoch": 3.903491565319733, + "grad_norm": 12.988509178161621, + "learning_rate": 1.4498072245216488e-08, + "loss": 4.5994, + "step": 19900 + }, + { + "epoch": 3.904472342094939, + "grad_norm": 24.291364669799805, + "learning_rate": 1.420503221910896e-08, + "loss": 4.2495, + "step": 19905 + }, + { + "epoch": 3.9054531188701453, + "grad_norm": 31.334964752197266, + "learning_rate": 1.3914979806685659e-08, + "loss": 4.5936, + "step": 19910 + }, + { + "epoch": 3.906433895645351, + "grad_norm": 38.430110931396484, + "learning_rate": 1.3627915181753659e-08, + "loss": 4.568, + "step": 19915 + }, + { + "epoch": 3.907414672420557, + "grad_norm": 23.217266082763672, + "learning_rate": 1.3343838516329255e-08, + "loss": 4.4528, + "step": 19920 + }, + { + "epoch": 3.908395449195763, + "grad_norm": 18.199256896972656, + "learning_rate": 1.3062749980637946e-08, + "loss": 4.5258, + "step": 19925 + }, + { + "epoch": 3.909376225970969, + "grad_norm": 16.116588592529297, + "learning_rate": 1.2784649743115551e-08, + "loss": 4.4565, + "step": 19930 + }, + { + "epoch": 3.910357002746175, + "grad_norm": 26.552534103393555, + "learning_rate": 1.2509537970406549e-08, + "loss": 4.6301, + "step": 19935 + }, + { + "epoch": 3.911337779521381, + "grad_norm": 15.603949546813965, + "learning_rate": 1.2237414827364624e-08, + "loss": 4.4856, + "step": 19940 + }, + { + "epoch": 3.912318556296587, + "grad_norm": 10.514763832092285, + "learning_rate": 1.1968280477052673e-08, + "loss": 4.4509, + "step": 19945 + }, + { + "epoch": 3.9132993330717927, + "grad_norm": 17.823230743408203, + "learning_rate": 1.170213508074336e-08, + "loss": 4.0671, + "step": 19950 + }, + { + "epoch": 3.914280109846999, + "grad_norm": 23.3159122467041, + "learning_rate": 1.1438978797916888e-08, + "loss": 4.7467, + "step": 19955 + }, + { + "epoch": 3.915260886622205, + "grad_norm": 33.341434478759766, + "learning_rate": 1.1178811786263787e-08, + "loss": 4.4565, + "step": 19960 + }, + { + "epoch": 3.9162416633974106, + "grad_norm": 20.232297897338867, + "learning_rate": 1.0921634201682685e-08, + "loss": 4.4115, + "step": 19965 + }, + { + "epoch": 3.917222440172617, + "grad_norm": 24.05611801147461, + "learning_rate": 1.0667446198280307e-08, + "loss": 4.2384, + "step": 19970 + }, + { + "epoch": 3.9182032169478225, + "grad_norm": 9.7948637008667, + "learning_rate": 1.0416247928373147e-08, + "loss": 4.2928, + "step": 19975 + }, + { + "epoch": 3.9191839937230286, + "grad_norm": 18.230249404907227, + "learning_rate": 1.0168039542485242e-08, + "loss": 4.7602, + "step": 19980 + }, + { + "epoch": 3.9201647704982348, + "grad_norm": 22.710430145263672, + "learning_rate": 9.922821189348731e-09, + "loss": 4.8469, + "step": 19985 + }, + { + "epoch": 3.9211455472734404, + "grad_norm": 19.150442123413086, + "learning_rate": 9.680593015905515e-09, + "loss": 4.3717, + "step": 19990 + }, + { + "epoch": 3.9221263240486466, + "grad_norm": 14.786493301391602, + "learning_rate": 9.44135516730449e-09, + "loss": 4.3758, + "step": 19995 + }, + { + "epoch": 3.9231071008238523, + "grad_norm": 16.019426345825195, + "learning_rate": 9.205107786902646e-09, + "loss": 4.5163, + "step": 20000 + }, + { + "epoch": 3.9240878775990584, + "grad_norm": 22.1610107421875, + "learning_rate": 8.971851016265631e-09, + "loss": 4.4827, + "step": 20005 + }, + { + "epoch": 3.9250686543742646, + "grad_norm": 22.63068199157715, + "learning_rate": 8.741584995167195e-09, + "loss": 4.2595, + "step": 20010 + }, + { + "epoch": 3.9260494311494702, + "grad_norm": 11.803055763244629, + "learning_rate": 8.514309861588077e-09, + "loss": 4.436, + "step": 20015 + }, + { + "epoch": 3.9270302079246764, + "grad_norm": 30.385862350463867, + "learning_rate": 8.290025751716558e-09, + "loss": 4.6062, + "step": 20020 + }, + { + "epoch": 3.928010984699882, + "grad_norm": 18.616378784179688, + "learning_rate": 8.068732799950685e-09, + "loss": 4.5166, + "step": 20025 + }, + { + "epoch": 3.928991761475088, + "grad_norm": 11.325064659118652, + "learning_rate": 7.850431138893833e-09, + "loss": 4.3665, + "step": 20030 + }, + { + "epoch": 3.9299725382502944, + "grad_norm": 20.41812515258789, + "learning_rate": 7.635120899358029e-09, + "loss": 4.3084, + "step": 20035 + }, + { + "epoch": 3.9309533150255, + "grad_norm": 25.305400848388672, + "learning_rate": 7.422802210362845e-09, + "loss": 4.364, + "step": 20040 + }, + { + "epoch": 3.931934091800706, + "grad_norm": 15.921910285949707, + "learning_rate": 7.213475199134845e-09, + "loss": 4.3632, + "step": 20045 + }, + { + "epoch": 3.932914868575912, + "grad_norm": 17.5665283203125, + "learning_rate": 7.007139991108136e-09, + "loss": 4.1237, + "step": 20050 + }, + { + "epoch": 3.933895645351118, + "grad_norm": 15.96419620513916, + "learning_rate": 6.8037967099232604e-09, + "loss": 4.4997, + "step": 20055 + }, + { + "epoch": 3.934876422126324, + "grad_norm": 38.77703094482422, + "learning_rate": 6.60344547742997e-09, + "loss": 4.9041, + "step": 20060 + }, + { + "epoch": 3.93585719890153, + "grad_norm": 16.583595275878906, + "learning_rate": 6.406086413682233e-09, + "loss": 4.3876, + "step": 20065 + }, + { + "epoch": 3.936837975676736, + "grad_norm": 24.678234100341797, + "learning_rate": 6.211719636943781e-09, + "loss": 4.18, + "step": 20070 + }, + { + "epoch": 3.9378187524519417, + "grad_norm": 12.968330383300781, + "learning_rate": 6.020345263683114e-09, + "loss": 4.8268, + "step": 20075 + }, + { + "epoch": 3.938799529227148, + "grad_norm": 17.95810317993164, + "learning_rate": 5.83196340857739e-09, + "loss": 4.5699, + "step": 20080 + }, + { + "epoch": 3.939780306002354, + "grad_norm": 22.08363914489746, + "learning_rate": 5.646574184509646e-09, + "loss": 4.9704, + "step": 20085 + }, + { + "epoch": 3.94076108277756, + "grad_norm": 13.76091480255127, + "learning_rate": 5.464177702568796e-09, + "loss": 4.4924, + "step": 20090 + }, + { + "epoch": 3.941741859552766, + "grad_norm": 19.462295532226562, + "learning_rate": 5.2847740720529674e-09, + "loss": 4.3258, + "step": 20095 + }, + { + "epoch": 3.9427226363279715, + "grad_norm": 11.40530014038086, + "learning_rate": 5.108363400463945e-09, + "loss": 4.581, + "step": 20100 + }, + { + "epoch": 3.9437034131031776, + "grad_norm": 16.985126495361328, + "learning_rate": 4.9349457935121695e-09, + "loss": 4.7181, + "step": 20105 + }, + { + "epoch": 3.9446841898783838, + "grad_norm": 36.759437561035156, + "learning_rate": 4.764521355113405e-09, + "loss": 4.2881, + "step": 20110 + }, + { + "epoch": 3.94566496665359, + "grad_norm": 22.106002807617188, + "learning_rate": 4.597090187390407e-09, + "loss": 4.3794, + "step": 20115 + }, + { + "epoch": 3.9466457434287956, + "grad_norm": 32.104759216308594, + "learning_rate": 4.43265239067292e-09, + "loss": 4.6535, + "step": 20120 + }, + { + "epoch": 3.9476265202040017, + "grad_norm": 21.282289505004883, + "learning_rate": 4.2712080634949024e-09, + "loss": 4.3232, + "step": 20125 + }, + { + "epoch": 3.9486072969792074, + "grad_norm": 25.346208572387695, + "learning_rate": 4.112757302598414e-09, + "loss": 4.4755, + "step": 20130 + }, + { + "epoch": 3.9495880737544136, + "grad_norm": 22.805927276611328, + "learning_rate": 3.957300202931391e-09, + "loss": 4.6681, + "step": 20135 + }, + { + "epoch": 3.9505688505296197, + "grad_norm": 27.887910842895508, + "learning_rate": 3.804836857647654e-09, + "loss": 4.5999, + "step": 20140 + }, + { + "epoch": 3.9515496273048254, + "grad_norm": 22.663606643676758, + "learning_rate": 3.655367358106343e-09, + "loss": 4.4651, + "step": 20145 + }, + { + "epoch": 3.9525304040800315, + "grad_norm": 11.637248992919922, + "learning_rate": 3.5088917938741473e-09, + "loss": 4.259, + "step": 20150 + }, + { + "epoch": 3.9535111808552372, + "grad_norm": 25.06681251525879, + "learning_rate": 3.365410252723078e-09, + "loss": 4.4804, + "step": 20155 + }, + { + "epoch": 3.9544919576304434, + "grad_norm": 34.771358489990234, + "learning_rate": 3.2249228206299165e-09, + "loss": 4.5491, + "step": 20160 + }, + { + "epoch": 3.9554727344056495, + "grad_norm": 14.006319999694824, + "learning_rate": 3.087429581778434e-09, + "loss": 4.258, + "step": 20165 + }, + { + "epoch": 3.956453511180855, + "grad_norm": 22.197072982788086, + "learning_rate": 2.952930618558836e-09, + "loss": 4.6398, + "step": 20170 + }, + { + "epoch": 3.9574342879560613, + "grad_norm": 11.157323837280273, + "learning_rate": 2.8214260115655424e-09, + "loss": 4.6351, + "step": 20175 + }, + { + "epoch": 3.958415064731267, + "grad_norm": 14.1424560546875, + "learning_rate": 2.6929158395999634e-09, + "loss": 4.5076, + "step": 20180 + }, + { + "epoch": 3.959395841506473, + "grad_norm": 24.90903663635254, + "learning_rate": 2.567400179667723e-09, + "loss": 4.3885, + "step": 20185 + }, + { + "epoch": 3.9603766182816793, + "grad_norm": 16.46721076965332, + "learning_rate": 2.444879106982545e-09, + "loss": 4.8088, + "step": 20190 + }, + { + "epoch": 3.961357395056885, + "grad_norm": 32.02701187133789, + "learning_rate": 2.325352694960148e-09, + "loss": 4.7701, + "step": 20195 + }, + { + "epoch": 3.962338171832091, + "grad_norm": 22.428773880004883, + "learning_rate": 2.2088210152254596e-09, + "loss": 4.4474, + "step": 20200 + }, + { + "epoch": 3.963318948607297, + "grad_norm": 27.974700927734375, + "learning_rate": 2.0952841376065124e-09, + "loss": 4.6453, + "step": 20205 + }, + { + "epoch": 3.964299725382503, + "grad_norm": 29.08708381652832, + "learning_rate": 1.984742130137218e-09, + "loss": 4.4979, + "step": 20210 + }, + { + "epoch": 3.965280502157709, + "grad_norm": 12.308250427246094, + "learning_rate": 1.8771950590573686e-09, + "loss": 4.4553, + "step": 20215 + }, + { + "epoch": 3.966261278932915, + "grad_norm": 18.78961944580078, + "learning_rate": 1.7726429888120788e-09, + "loss": 4.308, + "step": 20220 + }, + { + "epoch": 3.967242055708121, + "grad_norm": 11.963844299316406, + "learning_rate": 1.6710859820512348e-09, + "loss": 4.4348, + "step": 20225 + }, + { + "epoch": 3.9682228324833266, + "grad_norm": 19.877851486206055, + "learning_rate": 1.5725240996306013e-09, + "loss": 4.1762, + "step": 20230 + }, + { + "epoch": 3.9692036092585328, + "grad_norm": 30.30404281616211, + "learning_rate": 1.4769574006107124e-09, + "loss": 4.9034, + "step": 20235 + }, + { + "epoch": 3.970184386033739, + "grad_norm": 22.604032516479492, + "learning_rate": 1.3843859422574269e-09, + "loss": 4.3639, + "step": 20240 + }, + { + "epoch": 3.9711651628089446, + "grad_norm": 22.842365264892578, + "learning_rate": 1.294809780042483e-09, + "loss": 4.1495, + "step": 20245 + }, + { + "epoch": 3.9721459395841507, + "grad_norm": 25.694583892822266, + "learning_rate": 1.2082289676412784e-09, + "loss": 4.9004, + "step": 20250 + }, + { + "epoch": 3.9731267163593564, + "grad_norm": 30.319358825683594, + "learning_rate": 1.1246435569362002e-09, + "loss": 4.5087, + "step": 20255 + }, + { + "epoch": 3.9741074931345626, + "grad_norm": 23.130138397216797, + "learning_rate": 1.0440535980132948e-09, + "loss": 4.0759, + "step": 20260 + }, + { + "epoch": 3.9750882699097687, + "grad_norm": 26.70267677307129, + "learning_rate": 9.664591391639333e-10, + "loss": 4.2694, + "step": 20265 + }, + { + "epoch": 3.9760690466849744, + "grad_norm": 21.76305389404297, + "learning_rate": 8.918602268848109e-10, + "loss": 4.3865, + "step": 20270 + }, + { + "epoch": 3.9770498234601805, + "grad_norm": 12.993515968322754, + "learning_rate": 8.202569058773924e-10, + "loss": 4.225, + "step": 20275 + }, + { + "epoch": 3.9780306002353862, + "grad_norm": 17.48309326171875, + "learning_rate": 7.516492190479118e-10, + "loss": 4.3941, + "step": 20280 + }, + { + "epoch": 3.9790113770105924, + "grad_norm": 15.2409029006958, + "learning_rate": 6.860372075084831e-10, + "loss": 4.4058, + "step": 20285 + }, + { + "epoch": 3.9799921537857985, + "grad_norm": 12.336922645568848, + "learning_rate": 6.234209105754341e-10, + "loss": 4.6124, + "step": 20290 + }, + { + "epoch": 3.980972930561004, + "grad_norm": 19.70476722717285, + "learning_rate": 5.63800365769307e-10, + "loss": 4.2905, + "step": 20295 + }, + { + "epoch": 3.9819537073362103, + "grad_norm": 18.989845275878906, + "learning_rate": 5.071756088165236e-10, + "loss": 4.5843, + "step": 20300 + }, + { + "epoch": 3.982934484111416, + "grad_norm": 20.936487197875977, + "learning_rate": 4.535466736488303e-10, + "loss": 4.546, + "step": 20305 + }, + { + "epoch": 3.983915260886622, + "grad_norm": 11.952984809875488, + "learning_rate": 4.029135924005223e-10, + "loss": 4.3485, + "step": 20310 + }, + { + "epoch": 3.9848960376618283, + "grad_norm": 12.227151870727539, + "learning_rate": 3.5527639541399486e-10, + "loss": 4.4553, + "step": 20315 + }, + { + "epoch": 3.985876814437034, + "grad_norm": 38.21388244628906, + "learning_rate": 3.1063511123308187e-10, + "loss": 4.319, + "step": 20320 + }, + { + "epoch": 3.98685759121224, + "grad_norm": 25.145599365234375, + "learning_rate": 2.689897666091623e-10, + "loss": 4.609, + "step": 20325 + }, + { + "epoch": 3.987838367987446, + "grad_norm": 48.00320053100586, + "learning_rate": 2.3034038649616398e-10, + "loss": 4.4196, + "step": 20330 + }, + { + "epoch": 3.988819144762652, + "grad_norm": 34.582763671875, + "learning_rate": 1.9468699405444936e-10, + "loss": 4.4329, + "step": 20335 + }, + { + "epoch": 3.989799921537858, + "grad_norm": 19.579330444335938, + "learning_rate": 1.6202961064804013e-10, + "loss": 4.5891, + "step": 20340 + }, + { + "epoch": 3.990780698313064, + "grad_norm": 31.714948654174805, + "learning_rate": 1.3236825584628242e-10, + "loss": 4.4694, + "step": 20345 + }, + { + "epoch": 3.99176147508827, + "grad_norm": 24.47707176208496, + "learning_rate": 1.0570294742329179e-10, + "loss": 4.6973, + "step": 20350 + }, + { + "epoch": 3.9927422518634756, + "grad_norm": 21.718971252441406, + "learning_rate": 8.203370135684286e-11, + "loss": 4.7068, + "step": 20355 + }, + { + "epoch": 3.993723028638682, + "grad_norm": 22.66058921813965, + "learning_rate": 6.136053183058987e-11, + "loss": 4.4124, + "step": 20360 + }, + { + "epoch": 3.994703805413888, + "grad_norm": 28.243078231811523, + "learning_rate": 4.368345123295648e-11, + "loss": 4.3122, + "step": 20365 + }, + { + "epoch": 3.9956845821890936, + "grad_norm": 12.159256935119629, + "learning_rate": 2.900247015547031e-11, + "loss": 4.3095, + "step": 20370 + }, + { + "epoch": 3.9966653589642998, + "grad_norm": 17.810564041137695, + "learning_rate": 1.731759739553862e-11, + "loss": 4.7029, + "step": 20375 + }, + { + "epoch": 3.9976461357395054, + "grad_norm": 13.572352409362793, + "learning_rate": 8.628839955893143e-12, + "loss": 4.5484, + "step": 20380 + }, + { + "epoch": 3.9986269125147116, + "grad_norm": 18.063383102416992, + "learning_rate": 2.936203042369634e-12, + "loss": 4.3011, + "step": 20385 + }, + { + "epoch": 3.9996076892899177, + "grad_norm": 15.351741790771484, + "learning_rate": 2.3969006557322816e-13, + "loss": 4.4443, + "step": 20390 + } + ], + "logging_steps": 5, + "max_steps": 20392, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 102, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.058850979271475e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}