{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 8795, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.6929430365562439, "learning_rate": 0.0001999998405083484, "loss": 1.6838, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.5281903743743896, "learning_rate": 0.00019999936203390236, "loss": 1.4915, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.6337839961051941, "learning_rate": 0.0001999985645781881, "loss": 1.2662, "step": 15 }, { "epoch": 0.0, "grad_norm": 0.6382477879524231, "learning_rate": 0.00019999744814374942, "loss": 1.1492, "step": 20 }, { "epoch": 0.0, "grad_norm": 0.6016966104507446, "learning_rate": 0.0001999960127341475, "loss": 1.189, "step": 25 }, { "epoch": 0.0, "grad_norm": 0.48733243346214294, "learning_rate": 0.00019999425835396113, "loss": 1.2111, "step": 30 }, { "epoch": 0.0, "grad_norm": 0.4674977660179138, "learning_rate": 0.0001999921850087864, "loss": 1.0462, "step": 35 }, { "epoch": 0.0, "grad_norm": 0.48517295718193054, "learning_rate": 0.00019998979270523704, "loss": 0.9653, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.5423499345779419, "learning_rate": 0.000199987081450944, "loss": 1.0769, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.5383784174919128, "learning_rate": 0.0001999840512545558, "loss": 1.0471, "step": 50 }, { "epoch": 0.01, "grad_norm": 0.37535467743873596, "learning_rate": 0.00019998070212573824, "loss": 1.1045, "step": 55 }, { "epoch": 0.01, "grad_norm": 0.5334364175796509, "learning_rate": 0.00019997703407517443, "loss": 0.9794, "step": 60 }, { "epoch": 0.01, "grad_norm": 0.5327732563018799, "learning_rate": 0.0001999730471145649, "loss": 1.0664, "step": 65 }, { "epoch": 0.01, "grad_norm": 0.4103855788707733, "learning_rate": 0.0001999687412566274, "loss": 1.0529, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.5592668652534485, "learning_rate": 0.00019996411651509684, "loss": 0.9661, "step": 75 }, { "epoch": 0.01, "grad_norm": 0.5078962445259094, "learning_rate": 0.0001999591729047254, "loss": 1.0836, "step": 80 }, { "epoch": 0.01, "grad_norm": 0.5035337209701538, "learning_rate": 0.0001999539104412824, "loss": 1.0011, "step": 85 }, { "epoch": 0.01, "grad_norm": 0.5704492330551147, "learning_rate": 0.00019994832914155416, "loss": 0.9957, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.4775836169719696, "learning_rate": 0.00019994242902334416, "loss": 1.0125, "step": 95 }, { "epoch": 0.01, "grad_norm": 0.4531150460243225, "learning_rate": 0.00019993621010547277, "loss": 0.9085, "step": 100 }, { "epoch": 0.01, "grad_norm": 0.5194448232650757, "learning_rate": 0.00019992967240777727, "loss": 0.9871, "step": 105 }, { "epoch": 0.01, "grad_norm": 0.44902467727661133, "learning_rate": 0.00019992281595111185, "loss": 0.912, "step": 110 }, { "epoch": 0.01, "grad_norm": 0.5913822650909424, "learning_rate": 0.00019991564075734744, "loss": 0.941, "step": 115 }, { "epoch": 0.01, "grad_norm": 0.524722158908844, "learning_rate": 0.00019990814684937174, "loss": 1.0098, "step": 120 }, { "epoch": 0.01, "grad_norm": 0.6472988724708557, "learning_rate": 0.00019990033425108905, "loss": 0.9765, "step": 125 }, { "epoch": 0.01, "grad_norm": 0.5326634049415588, "learning_rate": 0.00019989220298742026, "loss": 0.9342, "step": 130 }, { "epoch": 0.02, "grad_norm": 0.5234887599945068, "learning_rate": 0.00019988375308430275, "loss": 0.9872, "step": 135 }, { "epoch": 0.02, "grad_norm": 0.49476635456085205, "learning_rate": 0.00019987498456869025, "loss": 0.9218, "step": 140 }, { "epoch": 0.02, "grad_norm": 0.4328886568546295, "learning_rate": 0.00019986589746855295, "loss": 1.0531, "step": 145 }, { "epoch": 0.02, "grad_norm": 0.5224207639694214, "learning_rate": 0.0001998564918128771, "loss": 0.9544, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.4767338037490845, "learning_rate": 0.0001998467676316652, "loss": 0.9498, "step": 155 }, { "epoch": 0.02, "grad_norm": 0.4608069658279419, "learning_rate": 0.00019983672495593578, "loss": 0.8847, "step": 160 }, { "epoch": 0.02, "grad_norm": 0.4687401056289673, "learning_rate": 0.00019982636381772327, "loss": 0.938, "step": 165 }, { "epoch": 0.02, "grad_norm": 0.5468807220458984, "learning_rate": 0.000199815684250078, "loss": 0.9296, "step": 170 }, { "epoch": 0.02, "grad_norm": 0.5462723970413208, "learning_rate": 0.00019980468628706604, "loss": 0.9583, "step": 175 }, { "epoch": 0.02, "grad_norm": 0.5237535834312439, "learning_rate": 0.00019979336996376893, "loss": 0.9683, "step": 180 }, { "epoch": 0.02, "grad_norm": 0.5200393199920654, "learning_rate": 0.000199781735316284, "loss": 0.9164, "step": 185 }, { "epoch": 0.02, "grad_norm": 0.480398565530777, "learning_rate": 0.00019976978238172373, "loss": 0.9312, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.49409154057502747, "learning_rate": 0.000199757511198216, "loss": 0.9962, "step": 195 }, { "epoch": 0.02, "grad_norm": 0.5906654000282288, "learning_rate": 0.00019974492180490388, "loss": 0.9277, "step": 200 }, { "epoch": 0.02, "grad_norm": 0.5397025346755981, "learning_rate": 0.00019973201424194542, "loss": 0.8953, "step": 205 }, { "epoch": 0.02, "grad_norm": 0.5493289232254028, "learning_rate": 0.00019971878855051358, "loss": 0.9364, "step": 210 }, { "epoch": 0.02, "grad_norm": 0.5614472031593323, "learning_rate": 0.0001997052447727961, "loss": 0.9419, "step": 215 }, { "epoch": 0.03, "grad_norm": 0.5108090043067932, "learning_rate": 0.0001996913829519954, "loss": 0.9203, "step": 220 }, { "epoch": 0.03, "grad_norm": 0.5096657872200012, "learning_rate": 0.0001996772031323283, "loss": 0.8614, "step": 225 }, { "epoch": 0.03, "grad_norm": 0.541383683681488, "learning_rate": 0.00019966270535902618, "loss": 0.9603, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.5045916438102722, "learning_rate": 0.00019964788967833438, "loss": 0.9289, "step": 235 }, { "epoch": 0.03, "grad_norm": 0.5646150708198547, "learning_rate": 0.00019963275613751256, "loss": 0.8575, "step": 240 }, { "epoch": 0.03, "grad_norm": 0.4511927664279938, "learning_rate": 0.0001996173047848341, "loss": 0.9126, "step": 245 }, { "epoch": 0.03, "grad_norm": 0.5300033688545227, "learning_rate": 0.0001996015356695863, "loss": 0.9346, "step": 250 }, { "epoch": 0.03, "grad_norm": 0.5646623969078064, "learning_rate": 0.00019958544884207, "loss": 0.8818, "step": 255 }, { "epoch": 0.03, "grad_norm": 0.4728727638721466, "learning_rate": 0.00019956904435359943, "loss": 0.9771, "step": 260 }, { "epoch": 0.03, "grad_norm": 0.49879685044288635, "learning_rate": 0.00019955232225650225, "loss": 0.8741, "step": 265 }, { "epoch": 0.03, "grad_norm": 0.47209540009498596, "learning_rate": 0.00019953528260411912, "loss": 0.8946, "step": 270 }, { "epoch": 0.03, "grad_norm": 0.5027173161506653, "learning_rate": 0.00019951792545080369, "loss": 0.8586, "step": 275 }, { "epoch": 0.03, "grad_norm": 0.5433563590049744, "learning_rate": 0.00019950025085192232, "loss": 0.9146, "step": 280 }, { "epoch": 0.03, "grad_norm": 0.493032306432724, "learning_rate": 0.00019948225886385414, "loss": 0.9083, "step": 285 }, { "epoch": 0.03, "grad_norm": 0.5513988733291626, "learning_rate": 0.00019946394954399054, "loss": 0.8455, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.588005542755127, "learning_rate": 0.00019944532295073516, "loss": 1.0188, "step": 295 }, { "epoch": 0.03, "grad_norm": 0.5429700016975403, "learning_rate": 0.00019942637914350378, "loss": 0.9102, "step": 300 }, { "epoch": 0.03, "grad_norm": 0.6686177849769592, "learning_rate": 0.00019940711818272394, "loss": 1.0241, "step": 305 }, { "epoch": 0.04, "grad_norm": 0.5708988308906555, "learning_rate": 0.00019938754012983488, "loss": 0.9757, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.5050585865974426, "learning_rate": 0.0001993676450472874, "loss": 1.0048, "step": 315 }, { "epoch": 0.04, "grad_norm": 0.5606969594955444, "learning_rate": 0.00019934743299854338, "loss": 0.9737, "step": 320 }, { "epoch": 0.04, "grad_norm": 0.5190075635910034, "learning_rate": 0.00019932690404807598, "loss": 0.9677, "step": 325 }, { "epoch": 0.04, "grad_norm": 0.5396589040756226, "learning_rate": 0.00019930605826136904, "loss": 0.9119, "step": 330 }, { "epoch": 0.04, "grad_norm": 0.5063149333000183, "learning_rate": 0.0001992848957049172, "loss": 0.9627, "step": 335 }, { "epoch": 0.04, "grad_norm": 0.5704573392868042, "learning_rate": 0.00019926341644622544, "loss": 0.9782, "step": 340 }, { "epoch": 0.04, "grad_norm": 0.542542040348053, "learning_rate": 0.00019924162055380903, "loss": 0.9253, "step": 345 }, { "epoch": 0.04, "grad_norm": 0.4711320102214813, "learning_rate": 0.00019921950809719324, "loss": 0.9039, "step": 350 }, { "epoch": 0.04, "grad_norm": 0.5174899697303772, "learning_rate": 0.00019919707914691311, "loss": 0.8753, "step": 355 }, { "epoch": 0.04, "grad_norm": 0.5371158719062805, "learning_rate": 0.0001991743337745132, "loss": 0.9531, "step": 360 }, { "epoch": 0.04, "grad_norm": 0.49162808060646057, "learning_rate": 0.00019915127205254751, "loss": 0.9007, "step": 365 }, { "epoch": 0.04, "grad_norm": 0.5364812016487122, "learning_rate": 0.00019912789405457905, "loss": 0.8619, "step": 370 }, { "epoch": 0.04, "grad_norm": 0.5108892321586609, "learning_rate": 0.00019910419985517977, "loss": 0.9384, "step": 375 }, { "epoch": 0.04, "grad_norm": 0.6169962286949158, "learning_rate": 0.00019908018952993016, "loss": 1.0248, "step": 380 }, { "epoch": 0.04, "grad_norm": 0.5570533275604248, "learning_rate": 0.00019905586315541917, "loss": 0.9526, "step": 385 }, { "epoch": 0.04, "grad_norm": 0.5810699462890625, "learning_rate": 0.00019903122080924387, "loss": 0.9722, "step": 390 }, { "epoch": 0.04, "grad_norm": 0.4718081057071686, "learning_rate": 0.00019900626257000922, "loss": 0.8801, "step": 395 }, { "epoch": 0.05, "grad_norm": 0.6052469611167908, "learning_rate": 0.00019898098851732786, "loss": 0.8618, "step": 400 }, { "epoch": 0.05, "grad_norm": 0.491576611995697, "learning_rate": 0.0001989553987318198, "loss": 0.8347, "step": 405 }, { "epoch": 0.05, "grad_norm": 0.4725711941719055, "learning_rate": 0.00019892949329511212, "loss": 0.8782, "step": 410 }, { "epoch": 0.05, "grad_norm": 0.4833574891090393, "learning_rate": 0.00019890327228983893, "loss": 0.9227, "step": 415 }, { "epoch": 0.05, "grad_norm": 0.4949423670768738, "learning_rate": 0.0001988767357996408, "loss": 0.9076, "step": 420 }, { "epoch": 0.05, "grad_norm": 0.4801913797855377, "learning_rate": 0.0001988498839091647, "loss": 0.8838, "step": 425 }, { "epoch": 0.05, "grad_norm": 0.5127063393592834, "learning_rate": 0.00019882271670406372, "loss": 0.7314, "step": 430 }, { "epoch": 0.05, "grad_norm": 0.524604320526123, "learning_rate": 0.00019879523427099665, "loss": 0.8783, "step": 435 }, { "epoch": 0.05, "grad_norm": 0.6428730487823486, "learning_rate": 0.00019876743669762793, "loss": 0.977, "step": 440 }, { "epoch": 0.05, "grad_norm": 0.5448722243309021, "learning_rate": 0.00019873932407262715, "loss": 0.9373, "step": 445 }, { "epoch": 0.05, "grad_norm": 0.5859642028808594, "learning_rate": 0.00019871089648566885, "loss": 0.9717, "step": 450 }, { "epoch": 0.05, "grad_norm": 0.5077968835830688, "learning_rate": 0.00019868215402743235, "loss": 0.8836, "step": 455 }, { "epoch": 0.05, "grad_norm": 0.5130951404571533, "learning_rate": 0.00019865309678960123, "loss": 1.012, "step": 460 }, { "epoch": 0.05, "grad_norm": 0.5490102767944336, "learning_rate": 0.0001986237248648633, "loss": 0.8743, "step": 465 }, { "epoch": 0.05, "grad_norm": 0.5080798864364624, "learning_rate": 0.00019859403834691003, "loss": 0.9109, "step": 470 }, { "epoch": 0.05, "grad_norm": 0.5114970207214355, "learning_rate": 0.0001985640373304365, "loss": 0.9453, "step": 475 }, { "epoch": 0.05, "grad_norm": 0.500872790813446, "learning_rate": 0.0001985337219111409, "loss": 0.9275, "step": 480 }, { "epoch": 0.06, "grad_norm": 0.4827217757701874, "learning_rate": 0.00019850309218572438, "loss": 0.9394, "step": 485 }, { "epoch": 0.06, "grad_norm": 0.5236514210700989, "learning_rate": 0.00019847214825189066, "loss": 0.9253, "step": 490 }, { "epoch": 0.06, "grad_norm": 0.49005958437919617, "learning_rate": 0.0001984408902083457, "loss": 0.8693, "step": 495 }, { "epoch": 0.06, "grad_norm": 0.5737020373344421, "learning_rate": 0.00019840931815479746, "loss": 0.9076, "step": 500 }, { "epoch": 0.06, "grad_norm": 0.5236185193061829, "learning_rate": 0.00019837743219195552, "loss": 0.9429, "step": 505 }, { "epoch": 0.06, "grad_norm": 0.5625522136688232, "learning_rate": 0.00019834523242153078, "loss": 0.9305, "step": 510 }, { "epoch": 0.06, "grad_norm": 0.5392923951148987, "learning_rate": 0.0001983127189462351, "loss": 0.8803, "step": 515 }, { "epoch": 0.06, "grad_norm": 0.5529817938804626, "learning_rate": 0.00019827989186978103, "loss": 0.9182, "step": 520 }, { "epoch": 0.06, "grad_norm": 0.5279693007469177, "learning_rate": 0.00019824675129688152, "loss": 0.9022, "step": 525 }, { "epoch": 0.06, "grad_norm": 0.6373478174209595, "learning_rate": 0.00019821329733324942, "loss": 0.9551, "step": 530 }, { "epoch": 0.06, "grad_norm": 0.5089979767799377, "learning_rate": 0.00019817953008559734, "loss": 0.8277, "step": 535 }, { "epoch": 0.06, "grad_norm": 0.5426749587059021, "learning_rate": 0.00019814544966163708, "loss": 1.012, "step": 540 }, { "epoch": 0.06, "grad_norm": 0.5283740758895874, "learning_rate": 0.0001981110561700796, "loss": 0.8224, "step": 545 }, { "epoch": 0.06, "grad_norm": 0.4337727725505829, "learning_rate": 0.00019807634972063428, "loss": 0.858, "step": 550 }, { "epoch": 0.06, "grad_norm": 0.4819512963294983, "learning_rate": 0.000198041330424009, "loss": 0.7776, "step": 555 }, { "epoch": 0.06, "grad_norm": 0.550744891166687, "learning_rate": 0.00019800599839190941, "loss": 0.8895, "step": 560 }, { "epoch": 0.06, "grad_norm": 0.4993669092655182, "learning_rate": 0.0001979703537370388, "loss": 0.9043, "step": 565 }, { "epoch": 0.06, "grad_norm": 0.618319571018219, "learning_rate": 0.00019793439657309772, "loss": 0.8229, "step": 570 }, { "epoch": 0.07, "grad_norm": 0.5560276508331299, "learning_rate": 0.00019789812701478346, "loss": 0.9208, "step": 575 }, { "epoch": 0.07, "grad_norm": 0.5499486327171326, "learning_rate": 0.00019786154517778987, "loss": 0.8309, "step": 580 }, { "epoch": 0.07, "grad_norm": 0.8018636703491211, "learning_rate": 0.00019782465117880693, "loss": 0.9529, "step": 585 }, { "epoch": 0.07, "grad_norm": 0.5488550066947937, "learning_rate": 0.0001977874451355203, "loss": 0.7879, "step": 590 }, { "epoch": 0.07, "grad_norm": 0.5369092226028442, "learning_rate": 0.00019774992716661106, "loss": 0.8819, "step": 595 }, { "epoch": 0.07, "grad_norm": 0.5121927857398987, "learning_rate": 0.00019771209739175523, "loss": 0.8949, "step": 600 }, { "epoch": 0.07, "grad_norm": 0.5077289938926697, "learning_rate": 0.00019767395593162353, "loss": 0.9174, "step": 605 }, { "epoch": 0.07, "grad_norm": 0.6287967562675476, "learning_rate": 0.00019763550290788085, "loss": 0.8388, "step": 610 }, { "epoch": 0.07, "grad_norm": 0.5971408486366272, "learning_rate": 0.0001975967384431859, "loss": 0.8899, "step": 615 }, { "epoch": 0.07, "grad_norm": 0.5257498025894165, "learning_rate": 0.00019755766266119085, "loss": 1.0072, "step": 620 }, { "epoch": 0.07, "grad_norm": 0.5270594954490662, "learning_rate": 0.00019751827568654089, "loss": 0.9276, "step": 625 }, { "epoch": 0.07, "grad_norm": 0.5067614912986755, "learning_rate": 0.00019747857764487395, "loss": 0.8488, "step": 630 }, { "epoch": 0.07, "grad_norm": 0.5074208378791809, "learning_rate": 0.0001974385686628201, "loss": 0.7905, "step": 635 }, { "epoch": 0.07, "grad_norm": 0.5764243602752686, "learning_rate": 0.00019739824886800134, "loss": 0.8907, "step": 640 }, { "epoch": 0.07, "grad_norm": 0.6378028392791748, "learning_rate": 0.00019735761838903106, "loss": 0.9187, "step": 645 }, { "epoch": 0.07, "grad_norm": 0.8019906282424927, "learning_rate": 0.00019731667735551375, "loss": 0.9371, "step": 650 }, { "epoch": 0.07, "grad_norm": 0.5774128437042236, "learning_rate": 0.00019727542589804444, "loss": 0.9012, "step": 655 }, { "epoch": 0.08, "grad_norm": 0.6190884709358215, "learning_rate": 0.00019723386414820842, "loss": 0.941, "step": 660 }, { "epoch": 0.08, "grad_norm": 0.526430070400238, "learning_rate": 0.00019719199223858068, "loss": 0.77, "step": 665 }, { "epoch": 0.08, "grad_norm": 0.4773986041545868, "learning_rate": 0.00019714981030272567, "loss": 0.8433, "step": 670 }, { "epoch": 0.08, "grad_norm": 0.5461030006408691, "learning_rate": 0.00019710731847519665, "loss": 0.8035, "step": 675 }, { "epoch": 0.08, "grad_norm": 0.6205869317054749, "learning_rate": 0.00019706451689153556, "loss": 0.9234, "step": 680 }, { "epoch": 0.08, "grad_norm": 0.5920631885528564, "learning_rate": 0.00019702140568827222, "loss": 0.884, "step": 685 }, { "epoch": 0.08, "grad_norm": 0.5703709721565247, "learning_rate": 0.0001969779850029242, "loss": 0.833, "step": 690 }, { "epoch": 0.08, "grad_norm": 0.6153601408004761, "learning_rate": 0.00019693425497399627, "loss": 0.9611, "step": 695 }, { "epoch": 0.08, "grad_norm": 0.4773414433002472, "learning_rate": 0.00019689021574097987, "loss": 0.8674, "step": 700 }, { "epoch": 0.08, "grad_norm": 0.5657919645309448, "learning_rate": 0.00019684586744435283, "loss": 0.9631, "step": 705 }, { "epoch": 0.08, "grad_norm": 0.5606719255447388, "learning_rate": 0.0001968012102255788, "loss": 0.8937, "step": 710 }, { "epoch": 0.08, "grad_norm": 0.510236382484436, "learning_rate": 0.00019675624422710682, "loss": 0.8685, "step": 715 }, { "epoch": 0.08, "grad_norm": 0.6055552363395691, "learning_rate": 0.000196710969592371, "loss": 0.9136, "step": 720 }, { "epoch": 0.08, "grad_norm": 0.52703857421875, "learning_rate": 0.0001966653864657898, "loss": 0.8531, "step": 725 }, { "epoch": 0.08, "grad_norm": 0.5948337316513062, "learning_rate": 0.00019661949499276578, "loss": 0.9152, "step": 730 }, { "epoch": 0.08, "grad_norm": 0.5188962817192078, "learning_rate": 0.00019657329531968512, "loss": 0.8868, "step": 735 }, { "epoch": 0.08, "grad_norm": 0.5457183718681335, "learning_rate": 0.000196526787593917, "loss": 0.8716, "step": 740 }, { "epoch": 0.08, "grad_norm": 0.5485244393348694, "learning_rate": 0.0001964799719638134, "loss": 0.8099, "step": 745 }, { "epoch": 0.09, "grad_norm": 0.5232064723968506, "learning_rate": 0.00019643284857870822, "loss": 0.8874, "step": 750 }, { "epoch": 0.09, "grad_norm": 0.5872951149940491, "learning_rate": 0.00019638541758891734, "loss": 0.7815, "step": 755 }, { "epoch": 0.09, "grad_norm": 0.5641792416572571, "learning_rate": 0.0001963376791457376, "loss": 0.8868, "step": 760 }, { "epoch": 0.09, "grad_norm": 1.058152198791504, "learning_rate": 0.0001962896334014467, "loss": 0.8355, "step": 765 }, { "epoch": 0.09, "grad_norm": 0.5059372782707214, "learning_rate": 0.00019624128050930252, "loss": 0.7938, "step": 770 }, { "epoch": 0.09, "grad_norm": 0.5510187745094299, "learning_rate": 0.00019619262062354275, "loss": 0.8468, "step": 775 }, { "epoch": 0.09, "grad_norm": 0.4972304105758667, "learning_rate": 0.00019614365389938426, "loss": 0.8065, "step": 780 }, { "epoch": 0.09, "grad_norm": 0.7013474106788635, "learning_rate": 0.00019609438049302273, "loss": 0.91, "step": 785 }, { "epoch": 0.09, "grad_norm": 0.5787790417671204, "learning_rate": 0.00019604480056163213, "loss": 0.9695, "step": 790 }, { "epoch": 0.09, "grad_norm": 0.45666056871414185, "learning_rate": 0.00019599491426336413, "loss": 0.9191, "step": 795 }, { "epoch": 0.09, "grad_norm": 0.5539554953575134, "learning_rate": 0.00019594472175734774, "loss": 0.9278, "step": 800 }, { "epoch": 0.09, "grad_norm": 0.5579492449760437, "learning_rate": 0.0001958942232036886, "loss": 0.9074, "step": 805 }, { "epoch": 0.09, "grad_norm": 0.48329582810401917, "learning_rate": 0.00019584341876346874, "loss": 0.773, "step": 810 }, { "epoch": 0.09, "grad_norm": 0.5779743790626526, "learning_rate": 0.0001957923085987458, "loss": 0.7915, "step": 815 }, { "epoch": 0.09, "grad_norm": 0.5726707577705383, "learning_rate": 0.0001957408928725527, "loss": 0.8683, "step": 820 }, { "epoch": 0.09, "grad_norm": 0.5140533447265625, "learning_rate": 0.00019568917174889693, "loss": 0.8599, "step": 825 }, { "epoch": 0.09, "grad_norm": 0.5206133723258972, "learning_rate": 0.00019563714539276036, "loss": 0.8629, "step": 830 }, { "epoch": 0.09, "grad_norm": 0.6327289342880249, "learning_rate": 0.0001955848139700983, "loss": 0.8577, "step": 835 }, { "epoch": 0.1, "grad_norm": 0.5038033127784729, "learning_rate": 0.00019553217764783928, "loss": 0.8652, "step": 840 }, { "epoch": 0.1, "grad_norm": 0.5202915072441101, "learning_rate": 0.0001954792365938844, "loss": 0.9021, "step": 845 }, { "epoch": 0.1, "grad_norm": 0.5957011580467224, "learning_rate": 0.00019542599097710676, "loss": 0.8404, "step": 850 }, { "epoch": 0.1, "grad_norm": 0.6331242918968201, "learning_rate": 0.00019537244096735096, "loss": 1.0052, "step": 855 }, { "epoch": 0.1, "grad_norm": 0.43145543336868286, "learning_rate": 0.00019531858673543266, "loss": 0.8814, "step": 860 }, { "epoch": 0.1, "grad_norm": 0.5859599709510803, "learning_rate": 0.0001952644284531378, "loss": 0.8131, "step": 865 }, { "epoch": 0.1, "grad_norm": 0.5870697498321533, "learning_rate": 0.00019520996629322228, "loss": 0.8458, "step": 870 }, { "epoch": 0.1, "grad_norm": 0.4881855845451355, "learning_rate": 0.00019515520042941132, "loss": 0.808, "step": 875 }, { "epoch": 0.1, "grad_norm": 0.5687686204910278, "learning_rate": 0.00019510013103639883, "loss": 0.8764, "step": 880 }, { "epoch": 0.1, "grad_norm": 0.49425217509269714, "learning_rate": 0.00019504475828984705, "loss": 0.9531, "step": 885 }, { "epoch": 0.1, "grad_norm": 0.6400182247161865, "learning_rate": 0.00019498908236638572, "loss": 0.9817, "step": 890 }, { "epoch": 0.1, "grad_norm": 0.50630122423172, "learning_rate": 0.0001949331034436118, "loss": 0.8061, "step": 895 }, { "epoch": 0.1, "grad_norm": 0.6477893590927124, "learning_rate": 0.00019487682170008866, "loss": 0.8433, "step": 900 }, { "epoch": 0.1, "grad_norm": 0.5080977082252502, "learning_rate": 0.0001948202373153457, "loss": 0.753, "step": 905 }, { "epoch": 0.1, "grad_norm": 0.6288018226623535, "learning_rate": 0.00019476335046987763, "loss": 0.8719, "step": 910 }, { "epoch": 0.1, "grad_norm": 0.5954068899154663, "learning_rate": 0.00019470616134514406, "loss": 0.9141, "step": 915 }, { "epoch": 0.1, "grad_norm": 0.9248217940330505, "learning_rate": 0.00019464867012356865, "loss": 0.8477, "step": 920 }, { "epoch": 0.11, "grad_norm": 0.5251208543777466, "learning_rate": 0.00019459087698853883, "loss": 0.805, "step": 925 }, { "epoch": 0.11, "grad_norm": 0.5578038692474365, "learning_rate": 0.0001945327821244051, "loss": 0.9431, "step": 930 }, { "epoch": 0.11, "grad_norm": 0.5071999430656433, "learning_rate": 0.0001944743857164803, "loss": 0.8176, "step": 935 }, { "epoch": 0.11, "grad_norm": 0.5080613493919373, "learning_rate": 0.00019441568795103932, "loss": 0.9004, "step": 940 }, { "epoch": 0.11, "grad_norm": 0.5936622619628906, "learning_rate": 0.00019435668901531813, "loss": 0.9215, "step": 945 }, { "epoch": 0.11, "grad_norm": 0.5861743092536926, "learning_rate": 0.00019429738909751353, "loss": 0.8413, "step": 950 }, { "epoch": 0.11, "grad_norm": 0.561229407787323, "learning_rate": 0.00019423778838678236, "loss": 0.8808, "step": 955 }, { "epoch": 0.11, "grad_norm": 0.6140787601470947, "learning_rate": 0.00019417788707324095, "loss": 0.8038, "step": 960 }, { "epoch": 0.11, "grad_norm": 0.4852558672428131, "learning_rate": 0.00019411768534796444, "loss": 0.7745, "step": 965 }, { "epoch": 0.11, "grad_norm": 0.6961116194725037, "learning_rate": 0.00019405718340298632, "loss": 0.9561, "step": 970 }, { "epoch": 0.11, "grad_norm": 0.5008209943771362, "learning_rate": 0.00019399638143129767, "loss": 0.8497, "step": 975 }, { "epoch": 0.11, "grad_norm": 0.6141425967216492, "learning_rate": 0.00019393527962684664, "loss": 0.8182, "step": 980 }, { "epoch": 0.11, "grad_norm": 0.7052502036094666, "learning_rate": 0.0001938738781845378, "loss": 0.8945, "step": 985 }, { "epoch": 0.11, "grad_norm": 0.5405805110931396, "learning_rate": 0.00019381217730023146, "loss": 0.9554, "step": 990 }, { "epoch": 0.11, "grad_norm": 0.6514587998390198, "learning_rate": 0.00019375017717074318, "loss": 0.811, "step": 995 }, { "epoch": 0.11, "grad_norm": 0.5553276538848877, "learning_rate": 0.000193687877993843, "loss": 0.8157, "step": 1000 }, { "epoch": 0.11, "grad_norm": 0.5811892151832581, "learning_rate": 0.00019362527996825488, "loss": 0.8352, "step": 1005 }, { "epoch": 0.11, "grad_norm": 0.4772842526435852, "learning_rate": 0.00019356238329365613, "loss": 0.8563, "step": 1010 }, { "epoch": 0.12, "grad_norm": 0.6040914058685303, "learning_rate": 0.00019349918817067655, "loss": 0.8884, "step": 1015 }, { "epoch": 0.12, "grad_norm": 0.638569712638855, "learning_rate": 0.0001934356948008981, "loss": 0.938, "step": 1020 }, { "epoch": 0.12, "grad_norm": 0.47892439365386963, "learning_rate": 0.00019337190338685397, "loss": 0.9725, "step": 1025 }, { "epoch": 0.12, "grad_norm": 0.5647065043449402, "learning_rate": 0.0001933078141320282, "loss": 0.8063, "step": 1030 }, { "epoch": 0.12, "grad_norm": 0.4954369068145752, "learning_rate": 0.0001932434272408547, "loss": 0.8535, "step": 1035 }, { "epoch": 0.12, "grad_norm": 0.655193567276001, "learning_rate": 0.00019317874291871704, "loss": 0.876, "step": 1040 }, { "epoch": 0.12, "grad_norm": 0.6297211050987244, "learning_rate": 0.0001931137613719473, "loss": 0.9351, "step": 1045 }, { "epoch": 0.12, "grad_norm": 0.5889206528663635, "learning_rate": 0.0001930484828078258, "loss": 0.918, "step": 1050 }, { "epoch": 0.12, "grad_norm": 0.5167111158370972, "learning_rate": 0.00019298290743458027, "loss": 1.1183, "step": 1055 }, { "epoch": 0.12, "grad_norm": 0.44684621691703796, "learning_rate": 0.0001929170354613852, "loss": 0.8091, "step": 1060 }, { "epoch": 0.12, "grad_norm": 0.5272998213768005, "learning_rate": 0.00019285086709836116, "loss": 0.8537, "step": 1065 }, { "epoch": 0.12, "grad_norm": 0.5813104510307312, "learning_rate": 0.0001927844025565742, "loss": 0.8221, "step": 1070 }, { "epoch": 0.12, "grad_norm": 0.507824718952179, "learning_rate": 0.00019271764204803512, "loss": 0.9199, "step": 1075 }, { "epoch": 0.12, "grad_norm": 0.5733903050422668, "learning_rate": 0.00019265058578569878, "loss": 0.7459, "step": 1080 }, { "epoch": 0.12, "grad_norm": 0.6304961442947388, "learning_rate": 0.00019258323398346346, "loss": 0.8584, "step": 1085 }, { "epoch": 0.12, "grad_norm": 0.5296971797943115, "learning_rate": 0.00019251558685617014, "loss": 0.8552, "step": 1090 }, { "epoch": 0.12, "grad_norm": 0.5370798707008362, "learning_rate": 0.00019244764461960191, "loss": 0.9183, "step": 1095 }, { "epoch": 0.13, "grad_norm": 0.5226066708564758, "learning_rate": 0.00019237940749048318, "loss": 0.883, "step": 1100 }, { "epoch": 0.13, "grad_norm": 0.6162168383598328, "learning_rate": 0.00019231087568647893, "loss": 0.924, "step": 1105 }, { "epoch": 0.13, "grad_norm": 0.6270351409912109, "learning_rate": 0.00019224204942619417, "loss": 0.9861, "step": 1110 }, { "epoch": 0.13, "grad_norm": 0.6001037955284119, "learning_rate": 0.00019217292892917325, "loss": 0.8134, "step": 1115 }, { "epoch": 0.13, "grad_norm": 0.5420514345169067, "learning_rate": 0.00019210351441589896, "loss": 0.7618, "step": 1120 }, { "epoch": 0.13, "grad_norm": 0.6640905141830444, "learning_rate": 0.000192033806107792, "loss": 0.9157, "step": 1125 }, { "epoch": 0.13, "grad_norm": 0.5504305362701416, "learning_rate": 0.00019196380422721026, "loss": 0.8995, "step": 1130 }, { "epoch": 0.13, "grad_norm": 0.6820715069770813, "learning_rate": 0.00019189350899744806, "loss": 0.9161, "step": 1135 }, { "epoch": 0.13, "grad_norm": 0.5563843250274658, "learning_rate": 0.00019182292064273544, "loss": 0.7748, "step": 1140 }, { "epoch": 0.13, "grad_norm": 0.6278269290924072, "learning_rate": 0.00019175203938823744, "loss": 0.7787, "step": 1145 }, { "epoch": 0.13, "grad_norm": 0.5549231171607971, "learning_rate": 0.00019168086546005346, "loss": 0.9038, "step": 1150 }, { "epoch": 0.13, "grad_norm": 0.5321454405784607, "learning_rate": 0.0001916093990852164, "loss": 0.7601, "step": 1155 }, { "epoch": 0.13, "grad_norm": 0.5376914143562317, "learning_rate": 0.0001915376404916921, "loss": 0.8395, "step": 1160 }, { "epoch": 0.13, "grad_norm": 0.5487528443336487, "learning_rate": 0.00019146558990837853, "loss": 0.7653, "step": 1165 }, { "epoch": 0.13, "grad_norm": 0.54075688123703, "learning_rate": 0.00019139324756510496, "loss": 0.8811, "step": 1170 }, { "epoch": 0.13, "grad_norm": 0.5763316750526428, "learning_rate": 0.00019132061369263136, "loss": 0.9043, "step": 1175 }, { "epoch": 0.13, "grad_norm": 0.5854605436325073, "learning_rate": 0.00019124768852264774, "loss": 0.9339, "step": 1180 }, { "epoch": 0.13, "grad_norm": 0.533819854259491, "learning_rate": 0.00019117447228777316, "loss": 0.8174, "step": 1185 }, { "epoch": 0.14, "grad_norm": 0.48118382692337036, "learning_rate": 0.00019110096522155523, "loss": 0.8992, "step": 1190 }, { "epoch": 0.14, "grad_norm": 0.5746036171913147, "learning_rate": 0.00019102716755846913, "loss": 0.7809, "step": 1195 }, { "epoch": 0.14, "grad_norm": 0.5010830163955688, "learning_rate": 0.00019095307953391718, "loss": 0.834, "step": 1200 }, { "epoch": 0.14, "grad_norm": 0.5111698508262634, "learning_rate": 0.00019087870138422775, "loss": 0.7903, "step": 1205 }, { "epoch": 0.14, "grad_norm": 0.5492734909057617, "learning_rate": 0.00019080403334665474, "loss": 0.8529, "step": 1210 }, { "epoch": 0.14, "grad_norm": 0.5762624144554138, "learning_rate": 0.00019072907565937674, "loss": 0.8261, "step": 1215 }, { "epoch": 0.14, "grad_norm": 0.7704640030860901, "learning_rate": 0.00019065382856149623, "loss": 0.8578, "step": 1220 }, { "epoch": 0.14, "grad_norm": 0.5472645163536072, "learning_rate": 0.0001905782922930389, "loss": 0.7221, "step": 1225 }, { "epoch": 0.14, "grad_norm": 0.4950571656227112, "learning_rate": 0.0001905024670949528, "loss": 0.8874, "step": 1230 }, { "epoch": 0.14, "grad_norm": 0.6059962511062622, "learning_rate": 0.00019042635320910768, "loss": 0.8667, "step": 1235 }, { "epoch": 0.14, "grad_norm": 0.5352611541748047, "learning_rate": 0.00019034995087829416, "loss": 0.8767, "step": 1240 }, { "epoch": 0.14, "grad_norm": 0.5850544571876526, "learning_rate": 0.00019027326034622288, "loss": 0.8335, "step": 1245 }, { "epoch": 0.14, "grad_norm": 0.6183121204376221, "learning_rate": 0.00019019628185752382, "loss": 0.866, "step": 1250 }, { "epoch": 0.14, "grad_norm": 0.5667844414710999, "learning_rate": 0.00019011901565774554, "loss": 0.8816, "step": 1255 }, { "epoch": 0.14, "grad_norm": 0.6229298114776611, "learning_rate": 0.0001900414619933543, "loss": 0.7841, "step": 1260 }, { "epoch": 0.14, "grad_norm": 0.5172335505485535, "learning_rate": 0.00018996362111173336, "loss": 0.841, "step": 1265 }, { "epoch": 0.14, "grad_norm": 0.5118042230606079, "learning_rate": 0.00018988549326118208, "loss": 0.8585, "step": 1270 }, { "epoch": 0.14, "grad_norm": 0.5746064782142639, "learning_rate": 0.0001898070786909153, "loss": 0.8849, "step": 1275 }, { "epoch": 0.15, "grad_norm": 0.5497964024543762, "learning_rate": 0.00018972837765106245, "loss": 0.8754, "step": 1280 }, { "epoch": 0.15, "grad_norm": 0.4933513402938843, "learning_rate": 0.0001896493903926666, "loss": 0.8773, "step": 1285 }, { "epoch": 0.15, "grad_norm": 0.5292490124702454, "learning_rate": 0.00018957011716768402, "loss": 0.825, "step": 1290 }, { "epoch": 0.15, "grad_norm": 0.5365048050880432, "learning_rate": 0.00018949055822898298, "loss": 0.8519, "step": 1295 }, { "epoch": 0.15, "grad_norm": 0.5695987939834595, "learning_rate": 0.00018941071383034327, "loss": 0.9198, "step": 1300 }, { "epoch": 0.15, "grad_norm": 0.5951317548751831, "learning_rate": 0.00018933058422645514, "loss": 0.8947, "step": 1305 }, { "epoch": 0.15, "grad_norm": 0.5998417139053345, "learning_rate": 0.00018925016967291872, "loss": 0.9372, "step": 1310 }, { "epoch": 0.15, "grad_norm": 0.5713790655136108, "learning_rate": 0.00018916947042624293, "loss": 0.9231, "step": 1315 }, { "epoch": 0.15, "grad_norm": 0.5592827796936035, "learning_rate": 0.00018908848674384493, "loss": 0.8792, "step": 1320 }, { "epoch": 0.15, "grad_norm": 0.6018208265304565, "learning_rate": 0.00018900721888404917, "loss": 0.8533, "step": 1325 }, { "epoch": 0.15, "grad_norm": 0.5641770958900452, "learning_rate": 0.0001889256671060865, "loss": 0.8096, "step": 1330 }, { "epoch": 0.15, "grad_norm": 0.49681103229522705, "learning_rate": 0.00018884383167009348, "loss": 0.7558, "step": 1335 }, { "epoch": 0.15, "grad_norm": 0.6027383208274841, "learning_rate": 0.0001887617128371115, "loss": 0.7801, "step": 1340 }, { "epoch": 0.15, "grad_norm": 0.5480598211288452, "learning_rate": 0.00018867931086908598, "loss": 0.9335, "step": 1345 }, { "epoch": 0.15, "grad_norm": 0.5374003648757935, "learning_rate": 0.00018859662602886538, "loss": 0.8229, "step": 1350 }, { "epoch": 0.15, "grad_norm": 0.5359901785850525, "learning_rate": 0.00018851365858020054, "loss": 0.888, "step": 1355 }, { "epoch": 0.15, "grad_norm": 0.6158584356307983, "learning_rate": 0.0001884304087877438, "loss": 0.8434, "step": 1360 }, { "epoch": 0.16, "grad_norm": 0.6495197415351868, "learning_rate": 0.00018834687691704805, "loss": 0.902, "step": 1365 }, { "epoch": 0.16, "grad_norm": 0.5516675710678101, "learning_rate": 0.000188263063234566, "loss": 0.8055, "step": 1370 }, { "epoch": 0.16, "grad_norm": 0.5624775886535645, "learning_rate": 0.00018817896800764938, "loss": 0.8212, "step": 1375 }, { "epoch": 0.16, "grad_norm": 0.47556230425834656, "learning_rate": 0.00018809459150454788, "loss": 0.7061, "step": 1380 }, { "epoch": 0.16, "grad_norm": 0.5220008492469788, "learning_rate": 0.00018800993399440845, "loss": 0.7378, "step": 1385 }, { "epoch": 0.16, "grad_norm": 0.5644752979278564, "learning_rate": 0.00018792499574727441, "loss": 0.8245, "step": 1390 }, { "epoch": 0.16, "grad_norm": 0.6211121678352356, "learning_rate": 0.0001878397770340846, "loss": 0.9034, "step": 1395 }, { "epoch": 0.16, "grad_norm": 0.5424923896789551, "learning_rate": 0.00018775427812667248, "loss": 0.9151, "step": 1400 }, { "epoch": 0.16, "grad_norm": 0.5865684151649475, "learning_rate": 0.00018766849929776532, "loss": 0.8795, "step": 1405 }, { "epoch": 0.16, "grad_norm": 0.6018354892730713, "learning_rate": 0.0001875824408209832, "loss": 0.6693, "step": 1410 }, { "epoch": 0.16, "grad_norm": 0.5678966045379639, "learning_rate": 0.0001874961029708383, "loss": 0.8994, "step": 1415 }, { "epoch": 0.16, "grad_norm": 0.4987218677997589, "learning_rate": 0.000187409486022734, "loss": 0.9152, "step": 1420 }, { "epoch": 0.16, "grad_norm": 0.5679898858070374, "learning_rate": 0.00018732259025296388, "loss": 0.7408, "step": 1425 }, { "epoch": 0.16, "grad_norm": 0.46406781673431396, "learning_rate": 0.0001872354159387109, "loss": 0.7725, "step": 1430 }, { "epoch": 0.16, "grad_norm": 0.5870163440704346, "learning_rate": 0.00018714796335804663, "loss": 0.9934, "step": 1435 }, { "epoch": 0.16, "grad_norm": 0.5287370681762695, "learning_rate": 0.00018706023278993014, "loss": 0.8331, "step": 1440 }, { "epoch": 0.16, "grad_norm": 0.6170235872268677, "learning_rate": 0.00018697222451420734, "loss": 0.8614, "step": 1445 }, { "epoch": 0.16, "grad_norm": 0.5744518041610718, "learning_rate": 0.00018688393881160993, "loss": 0.8481, "step": 1450 }, { "epoch": 0.17, "grad_norm": 0.6607945561408997, "learning_rate": 0.0001867953759637545, "loss": 0.7717, "step": 1455 }, { "epoch": 0.17, "grad_norm": 0.6847087740898132, "learning_rate": 0.00018670653625314185, "loss": 0.9084, "step": 1460 }, { "epoch": 0.17, "grad_norm": 0.5219533443450928, "learning_rate": 0.00018661741996315573, "loss": 0.7862, "step": 1465 }, { "epoch": 0.17, "grad_norm": 0.541492223739624, "learning_rate": 0.00018652802737806226, "loss": 0.7658, "step": 1470 }, { "epoch": 0.17, "grad_norm": 0.5432409048080444, "learning_rate": 0.00018643835878300887, "loss": 0.7273, "step": 1475 }, { "epoch": 0.17, "grad_norm": 0.5351329445838928, "learning_rate": 0.00018634841446402343, "loss": 0.7721, "step": 1480 }, { "epoch": 0.17, "grad_norm": 0.5327515602111816, "learning_rate": 0.0001862581947080132, "loss": 0.9053, "step": 1485 }, { "epoch": 0.17, "grad_norm": 0.5060054063796997, "learning_rate": 0.00018616769980276426, "loss": 0.8919, "step": 1490 }, { "epoch": 0.17, "grad_norm": 0.6146079897880554, "learning_rate": 0.0001860769300369402, "loss": 0.8494, "step": 1495 }, { "epoch": 0.17, "grad_norm": 0.5177258849143982, "learning_rate": 0.0001859858857000814, "loss": 0.8473, "step": 1500 }, { "epoch": 0.17, "grad_norm": 0.5560600757598877, "learning_rate": 0.0001858945670826041, "loss": 0.8193, "step": 1505 }, { "epoch": 0.17, "grad_norm": 0.6341832876205444, "learning_rate": 0.00018580297447579947, "loss": 0.8413, "step": 1510 }, { "epoch": 0.17, "grad_norm": 0.5502546429634094, "learning_rate": 0.0001857111081718326, "loss": 0.8251, "step": 1515 }, { "epoch": 0.17, "grad_norm": 0.6432173252105713, "learning_rate": 0.00018561896846374168, "loss": 0.879, "step": 1520 }, { "epoch": 0.17, "grad_norm": 0.5981735587120056, "learning_rate": 0.00018552655564543695, "loss": 0.8699, "step": 1525 }, { "epoch": 0.17, "grad_norm": 0.5953419804573059, "learning_rate": 0.00018543387001169993, "loss": 0.8163, "step": 1530 }, { "epoch": 0.17, "grad_norm": 0.5923714637756348, "learning_rate": 0.0001853409118581823, "loss": 0.9091, "step": 1535 }, { "epoch": 0.18, "grad_norm": 0.9148262739181519, "learning_rate": 0.00018524768148140504, "loss": 0.7836, "step": 1540 }, { "epoch": 0.18, "grad_norm": 0.5257108807563782, "learning_rate": 0.00018515417917875748, "loss": 0.7875, "step": 1545 }, { "epoch": 0.18, "grad_norm": 0.6084693074226379, "learning_rate": 0.00018506040524849637, "loss": 0.8409, "step": 1550 }, { "epoch": 0.18, "grad_norm": 0.49939265847206116, "learning_rate": 0.00018496635998974489, "loss": 0.8631, "step": 1555 }, { "epoch": 0.18, "grad_norm": 0.5126424431800842, "learning_rate": 0.00018487204370249167, "loss": 0.8033, "step": 1560 }, { "epoch": 0.18, "grad_norm": 0.4886222183704376, "learning_rate": 0.00018477745668758996, "loss": 0.8473, "step": 1565 }, { "epoch": 0.18, "grad_norm": 0.5652774572372437, "learning_rate": 0.00018468259924675655, "loss": 0.9041, "step": 1570 }, { "epoch": 0.18, "grad_norm": 0.5118491053581238, "learning_rate": 0.00018458747168257085, "loss": 0.8749, "step": 1575 }, { "epoch": 0.18, "grad_norm": 0.5526809692382812, "learning_rate": 0.00018449207429847384, "loss": 0.915, "step": 1580 }, { "epoch": 0.18, "grad_norm": 0.6415011882781982, "learning_rate": 0.0001843964073987673, "loss": 0.878, "step": 1585 }, { "epoch": 0.18, "grad_norm": 0.5006015300750732, "learning_rate": 0.00018430047128861266, "loss": 0.7848, "step": 1590 }, { "epoch": 0.18, "grad_norm": 0.5490383505821228, "learning_rate": 0.0001842042662740301, "loss": 0.7836, "step": 1595 }, { "epoch": 0.18, "grad_norm": 0.5240172743797302, "learning_rate": 0.00018410779266189752, "loss": 0.7642, "step": 1600 }, { "epoch": 0.18, "grad_norm": 0.6275254487991333, "learning_rate": 0.00018401105075994967, "loss": 0.8773, "step": 1605 }, { "epoch": 0.18, "grad_norm": 0.5745741724967957, "learning_rate": 0.00018391404087677704, "loss": 0.904, "step": 1610 }, { "epoch": 0.18, "grad_norm": 0.6686891913414001, "learning_rate": 0.00018381676332182497, "loss": 0.8057, "step": 1615 }, { "epoch": 0.18, "grad_norm": 0.501735508441925, "learning_rate": 0.00018371921840539264, "loss": 0.8361, "step": 1620 }, { "epoch": 0.18, "grad_norm": 0.6197385191917419, "learning_rate": 0.000183621406438632, "loss": 0.9193, "step": 1625 }, { "epoch": 0.19, "grad_norm": 0.5199822783470154, "learning_rate": 0.00018352332773354695, "loss": 0.8461, "step": 1630 }, { "epoch": 0.19, "grad_norm": 0.48266005516052246, "learning_rate": 0.00018342498260299212, "loss": 0.7336, "step": 1635 }, { "epoch": 0.19, "grad_norm": 0.7070842981338501, "learning_rate": 0.0001833263713606721, "loss": 0.9886, "step": 1640 }, { "epoch": 0.19, "grad_norm": 0.5649372339248657, "learning_rate": 0.00018322749432114028, "loss": 0.8146, "step": 1645 }, { "epoch": 0.19, "grad_norm": 0.5961683988571167, "learning_rate": 0.00018312835179979788, "loss": 0.7934, "step": 1650 }, { "epoch": 0.19, "grad_norm": 0.5285751819610596, "learning_rate": 0.00018302894411289304, "loss": 0.9225, "step": 1655 }, { "epoch": 0.19, "grad_norm": 0.5439310669898987, "learning_rate": 0.0001829292715775196, "loss": 0.9477, "step": 1660 }, { "epoch": 0.19, "grad_norm": 0.5521465539932251, "learning_rate": 0.00018282933451161643, "loss": 0.8531, "step": 1665 }, { "epoch": 0.19, "grad_norm": 0.6291643381118774, "learning_rate": 0.00018272913323396598, "loss": 0.9441, "step": 1670 }, { "epoch": 0.19, "grad_norm": 0.5861973762512207, "learning_rate": 0.00018262866806419362, "loss": 0.7543, "step": 1675 }, { "epoch": 0.19, "grad_norm": 0.6347075700759888, "learning_rate": 0.0001825279393227665, "loss": 0.8737, "step": 1680 }, { "epoch": 0.19, "grad_norm": 0.6788772344589233, "learning_rate": 0.00018242694733099245, "loss": 0.8511, "step": 1685 }, { "epoch": 0.19, "grad_norm": 0.5907087326049805, "learning_rate": 0.0001823256924110191, "loss": 0.8505, "step": 1690 }, { "epoch": 0.19, "grad_norm": 0.6607640385627747, "learning_rate": 0.0001822241748858327, "loss": 0.856, "step": 1695 }, { "epoch": 0.19, "grad_norm": 0.6193135976791382, "learning_rate": 0.0001821223950792572, "loss": 0.8215, "step": 1700 }, { "epoch": 0.19, "grad_norm": 0.6171255707740784, "learning_rate": 0.00018202035331595323, "loss": 0.7666, "step": 1705 }, { "epoch": 0.19, "grad_norm": 0.49476027488708496, "learning_rate": 0.00018191804992141695, "loss": 0.8192, "step": 1710 }, { "epoch": 0.19, "grad_norm": 0.6600732803344727, "learning_rate": 0.0001818154852219791, "loss": 0.8839, "step": 1715 }, { "epoch": 0.2, "grad_norm": 0.6551568508148193, "learning_rate": 0.00018171265954480394, "loss": 0.7813, "step": 1720 }, { "epoch": 0.2, "grad_norm": 0.539866030216217, "learning_rate": 0.00018160957321788828, "loss": 0.8957, "step": 1725 }, { "epoch": 0.2, "grad_norm": 0.5746687054634094, "learning_rate": 0.00018150622657006016, "loss": 0.9047, "step": 1730 }, { "epoch": 0.2, "grad_norm": 0.642020583152771, "learning_rate": 0.0001814026199309783, "loss": 0.7867, "step": 1735 }, { "epoch": 0.2, "grad_norm": 0.47608810663223267, "learning_rate": 0.00018129875363113044, "loss": 0.9095, "step": 1740 }, { "epoch": 0.2, "grad_norm": 0.5929521918296814, "learning_rate": 0.0001811946280018328, "loss": 0.8587, "step": 1745 }, { "epoch": 0.2, "grad_norm": 0.6232489347457886, "learning_rate": 0.00018109024337522876, "loss": 0.8188, "step": 1750 }, { "epoch": 0.2, "grad_norm": 0.47936391830444336, "learning_rate": 0.00018098560008428778, "loss": 0.8164, "step": 1755 }, { "epoch": 0.2, "grad_norm": 0.6068043112754822, "learning_rate": 0.00018088069846280456, "loss": 0.8273, "step": 1760 }, { "epoch": 0.2, "grad_norm": 0.5103864073753357, "learning_rate": 0.00018077553884539773, "loss": 0.8453, "step": 1765 }, { "epoch": 0.2, "grad_norm": 0.5663210153579712, "learning_rate": 0.0001806701215675089, "loss": 0.8745, "step": 1770 }, { "epoch": 0.2, "grad_norm": 0.6410555243492126, "learning_rate": 0.00018056444696540162, "loss": 0.8698, "step": 1775 }, { "epoch": 0.2, "grad_norm": 0.7071842551231384, "learning_rate": 0.00018045851537616016, "loss": 0.7438, "step": 1780 }, { "epoch": 0.2, "grad_norm": 0.551302969455719, "learning_rate": 0.0001803523271376887, "loss": 0.8576, "step": 1785 }, { "epoch": 0.2, "grad_norm": 0.5493254065513611, "learning_rate": 0.0001802458825887099, "loss": 0.7339, "step": 1790 }, { "epoch": 0.2, "grad_norm": 0.6084312200546265, "learning_rate": 0.00018013918206876415, "loss": 0.7465, "step": 1795 }, { "epoch": 0.2, "grad_norm": 0.52370685338974, "learning_rate": 0.00018003222591820824, "loss": 0.8574, "step": 1800 }, { "epoch": 0.21, "grad_norm": 0.46578991413116455, "learning_rate": 0.00017992501447821452, "loss": 0.7609, "step": 1805 }, { "epoch": 0.21, "grad_norm": 0.6183673143386841, "learning_rate": 0.00017981754809076952, "loss": 0.8273, "step": 1810 }, { "epoch": 0.21, "grad_norm": 0.6610841751098633, "learning_rate": 0.0001797098270986731, "loss": 0.9363, "step": 1815 }, { "epoch": 0.21, "grad_norm": 0.5758525729179382, "learning_rate": 0.00017960185184553716, "loss": 0.7438, "step": 1820 }, { "epoch": 0.21, "grad_norm": 0.5575788021087646, "learning_rate": 0.00017949362267578485, "loss": 0.8472, "step": 1825 }, { "epoch": 0.21, "grad_norm": 0.5445650815963745, "learning_rate": 0.0001793851399346491, "loss": 0.895, "step": 1830 }, { "epoch": 0.21, "grad_norm": 0.509607195854187, "learning_rate": 0.0001792764039681717, "loss": 0.8065, "step": 1835 }, { "epoch": 0.21, "grad_norm": 0.5268949270248413, "learning_rate": 0.00017916741512320227, "loss": 0.7979, "step": 1840 }, { "epoch": 0.21, "grad_norm": 0.5786017775535583, "learning_rate": 0.00017905817374739704, "loss": 0.8833, "step": 1845 }, { "epoch": 0.21, "grad_norm": 0.5071285963058472, "learning_rate": 0.0001789486801892177, "loss": 0.7521, "step": 1850 }, { "epoch": 0.21, "grad_norm": 0.620481550693512, "learning_rate": 0.0001788389347979305, "loss": 0.8354, "step": 1855 }, { "epoch": 0.21, "grad_norm": 0.5200830698013306, "learning_rate": 0.00017872893792360484, "loss": 0.9292, "step": 1860 }, { "epoch": 0.21, "grad_norm": 0.594330370426178, "learning_rate": 0.00017861868991711247, "loss": 0.9231, "step": 1865 }, { "epoch": 0.21, "grad_norm": 0.542568564414978, "learning_rate": 0.00017850819113012601, "loss": 0.6837, "step": 1870 }, { "epoch": 0.21, "grad_norm": 0.6251922845840454, "learning_rate": 0.0001783974419151182, "loss": 0.7409, "step": 1875 }, { "epoch": 0.21, "grad_norm": 0.5484223365783691, "learning_rate": 0.0001782864426253606, "loss": 0.824, "step": 1880 }, { "epoch": 0.21, "grad_norm": 0.7240644693374634, "learning_rate": 0.00017817519361492228, "loss": 0.7806, "step": 1885 }, { "epoch": 0.21, "grad_norm": 0.5933576822280884, "learning_rate": 0.00017806369523866913, "loss": 0.8518, "step": 1890 }, { "epoch": 0.22, "grad_norm": 0.6882241368293762, "learning_rate": 0.00017795194785226229, "loss": 0.9101, "step": 1895 }, { "epoch": 0.22, "grad_norm": 0.5205492973327637, "learning_rate": 0.00017783995181215728, "loss": 0.8973, "step": 1900 }, { "epoch": 0.22, "grad_norm": 0.6640832424163818, "learning_rate": 0.00017772770747560273, "loss": 0.9665, "step": 1905 }, { "epoch": 0.22, "grad_norm": 0.6470832228660583, "learning_rate": 0.00017761521520063945, "loss": 0.8718, "step": 1910 }, { "epoch": 0.22, "grad_norm": 0.704579770565033, "learning_rate": 0.0001775024753460989, "loss": 0.9091, "step": 1915 }, { "epoch": 0.22, "grad_norm": 0.7193452715873718, "learning_rate": 0.00017738948827160242, "loss": 0.7795, "step": 1920 }, { "epoch": 0.22, "grad_norm": 0.5544724464416504, "learning_rate": 0.0001772762543375599, "loss": 0.8755, "step": 1925 }, { "epoch": 0.22, "grad_norm": 0.5948014855384827, "learning_rate": 0.00017716277390516876, "loss": 0.8527, "step": 1930 }, { "epoch": 0.22, "grad_norm": 0.7045935392379761, "learning_rate": 0.00017704904733641255, "loss": 0.9894, "step": 1935 }, { "epoch": 0.22, "grad_norm": 0.5722846388816833, "learning_rate": 0.00017693507499406, "loss": 0.8367, "step": 1940 }, { "epoch": 0.22, "grad_norm": 0.5934826135635376, "learning_rate": 0.000176820857241664, "loss": 0.96, "step": 1945 }, { "epoch": 0.22, "grad_norm": 0.6156487464904785, "learning_rate": 0.00017670639444355998, "loss": 0.8629, "step": 1950 }, { "epoch": 0.22, "grad_norm": 0.6569040417671204, "learning_rate": 0.0001765916869648652, "loss": 0.8539, "step": 1955 }, { "epoch": 0.22, "grad_norm": 0.5664108395576477, "learning_rate": 0.0001764767351714774, "loss": 0.8514, "step": 1960 }, { "epoch": 0.22, "grad_norm": 0.681502640247345, "learning_rate": 0.0001763615394300735, "loss": 0.9149, "step": 1965 }, { "epoch": 0.22, "grad_norm": 0.5791369080543518, "learning_rate": 0.00017624610010810878, "loss": 0.8566, "step": 1970 }, { "epoch": 0.22, "grad_norm": 0.5944411158561707, "learning_rate": 0.00017613041757381538, "loss": 0.8279, "step": 1975 }, { "epoch": 0.23, "grad_norm": 0.5867791175842285, "learning_rate": 0.00017601449219620125, "loss": 0.8742, "step": 1980 }, { "epoch": 0.23, "grad_norm": 0.5641552805900574, "learning_rate": 0.00017589832434504902, "loss": 0.7705, "step": 1985 }, { "epoch": 0.23, "grad_norm": 0.5856841206550598, "learning_rate": 0.0001757819143909147, "loss": 0.8156, "step": 1990 }, { "epoch": 0.23, "grad_norm": 0.5216368436813354, "learning_rate": 0.00017566526270512665, "loss": 0.965, "step": 1995 }, { "epoch": 0.23, "grad_norm": 0.6632785797119141, "learning_rate": 0.0001755483696597842, "loss": 0.88, "step": 2000 }, { "epoch": 0.23, "grad_norm": 0.5716277360916138, "learning_rate": 0.0001754312356277567, "loss": 0.8717, "step": 2005 }, { "epoch": 0.23, "grad_norm": 0.5452391505241394, "learning_rate": 0.0001753138609826822, "loss": 0.7268, "step": 2010 }, { "epoch": 0.23, "grad_norm": 0.5985669493675232, "learning_rate": 0.00017519624609896615, "loss": 0.8528, "step": 2015 }, { "epoch": 0.23, "grad_norm": 0.605197548866272, "learning_rate": 0.0001750783913517804, "loss": 0.9327, "step": 2020 }, { "epoch": 0.23, "grad_norm": 0.7269711494445801, "learning_rate": 0.0001749602971170619, "loss": 0.9022, "step": 2025 }, { "epoch": 0.23, "grad_norm": 0.6030486226081848, "learning_rate": 0.00017484196377151161, "loss": 0.851, "step": 2030 }, { "epoch": 0.23, "grad_norm": 0.5089486837387085, "learning_rate": 0.00017472339169259307, "loss": 0.7455, "step": 2035 }, { "epoch": 0.23, "grad_norm": 0.5635782480239868, "learning_rate": 0.00017460458125853143, "loss": 1.0043, "step": 2040 }, { "epoch": 0.23, "grad_norm": 0.5171327590942383, "learning_rate": 0.0001744855328483122, "loss": 0.8556, "step": 2045 }, { "epoch": 0.23, "grad_norm": 0.5177988409996033, "learning_rate": 0.00017436624684167984, "loss": 0.8203, "step": 2050 }, { "epoch": 0.23, "grad_norm": 0.702873170375824, "learning_rate": 0.00017424672361913686, "loss": 0.8809, "step": 2055 }, { "epoch": 0.23, "grad_norm": 0.5576356649398804, "learning_rate": 0.00017412696356194235, "loss": 0.8858, "step": 2060 }, { "epoch": 0.23, "grad_norm": 0.6151427030563354, "learning_rate": 0.0001740069670521109, "loss": 0.9176, "step": 2065 }, { "epoch": 0.24, "grad_norm": 0.5806076526641846, "learning_rate": 0.00017388673447241138, "loss": 1.0125, "step": 2070 }, { "epoch": 0.24, "grad_norm": 0.6101603507995605, "learning_rate": 0.00017376626620636557, "loss": 0.8164, "step": 2075 }, { "epoch": 0.24, "grad_norm": 0.6350337266921997, "learning_rate": 0.00017364556263824719, "loss": 0.86, "step": 2080 }, { "epoch": 0.24, "grad_norm": 0.5311822891235352, "learning_rate": 0.00017352462415308044, "loss": 0.8738, "step": 2085 }, { "epoch": 0.24, "grad_norm": 0.5520626306533813, "learning_rate": 0.0001734034511366389, "loss": 0.8186, "step": 2090 }, { "epoch": 0.24, "grad_norm": 0.6877513527870178, "learning_rate": 0.00017328204397544424, "loss": 0.8194, "step": 2095 }, { "epoch": 0.24, "grad_norm": 0.6582801342010498, "learning_rate": 0.00017316040305676508, "loss": 0.9177, "step": 2100 }, { "epoch": 0.24, "grad_norm": 0.637827455997467, "learning_rate": 0.0001730385287686156, "loss": 0.8896, "step": 2105 }, { "epoch": 0.24, "grad_norm": 0.5763481855392456, "learning_rate": 0.00017291642149975446, "loss": 0.8431, "step": 2110 }, { "epoch": 0.24, "grad_norm": 0.5631943941116333, "learning_rate": 0.00017279408163968342, "loss": 0.8405, "step": 2115 }, { "epoch": 0.24, "grad_norm": 0.5558026432991028, "learning_rate": 0.00017267150957864623, "loss": 0.8788, "step": 2120 }, { "epoch": 0.24, "grad_norm": 0.6178169250488281, "learning_rate": 0.00017254870570762733, "loss": 0.8176, "step": 2125 }, { "epoch": 0.24, "grad_norm": 0.5574104189872742, "learning_rate": 0.0001724256704183505, "loss": 0.8623, "step": 2130 }, { "epoch": 0.24, "grad_norm": 0.6056390404701233, "learning_rate": 0.00017230240410327782, "loss": 0.8526, "step": 2135 }, { "epoch": 0.24, "grad_norm": 0.584563136100769, "learning_rate": 0.00017217890715560822, "loss": 0.8111, "step": 2140 }, { "epoch": 0.24, "grad_norm": 0.5519852042198181, "learning_rate": 0.0001720551799692764, "loss": 0.767, "step": 2145 }, { "epoch": 0.24, "grad_norm": 0.5955138206481934, "learning_rate": 0.00017193122293895138, "loss": 0.9088, "step": 2150 }, { "epoch": 0.25, "grad_norm": 0.637936532497406, "learning_rate": 0.00017180703646003535, "loss": 0.7683, "step": 2155 }, { "epoch": 0.25, "grad_norm": 0.5777458548545837, "learning_rate": 0.0001716826209286625, "loss": 0.8103, "step": 2160 }, { "epoch": 0.25, "grad_norm": 0.5864526629447937, "learning_rate": 0.0001715579767416976, "loss": 0.8297, "step": 2165 }, { "epoch": 0.25, "grad_norm": 0.5473954677581787, "learning_rate": 0.0001714331042967348, "loss": 0.8161, "step": 2170 }, { "epoch": 0.25, "grad_norm": 0.5501269102096558, "learning_rate": 0.00017130800399209632, "loss": 0.774, "step": 2175 }, { "epoch": 0.25, "grad_norm": 0.622527539730072, "learning_rate": 0.00017118267622683123, "loss": 0.9349, "step": 2180 }, { "epoch": 0.25, "grad_norm": 0.6729717254638672, "learning_rate": 0.00017105712140071426, "loss": 0.8568, "step": 2185 }, { "epoch": 0.25, "grad_norm": 0.5891861319541931, "learning_rate": 0.00017093133991424425, "loss": 0.787, "step": 2190 }, { "epoch": 0.25, "grad_norm": 0.683480978012085, "learning_rate": 0.00017080533216864318, "loss": 0.8124, "step": 2195 }, { "epoch": 0.25, "grad_norm": 0.7241565585136414, "learning_rate": 0.00017067909856585472, "loss": 0.6895, "step": 2200 }, { "epoch": 0.25, "grad_norm": 0.6606466770172119, "learning_rate": 0.00017055263950854297, "loss": 0.8192, "step": 2205 }, { "epoch": 0.25, "grad_norm": 0.5163353681564331, "learning_rate": 0.00017042595540009124, "loss": 0.8085, "step": 2210 }, { "epoch": 0.25, "grad_norm": 0.6061686277389526, "learning_rate": 0.00017029904664460065, "loss": 0.9215, "step": 2215 }, { "epoch": 0.25, "grad_norm": 0.5943601727485657, "learning_rate": 0.00017017191364688896, "loss": 0.8554, "step": 2220 }, { "epoch": 0.25, "grad_norm": 0.6568538546562195, "learning_rate": 0.00017004455681248918, "loss": 0.8472, "step": 2225 }, { "epoch": 0.25, "grad_norm": 0.554853081703186, "learning_rate": 0.0001699169765476484, "loss": 0.8969, "step": 2230 }, { "epoch": 0.25, "grad_norm": 0.4984689950942993, "learning_rate": 0.0001697891732593263, "loss": 0.8548, "step": 2235 }, { "epoch": 0.25, "grad_norm": 0.6138368248939514, "learning_rate": 0.00016966114735519406, "loss": 0.8888, "step": 2240 }, { "epoch": 0.26, "grad_norm": 0.8124855756759644, "learning_rate": 0.00016953289924363297, "loss": 0.9703, "step": 2245 }, { "epoch": 0.26, "grad_norm": 0.6956800222396851, "learning_rate": 0.00016940442933373304, "loss": 0.7584, "step": 2250 }, { "epoch": 0.26, "grad_norm": 0.7704644799232483, "learning_rate": 0.00016927573803529185, "loss": 0.7784, "step": 2255 }, { "epoch": 0.26, "grad_norm": 0.5658566355705261, "learning_rate": 0.00016914682575881314, "loss": 0.8646, "step": 2260 }, { "epoch": 0.26, "grad_norm": 0.6220143437385559, "learning_rate": 0.00016901769291550558, "loss": 0.7348, "step": 2265 }, { "epoch": 0.26, "grad_norm": 0.6346325278282166, "learning_rate": 0.00016888833991728137, "loss": 0.8734, "step": 2270 }, { "epoch": 0.26, "grad_norm": 0.5919114947319031, "learning_rate": 0.00016875876717675496, "loss": 0.781, "step": 2275 }, { "epoch": 0.26, "grad_norm": 0.5826617479324341, "learning_rate": 0.00016862897510724176, "loss": 0.8185, "step": 2280 }, { "epoch": 0.26, "grad_norm": 0.5474236607551575, "learning_rate": 0.00016849896412275683, "loss": 0.7699, "step": 2285 }, { "epoch": 0.26, "grad_norm": 0.619706928730011, "learning_rate": 0.0001683687346380135, "loss": 0.852, "step": 2290 }, { "epoch": 0.26, "grad_norm": 0.8853769302368164, "learning_rate": 0.000168238287068422, "loss": 0.8513, "step": 2295 }, { "epoch": 0.26, "grad_norm": 0.6231085062026978, "learning_rate": 0.00016810762183008845, "loss": 0.7905, "step": 2300 }, { "epoch": 0.26, "grad_norm": 0.5759189128875732, "learning_rate": 0.00016797673933981297, "loss": 0.8566, "step": 2305 }, { "epoch": 0.26, "grad_norm": 0.5591956973075867, "learning_rate": 0.000167845640015089, "loss": 0.9151, "step": 2310 }, { "epoch": 0.26, "grad_norm": 0.7117490172386169, "learning_rate": 0.00016771432427410137, "loss": 0.7662, "step": 2315 }, { "epoch": 0.26, "grad_norm": 0.5831469893455505, "learning_rate": 0.00016758279253572546, "loss": 0.8582, "step": 2320 }, { "epoch": 0.26, "grad_norm": 0.7675443291664124, "learning_rate": 0.00016745104521952552, "loss": 0.8713, "step": 2325 }, { "epoch": 0.26, "grad_norm": 0.5224565863609314, "learning_rate": 0.0001673190827457535, "loss": 0.784, "step": 2330 }, { "epoch": 0.27, "grad_norm": 0.6027007102966309, "learning_rate": 0.00016718690553534766, "loss": 0.8347, "step": 2335 }, { "epoch": 0.27, "grad_norm": 0.5675181746482849, "learning_rate": 0.0001670545140099312, "loss": 0.7134, "step": 2340 }, { "epoch": 0.27, "grad_norm": 0.6312824487686157, "learning_rate": 0.00016692190859181102, "loss": 0.8298, "step": 2345 }, { "epoch": 0.27, "grad_norm": 0.5071117281913757, "learning_rate": 0.00016678908970397624, "loss": 0.845, "step": 2350 }, { "epoch": 0.27, "grad_norm": 0.6821117997169495, "learning_rate": 0.00016665605777009697, "loss": 0.8077, "step": 2355 }, { "epoch": 0.27, "grad_norm": 0.5504366755485535, "learning_rate": 0.00016652281321452282, "loss": 0.8021, "step": 2360 }, { "epoch": 0.27, "grad_norm": 0.5457181930541992, "learning_rate": 0.0001663893564622817, "loss": 0.7769, "step": 2365 }, { "epoch": 0.27, "grad_norm": 0.6445764899253845, "learning_rate": 0.00016625568793907834, "loss": 0.7976, "step": 2370 }, { "epoch": 0.27, "grad_norm": 0.6524176001548767, "learning_rate": 0.00016612180807129304, "loss": 0.7837, "step": 2375 }, { "epoch": 0.27, "grad_norm": 0.5401653051376343, "learning_rate": 0.00016598771728598024, "loss": 0.6942, "step": 2380 }, { "epoch": 0.27, "grad_norm": 0.5231527090072632, "learning_rate": 0.00016585341601086712, "loss": 0.7986, "step": 2385 }, { "epoch": 0.27, "grad_norm": 0.6786569952964783, "learning_rate": 0.0001657189046743523, "loss": 0.8576, "step": 2390 }, { "epoch": 0.27, "grad_norm": 0.6088935732841492, "learning_rate": 0.0001655841837055046, "loss": 0.8909, "step": 2395 }, { "epoch": 0.27, "grad_norm": 0.6915789842605591, "learning_rate": 0.00016544925353406125, "loss": 0.7604, "step": 2400 }, { "epoch": 0.27, "grad_norm": 0.5768089890480042, "learning_rate": 0.0001653141145904271, "loss": 0.8903, "step": 2405 }, { "epoch": 0.27, "grad_norm": 0.6887524127960205, "learning_rate": 0.0001651787673056728, "loss": 0.8363, "step": 2410 }, { "epoch": 0.27, "grad_norm": 0.6299374103546143, "learning_rate": 0.00016504321211153355, "loss": 0.8185, "step": 2415 }, { "epoch": 0.28, "grad_norm": 0.6226490139961243, "learning_rate": 0.00016490744944040777, "loss": 0.8176, "step": 2420 }, { "epoch": 0.28, "grad_norm": 0.6668693423271179, "learning_rate": 0.00016477147972535577, "loss": 0.8422, "step": 2425 }, { "epoch": 0.28, "grad_norm": 0.7947617769241333, "learning_rate": 0.00016463530340009817, "loss": 0.8986, "step": 2430 }, { "epoch": 0.28, "grad_norm": 0.5857140421867371, "learning_rate": 0.00016449892089901477, "loss": 0.75, "step": 2435 }, { "epoch": 0.28, "grad_norm": 0.658028244972229, "learning_rate": 0.00016436233265714297, "loss": 0.8304, "step": 2440 }, { "epoch": 0.28, "grad_norm": 0.5767120122909546, "learning_rate": 0.00016422553911017642, "loss": 0.8077, "step": 2445 }, { "epoch": 0.28, "grad_norm": 0.607893705368042, "learning_rate": 0.00016408854069446374, "loss": 0.8232, "step": 2450 }, { "epoch": 0.28, "grad_norm": 0.6129177212715149, "learning_rate": 0.00016395133784700695, "loss": 0.8561, "step": 2455 }, { "epoch": 0.28, "grad_norm": 0.5294451117515564, "learning_rate": 0.00016381393100546026, "loss": 0.7024, "step": 2460 }, { "epoch": 0.28, "grad_norm": 0.5510571002960205, "learning_rate": 0.00016367632060812856, "loss": 0.8306, "step": 2465 }, { "epoch": 0.28, "grad_norm": 5.814347267150879, "learning_rate": 0.00016353850709396604, "loss": 0.8465, "step": 2470 }, { "epoch": 0.28, "grad_norm": 0.5485466718673706, "learning_rate": 0.00016340049090257476, "loss": 0.741, "step": 2475 }, { "epoch": 0.28, "grad_norm": 0.6270123720169067, "learning_rate": 0.00016326227247420337, "loss": 0.9109, "step": 2480 }, { "epoch": 0.28, "grad_norm": 0.6270929574966431, "learning_rate": 0.00016312385224974554, "loss": 0.8672, "step": 2485 }, { "epoch": 0.28, "grad_norm": 0.5627338886260986, "learning_rate": 0.0001629852306707387, "loss": 0.9262, "step": 2490 }, { "epoch": 0.28, "grad_norm": 0.6023781299591064, "learning_rate": 0.00016284640817936254, "loss": 0.7498, "step": 2495 }, { "epoch": 0.28, "grad_norm": 0.5960550308227539, "learning_rate": 0.00016270738521843763, "loss": 0.7668, "step": 2500 }, { "epoch": 0.28, "grad_norm": 0.5488481521606445, "learning_rate": 0.000162568162231424, "loss": 0.8611, "step": 2505 }, { "epoch": 0.29, "grad_norm": 0.547410786151886, "learning_rate": 0.00016242873966241974, "loss": 0.7795, "step": 2510 }, { "epoch": 0.29, "grad_norm": 0.5622233748435974, "learning_rate": 0.00016228911795615952, "loss": 0.8837, "step": 2515 }, { "epoch": 0.29, "grad_norm": 0.8012298941612244, "learning_rate": 0.00016214929755801335, "loss": 0.895, "step": 2520 }, { "epoch": 0.29, "grad_norm": 0.6051672101020813, "learning_rate": 0.00016200927891398489, "loss": 0.7937, "step": 2525 }, { "epoch": 0.29, "grad_norm": 0.6143134832382202, "learning_rate": 0.00016186906247071025, "loss": 0.8751, "step": 2530 }, { "epoch": 0.29, "grad_norm": 0.6904672980308533, "learning_rate": 0.0001617286486754565, "loss": 0.7504, "step": 2535 }, { "epoch": 0.29, "grad_norm": 0.5270385146141052, "learning_rate": 0.00016158803797612019, "loss": 0.8145, "step": 2540 }, { "epoch": 0.29, "grad_norm": 0.6666483879089355, "learning_rate": 0.00016144723082122596, "loss": 0.778, "step": 2545 }, { "epoch": 0.29, "grad_norm": 0.5860428214073181, "learning_rate": 0.0001613062276599251, "loss": 0.8231, "step": 2550 }, { "epoch": 0.29, "grad_norm": 0.6345640420913696, "learning_rate": 0.00016116502894199418, "loss": 0.8982, "step": 2555 }, { "epoch": 0.29, "grad_norm": 0.590721845626831, "learning_rate": 0.00016102363511783362, "loss": 0.833, "step": 2560 }, { "epoch": 0.29, "grad_norm": 0.5986994504928589, "learning_rate": 0.00016088204663846595, "loss": 0.8326, "step": 2565 }, { "epoch": 0.29, "grad_norm": 0.5553504824638367, "learning_rate": 0.00016074026395553487, "loss": 0.7604, "step": 2570 }, { "epoch": 0.29, "grad_norm": 0.6035515666007996, "learning_rate": 0.00016059828752130345, "loss": 0.7755, "step": 2575 }, { "epoch": 0.29, "grad_norm": 0.6856073141098022, "learning_rate": 0.0001604561177886528, "loss": 0.7277, "step": 2580 }, { "epoch": 0.29, "grad_norm": 0.5378929972648621, "learning_rate": 0.00016031375521108066, "loss": 0.8081, "step": 2585 }, { "epoch": 0.29, "grad_norm": 0.6315340399742126, "learning_rate": 0.00016017120024269986, "loss": 0.9446, "step": 2590 }, { "epoch": 0.3, "grad_norm": 0.5610125660896301, "learning_rate": 0.00016002845333823695, "loss": 0.7239, "step": 2595 }, { "epoch": 0.3, "grad_norm": 0.6176686882972717, "learning_rate": 0.00015988551495303073, "loss": 0.8375, "step": 2600 }, { "epoch": 0.3, "grad_norm": 0.5788987278938293, "learning_rate": 0.00015974238554303076, "loss": 0.8433, "step": 2605 }, { "epoch": 0.3, "grad_norm": 0.6443178057670593, "learning_rate": 0.00015959906556479596, "loss": 0.9211, "step": 2610 }, { "epoch": 0.3, "grad_norm": 0.6430835723876953, "learning_rate": 0.00015945555547549315, "loss": 0.8475, "step": 2615 }, { "epoch": 0.3, "grad_norm": 0.6127652525901794, "learning_rate": 0.00015931185573289555, "loss": 0.8167, "step": 2620 }, { "epoch": 0.3, "grad_norm": 0.5990703105926514, "learning_rate": 0.00015916796679538134, "loss": 0.7095, "step": 2625 }, { "epoch": 0.3, "grad_norm": 0.6349881887435913, "learning_rate": 0.00015902388912193222, "loss": 0.8973, "step": 2630 }, { "epoch": 0.3, "grad_norm": 0.5582308173179626, "learning_rate": 0.0001588796231721319, "loss": 0.7662, "step": 2635 }, { "epoch": 0.3, "grad_norm": 0.5647494792938232, "learning_rate": 0.00015873516940616468, "loss": 0.7906, "step": 2640 }, { "epoch": 0.3, "grad_norm": 0.630150556564331, "learning_rate": 0.00015859052828481394, "loss": 0.8536, "step": 2645 }, { "epoch": 0.3, "grad_norm": 0.6978116035461426, "learning_rate": 0.0001584457002694607, "loss": 0.7525, "step": 2650 }, { "epoch": 0.3, "grad_norm": 0.6089206337928772, "learning_rate": 0.00015830068582208217, "loss": 0.849, "step": 2655 }, { "epoch": 0.3, "grad_norm": 0.5681561231613159, "learning_rate": 0.0001581554854052502, "loss": 0.8956, "step": 2660 }, { "epoch": 0.3, "grad_norm": 0.6889328956604004, "learning_rate": 0.0001580100994821299, "loss": 0.8592, "step": 2665 }, { "epoch": 0.3, "grad_norm": 0.5611298084259033, "learning_rate": 0.0001578645285164781, "loss": 0.8436, "step": 2670 }, { "epoch": 0.3, "grad_norm": 0.6381964087486267, "learning_rate": 0.00015771877297264184, "loss": 0.7636, "step": 2675 }, { "epoch": 0.3, "grad_norm": 0.6424821615219116, "learning_rate": 0.00015757283331555697, "loss": 0.8919, "step": 2680 }, { "epoch": 0.31, "grad_norm": 0.5920271873474121, "learning_rate": 0.00015742671001074668, "loss": 0.9166, "step": 2685 }, { "epoch": 0.31, "grad_norm": 0.6440754532814026, "learning_rate": 0.00015728040352431982, "loss": 0.8435, "step": 2690 }, { "epoch": 0.31, "grad_norm": 0.6160061359405518, "learning_rate": 0.00015713391432296977, "loss": 0.7851, "step": 2695 }, { "epoch": 0.31, "grad_norm": 0.5635654926300049, "learning_rate": 0.00015698724287397254, "loss": 0.8102, "step": 2700 }, { "epoch": 0.31, "grad_norm": 0.5842839479446411, "learning_rate": 0.00015684038964518558, "loss": 0.878, "step": 2705 }, { "epoch": 0.31, "grad_norm": 0.6162332892417908, "learning_rate": 0.00015669335510504618, "loss": 0.8285, "step": 2710 }, { "epoch": 0.31, "grad_norm": 0.5869714617729187, "learning_rate": 0.00015654613972256997, "loss": 0.8949, "step": 2715 }, { "epoch": 0.31, "grad_norm": 0.6483595371246338, "learning_rate": 0.00015639874396734943, "loss": 0.739, "step": 2720 }, { "epoch": 0.31, "grad_norm": 0.6540059447288513, "learning_rate": 0.00015625116830955243, "loss": 0.9031, "step": 2725 }, { "epoch": 0.31, "grad_norm": 0.5853153467178345, "learning_rate": 0.00015610341321992068, "loss": 0.7753, "step": 2730 }, { "epoch": 0.31, "grad_norm": 0.521800696849823, "learning_rate": 0.0001559554791697682, "loss": 0.8599, "step": 2735 }, { "epoch": 0.31, "grad_norm": 0.5266906023025513, "learning_rate": 0.00015580736663097996, "loss": 0.8546, "step": 2740 }, { "epoch": 0.31, "grad_norm": 0.585526168346405, "learning_rate": 0.00015565907607601023, "loss": 0.7511, "step": 2745 }, { "epoch": 0.31, "grad_norm": 0.5523586273193359, "learning_rate": 0.00015551060797788107, "loss": 0.8193, "step": 2750 }, { "epoch": 0.31, "grad_norm": 0.6577056050300598, "learning_rate": 0.00015536196281018097, "loss": 0.8619, "step": 2755 }, { "epoch": 0.31, "grad_norm": 0.5628554224967957, "learning_rate": 0.00015521314104706318, "loss": 0.7435, "step": 2760 }, { "epoch": 0.31, "grad_norm": 0.5676989555358887, "learning_rate": 0.00015506414316324426, "loss": 0.8461, "step": 2765 }, { "epoch": 0.31, "grad_norm": 0.6753340363502502, "learning_rate": 0.0001549149696340026, "loss": 0.8576, "step": 2770 }, { "epoch": 0.32, "grad_norm": 0.6714786887168884, "learning_rate": 0.00015476562093517688, "loss": 0.8376, "step": 2775 }, { "epoch": 0.32, "grad_norm": 0.5616926550865173, "learning_rate": 0.00015461609754316446, "loss": 0.7985, "step": 2780 }, { "epoch": 0.32, "grad_norm": 0.5463203191757202, "learning_rate": 0.00015446639993492003, "loss": 0.844, "step": 2785 }, { "epoch": 0.32, "grad_norm": 0.6310843229293823, "learning_rate": 0.00015431652858795394, "loss": 0.8265, "step": 2790 }, { "epoch": 0.32, "grad_norm": 0.6531693935394287, "learning_rate": 0.00015416648398033076, "loss": 0.9024, "step": 2795 }, { "epoch": 0.32, "grad_norm": 0.4960426688194275, "learning_rate": 0.00015401626659066774, "loss": 0.8993, "step": 2800 }, { "epoch": 0.32, "grad_norm": 0.6042740345001221, "learning_rate": 0.0001538658768981333, "loss": 0.8909, "step": 2805 }, { "epoch": 0.32, "grad_norm": 0.579511284828186, "learning_rate": 0.00015371531538244546, "loss": 0.7696, "step": 2810 }, { "epoch": 0.32, "grad_norm": 0.6731118559837341, "learning_rate": 0.00015356458252387025, "loss": 0.7309, "step": 2815 }, { "epoch": 0.32, "grad_norm": 0.5301114916801453, "learning_rate": 0.00015341367880322042, "loss": 0.7494, "step": 2820 }, { "epoch": 0.32, "grad_norm": 0.7461742162704468, "learning_rate": 0.00015326260470185352, "loss": 0.8429, "step": 2825 }, { "epoch": 0.32, "grad_norm": 0.6235002875328064, "learning_rate": 0.00015311136070167075, "loss": 0.7959, "step": 2830 }, { "epoch": 0.32, "grad_norm": 0.5418492555618286, "learning_rate": 0.00015295994728511532, "loss": 0.8112, "step": 2835 }, { "epoch": 0.32, "grad_norm": 0.6243062019348145, "learning_rate": 0.0001528083649351706, "loss": 0.8134, "step": 2840 }, { "epoch": 0.32, "grad_norm": 0.5746551156044006, "learning_rate": 0.00015265661413535906, "loss": 0.8692, "step": 2845 }, { "epoch": 0.32, "grad_norm": 0.7506961226463318, "learning_rate": 0.00015250469536974042, "loss": 0.8399, "step": 2850 }, { "epoch": 0.32, "grad_norm": 0.6415050625801086, "learning_rate": 0.00015235260912291012, "loss": 0.7829, "step": 2855 }, { "epoch": 0.33, "grad_norm": 0.6047353148460388, "learning_rate": 0.00015220035587999796, "loss": 0.8918, "step": 2860 }, { "epoch": 0.33, "grad_norm": 0.573059618473053, "learning_rate": 0.00015204793612666627, "loss": 0.817, "step": 2865 }, { "epoch": 0.33, "grad_norm": 0.5482955574989319, "learning_rate": 0.00015189535034910873, "loss": 0.7738, "step": 2870 }, { "epoch": 0.33, "grad_norm": 0.5009284615516663, "learning_rate": 0.00015174259903404845, "loss": 0.7723, "step": 2875 }, { "epoch": 0.33, "grad_norm": 0.5775097012519836, "learning_rate": 0.00015158968266873658, "loss": 0.804, "step": 2880 }, { "epoch": 0.33, "grad_norm": 0.5707810521125793, "learning_rate": 0.00015143660174095081, "loss": 0.7024, "step": 2885 }, { "epoch": 0.33, "grad_norm": 0.5989297032356262, "learning_rate": 0.00015128335673899375, "loss": 0.8147, "step": 2890 }, { "epoch": 0.33, "grad_norm": 0.5856032967567444, "learning_rate": 0.00015112994815169142, "loss": 0.8361, "step": 2895 }, { "epoch": 0.33, "grad_norm": 0.6352725028991699, "learning_rate": 0.0001509763764683915, "loss": 0.8242, "step": 2900 }, { "epoch": 0.33, "grad_norm": 0.613037645816803, "learning_rate": 0.00015082264217896208, "loss": 0.9165, "step": 2905 }, { "epoch": 0.33, "grad_norm": 0.555920422077179, "learning_rate": 0.00015066874577378988, "loss": 0.8409, "step": 2910 }, { "epoch": 0.33, "grad_norm": 0.616894006729126, "learning_rate": 0.00015051468774377868, "loss": 0.7981, "step": 2915 }, { "epoch": 0.33, "grad_norm": 0.6403358578681946, "learning_rate": 0.00015036046858034796, "loss": 0.9592, "step": 2920 }, { "epoch": 0.33, "grad_norm": 0.6272974014282227, "learning_rate": 0.00015020608877543102, "loss": 0.7743, "step": 2925 }, { "epoch": 0.33, "grad_norm": 0.6145214438438416, "learning_rate": 0.00015005154882147373, "loss": 0.7876, "step": 2930 }, { "epoch": 0.33, "grad_norm": 0.5387482047080994, "learning_rate": 0.00014989684921143268, "loss": 0.8426, "step": 2935 }, { "epoch": 0.33, "grad_norm": 0.6402955651283264, "learning_rate": 0.0001497419904387738, "loss": 0.8531, "step": 2940 }, { "epoch": 0.33, "grad_norm": 0.6462345719337463, "learning_rate": 0.0001495869729974708, "loss": 0.8089, "step": 2945 }, { "epoch": 0.34, "grad_norm": 0.5510848760604858, "learning_rate": 0.00014943179738200333, "loss": 0.7983, "step": 2950 }, { "epoch": 0.34, "grad_norm": 0.6539138555526733, "learning_rate": 0.00014927646408735576, "loss": 0.7698, "step": 2955 }, { "epoch": 0.34, "grad_norm": 0.5367149710655212, "learning_rate": 0.00014912097360901533, "loss": 0.7783, "step": 2960 }, { "epoch": 0.34, "grad_norm": 0.5643497109413147, "learning_rate": 0.0001489653264429707, "loss": 0.7887, "step": 2965 }, { "epoch": 0.34, "grad_norm": 0.6941254138946533, "learning_rate": 0.0001488095230857104, "loss": 0.9146, "step": 2970 }, { "epoch": 0.34, "grad_norm": 0.5869201421737671, "learning_rate": 0.00014865356403422105, "loss": 0.8394, "step": 2975 }, { "epoch": 0.34, "grad_norm": 0.8639833331108093, "learning_rate": 0.00014849744978598603, "loss": 0.7952, "step": 2980 }, { "epoch": 0.34, "grad_norm": 0.6019383668899536, "learning_rate": 0.00014834118083898373, "loss": 0.8434, "step": 2985 }, { "epoch": 0.34, "grad_norm": 0.6566647887229919, "learning_rate": 0.00014818475769168594, "loss": 0.7786, "step": 2990 }, { "epoch": 0.34, "grad_norm": 0.6752328276634216, "learning_rate": 0.00014802818084305646, "loss": 0.7453, "step": 2995 }, { "epoch": 0.34, "grad_norm": 0.5767747759819031, "learning_rate": 0.00014787145079254925, "loss": 0.8015, "step": 3000 }, { "epoch": 0.34, "grad_norm": 0.6346129775047302, "learning_rate": 0.00014771456804010702, "loss": 0.9022, "step": 3005 }, { "epoch": 0.34, "grad_norm": 0.6062515377998352, "learning_rate": 0.0001475575330861595, "loss": 0.887, "step": 3010 }, { "epoch": 0.34, "grad_norm": 0.5885165929794312, "learning_rate": 0.00014740034643162208, "loss": 0.8497, "step": 3015 }, { "epoch": 0.34, "grad_norm": 0.5770689845085144, "learning_rate": 0.00014724300857789385, "loss": 0.7701, "step": 3020 }, { "epoch": 0.34, "grad_norm": 0.6336144804954529, "learning_rate": 0.00014708552002685633, "loss": 0.7648, "step": 3025 }, { "epoch": 0.34, "grad_norm": 0.614093005657196, "learning_rate": 0.00014692788128087175, "loss": 0.7606, "step": 3030 }, { "epoch": 0.35, "grad_norm": 0.6177241206169128, "learning_rate": 0.00014677009284278127, "loss": 0.7948, "step": 3035 }, { "epoch": 0.35, "grad_norm": 0.6105360984802246, "learning_rate": 0.00014661215521590375, "loss": 0.6969, "step": 3040 }, { "epoch": 0.35, "grad_norm": 0.5755805373191833, "learning_rate": 0.00014645406890403384, "loss": 0.912, "step": 3045 }, { "epoch": 0.35, "grad_norm": 0.6132418513298035, "learning_rate": 0.00014629583441144042, "loss": 0.8358, "step": 3050 }, { "epoch": 0.35, "grad_norm": 0.5401825308799744, "learning_rate": 0.00014613745224286524, "loss": 0.8461, "step": 3055 }, { "epoch": 0.35, "grad_norm": 0.6283666491508484, "learning_rate": 0.0001459789229035208, "loss": 0.8416, "step": 3060 }, { "epoch": 0.35, "grad_norm": 0.5837119221687317, "learning_rate": 0.00014582024689908932, "loss": 0.8082, "step": 3065 }, { "epoch": 0.35, "grad_norm": 0.6010891199111938, "learning_rate": 0.0001456614247357208, "loss": 0.7427, "step": 3070 }, { "epoch": 0.35, "grad_norm": 0.625624418258667, "learning_rate": 0.00014550245692003132, "loss": 0.8802, "step": 3075 }, { "epoch": 0.35, "grad_norm": 0.5869577527046204, "learning_rate": 0.00014534334395910171, "loss": 0.8265, "step": 3080 }, { "epoch": 0.35, "grad_norm": 0.6369906067848206, "learning_rate": 0.0001451840863604758, "loss": 0.7906, "step": 3085 }, { "epoch": 0.35, "grad_norm": 0.5773164629936218, "learning_rate": 0.00014502468463215866, "loss": 0.7897, "step": 3090 }, { "epoch": 0.35, "grad_norm": 0.646680474281311, "learning_rate": 0.00014486513928261524, "loss": 0.8279, "step": 3095 }, { "epoch": 0.35, "grad_norm": 0.6799820065498352, "learning_rate": 0.00014470545082076854, "loss": 0.8948, "step": 3100 }, { "epoch": 0.35, "grad_norm": 0.5638763904571533, "learning_rate": 0.0001445456197559981, "loss": 0.8377, "step": 3105 }, { "epoch": 0.35, "grad_norm": 0.5281423330307007, "learning_rate": 0.00014438564659813833, "loss": 0.8436, "step": 3110 }, { "epoch": 0.35, "grad_norm": 0.54918372631073, "learning_rate": 0.00014422553185747692, "loss": 0.7828, "step": 3115 }, { "epoch": 0.35, "grad_norm": 0.5749617218971252, "learning_rate": 0.00014406527604475308, "loss": 0.7934, "step": 3120 }, { "epoch": 0.36, "grad_norm": 0.5960623025894165, "learning_rate": 0.00014390487967115619, "loss": 0.8148, "step": 3125 }, { "epoch": 0.36, "grad_norm": 0.5716930031776428, "learning_rate": 0.00014374434324832385, "loss": 0.9293, "step": 3130 }, { "epoch": 0.36, "grad_norm": 0.6069075465202332, "learning_rate": 0.00014358366728834044, "loss": 0.7865, "step": 3135 }, { "epoch": 0.36, "grad_norm": 0.6393450498580933, "learning_rate": 0.0001434228523037355, "loss": 0.8179, "step": 3140 }, { "epoch": 0.36, "grad_norm": 0.6741149425506592, "learning_rate": 0.00014326189880748186, "loss": 0.867, "step": 3145 }, { "epoch": 0.36, "grad_norm": 0.5840499997138977, "learning_rate": 0.00014310080731299443, "loss": 0.8286, "step": 3150 }, { "epoch": 0.36, "grad_norm": 0.5931133031845093, "learning_rate": 0.0001429395783341281, "loss": 0.9023, "step": 3155 }, { "epoch": 0.36, "grad_norm": 0.7968230843544006, "learning_rate": 0.00014277821238517643, "loss": 0.754, "step": 3160 }, { "epoch": 0.36, "grad_norm": 0.56842041015625, "learning_rate": 0.00014261670998086986, "loss": 0.6804, "step": 3165 }, { "epoch": 0.36, "grad_norm": 0.6133948564529419, "learning_rate": 0.00014245507163637407, "loss": 0.8501, "step": 3170 }, { "epoch": 0.36, "grad_norm": 0.5714240670204163, "learning_rate": 0.00014229329786728839, "loss": 0.8027, "step": 3175 }, { "epoch": 0.36, "grad_norm": 0.8033362627029419, "learning_rate": 0.00014213138918964415, "loss": 0.8119, "step": 3180 }, { "epoch": 0.36, "grad_norm": 0.7656739950180054, "learning_rate": 0.00014196934611990296, "loss": 0.8129, "step": 3185 }, { "epoch": 0.36, "grad_norm": 0.5456616282463074, "learning_rate": 0.0001418071691749552, "loss": 0.6827, "step": 3190 }, { "epoch": 0.36, "grad_norm": 0.5840951204299927, "learning_rate": 0.00014164485887211824, "loss": 0.67, "step": 3195 }, { "epoch": 0.36, "grad_norm": 0.7157981395721436, "learning_rate": 0.0001414824157291348, "loss": 0.861, "step": 3200 }, { "epoch": 0.36, "grad_norm": 0.6238237619400024, "learning_rate": 0.00014131984026417147, "loss": 0.8524, "step": 3205 }, { "epoch": 0.36, "grad_norm": 0.7766329646110535, "learning_rate": 0.00014115713299581677, "loss": 0.7376, "step": 3210 }, { "epoch": 0.37, "grad_norm": 0.5822386741638184, "learning_rate": 0.00014099429444307973, "loss": 0.9006, "step": 3215 }, { "epoch": 0.37, "grad_norm": 0.6553748250007629, "learning_rate": 0.00014083132512538815, "loss": 0.781, "step": 3220 }, { "epoch": 0.37, "grad_norm": 0.5938107967376709, "learning_rate": 0.00014066822556258693, "loss": 0.7423, "step": 3225 }, { "epoch": 0.37, "grad_norm": 0.6933189630508423, "learning_rate": 0.00014050499627493647, "loss": 0.8366, "step": 3230 }, { "epoch": 0.37, "grad_norm": 0.5675535202026367, "learning_rate": 0.00014034163778311095, "loss": 0.8206, "step": 3235 }, { "epoch": 0.37, "grad_norm": 0.49278539419174194, "learning_rate": 0.00014017815060819665, "loss": 0.7769, "step": 3240 }, { "epoch": 0.37, "grad_norm": 0.5603547096252441, "learning_rate": 0.00014001453527169035, "loss": 0.815, "step": 3245 }, { "epoch": 0.37, "grad_norm": 0.6954237818717957, "learning_rate": 0.00013985079229549772, "loss": 0.8185, "step": 3250 }, { "epoch": 0.37, "grad_norm": 0.5948113799095154, "learning_rate": 0.00013968692220193144, "loss": 0.7734, "step": 3255 }, { "epoch": 0.37, "grad_norm": 0.5735976696014404, "learning_rate": 0.00013952292551370978, "loss": 0.7777, "step": 3260 }, { "epoch": 0.37, "grad_norm": 0.6098427176475525, "learning_rate": 0.00013935880275395482, "loss": 0.689, "step": 3265 }, { "epoch": 0.37, "grad_norm": 0.5526130199432373, "learning_rate": 0.00013919455444619074, "loss": 0.7506, "step": 3270 }, { "epoch": 0.37, "grad_norm": 0.585938572883606, "learning_rate": 0.0001390301811143422, "loss": 0.794, "step": 3275 }, { "epoch": 0.37, "grad_norm": 0.737772524356842, "learning_rate": 0.00013886568328273267, "loss": 0.8794, "step": 3280 }, { "epoch": 0.37, "grad_norm": 0.5786934494972229, "learning_rate": 0.00013870106147608282, "loss": 0.8145, "step": 3285 }, { "epoch": 0.37, "grad_norm": 0.6087589859962463, "learning_rate": 0.0001385363162195087, "loss": 0.898, "step": 3290 }, { "epoch": 0.37, "grad_norm": 0.6845444440841675, "learning_rate": 0.00013837144803852016, "loss": 0.9058, "step": 3295 }, { "epoch": 0.38, "grad_norm": 0.6652143001556396, "learning_rate": 0.00013820645745901916, "loss": 0.7903, "step": 3300 }, { "epoch": 0.38, "grad_norm": 0.5612785816192627, "learning_rate": 0.00013804134500729816, "loss": 0.8815, "step": 3305 }, { "epoch": 0.38, "grad_norm": 0.5807976126670837, "learning_rate": 0.00013787611121003824, "loss": 0.7484, "step": 3310 }, { "epoch": 0.38, "grad_norm": 0.6199260950088501, "learning_rate": 0.0001377107565943077, "loss": 0.7713, "step": 3315 }, { "epoch": 0.38, "grad_norm": 0.5932952761650085, "learning_rate": 0.00013754528168756006, "loss": 0.8288, "step": 3320 }, { "epoch": 0.38, "grad_norm": 0.5941212773323059, "learning_rate": 0.00013737968701763275, "loss": 0.8343, "step": 3325 }, { "epoch": 0.38, "grad_norm": 0.5822454690933228, "learning_rate": 0.00013721397311274505, "loss": 0.7255, "step": 3330 }, { "epoch": 0.38, "grad_norm": 0.6364741921424866, "learning_rate": 0.00013704814050149663, "loss": 0.8083, "step": 3335 }, { "epoch": 0.38, "grad_norm": 0.707168459892273, "learning_rate": 0.0001368821897128659, "loss": 0.7845, "step": 3340 }, { "epoch": 0.38, "grad_norm": 0.6789791584014893, "learning_rate": 0.0001367161212762081, "loss": 0.8041, "step": 3345 }, { "epoch": 0.38, "grad_norm": 0.6019969582557678, "learning_rate": 0.00013654993572125384, "loss": 0.7461, "step": 3350 }, { "epoch": 0.38, "grad_norm": 0.9661110639572144, "learning_rate": 0.00013638363357810734, "loss": 0.7196, "step": 3355 }, { "epoch": 0.38, "grad_norm": 0.6087985038757324, "learning_rate": 0.00013621721537724458, "loss": 0.7691, "step": 3360 }, { "epoch": 0.38, "grad_norm": 0.628359854221344, "learning_rate": 0.00013605068164951193, "loss": 0.8378, "step": 3365 }, { "epoch": 0.38, "grad_norm": 0.6273655891418457, "learning_rate": 0.00013588403292612408, "loss": 0.7873, "step": 3370 }, { "epoch": 0.38, "grad_norm": 0.6866421103477478, "learning_rate": 0.00013571726973866274, "loss": 0.7953, "step": 3375 }, { "epoch": 0.38, "grad_norm": 0.6045382022857666, "learning_rate": 0.00013555039261907453, "loss": 0.8285, "step": 3380 }, { "epoch": 0.38, "grad_norm": 0.5913712978363037, "learning_rate": 0.00013538340209966966, "loss": 0.775, "step": 3385 }, { "epoch": 0.39, "grad_norm": 0.669894278049469, "learning_rate": 0.00013521629871311995, "loss": 0.7326, "step": 3390 }, { "epoch": 0.39, "grad_norm": 0.6073742508888245, "learning_rate": 0.00013504908299245738, "loss": 0.8209, "step": 3395 }, { "epoch": 0.39, "grad_norm": 0.7493303418159485, "learning_rate": 0.00013488175547107215, "loss": 0.7391, "step": 3400 }, { "epoch": 0.39, "grad_norm": 0.5117560625076294, "learning_rate": 0.00013471431668271103, "loss": 0.7678, "step": 3405 }, { "epoch": 0.39, "grad_norm": 0.5908418297767639, "learning_rate": 0.00013454676716147593, "loss": 0.8567, "step": 3410 }, { "epoch": 0.39, "grad_norm": 0.4934523403644562, "learning_rate": 0.00013437910744182178, "loss": 0.9218, "step": 3415 }, { "epoch": 0.39, "grad_norm": 0.6765584349632263, "learning_rate": 0.0001342113380585551, "loss": 0.7872, "step": 3420 }, { "epoch": 0.39, "grad_norm": 0.5264098048210144, "learning_rate": 0.0001340434595468322, "loss": 0.7688, "step": 3425 }, { "epoch": 0.39, "grad_norm": 0.621438205242157, "learning_rate": 0.00013387547244215754, "loss": 0.8054, "step": 3430 }, { "epoch": 0.39, "grad_norm": 0.8711406588554382, "learning_rate": 0.0001337073772803819, "loss": 0.8414, "step": 3435 }, { "epoch": 0.39, "grad_norm": 0.5573263764381409, "learning_rate": 0.00013353917459770078, "loss": 0.7817, "step": 3440 }, { "epoch": 0.39, "grad_norm": 0.5504346489906311, "learning_rate": 0.00013337086493065266, "loss": 0.7979, "step": 3445 }, { "epoch": 0.39, "grad_norm": 0.6491450071334839, "learning_rate": 0.00013320244881611726, "loss": 0.8133, "step": 3450 }, { "epoch": 0.39, "grad_norm": 0.5762600898742676, "learning_rate": 0.00013303392679131393, "loss": 0.7396, "step": 3455 }, { "epoch": 0.39, "grad_norm": 0.606731116771698, "learning_rate": 0.00013286529939379968, "loss": 0.8597, "step": 3460 }, { "epoch": 0.39, "grad_norm": 0.583909273147583, "learning_rate": 0.00013269656716146785, "loss": 0.7119, "step": 3465 }, { "epoch": 0.39, "grad_norm": 0.5781696438789368, "learning_rate": 0.0001325277306325461, "loss": 0.7583, "step": 3470 }, { "epoch": 0.4, "grad_norm": 0.6569092273712158, "learning_rate": 0.00013235879034559467, "loss": 0.7816, "step": 3475 }, { "epoch": 0.4, "grad_norm": 0.5959500670433044, "learning_rate": 0.0001321897468395049, "loss": 0.7365, "step": 3480 }, { "epoch": 0.4, "grad_norm": 0.6647725701332092, "learning_rate": 0.0001320206006534974, "loss": 0.8132, "step": 3485 }, { "epoch": 0.4, "grad_norm": 0.7699481248855591, "learning_rate": 0.00013185135232712022, "loss": 0.7308, "step": 3490 }, { "epoch": 0.4, "grad_norm": 0.5117635130882263, "learning_rate": 0.00013168200240024728, "loss": 0.7804, "step": 3495 }, { "epoch": 0.4, "grad_norm": 0.6221270561218262, "learning_rate": 0.00013151255141307657, "loss": 0.8068, "step": 3500 }, { "epoch": 0.4, "grad_norm": 0.6052716374397278, "learning_rate": 0.0001313429999061284, "loss": 0.7677, "step": 3505 }, { "epoch": 0.4, "grad_norm": 0.6011590361595154, "learning_rate": 0.00013117334842024385, "loss": 0.8228, "step": 3510 }, { "epoch": 0.4, "grad_norm": 0.6260893940925598, "learning_rate": 0.0001310035974965828, "loss": 0.8167, "step": 3515 }, { "epoch": 0.4, "grad_norm": 0.5733935832977295, "learning_rate": 0.0001308337476766223, "loss": 0.8315, "step": 3520 }, { "epoch": 0.4, "grad_norm": 0.6916394233703613, "learning_rate": 0.00013066379950215498, "loss": 0.735, "step": 3525 }, { "epoch": 0.4, "grad_norm": 0.7169948816299438, "learning_rate": 0.0001304937535152871, "loss": 0.7433, "step": 3530 }, { "epoch": 0.4, "grad_norm": 0.5684279203414917, "learning_rate": 0.00013032361025843705, "loss": 0.7539, "step": 3535 }, { "epoch": 0.4, "grad_norm": 0.7120059728622437, "learning_rate": 0.0001301533702743333, "loss": 0.913, "step": 3540 }, { "epoch": 0.4, "grad_norm": 0.6004089713096619, "learning_rate": 0.000129983034106013, "loss": 0.7089, "step": 3545 }, { "epoch": 0.4, "grad_norm": 0.5413163900375366, "learning_rate": 0.00012981260229682018, "loss": 0.7601, "step": 3550 }, { "epoch": 0.4, "grad_norm": 0.5603556036949158, "learning_rate": 0.0001296420753904037, "loss": 0.8225, "step": 3555 }, { "epoch": 0.4, "grad_norm": 0.6073338389396667, "learning_rate": 0.00012947145393071608, "loss": 0.8175, "step": 3560 }, { "epoch": 0.41, "grad_norm": 0.5353379249572754, "learning_rate": 0.00012930073846201116, "loss": 0.7516, "step": 3565 }, { "epoch": 0.41, "grad_norm": 0.5886621475219727, "learning_rate": 0.00012912992952884283, "loss": 0.8678, "step": 3570 }, { "epoch": 0.41, "grad_norm": 0.5950932502746582, "learning_rate": 0.0001289590276760631, "loss": 0.7118, "step": 3575 }, { "epoch": 0.41, "grad_norm": 0.6099336743354797, "learning_rate": 0.00012878803344882028, "loss": 0.909, "step": 3580 }, { "epoch": 0.41, "grad_norm": 0.662491500377655, "learning_rate": 0.00012861694739255746, "loss": 0.8293, "step": 3585 }, { "epoch": 0.41, "grad_norm": 0.49604055285453796, "learning_rate": 0.00012844577005301054, "loss": 0.7738, "step": 3590 }, { "epoch": 0.41, "grad_norm": 0.5976107716560364, "learning_rate": 0.00012827450197620672, "loss": 0.7678, "step": 3595 }, { "epoch": 0.41, "grad_norm": 0.6851217150688171, "learning_rate": 0.00012810314370846252, "loss": 0.7946, "step": 3600 }, { "epoch": 0.41, "grad_norm": 0.6138992309570312, "learning_rate": 0.00012793169579638223, "loss": 0.7757, "step": 3605 }, { "epoch": 0.41, "grad_norm": 0.5814104080200195, "learning_rate": 0.00012776015878685604, "loss": 0.8763, "step": 3610 }, { "epoch": 0.41, "grad_norm": 0.7025929689407349, "learning_rate": 0.00012758853322705836, "loss": 0.7614, "step": 3615 }, { "epoch": 0.41, "grad_norm": 0.7081409692764282, "learning_rate": 0.00012741681966444609, "loss": 0.9598, "step": 3620 }, { "epoch": 0.41, "grad_norm": 0.5396503210067749, "learning_rate": 0.0001272450186467568, "loss": 0.7691, "step": 3625 }, { "epoch": 0.41, "grad_norm": 0.5764672160148621, "learning_rate": 0.0001270731307220071, "loss": 0.7361, "step": 3630 }, { "epoch": 0.41, "grad_norm": 0.5809180736541748, "learning_rate": 0.00012690115643849078, "loss": 0.7498, "step": 3635 }, { "epoch": 0.41, "grad_norm": 0.8152880668640137, "learning_rate": 0.000126729096344777, "loss": 0.8225, "step": 3640 }, { "epoch": 0.41, "grad_norm": 0.6310808658599854, "learning_rate": 0.0001265569509897088, "loss": 0.9078, "step": 3645 }, { "epoch": 0.42, "grad_norm": 0.540705144405365, "learning_rate": 0.00012638472092240112, "loss": 0.7529, "step": 3650 }, { "epoch": 0.42, "grad_norm": 0.5447615385055542, "learning_rate": 0.00012621240669223905, "loss": 0.8246, "step": 3655 }, { "epoch": 0.42, "grad_norm": 0.5487338900566101, "learning_rate": 0.00012604000884887634, "loss": 0.8896, "step": 3660 }, { "epoch": 0.42, "grad_norm": 0.5725840926170349, "learning_rate": 0.0001258675279422332, "loss": 0.8124, "step": 3665 }, { "epoch": 0.42, "grad_norm": 0.5746605396270752, "learning_rate": 0.00012569496452249497, "loss": 0.8167, "step": 3670 }, { "epoch": 0.42, "grad_norm": 0.6197704672813416, "learning_rate": 0.00012552231914011015, "loss": 0.7547, "step": 3675 }, { "epoch": 0.42, "grad_norm": 0.5929337739944458, "learning_rate": 0.0001253495923457887, "loss": 0.8175, "step": 3680 }, { "epoch": 0.42, "grad_norm": 0.6313364505767822, "learning_rate": 0.00012517678469050022, "loss": 0.8266, "step": 3685 }, { "epoch": 0.42, "grad_norm": 0.6353349685668945, "learning_rate": 0.00012500389672547233, "loss": 0.7939, "step": 3690 }, { "epoch": 0.42, "grad_norm": 0.7956532835960388, "learning_rate": 0.00012483092900218872, "loss": 0.7978, "step": 3695 }, { "epoch": 0.42, "grad_norm": 0.6404840350151062, "learning_rate": 0.00012465788207238754, "loss": 0.8911, "step": 3700 }, { "epoch": 0.42, "grad_norm": 0.6020888686180115, "learning_rate": 0.00012448475648805965, "loss": 0.8003, "step": 3705 }, { "epoch": 0.42, "grad_norm": 0.5578533411026001, "learning_rate": 0.0001243115528014467, "loss": 0.7608, "step": 3710 }, { "epoch": 0.42, "grad_norm": 0.5616059899330139, "learning_rate": 0.0001241382715650396, "loss": 0.8111, "step": 3715 }, { "epoch": 0.42, "grad_norm": 0.5208618640899658, "learning_rate": 0.00012396491333157653, "loss": 0.7848, "step": 3720 }, { "epoch": 0.42, "grad_norm": 0.6344425082206726, "learning_rate": 0.00012379147865404126, "loss": 0.82, "step": 3725 }, { "epoch": 0.42, "grad_norm": 0.6129960417747498, "learning_rate": 0.00012361796808566154, "loss": 0.8048, "step": 3730 }, { "epoch": 0.42, "grad_norm": 0.5904735922813416, "learning_rate": 0.00012344438217990706, "loss": 0.8064, "step": 3735 }, { "epoch": 0.43, "grad_norm": 0.6212365031242371, "learning_rate": 0.00012327072149048785, "loss": 0.7793, "step": 3740 }, { "epoch": 0.43, "grad_norm": 0.5697383284568787, "learning_rate": 0.00012309698657135264, "loss": 0.8082, "step": 3745 }, { "epoch": 0.43, "grad_norm": 0.5733943581581116, "learning_rate": 0.00012292317797668665, "loss": 0.8163, "step": 3750 }, { "epoch": 0.43, "grad_norm": 0.7434484362602234, "learning_rate": 0.00012274929626091035, "loss": 0.8446, "step": 3755 }, { "epoch": 0.43, "grad_norm": 0.5016825795173645, "learning_rate": 0.00012257534197867743, "loss": 0.731, "step": 3760 }, { "epoch": 0.43, "grad_norm": 0.6584892868995667, "learning_rate": 0.00012240131568487292, "loss": 0.7483, "step": 3765 }, { "epoch": 0.43, "grad_norm": 0.5814744234085083, "learning_rate": 0.0001222272179346117, "loss": 0.718, "step": 3770 }, { "epoch": 0.43, "grad_norm": 0.7187736630439758, "learning_rate": 0.00012205304928323649, "loss": 0.8388, "step": 3775 }, { "epoch": 0.43, "grad_norm": 0.6147611141204834, "learning_rate": 0.00012187881028631621, "loss": 0.8159, "step": 3780 }, { "epoch": 0.43, "grad_norm": 0.5736737251281738, "learning_rate": 0.0001217045014996442, "loss": 0.6625, "step": 3785 }, { "epoch": 0.43, "grad_norm": 0.6499507427215576, "learning_rate": 0.00012153012347923634, "loss": 0.8721, "step": 3790 }, { "epoch": 0.43, "grad_norm": 0.6190776824951172, "learning_rate": 0.00012135567678132942, "loss": 0.7648, "step": 3795 }, { "epoch": 0.43, "grad_norm": 0.6103724837303162, "learning_rate": 0.0001211811619623793, "loss": 0.7944, "step": 3800 }, { "epoch": 0.43, "grad_norm": 0.5723038911819458, "learning_rate": 0.00012100657957905908, "loss": 0.7289, "step": 3805 }, { "epoch": 0.43, "grad_norm": 0.5608327984809875, "learning_rate": 0.00012083193018825744, "loss": 0.8117, "step": 3810 }, { "epoch": 0.43, "grad_norm": 0.5121841430664062, "learning_rate": 0.00012065721434707677, "loss": 0.9014, "step": 3815 }, { "epoch": 0.43, "grad_norm": 0.6049484014511108, "learning_rate": 0.00012048243261283143, "loss": 0.7161, "step": 3820 }, { "epoch": 0.43, "grad_norm": 0.6480629444122314, "learning_rate": 0.00012030758554304593, "loss": 0.8718, "step": 3825 }, { "epoch": 0.44, "grad_norm": 0.6752771735191345, "learning_rate": 0.00012013267369545329, "loss": 0.8241, "step": 3830 }, { "epoch": 0.44, "grad_norm": 0.6116204261779785, "learning_rate": 0.00011995769762799307, "loss": 0.8426, "step": 3835 }, { "epoch": 0.44, "grad_norm": 0.6204956769943237, "learning_rate": 0.00011978265789880973, "loss": 0.8223, "step": 3840 }, { "epoch": 0.44, "grad_norm": 0.6253282427787781, "learning_rate": 0.00011960755506625077, "loss": 0.7238, "step": 3845 }, { "epoch": 0.44, "grad_norm": 0.7981095314025879, "learning_rate": 0.00011943238968886492, "loss": 0.7958, "step": 3850 }, { "epoch": 0.44, "grad_norm": 0.67615807056427, "learning_rate": 0.00011925716232540061, "loss": 0.8668, "step": 3855 }, { "epoch": 0.44, "grad_norm": 0.6348207592964172, "learning_rate": 0.0001190818735348038, "loss": 0.8649, "step": 3860 }, { "epoch": 0.44, "grad_norm": 0.8550826907157898, "learning_rate": 0.00011890652387621643, "loss": 0.7417, "step": 3865 }, { "epoch": 0.44, "grad_norm": 0.638140082359314, "learning_rate": 0.00011873111390897475, "loss": 0.8436, "step": 3870 }, { "epoch": 0.44, "grad_norm": 0.7016919255256653, "learning_rate": 0.00011855564419260714, "loss": 0.7805, "step": 3875 }, { "epoch": 0.44, "grad_norm": 0.6126405000686646, "learning_rate": 0.00011838011528683279, "loss": 0.8705, "step": 3880 }, { "epoch": 0.44, "grad_norm": 0.6007192730903625, "learning_rate": 0.00011820452775155957, "loss": 0.7607, "step": 3885 }, { "epoch": 0.44, "grad_norm": 0.5800730586051941, "learning_rate": 0.00011802888214688235, "loss": 0.8891, "step": 3890 }, { "epoch": 0.44, "grad_norm": 0.6240999102592468, "learning_rate": 0.00011785317903308137, "loss": 0.729, "step": 3895 }, { "epoch": 0.44, "grad_norm": 0.5820502638816833, "learning_rate": 0.00011767741897062017, "loss": 0.7398, "step": 3900 }, { "epoch": 0.44, "grad_norm": 0.7410991787910461, "learning_rate": 0.00011750160252014402, "loss": 0.7372, "step": 3905 }, { "epoch": 0.44, "grad_norm": 0.6214268207550049, "learning_rate": 0.00011732573024247804, "loss": 0.7226, "step": 3910 }, { "epoch": 0.45, "grad_norm": 0.7160118222236633, "learning_rate": 0.00011714980269862538, "loss": 0.8079, "step": 3915 }, { "epoch": 0.45, "grad_norm": 0.6754052042961121, "learning_rate": 0.00011697382044976564, "loss": 0.7681, "step": 3920 }, { "epoch": 0.45, "grad_norm": 0.6616350412368774, "learning_rate": 0.00011679778405725274, "loss": 0.8621, "step": 3925 }, { "epoch": 0.45, "grad_norm": 0.452217698097229, "learning_rate": 0.00011662169408261339, "loss": 0.7792, "step": 3930 }, { "epoch": 0.45, "grad_norm": 0.6552872061729431, "learning_rate": 0.00011644555108754517, "loss": 0.8509, "step": 3935 }, { "epoch": 0.45, "grad_norm": 0.6112470626831055, "learning_rate": 0.0001162693556339149, "loss": 0.852, "step": 3940 }, { "epoch": 0.45, "grad_norm": 0.6591911315917969, "learning_rate": 0.00011609310828375661, "loss": 0.8543, "step": 3945 }, { "epoch": 0.45, "grad_norm": 0.6253312826156616, "learning_rate": 0.00011591680959926994, "loss": 0.79, "step": 3950 }, { "epoch": 0.45, "grad_norm": 0.5114595890045166, "learning_rate": 0.00011574046014281823, "loss": 0.8251, "step": 3955 }, { "epoch": 0.45, "grad_norm": 0.5724785923957825, "learning_rate": 0.0001155640604769268, "loss": 0.8662, "step": 3960 }, { "epoch": 0.45, "grad_norm": 0.6301324963569641, "learning_rate": 0.00011538761116428118, "loss": 0.7555, "step": 3965 }, { "epoch": 0.45, "grad_norm": 0.74462890625, "learning_rate": 0.00011521111276772518, "loss": 0.9435, "step": 3970 }, { "epoch": 0.45, "grad_norm": 0.7058918476104736, "learning_rate": 0.00011503456585025918, "loss": 0.8144, "step": 3975 }, { "epoch": 0.45, "grad_norm": 0.5204983949661255, "learning_rate": 0.00011485797097503848, "loss": 0.894, "step": 3980 }, { "epoch": 0.45, "grad_norm": 0.6754153370857239, "learning_rate": 0.00011468132870537112, "loss": 0.8955, "step": 3985 }, { "epoch": 0.45, "grad_norm": 0.5927032232284546, "learning_rate": 0.00011450463960471651, "loss": 0.82, "step": 3990 }, { "epoch": 0.45, "grad_norm": 0.5471856594085693, "learning_rate": 0.00011432790423668338, "loss": 0.7896, "step": 3995 }, { "epoch": 0.45, "grad_norm": 0.5610169768333435, "learning_rate": 0.00011415112316502803, "loss": 0.6074, "step": 4000 }, { "epoch": 0.46, "grad_norm": 0.6560962200164795, "learning_rate": 0.0001139742969536526, "loss": 0.7843, "step": 4005 }, { "epoch": 0.46, "grad_norm": 0.6780028939247131, "learning_rate": 0.0001137974261666031, "loss": 0.851, "step": 4010 }, { "epoch": 0.46, "grad_norm": 0.6535429954528809, "learning_rate": 0.00011362051136806789, "loss": 0.7268, "step": 4015 }, { "epoch": 0.46, "grad_norm": 0.7623457908630371, "learning_rate": 0.0001134435531223756, "loss": 0.8387, "step": 4020 }, { "epoch": 0.46, "grad_norm": 0.5498214364051819, "learning_rate": 0.00011326655199399345, "loss": 0.7551, "step": 4025 }, { "epoch": 0.46, "grad_norm": 0.6058107614517212, "learning_rate": 0.00011308950854752558, "loss": 0.765, "step": 4030 }, { "epoch": 0.46, "grad_norm": 0.5960306525230408, "learning_rate": 0.00011291242334771095, "loss": 0.8633, "step": 4035 }, { "epoch": 0.46, "grad_norm": 0.6701599359512329, "learning_rate": 0.00011273529695942183, "loss": 0.7851, "step": 4040 }, { "epoch": 0.46, "grad_norm": 0.5869520902633667, "learning_rate": 0.00011255812994766175, "loss": 0.8756, "step": 4045 }, { "epoch": 0.46, "grad_norm": 0.6017047762870789, "learning_rate": 0.00011238092287756397, "loss": 0.6697, "step": 4050 }, { "epoch": 0.46, "grad_norm": 0.7067684531211853, "learning_rate": 0.00011220367631438942, "loss": 0.7732, "step": 4055 }, { "epoch": 0.46, "grad_norm": 0.6048575043678284, "learning_rate": 0.00011202639082352506, "loss": 0.7937, "step": 4060 }, { "epoch": 0.46, "grad_norm": 0.6676318049430847, "learning_rate": 0.00011184906697048201, "loss": 0.8011, "step": 4065 }, { "epoch": 0.46, "grad_norm": 0.5223115086555481, "learning_rate": 0.00011167170532089369, "loss": 0.7453, "step": 4070 }, { "epoch": 0.46, "grad_norm": 0.5543930530548096, "learning_rate": 0.00011149430644051424, "loss": 0.8195, "step": 4075 }, { "epoch": 0.46, "grad_norm": 0.8007245063781738, "learning_rate": 0.0001113168708952164, "loss": 0.889, "step": 4080 }, { "epoch": 0.46, "grad_norm": 0.5854726433753967, "learning_rate": 0.00011113939925098997, "loss": 0.8129, "step": 4085 }, { "epoch": 0.47, "grad_norm": 0.5896696448326111, "learning_rate": 0.00011096189207393987, "loss": 0.7341, "step": 4090 }, { "epoch": 0.47, "grad_norm": 0.6035794019699097, "learning_rate": 0.00011078434993028431, "loss": 0.7217, "step": 4095 }, { "epoch": 0.47, "grad_norm": 0.5358508825302124, "learning_rate": 0.0001106067733863531, "loss": 0.767, "step": 4100 }, { "epoch": 0.47, "grad_norm": 0.6229228973388672, "learning_rate": 0.00011042916300858583, "loss": 0.7915, "step": 4105 }, { "epoch": 0.47, "grad_norm": 0.5339498519897461, "learning_rate": 0.00011025151936352987, "loss": 0.7711, "step": 4110 }, { "epoch": 0.47, "grad_norm": 0.8428621292114258, "learning_rate": 0.00011007384301783883, "loss": 0.8321, "step": 4115 }, { "epoch": 0.47, "grad_norm": 0.6193743348121643, "learning_rate": 0.00010989613453827057, "loss": 0.7195, "step": 4120 }, { "epoch": 0.47, "grad_norm": 0.7001969814300537, "learning_rate": 0.00010971839449168543, "loss": 0.7381, "step": 4125 }, { "epoch": 0.47, "grad_norm": 0.6056939959526062, "learning_rate": 0.00010954062344504458, "loss": 0.8131, "step": 4130 }, { "epoch": 0.47, "grad_norm": 0.7164072394371033, "learning_rate": 0.00010936282196540788, "loss": 0.8643, "step": 4135 }, { "epoch": 0.47, "grad_norm": 0.6174270510673523, "learning_rate": 0.00010918499061993241, "loss": 0.7746, "step": 4140 }, { "epoch": 0.47, "grad_norm": 0.602906346321106, "learning_rate": 0.00010900712997587047, "loss": 0.8276, "step": 4145 }, { "epoch": 0.47, "grad_norm": 0.6625570058822632, "learning_rate": 0.0001088292406005678, "loss": 0.8349, "step": 4150 }, { "epoch": 0.47, "grad_norm": 0.6430180072784424, "learning_rate": 0.00010865132306146182, "loss": 0.911, "step": 4155 }, { "epoch": 0.47, "grad_norm": 0.6040902733802795, "learning_rate": 0.00010847337792607978, "loss": 0.7391, "step": 4160 }, { "epoch": 0.47, "grad_norm": 0.6236419081687927, "learning_rate": 0.00010829540576203695, "loss": 0.7698, "step": 4165 }, { "epoch": 0.47, "grad_norm": 0.704279363155365, "learning_rate": 0.00010811740713703476, "loss": 0.7582, "step": 4170 }, { "epoch": 0.47, "grad_norm": 0.5443447232246399, "learning_rate": 0.00010793938261885916, "loss": 0.6971, "step": 4175 }, { "epoch": 0.48, "grad_norm": 0.5762811303138733, "learning_rate": 0.00010776133277537865, "loss": 0.7751, "step": 4180 }, { "epoch": 0.48, "grad_norm": 0.5704807639122009, "learning_rate": 0.00010758325817454248, "loss": 0.7032, "step": 4185 }, { "epoch": 0.48, "grad_norm": 0.5983087420463562, "learning_rate": 0.0001074051593843789, "loss": 0.676, "step": 4190 }, { "epoch": 0.48, "grad_norm": 0.5941609740257263, "learning_rate": 0.00010722703697299328, "loss": 0.7671, "step": 4195 }, { "epoch": 0.48, "grad_norm": 0.6191790103912354, "learning_rate": 0.0001070488915085664, "loss": 0.7947, "step": 4200 }, { "epoch": 0.48, "grad_norm": 0.6260554194450378, "learning_rate": 0.00010687072355935257, "loss": 0.88, "step": 4205 }, { "epoch": 0.48, "grad_norm": 0.5160613059997559, "learning_rate": 0.00010669253369367775, "loss": 0.7526, "step": 4210 }, { "epoch": 0.48, "grad_norm": 0.5811184644699097, "learning_rate": 0.00010651432247993794, "loss": 0.7775, "step": 4215 }, { "epoch": 0.48, "grad_norm": 0.5606276392936707, "learning_rate": 0.00010633609048659705, "loss": 0.8119, "step": 4220 }, { "epoch": 0.48, "grad_norm": 0.6187065243721008, "learning_rate": 0.00010615783828218547, "loss": 0.8063, "step": 4225 }, { "epoch": 0.48, "grad_norm": 0.5912905931472778, "learning_rate": 0.0001059795664352979, "loss": 0.7661, "step": 4230 }, { "epoch": 0.48, "grad_norm": 0.7005530595779419, "learning_rate": 0.00010580127551459178, "loss": 0.7361, "step": 4235 }, { "epoch": 0.48, "grad_norm": 0.58009934425354, "learning_rate": 0.00010562296608878545, "loss": 0.7438, "step": 4240 }, { "epoch": 0.48, "grad_norm": 0.7159207463264465, "learning_rate": 0.00010544463872665611, "loss": 0.8424, "step": 4245 }, { "epoch": 0.48, "grad_norm": 0.5975012183189392, "learning_rate": 0.00010526629399703833, "loss": 0.8002, "step": 4250 }, { "epoch": 0.48, "grad_norm": 0.7247589230537415, "learning_rate": 0.00010508793246882202, "loss": 0.7255, "step": 4255 }, { "epoch": 0.48, "grad_norm": 0.6801584959030151, "learning_rate": 0.0001049095547109506, "loss": 0.8668, "step": 4260 }, { "epoch": 0.48, "grad_norm": 0.596948504447937, "learning_rate": 0.00010473116129241944, "loss": 0.7466, "step": 4265 }, { "epoch": 0.49, "grad_norm": 0.8162465691566467, "learning_rate": 0.0001045527527822737, "loss": 0.7843, "step": 4270 }, { "epoch": 0.49, "grad_norm": 0.6837438941001892, "learning_rate": 0.00010437432974960674, "loss": 0.8472, "step": 4275 }, { "epoch": 0.49, "grad_norm": 0.6489061713218689, "learning_rate": 0.00010419589276355826, "loss": 0.8139, "step": 4280 }, { "epoch": 0.49, "grad_norm": 0.6814378499984741, "learning_rate": 0.00010401744239331243, "loss": 0.7912, "step": 4285 }, { "epoch": 0.49, "grad_norm": 0.6457976698875427, "learning_rate": 0.00010383897920809618, "loss": 0.9099, "step": 4290 }, { "epoch": 0.49, "grad_norm": 0.547829806804657, "learning_rate": 0.00010366050377717722, "loss": 0.7389, "step": 4295 }, { "epoch": 0.49, "grad_norm": 0.6295444965362549, "learning_rate": 0.00010348201666986241, "loss": 0.8012, "step": 4300 }, { "epoch": 0.49, "grad_norm": 0.6654611825942993, "learning_rate": 0.00010330351845549578, "loss": 0.9024, "step": 4305 }, { "epoch": 0.49, "grad_norm": 0.5944251418113708, "learning_rate": 0.00010312500970345688, "loss": 0.7721, "step": 4310 }, { "epoch": 0.49, "grad_norm": 0.6546233296394348, "learning_rate": 0.0001029464909831588, "loss": 0.7065, "step": 4315 }, { "epoch": 0.49, "grad_norm": 0.6331318020820618, "learning_rate": 0.00010276796286404644, "loss": 0.7166, "step": 4320 }, { "epoch": 0.49, "grad_norm": 0.9102218151092529, "learning_rate": 0.00010258942591559475, "loss": 0.8608, "step": 4325 }, { "epoch": 0.49, "grad_norm": 0.5956624150276184, "learning_rate": 0.00010241088070730669, "loss": 0.7382, "step": 4330 }, { "epoch": 0.49, "grad_norm": 0.6455515623092651, "learning_rate": 0.00010223232780871173, "loss": 0.8699, "step": 4335 }, { "epoch": 0.49, "grad_norm": 0.6276233196258545, "learning_rate": 0.00010205376778936379, "loss": 0.8406, "step": 4340 }, { "epoch": 0.49, "grad_norm": 0.6347047090530396, "learning_rate": 0.0001018752012188395, "loss": 0.7482, "step": 4345 }, { "epoch": 0.49, "grad_norm": 0.6462671756744385, "learning_rate": 0.00010169662866673646, "loss": 0.7888, "step": 4350 }, { "epoch": 0.5, "grad_norm": 0.7501237392425537, "learning_rate": 0.00010151805070267121, "loss": 0.7205, "step": 4355 }, { "epoch": 0.5, "grad_norm": 0.6664027571678162, "learning_rate": 0.00010133946789627773, "loss": 0.7484, "step": 4360 }, { "epoch": 0.5, "grad_norm": 0.5977053642272949, "learning_rate": 0.00010116088081720527, "loss": 0.7813, "step": 4365 }, { "epoch": 0.5, "grad_norm": 0.5906143188476562, "learning_rate": 0.00010098229003511683, "loss": 0.7464, "step": 4370 }, { "epoch": 0.5, "grad_norm": 0.6777781844139099, "learning_rate": 0.00010080369611968723, "loss": 0.7871, "step": 4375 }, { "epoch": 0.5, "grad_norm": 0.6508293747901917, "learning_rate": 0.00010062509964060118, "loss": 0.7998, "step": 4380 }, { "epoch": 0.5, "grad_norm": 0.6403629779815674, "learning_rate": 0.00010044650116755165, "loss": 0.7488, "step": 4385 }, { "epoch": 0.5, "grad_norm": 0.5627826452255249, "learning_rate": 0.00010026790127023793, "loss": 0.7261, "step": 4390 }, { "epoch": 0.5, "grad_norm": 0.5952783823013306, "learning_rate": 0.0001000893005183639, "loss": 0.7672, "step": 4395 }, { "epoch": 0.5, "grad_norm": 0.5621878504753113, "learning_rate": 9.991069948163614e-05, "loss": 0.7098, "step": 4400 }, { "epoch": 0.5, "grad_norm": 0.7619518041610718, "learning_rate": 9.97320987297621e-05, "loss": 0.8246, "step": 4405 }, { "epoch": 0.5, "grad_norm": 0.6683063507080078, "learning_rate": 9.955349883244837e-05, "loss": 0.7404, "step": 4410 }, { "epoch": 0.5, "grad_norm": 0.5906397104263306, "learning_rate": 9.937490035939885e-05, "loss": 0.7627, "step": 4415 }, { "epoch": 0.5, "grad_norm": 0.6680077910423279, "learning_rate": 9.919630388031278e-05, "loss": 0.7825, "step": 4420 }, { "epoch": 0.5, "grad_norm": 0.7676994800567627, "learning_rate": 9.901770996488315e-05, "loss": 0.8636, "step": 4425 }, { "epoch": 0.5, "grad_norm": 0.7993883490562439, "learning_rate": 9.883911918279476e-05, "loss": 0.8637, "step": 4430 }, { "epoch": 0.5, "grad_norm": 0.6810579895973206, "learning_rate": 9.86605321037223e-05, "loss": 0.7691, "step": 4435 }, { "epoch": 0.5, "grad_norm": 0.6630615592002869, "learning_rate": 9.84819492973288e-05, "loss": 0.8173, "step": 4440 }, { "epoch": 0.51, "grad_norm": 0.6403682231903076, "learning_rate": 9.830337133326355e-05, "loss": 0.9035, "step": 4445 }, { "epoch": 0.51, "grad_norm": 0.5860075354576111, "learning_rate": 9.81247987811605e-05, "loss": 0.8773, "step": 4450 }, { "epoch": 0.51, "grad_norm": 0.5799443125724792, "learning_rate": 9.794623221063625e-05, "loss": 0.8335, "step": 4455 }, { "epoch": 0.51, "grad_norm": 0.6200720071792603, "learning_rate": 9.776767219128828e-05, "loss": 0.7708, "step": 4460 }, { "epoch": 0.51, "grad_norm": 0.6964035630226135, "learning_rate": 9.758911929269334e-05, "loss": 0.8915, "step": 4465 }, { "epoch": 0.51, "grad_norm": 0.5871273875236511, "learning_rate": 9.741057408440528e-05, "loss": 0.7691, "step": 4470 }, { "epoch": 0.51, "grad_norm": 0.5692684054374695, "learning_rate": 9.723203713595355e-05, "loss": 0.7232, "step": 4475 }, { "epoch": 0.51, "grad_norm": 0.6924221515655518, "learning_rate": 9.705350901684119e-05, "loss": 0.819, "step": 4480 }, { "epoch": 0.51, "grad_norm": 0.6366986036300659, "learning_rate": 9.687499029654314e-05, "loss": 0.6907, "step": 4485 }, { "epoch": 0.51, "grad_norm": 0.6119223237037659, "learning_rate": 9.669648154450425e-05, "loss": 0.8127, "step": 4490 }, { "epoch": 0.51, "grad_norm": 0.6606696248054504, "learning_rate": 9.651798333013762e-05, "loss": 0.7755, "step": 4495 }, { "epoch": 0.51, "grad_norm": 0.566644549369812, "learning_rate": 9.63394962228228e-05, "loss": 0.9107, "step": 4500 }, { "epoch": 0.51, "grad_norm": 0.6656118035316467, "learning_rate": 9.616102079190382e-05, "loss": 0.7586, "step": 4505 }, { "epoch": 0.51, "grad_norm": 0.5353782773017883, "learning_rate": 9.598255760668758e-05, "loss": 0.6815, "step": 4510 }, { "epoch": 0.51, "grad_norm": 0.7555747032165527, "learning_rate": 9.580410723644177e-05, "loss": 0.9274, "step": 4515 }, { "epoch": 0.51, "grad_norm": 0.5931302905082703, "learning_rate": 9.562567025039327e-05, "loss": 0.7938, "step": 4520 }, { "epoch": 0.51, "grad_norm": 0.6406871676445007, "learning_rate": 9.544724721772631e-05, "loss": 0.8135, "step": 4525 }, { "epoch": 0.52, "grad_norm": 0.5799956321716309, "learning_rate": 9.526883870758056e-05, "loss": 0.7286, "step": 4530 }, { "epoch": 0.52, "grad_norm": 0.6960060596466064, "learning_rate": 9.50904452890494e-05, "loss": 0.8437, "step": 4535 }, { "epoch": 0.52, "grad_norm": 0.5997190475463867, "learning_rate": 9.491206753117803e-05, "loss": 0.769, "step": 4540 }, { "epoch": 0.52, "grad_norm": 0.5737845301628113, "learning_rate": 9.473370600296169e-05, "loss": 0.7596, "step": 4545 }, { "epoch": 0.52, "grad_norm": 0.7565434575080872, "learning_rate": 9.45553612733439e-05, "loss": 0.7319, "step": 4550 }, { "epoch": 0.52, "grad_norm": 0.6088534593582153, "learning_rate": 9.437703391121456e-05, "loss": 0.7568, "step": 4555 }, { "epoch": 0.52, "grad_norm": 0.6066818833351135, "learning_rate": 9.419872448540821e-05, "loss": 0.6767, "step": 4560 }, { "epoch": 0.52, "grad_norm": 0.5788107514381409, "learning_rate": 9.402043356470215e-05, "loss": 0.7041, "step": 4565 }, { "epoch": 0.52, "grad_norm": 0.7555676102638245, "learning_rate": 9.384216171781457e-05, "loss": 0.6752, "step": 4570 }, { "epoch": 0.52, "grad_norm": 0.5866352915763855, "learning_rate": 9.366390951340297e-05, "loss": 0.8096, "step": 4575 }, { "epoch": 0.52, "grad_norm": 0.5765191912651062, "learning_rate": 9.348567752006207e-05, "loss": 0.7834, "step": 4580 }, { "epoch": 0.52, "grad_norm": 0.5642898082733154, "learning_rate": 9.330746630632224e-05, "loss": 0.8147, "step": 4585 }, { "epoch": 0.52, "grad_norm": 0.7535125017166138, "learning_rate": 9.312927644064741e-05, "loss": 0.8679, "step": 4590 }, { "epoch": 0.52, "grad_norm": 0.6822525262832642, "learning_rate": 9.295110849143361e-05, "loss": 0.8211, "step": 4595 }, { "epoch": 0.52, "grad_norm": 0.6119083762168884, "learning_rate": 9.277296302700676e-05, "loss": 0.7726, "step": 4600 }, { "epoch": 0.52, "grad_norm": 0.6004748344421387, "learning_rate": 9.259484061562113e-05, "loss": 0.7189, "step": 4605 }, { "epoch": 0.52, "grad_norm": 0.5561325550079346, "learning_rate": 9.241674182545754e-05, "loss": 0.7632, "step": 4610 }, { "epoch": 0.52, "grad_norm": 0.674069344997406, "learning_rate": 9.223866722462134e-05, "loss": 0.734, "step": 4615 }, { "epoch": 0.53, "grad_norm": 0.6486625075340271, "learning_rate": 9.206061738114086e-05, "loss": 0.7448, "step": 4620 }, { "epoch": 0.53, "grad_norm": 0.585586428642273, "learning_rate": 9.188259286296528e-05, "loss": 0.6513, "step": 4625 }, { "epoch": 0.53, "grad_norm": 0.6953662037849426, "learning_rate": 9.170459423796309e-05, "loss": 0.7195, "step": 4630 }, { "epoch": 0.53, "grad_norm": 0.6233338117599487, "learning_rate": 9.152662207392024e-05, "loss": 0.8018, "step": 4635 }, { "epoch": 0.53, "grad_norm": 0.6842007637023926, "learning_rate": 9.134867693853816e-05, "loss": 0.7632, "step": 4640 }, { "epoch": 0.53, "grad_norm": 0.6602983474731445, "learning_rate": 9.117075939943221e-05, "loss": 0.7252, "step": 4645 }, { "epoch": 0.53, "grad_norm": 0.5861719846725464, "learning_rate": 9.099287002412956e-05, "loss": 0.6509, "step": 4650 }, { "epoch": 0.53, "grad_norm": 0.6650002002716064, "learning_rate": 9.081500938006761e-05, "loss": 0.8915, "step": 4655 }, { "epoch": 0.53, "grad_norm": 0.5285857915878296, "learning_rate": 9.063717803459213e-05, "loss": 0.7512, "step": 4660 }, { "epoch": 0.53, "grad_norm": 0.5826857089996338, "learning_rate": 9.045937655495544e-05, "loss": 0.9319, "step": 4665 }, { "epoch": 0.53, "grad_norm": 0.6033085584640503, "learning_rate": 9.028160550831458e-05, "loss": 0.7979, "step": 4670 }, { "epoch": 0.53, "grad_norm": 0.7138378620147705, "learning_rate": 9.010386546172949e-05, "loss": 0.763, "step": 4675 }, { "epoch": 0.53, "grad_norm": 0.6795177459716797, "learning_rate": 8.99261569821612e-05, "loss": 0.7867, "step": 4680 }, { "epoch": 0.53, "grad_norm": 0.6258403658866882, "learning_rate": 8.974848063647015e-05, "loss": 0.7436, "step": 4685 }, { "epoch": 0.53, "grad_norm": 0.6418249607086182, "learning_rate": 8.957083699141419e-05, "loss": 0.9125, "step": 4690 }, { "epoch": 0.53, "grad_norm": 0.5499231815338135, "learning_rate": 8.939322661364689e-05, "loss": 0.7395, "step": 4695 }, { "epoch": 0.53, "grad_norm": 0.5704962015151978, "learning_rate": 8.92156500697157e-05, "loss": 0.7699, "step": 4700 }, { "epoch": 0.53, "grad_norm": 0.6621077060699463, "learning_rate": 8.903810792606018e-05, "loss": 0.7511, "step": 4705 }, { "epoch": 0.54, "grad_norm": 0.49777495861053467, "learning_rate": 8.886060074901005e-05, "loss": 0.6749, "step": 4710 }, { "epoch": 0.54, "grad_norm": 0.609890878200531, "learning_rate": 8.868312910478362e-05, "loss": 0.6744, "step": 4715 }, { "epoch": 0.54, "grad_norm": 0.6777181625366211, "learning_rate": 8.85056935594858e-05, "loss": 0.8498, "step": 4720 }, { "epoch": 0.54, "grad_norm": 0.6590617895126343, "learning_rate": 8.832829467910631e-05, "loss": 0.7196, "step": 4725 }, { "epoch": 0.54, "grad_norm": 0.7174111604690552, "learning_rate": 8.815093302951804e-05, "loss": 0.7889, "step": 4730 }, { "epoch": 0.54, "grad_norm": 0.7540302276611328, "learning_rate": 8.797360917647498e-05, "loss": 0.8266, "step": 4735 }, { "epoch": 0.54, "grad_norm": 0.6515617966651917, "learning_rate": 8.77963236856106e-05, "loss": 0.8768, "step": 4740 }, { "epoch": 0.54, "grad_norm": 0.5930551290512085, "learning_rate": 8.761907712243606e-05, "loss": 0.7096, "step": 4745 }, { "epoch": 0.54, "grad_norm": 0.6492053866386414, "learning_rate": 8.744187005233826e-05, "loss": 0.7805, "step": 4750 }, { "epoch": 0.54, "grad_norm": 0.5830654501914978, "learning_rate": 8.72647030405782e-05, "loss": 0.8618, "step": 4755 }, { "epoch": 0.54, "grad_norm": 0.5837696194648743, "learning_rate": 8.708757665228909e-05, "loss": 0.8436, "step": 4760 }, { "epoch": 0.54, "grad_norm": 0.6426911354064941, "learning_rate": 8.691049145247445e-05, "loss": 0.8142, "step": 4765 }, { "epoch": 0.54, "grad_norm": 0.5991452932357788, "learning_rate": 8.673344800600657e-05, "loss": 0.8628, "step": 4770 }, { "epoch": 0.54, "grad_norm": 0.5630876421928406, "learning_rate": 8.655644687762443e-05, "loss": 0.6788, "step": 4775 }, { "epoch": 0.54, "grad_norm": 0.692578911781311, "learning_rate": 8.637948863193214e-05, "loss": 0.8166, "step": 4780 }, { "epoch": 0.54, "grad_norm": 0.7014926075935364, "learning_rate": 8.620257383339694e-05, "loss": 0.8809, "step": 4785 }, { "epoch": 0.54, "grad_norm": 0.717907190322876, "learning_rate": 8.602570304634745e-05, "loss": 0.6915, "step": 4790 }, { "epoch": 0.55, "grad_norm": 0.6019871234893799, "learning_rate": 8.584887683497199e-05, "loss": 0.7664, "step": 4795 }, { "epoch": 0.55, "grad_norm": 0.5950486063957214, "learning_rate": 8.567209576331663e-05, "loss": 0.7824, "step": 4800 }, { "epoch": 0.55, "grad_norm": 0.5407964587211609, "learning_rate": 8.54953603952835e-05, "loss": 0.7581, "step": 4805 }, { "epoch": 0.55, "grad_norm": 0.6220264434814453, "learning_rate": 8.531867129462888e-05, "loss": 0.7908, "step": 4810 }, { "epoch": 0.55, "grad_norm": 0.6367149353027344, "learning_rate": 8.514202902496157e-05, "loss": 0.8258, "step": 4815 }, { "epoch": 0.55, "grad_norm": 0.5935396552085876, "learning_rate": 8.496543414974083e-05, "loss": 0.6965, "step": 4820 }, { "epoch": 0.55, "grad_norm": 0.6432780027389526, "learning_rate": 8.478888723227485e-05, "loss": 0.8545, "step": 4825 }, { "epoch": 0.55, "grad_norm": 0.6651439666748047, "learning_rate": 8.461238883571885e-05, "loss": 0.8435, "step": 4830 }, { "epoch": 0.55, "grad_norm": 0.6177381873130798, "learning_rate": 8.443593952307319e-05, "loss": 0.8476, "step": 4835 }, { "epoch": 0.55, "grad_norm": 0.647454559803009, "learning_rate": 8.42595398571818e-05, "loss": 0.7337, "step": 4840 }, { "epoch": 0.55, "grad_norm": 0.6528860330581665, "learning_rate": 8.408319040073011e-05, "loss": 0.8445, "step": 4845 }, { "epoch": 0.55, "grad_norm": 0.5049680471420288, "learning_rate": 8.390689171624341e-05, "loss": 0.8322, "step": 4850 }, { "epoch": 0.55, "grad_norm": 0.7274385094642639, "learning_rate": 8.373064436608512e-05, "loss": 0.9198, "step": 4855 }, { "epoch": 0.55, "grad_norm": 0.6594643592834473, "learning_rate": 8.355444891245482e-05, "loss": 0.7995, "step": 4860 }, { "epoch": 0.55, "grad_norm": 0.6585742235183716, "learning_rate": 8.337830591738664e-05, "loss": 0.7419, "step": 4865 }, { "epoch": 0.55, "grad_norm": 0.7104743719100952, "learning_rate": 8.32022159427473e-05, "loss": 0.7447, "step": 4870 }, { "epoch": 0.55, "grad_norm": 0.7039633393287659, "learning_rate": 8.302617955023437e-05, "loss": 0.8402, "step": 4875 }, { "epoch": 0.55, "grad_norm": 0.5455465912818909, "learning_rate": 8.285019730137463e-05, "loss": 0.8199, "step": 4880 }, { "epoch": 0.56, "grad_norm": 0.5330617427825928, "learning_rate": 8.2674269757522e-05, "loss": 0.6672, "step": 4885 }, { "epoch": 0.56, "grad_norm": 0.6174395084381104, "learning_rate": 8.2498397479856e-05, "loss": 0.8038, "step": 4890 }, { "epoch": 0.56, "grad_norm": 0.7968682646751404, "learning_rate": 8.232258102937987e-05, "loss": 0.7963, "step": 4895 }, { "epoch": 0.56, "grad_norm": 0.6184393763542175, "learning_rate": 8.214682096691866e-05, "loss": 0.7699, "step": 4900 }, { "epoch": 0.56, "grad_norm": 0.6042989492416382, "learning_rate": 8.197111785311768e-05, "loss": 0.7207, "step": 4905 }, { "epoch": 0.56, "grad_norm": 0.6476695537567139, "learning_rate": 8.179547224844047e-05, "loss": 0.8862, "step": 4910 }, { "epoch": 0.56, "grad_norm": 0.7073302865028381, "learning_rate": 8.161988471316723e-05, "loss": 0.6839, "step": 4915 }, { "epoch": 0.56, "grad_norm": 0.7182896733283997, "learning_rate": 8.144435580739284e-05, "loss": 0.8197, "step": 4920 }, { "epoch": 0.56, "grad_norm": 0.5961290001869202, "learning_rate": 8.126888609102528e-05, "loss": 0.7861, "step": 4925 }, { "epoch": 0.56, "grad_norm": 0.621379554271698, "learning_rate": 8.109347612378358e-05, "loss": 0.8238, "step": 4930 }, { "epoch": 0.56, "grad_norm": 0.5518468618392944, "learning_rate": 8.091812646519623e-05, "loss": 0.8338, "step": 4935 }, { "epoch": 0.56, "grad_norm": 0.715250551700592, "learning_rate": 8.074283767459941e-05, "loss": 0.8835, "step": 4940 }, { "epoch": 0.56, "grad_norm": 0.6616021394729614, "learning_rate": 8.056761031113506e-05, "loss": 0.7993, "step": 4945 }, { "epoch": 0.56, "grad_norm": 0.6410152316093445, "learning_rate": 8.03924449337493e-05, "loss": 0.7862, "step": 4950 }, { "epoch": 0.56, "grad_norm": 0.5611416101455688, "learning_rate": 8.02173421011903e-05, "loss": 0.8749, "step": 4955 }, { "epoch": 0.56, "grad_norm": 0.5541871786117554, "learning_rate": 8.004230237200694e-05, "loss": 0.6462, "step": 4960 }, { "epoch": 0.56, "grad_norm": 0.594853401184082, "learning_rate": 7.986732630454674e-05, "loss": 0.8506, "step": 4965 }, { "epoch": 0.57, "grad_norm": 0.6446029543876648, "learning_rate": 7.969241445695406e-05, "loss": 0.8062, "step": 4970 }, { "epoch": 0.57, "grad_norm": 0.5990138649940491, "learning_rate": 7.95175673871686e-05, "loss": 0.8053, "step": 4975 }, { "epoch": 0.57, "grad_norm": 0.7167456746101379, "learning_rate": 7.934278565292328e-05, "loss": 0.8316, "step": 4980 }, { "epoch": 0.57, "grad_norm": 0.8522974848747253, "learning_rate": 7.916806981174258e-05, "loss": 0.9252, "step": 4985 }, { "epoch": 0.57, "grad_norm": 0.703740119934082, "learning_rate": 7.899342042094095e-05, "loss": 0.8845, "step": 4990 }, { "epoch": 0.57, "grad_norm": 0.6203125715255737, "learning_rate": 7.88188380376207e-05, "loss": 0.6643, "step": 4995 }, { "epoch": 0.57, "grad_norm": 0.6809445023536682, "learning_rate": 7.864432321867057e-05, "loss": 0.8035, "step": 5000 }, { "epoch": 0.57, "grad_norm": 0.5899534821510315, "learning_rate": 7.846987652076372e-05, "loss": 0.8727, "step": 5005 }, { "epoch": 0.57, "grad_norm": 0.6520495414733887, "learning_rate": 7.829549850035584e-05, "loss": 0.792, "step": 5010 }, { "epoch": 0.57, "grad_norm": 0.4904904067516327, "learning_rate": 7.812118971368383e-05, "loss": 0.7416, "step": 5015 }, { "epoch": 0.57, "grad_norm": 0.6947556734085083, "learning_rate": 7.794695071676355e-05, "loss": 0.7412, "step": 5020 }, { "epoch": 0.57, "grad_norm": 0.6248174905776978, "learning_rate": 7.777278206538832e-05, "loss": 0.8281, "step": 5025 }, { "epoch": 0.57, "grad_norm": 0.6942163109779358, "learning_rate": 7.759868431512709e-05, "loss": 0.8064, "step": 5030 }, { "epoch": 0.57, "grad_norm": 0.8450883626937866, "learning_rate": 7.742465802132262e-05, "loss": 0.7073, "step": 5035 }, { "epoch": 0.57, "grad_norm": 0.6221974492073059, "learning_rate": 7.725070373908967e-05, "loss": 0.7276, "step": 5040 }, { "epoch": 0.57, "grad_norm": 0.5618094205856323, "learning_rate": 7.707682202331338e-05, "loss": 0.7289, "step": 5045 }, { "epoch": 0.57, "grad_norm": 0.5586840510368347, "learning_rate": 7.690301342864739e-05, "loss": 0.7277, "step": 5050 }, { "epoch": 0.57, "grad_norm": 0.6269596815109253, "learning_rate": 7.672927850951213e-05, "loss": 0.8247, "step": 5055 }, { "epoch": 0.58, "grad_norm": 0.5967304110527039, "learning_rate": 7.655561782009298e-05, "loss": 0.7651, "step": 5060 }, { "epoch": 0.58, "grad_norm": 0.5993629097938538, "learning_rate": 7.638203191433848e-05, "loss": 0.8346, "step": 5065 }, { "epoch": 0.58, "grad_norm": 0.6899064779281616, "learning_rate": 7.620852134595875e-05, "loss": 0.8915, "step": 5070 }, { "epoch": 0.58, "grad_norm": 0.6052946448326111, "learning_rate": 7.60350866684235e-05, "loss": 0.7582, "step": 5075 }, { "epoch": 0.58, "grad_norm": 0.6006975173950195, "learning_rate": 7.586172843496042e-05, "loss": 0.7252, "step": 5080 }, { "epoch": 0.58, "grad_norm": 0.7047126293182373, "learning_rate": 7.568844719855328e-05, "loss": 0.7539, "step": 5085 }, { "epoch": 0.58, "grad_norm": 0.529346227645874, "learning_rate": 7.551524351194039e-05, "loss": 0.7305, "step": 5090 }, { "epoch": 0.58, "grad_norm": 0.870637834072113, "learning_rate": 7.534211792761248e-05, "loss": 0.7647, "step": 5095 }, { "epoch": 0.58, "grad_norm": 0.6663516163825989, "learning_rate": 7.51690709978113e-05, "loss": 0.8185, "step": 5100 }, { "epoch": 0.58, "grad_norm": 0.688782811164856, "learning_rate": 7.49961032745277e-05, "loss": 0.7494, "step": 5105 }, { "epoch": 0.58, "grad_norm": 0.7706353068351746, "learning_rate": 7.482321530949976e-05, "loss": 0.825, "step": 5110 }, { "epoch": 0.58, "grad_norm": 0.6962013840675354, "learning_rate": 7.465040765421132e-05, "loss": 0.9311, "step": 5115 }, { "epoch": 0.58, "grad_norm": 0.6817395091056824, "learning_rate": 7.447768085988987e-05, "loss": 0.8245, "step": 5120 }, { "epoch": 0.58, "grad_norm": 0.5578342080116272, "learning_rate": 7.430503547750505e-05, "loss": 0.872, "step": 5125 }, { "epoch": 0.58, "grad_norm": 0.6477236151695251, "learning_rate": 7.413247205776683e-05, "loss": 0.7653, "step": 5130 }, { "epoch": 0.58, "grad_norm": 0.6161476373672485, "learning_rate": 7.395999115112369e-05, "loss": 0.7479, "step": 5135 }, { "epoch": 0.58, "grad_norm": 0.6497325897216797, "learning_rate": 7.378759330776093e-05, "loss": 0.704, "step": 5140 }, { "epoch": 0.58, "grad_norm": 0.6440129280090332, "learning_rate": 7.361527907759893e-05, "loss": 0.7506, "step": 5145 }, { "epoch": 0.59, "grad_norm": 0.8600013256072998, "learning_rate": 7.344304901029121e-05, "loss": 0.792, "step": 5150 }, { "epoch": 0.59, "grad_norm": 0.7030536532402039, "learning_rate": 7.327090365522302e-05, "loss": 0.8071, "step": 5155 }, { "epoch": 0.59, "grad_norm": 0.7202965617179871, "learning_rate": 7.309884356150924e-05, "loss": 0.6725, "step": 5160 }, { "epoch": 0.59, "grad_norm": 0.6689905524253845, "learning_rate": 7.292686927799288e-05, "loss": 0.7843, "step": 5165 }, { "epoch": 0.59, "grad_norm": 0.6386666297912598, "learning_rate": 7.275498135324322e-05, "loss": 0.7353, "step": 5170 }, { "epoch": 0.59, "grad_norm": 0.8615363836288452, "learning_rate": 7.258318033555394e-05, "loss": 0.8075, "step": 5175 }, { "epoch": 0.59, "grad_norm": 0.5758625864982605, "learning_rate": 7.241146677294168e-05, "loss": 0.7186, "step": 5180 }, { "epoch": 0.59, "grad_norm": 0.6153453588485718, "learning_rate": 7.2239841213144e-05, "loss": 0.8593, "step": 5185 }, { "epoch": 0.59, "grad_norm": 0.728493869304657, "learning_rate": 7.20683042036178e-05, "loss": 0.7179, "step": 5190 }, { "epoch": 0.59, "grad_norm": 0.7267651557922363, "learning_rate": 7.189685629153749e-05, "loss": 0.8491, "step": 5195 }, { "epoch": 0.59, "grad_norm": 0.6467040777206421, "learning_rate": 7.17254980237933e-05, "loss": 0.7203, "step": 5200 }, { "epoch": 0.59, "grad_norm": 0.6471399664878845, "learning_rate": 7.155422994698948e-05, "loss": 0.8021, "step": 5205 }, { "epoch": 0.59, "grad_norm": 0.5818991661071777, "learning_rate": 7.138305260744256e-05, "loss": 0.8434, "step": 5210 }, { "epoch": 0.59, "grad_norm": 0.5848895311355591, "learning_rate": 7.121196655117974e-05, "loss": 0.7041, "step": 5215 }, { "epoch": 0.59, "grad_norm": 0.6077134609222412, "learning_rate": 7.104097232393691e-05, "loss": 0.718, "step": 5220 }, { "epoch": 0.59, "grad_norm": 0.6883412599563599, "learning_rate": 7.08700704711572e-05, "loss": 0.8343, "step": 5225 }, { "epoch": 0.59, "grad_norm": 0.7491132020950317, "learning_rate": 7.069926153798888e-05, "loss": 0.7679, "step": 5230 }, { "epoch": 0.6, "grad_norm": 0.740841805934906, "learning_rate": 7.052854606928396e-05, "loss": 0.8734, "step": 5235 }, { "epoch": 0.6, "grad_norm": 0.5949041247367859, "learning_rate": 7.03579246095963e-05, "loss": 0.7022, "step": 5240 }, { "epoch": 0.6, "grad_norm": 0.8379162549972534, "learning_rate": 7.018739770317985e-05, "loss": 0.7814, "step": 5245 }, { "epoch": 0.6, "grad_norm": 0.6472388505935669, "learning_rate": 7.001696589398699e-05, "loss": 0.8437, "step": 5250 }, { "epoch": 0.6, "grad_norm": 0.6293101906776428, "learning_rate": 6.984662972566674e-05, "loss": 0.825, "step": 5255 }, { "epoch": 0.6, "grad_norm": 0.6227039694786072, "learning_rate": 6.967638974156299e-05, "loss": 0.8873, "step": 5260 }, { "epoch": 0.6, "grad_norm": 0.5379999279975891, "learning_rate": 6.950624648471288e-05, "loss": 0.7832, "step": 5265 }, { "epoch": 0.6, "grad_norm": 0.6406290531158447, "learning_rate": 6.933620049784501e-05, "loss": 0.808, "step": 5270 }, { "epoch": 0.6, "grad_norm": 0.5745694041252136, "learning_rate": 6.91662523233777e-05, "loss": 0.8721, "step": 5275 }, { "epoch": 0.6, "grad_norm": 0.5971238017082214, "learning_rate": 6.899640250341726e-05, "loss": 0.8064, "step": 5280 }, { "epoch": 0.6, "grad_norm": 0.6286386847496033, "learning_rate": 6.882665157975617e-05, "loss": 0.7531, "step": 5285 }, { "epoch": 0.6, "grad_norm": 0.7099630236625671, "learning_rate": 6.865700009387161e-05, "loss": 0.8179, "step": 5290 }, { "epoch": 0.6, "grad_norm": 0.6734933853149414, "learning_rate": 6.848744858692344e-05, "loss": 0.7745, "step": 5295 }, { "epoch": 0.6, "grad_norm": 0.6301453709602356, "learning_rate": 6.831799759975273e-05, "loss": 0.7436, "step": 5300 }, { "epoch": 0.6, "grad_norm": 0.6269943118095398, "learning_rate": 6.814864767287978e-05, "loss": 0.845, "step": 5305 }, { "epoch": 0.6, "grad_norm": 0.6703981757164001, "learning_rate": 6.797939934650262e-05, "loss": 0.6949, "step": 5310 }, { "epoch": 0.6, "grad_norm": 0.6487662196159363, "learning_rate": 6.781025316049512e-05, "loss": 0.8129, "step": 5315 }, { "epoch": 0.6, "grad_norm": 0.6541668772697449, "learning_rate": 6.764120965440537e-05, "loss": 0.7955, "step": 5320 }, { "epoch": 0.61, "grad_norm": 0.5929297208786011, "learning_rate": 6.747226936745394e-05, "loss": 0.733, "step": 5325 }, { "epoch": 0.61, "grad_norm": 0.6727938652038574, "learning_rate": 6.730343283853214e-05, "loss": 0.7599, "step": 5330 }, { "epoch": 0.61, "grad_norm": 0.6011897325515747, "learning_rate": 6.713470060620033e-05, "loss": 0.7038, "step": 5335 }, { "epoch": 0.61, "grad_norm": 0.5427403450012207, "learning_rate": 6.696607320868612e-05, "loss": 0.6685, "step": 5340 }, { "epoch": 0.61, "grad_norm": 0.6301597952842712, "learning_rate": 6.679755118388275e-05, "loss": 0.7247, "step": 5345 }, { "epoch": 0.61, "grad_norm": 0.7286877036094666, "learning_rate": 6.662913506934736e-05, "loss": 0.7292, "step": 5350 }, { "epoch": 0.61, "grad_norm": 0.5117044448852539, "learning_rate": 6.646082540229923e-05, "loss": 0.687, "step": 5355 }, { "epoch": 0.61, "grad_norm": 0.5101413726806641, "learning_rate": 6.629262271961811e-05, "loss": 0.6912, "step": 5360 }, { "epoch": 0.61, "grad_norm": 0.5790354609489441, "learning_rate": 6.61245275578425e-05, "loss": 0.7624, "step": 5365 }, { "epoch": 0.61, "grad_norm": 0.6759084463119507, "learning_rate": 6.595654045316782e-05, "loss": 0.7712, "step": 5370 }, { "epoch": 0.61, "grad_norm": 0.6083055138587952, "learning_rate": 6.578866194144492e-05, "loss": 0.7512, "step": 5375 }, { "epoch": 0.61, "grad_norm": 0.6502549648284912, "learning_rate": 6.562089255817823e-05, "loss": 0.8051, "step": 5380 }, { "epoch": 0.61, "grad_norm": 0.5895970463752747, "learning_rate": 6.545323283852407e-05, "loss": 0.6741, "step": 5385 }, { "epoch": 0.61, "grad_norm": 0.630634069442749, "learning_rate": 6.528568331728895e-05, "loss": 0.8695, "step": 5390 }, { "epoch": 0.61, "grad_norm": 0.6064544320106506, "learning_rate": 6.51182445289279e-05, "loss": 0.7345, "step": 5395 }, { "epoch": 0.61, "grad_norm": 0.6453479528427124, "learning_rate": 6.495091700754266e-05, "loss": 0.743, "step": 5400 }, { "epoch": 0.61, "grad_norm": 0.6371870040893555, "learning_rate": 6.478370128688005e-05, "loss": 0.7806, "step": 5405 }, { "epoch": 0.62, "grad_norm": 0.6676410436630249, "learning_rate": 6.461659790033038e-05, "loss": 0.767, "step": 5410 }, { "epoch": 0.62, "grad_norm": 0.6216074824333191, "learning_rate": 6.444960738092548e-05, "loss": 0.6315, "step": 5415 }, { "epoch": 0.62, "grad_norm": 0.6729696989059448, "learning_rate": 6.428273026133731e-05, "loss": 0.7697, "step": 5420 }, { "epoch": 0.62, "grad_norm": 0.5996483564376831, "learning_rate": 6.411596707387594e-05, "loss": 0.7897, "step": 5425 }, { "epoch": 0.62, "grad_norm": 0.9583771824836731, "learning_rate": 6.39493183504881e-05, "loss": 0.7393, "step": 5430 }, { "epoch": 0.62, "grad_norm": 0.62912917137146, "learning_rate": 6.378278462275542e-05, "loss": 0.8387, "step": 5435 }, { "epoch": 0.62, "grad_norm": 0.617304801940918, "learning_rate": 6.361636642189269e-05, "loss": 0.8835, "step": 5440 }, { "epoch": 0.62, "grad_norm": 0.6186046004295349, "learning_rate": 6.345006427874615e-05, "loss": 0.6838, "step": 5445 }, { "epoch": 0.62, "grad_norm": 0.5584391355514526, "learning_rate": 6.328387872379193e-05, "loss": 0.6263, "step": 5450 }, { "epoch": 0.62, "grad_norm": 0.49611517786979675, "learning_rate": 6.311781028713414e-05, "loss": 0.7853, "step": 5455 }, { "epoch": 0.62, "grad_norm": 0.5180690288543701, "learning_rate": 6.295185949850339e-05, "loss": 0.7376, "step": 5460 }, { "epoch": 0.62, "grad_norm": 0.6937770247459412, "learning_rate": 6.278602688725497e-05, "loss": 0.7639, "step": 5465 }, { "epoch": 0.62, "grad_norm": 0.6713787317276001, "learning_rate": 6.262031298236728e-05, "loss": 0.7227, "step": 5470 }, { "epoch": 0.62, "grad_norm": 0.7440084218978882, "learning_rate": 6.245471831243996e-05, "loss": 0.7153, "step": 5475 }, { "epoch": 0.62, "grad_norm": 0.6448591947555542, "learning_rate": 6.228924340569233e-05, "loss": 0.7611, "step": 5480 }, { "epoch": 0.62, "grad_norm": 0.6704011559486389, "learning_rate": 6.212388878996177e-05, "loss": 0.8411, "step": 5485 }, { "epoch": 0.62, "grad_norm": 0.6402199864387512, "learning_rate": 6.195865499270186e-05, "loss": 0.6718, "step": 5490 }, { "epoch": 0.62, "grad_norm": 0.6440407037734985, "learning_rate": 6.179354254098085e-05, "loss": 0.6699, "step": 5495 }, { "epoch": 0.63, "grad_norm": 0.6990440487861633, "learning_rate": 6.162855196147986e-05, "loss": 0.7178, "step": 5500 }, { "epoch": 0.63, "grad_norm": 0.6173884272575378, "learning_rate": 6.146368378049134e-05, "loss": 0.8273, "step": 5505 }, { "epoch": 0.63, "grad_norm": 0.6001347303390503, "learning_rate": 6.129893852391721e-05, "loss": 0.8893, "step": 5510 }, { "epoch": 0.63, "grad_norm": 0.5927189588546753, "learning_rate": 6.113431671726735e-05, "loss": 0.7978, "step": 5515 }, { "epoch": 0.63, "grad_norm": 0.7115824222564697, "learning_rate": 6.0969818885657835e-05, "loss": 0.8715, "step": 5520 }, { "epoch": 0.63, "grad_norm": 0.6806285977363586, "learning_rate": 6.080544555380927e-05, "loss": 0.6983, "step": 5525 }, { "epoch": 0.63, "grad_norm": 0.6034073829650879, "learning_rate": 6.06411972460452e-05, "loss": 0.7861, "step": 5530 }, { "epoch": 0.63, "grad_norm": 0.6072666645050049, "learning_rate": 6.047707448629023e-05, "loss": 0.796, "step": 5535 }, { "epoch": 0.63, "grad_norm": 0.6448633670806885, "learning_rate": 6.0313077798068575e-05, "loss": 0.7532, "step": 5540 }, { "epoch": 0.63, "grad_norm": 0.6306453943252563, "learning_rate": 6.014920770450232e-05, "loss": 0.7327, "step": 5545 }, { "epoch": 0.63, "grad_norm": 0.8060635328292847, "learning_rate": 5.998546472830965e-05, "loss": 0.7468, "step": 5550 }, { "epoch": 0.63, "grad_norm": 0.6441696286201477, "learning_rate": 5.9821849391803375e-05, "loss": 0.6766, "step": 5555 }, { "epoch": 0.63, "grad_norm": 0.7230235934257507, "learning_rate": 5.9658362216889095e-05, "loss": 0.7493, "step": 5560 }, { "epoch": 0.63, "grad_norm": 0.7482134103775024, "learning_rate": 5.949500372506354e-05, "loss": 0.6559, "step": 5565 }, { "epoch": 0.63, "grad_norm": 0.5693408846855164, "learning_rate": 5.933177443741309e-05, "loss": 0.674, "step": 5570 }, { "epoch": 0.63, "grad_norm": 0.6371012330055237, "learning_rate": 5.916867487461186e-05, "loss": 0.7717, "step": 5575 }, { "epoch": 0.63, "grad_norm": 0.600836455821991, "learning_rate": 5.900570555692029e-05, "loss": 0.8248, "step": 5580 }, { "epoch": 0.64, "grad_norm": 0.748927891254425, "learning_rate": 5.884286700418328e-05, "loss": 0.7426, "step": 5585 }, { "epoch": 0.64, "grad_norm": 0.6669610142707825, "learning_rate": 5.8680159735828555e-05, "loss": 0.7964, "step": 5590 }, { "epoch": 0.64, "grad_norm": 0.6355113387107849, "learning_rate": 5.85175842708652e-05, "loss": 0.8308, "step": 5595 }, { "epoch": 0.64, "grad_norm": 0.6880905628204346, "learning_rate": 5.835514112788179e-05, "loss": 0.711, "step": 5600 }, { "epoch": 0.64, "grad_norm": 0.7189605236053467, "learning_rate": 5.819283082504482e-05, "loss": 0.7994, "step": 5605 }, { "epoch": 0.64, "grad_norm": 0.521282970905304, "learning_rate": 5.8030653880097066e-05, "loss": 0.6762, "step": 5610 }, { "epoch": 0.64, "grad_norm": 0.6705846786499023, "learning_rate": 5.7868610810355896e-05, "loss": 0.7619, "step": 5615 }, { "epoch": 0.64, "grad_norm": 0.7209597826004028, "learning_rate": 5.770670213271165e-05, "loss": 0.8654, "step": 5620 }, { "epoch": 0.64, "grad_norm": 0.6950438022613525, "learning_rate": 5.7544928363625974e-05, "loss": 0.7806, "step": 5625 }, { "epoch": 0.64, "grad_norm": 0.7177894711494446, "learning_rate": 5.738329001913014e-05, "loss": 0.8453, "step": 5630 }, { "epoch": 0.64, "grad_norm": 0.6125010848045349, "learning_rate": 5.722178761482356e-05, "loss": 0.7242, "step": 5635 }, { "epoch": 0.64, "grad_norm": 0.650780975818634, "learning_rate": 5.706042166587193e-05, "loss": 0.8344, "step": 5640 }, { "epoch": 0.64, "grad_norm": 0.6594287157058716, "learning_rate": 5.6899192687005585e-05, "loss": 0.8212, "step": 5645 }, { "epoch": 0.64, "grad_norm": 0.8201553225517273, "learning_rate": 5.673810119251814e-05, "loss": 0.9035, "step": 5650 }, { "epoch": 0.64, "grad_norm": 0.5799230933189392, "learning_rate": 5.657714769626455e-05, "loss": 0.7603, "step": 5655 }, { "epoch": 0.64, "grad_norm": 0.7373571395874023, "learning_rate": 5.641633271165955e-05, "loss": 0.8696, "step": 5660 }, { "epoch": 0.64, "grad_norm": 0.6818183660507202, "learning_rate": 5.6255656751676143e-05, "loss": 0.7146, "step": 5665 }, { "epoch": 0.64, "grad_norm": 0.640631377696991, "learning_rate": 5.609512032884385e-05, "loss": 0.8767, "step": 5670 }, { "epoch": 0.65, "grad_norm": 0.6422764658927917, "learning_rate": 5.5934723955246917e-05, "loss": 0.7608, "step": 5675 }, { "epoch": 0.65, "grad_norm": 0.633758544921875, "learning_rate": 5.5774468142523104e-05, "loss": 0.7595, "step": 5680 }, { "epoch": 0.65, "grad_norm": 0.6388438940048218, "learning_rate": 5.5614353401861675e-05, "loss": 0.8263, "step": 5685 }, { "epoch": 0.65, "grad_norm": 0.6727738976478577, "learning_rate": 5.545438024400192e-05, "loss": 0.7927, "step": 5690 }, { "epoch": 0.65, "grad_norm": 0.6814836263656616, "learning_rate": 5.529454917923149e-05, "loss": 0.8103, "step": 5695 }, { "epoch": 0.65, "grad_norm": 0.772027850151062, "learning_rate": 5.513486071738481e-05, "loss": 0.707, "step": 5700 }, { "epoch": 0.65, "grad_norm": 0.6700000166893005, "learning_rate": 5.4975315367841374e-05, "loss": 0.8225, "step": 5705 }, { "epoch": 0.65, "grad_norm": 0.7606449723243713, "learning_rate": 5.481591363952421e-05, "loss": 0.7547, "step": 5710 }, { "epoch": 0.65, "grad_norm": 0.6637645363807678, "learning_rate": 5.465665604089829e-05, "loss": 0.8537, "step": 5715 }, { "epoch": 0.65, "grad_norm": 0.6221731901168823, "learning_rate": 5.449754307996871e-05, "loss": 0.6966, "step": 5720 }, { "epoch": 0.65, "grad_norm": 0.6201137900352478, "learning_rate": 5.433857526427923e-05, "loss": 0.7586, "step": 5725 }, { "epoch": 0.65, "grad_norm": 0.715356171131134, "learning_rate": 5.417975310091068e-05, "loss": 0.9557, "step": 5730 }, { "epoch": 0.65, "grad_norm": 0.7795970439910889, "learning_rate": 5.402107709647921e-05, "loss": 0.8289, "step": 5735 }, { "epoch": 0.65, "grad_norm": 0.6747726798057556, "learning_rate": 5.3862547757134816e-05, "loss": 0.6959, "step": 5740 }, { "epoch": 0.65, "grad_norm": 0.6822933554649353, "learning_rate": 5.370416558855955e-05, "loss": 0.7804, "step": 5745 }, { "epoch": 0.65, "grad_norm": 0.6174326539039612, "learning_rate": 5.354593109596621e-05, "loss": 0.8149, "step": 5750 }, { "epoch": 0.65, "grad_norm": 0.6532416939735413, "learning_rate": 5.338784478409628e-05, "loss": 0.8078, "step": 5755 }, { "epoch": 0.65, "grad_norm": 0.6764138340950012, "learning_rate": 5.3229907157218737e-05, "loss": 0.802, "step": 5760 }, { "epoch": 0.66, "grad_norm": 0.6659644842147827, "learning_rate": 5.307211871912828e-05, "loss": 0.7593, "step": 5765 }, { "epoch": 0.66, "grad_norm": 0.8311240077018738, "learning_rate": 5.291447997314367e-05, "loss": 0.7216, "step": 5770 }, { "epoch": 0.66, "grad_norm": 0.6325226426124573, "learning_rate": 5.275699142210615e-05, "loss": 0.7994, "step": 5775 }, { "epoch": 0.66, "grad_norm": 0.678072988986969, "learning_rate": 5.259965356837795e-05, "loss": 0.8148, "step": 5780 }, { "epoch": 0.66, "grad_norm": 0.6537745594978333, "learning_rate": 5.244246691384051e-05, "loss": 0.8272, "step": 5785 }, { "epoch": 0.66, "grad_norm": 0.6158615350723267, "learning_rate": 5.228543195989303e-05, "loss": 0.6634, "step": 5790 }, { "epoch": 0.66, "grad_norm": 0.5966333746910095, "learning_rate": 5.212854920745075e-05, "loss": 0.733, "step": 5795 }, { "epoch": 0.66, "grad_norm": 0.6967484951019287, "learning_rate": 5.1971819156943545e-05, "loss": 0.855, "step": 5800 }, { "epoch": 0.66, "grad_norm": 0.6322739124298096, "learning_rate": 5.181524230831409e-05, "loss": 0.8315, "step": 5805 }, { "epoch": 0.66, "grad_norm": 0.6847050786018372, "learning_rate": 5.1658819161016294e-05, "loss": 0.8198, "step": 5810 }, { "epoch": 0.66, "grad_norm": 0.6578314304351807, "learning_rate": 5.150255021401399e-05, "loss": 0.8202, "step": 5815 }, { "epoch": 0.66, "grad_norm": 0.6339473128318787, "learning_rate": 5.134643596577897e-05, "loss": 0.7877, "step": 5820 }, { "epoch": 0.66, "grad_norm": 0.5576701760292053, "learning_rate": 5.1190476914289645e-05, "loss": 0.7409, "step": 5825 }, { "epoch": 0.66, "grad_norm": 1.082842469215393, "learning_rate": 5.103467355702928e-05, "loss": 0.6842, "step": 5830 }, { "epoch": 0.66, "grad_norm": 0.7405625581741333, "learning_rate": 5.087902639098472e-05, "loss": 0.7636, "step": 5835 }, { "epoch": 0.66, "grad_norm": 0.5836653709411621, "learning_rate": 5.0723535912644294e-05, "loss": 0.7134, "step": 5840 }, { "epoch": 0.66, "grad_norm": 0.7886473536491394, "learning_rate": 5.0568202617996675e-05, "loss": 0.8099, "step": 5845 }, { "epoch": 0.67, "grad_norm": 0.6854684948921204, "learning_rate": 5.0413027002529214e-05, "loss": 0.7903, "step": 5850 }, { "epoch": 0.67, "grad_norm": 0.6613681316375732, "learning_rate": 5.025800956122619e-05, "loss": 0.7311, "step": 5855 }, { "epoch": 0.67, "grad_norm": 0.6116787791252136, "learning_rate": 5.010315078856733e-05, "loss": 0.7632, "step": 5860 }, { "epoch": 0.67, "grad_norm": 0.74680095911026, "learning_rate": 4.99484511785263e-05, "loss": 0.8084, "step": 5865 }, { "epoch": 0.67, "grad_norm": 0.6365654468536377, "learning_rate": 4.979391122456899e-05, "loss": 0.6616, "step": 5870 }, { "epoch": 0.67, "grad_norm": 0.7090327143669128, "learning_rate": 4.9639531419652075e-05, "loss": 0.8372, "step": 5875 }, { "epoch": 0.67, "grad_norm": 0.7884336709976196, "learning_rate": 4.948531225622129e-05, "loss": 0.7197, "step": 5880 }, { "epoch": 0.67, "grad_norm": 0.7464672923088074, "learning_rate": 4.933125422621013e-05, "loss": 0.7625, "step": 5885 }, { "epoch": 0.67, "grad_norm": 0.5780450701713562, "learning_rate": 4.9177357821037964e-05, "loss": 0.7627, "step": 5890 }, { "epoch": 0.67, "grad_norm": 0.6157823801040649, "learning_rate": 4.902362353160851e-05, "loss": 0.8363, "step": 5895 }, { "epoch": 0.67, "grad_norm": 0.6948485374450684, "learning_rate": 4.8870051848308603e-05, "loss": 0.7677, "step": 5900 }, { "epoch": 0.67, "grad_norm": 0.7331840395927429, "learning_rate": 4.871664326100625e-05, "loss": 0.887, "step": 5905 }, { "epoch": 0.67, "grad_norm": 0.6237366199493408, "learning_rate": 4.856339825904921e-05, "loss": 0.7934, "step": 5910 }, { "epoch": 0.67, "grad_norm": 0.6936962008476257, "learning_rate": 4.841031733126345e-05, "loss": 0.7613, "step": 5915 }, { "epoch": 0.67, "grad_norm": 0.6001924276351929, "learning_rate": 4.825740096595159e-05, "loss": 0.7355, "step": 5920 }, { "epoch": 0.67, "grad_norm": 0.5404167771339417, "learning_rate": 4.8104649650891295e-05, "loss": 0.7993, "step": 5925 }, { "epoch": 0.67, "grad_norm": 0.656347930431366, "learning_rate": 4.795206387333371e-05, "loss": 0.7538, "step": 5930 }, { "epoch": 0.67, "grad_norm": 0.6526111364364624, "learning_rate": 4.779964412000206e-05, "loss": 0.7349, "step": 5935 }, { "epoch": 0.68, "grad_norm": 0.7296586632728577, "learning_rate": 4.7647390877089884e-05, "loss": 0.7385, "step": 5940 }, { "epoch": 0.68, "grad_norm": 0.6850858330726624, "learning_rate": 4.749530463025961e-05, "loss": 0.7774, "step": 5945 }, { "epoch": 0.68, "grad_norm": 0.7608144283294678, "learning_rate": 4.734338586464096e-05, "loss": 0.7299, "step": 5950 }, { "epoch": 0.68, "grad_norm": 0.6235141158103943, "learning_rate": 4.719163506482942e-05, "loss": 0.734, "step": 5955 }, { "epoch": 0.68, "grad_norm": 0.6358844637870789, "learning_rate": 4.704005271488472e-05, "loss": 0.78, "step": 5960 }, { "epoch": 0.68, "grad_norm": 0.7749031186103821, "learning_rate": 4.6888639298329216e-05, "loss": 0.8022, "step": 5965 }, { "epoch": 0.68, "grad_norm": 0.5940665006637573, "learning_rate": 4.673739529814653e-05, "loss": 0.7931, "step": 5970 }, { "epoch": 0.68, "grad_norm": 0.674860417842865, "learning_rate": 4.658632119677965e-05, "loss": 0.8207, "step": 5975 }, { "epoch": 0.68, "grad_norm": 0.5767741799354553, "learning_rate": 4.643541747612974e-05, "loss": 0.6727, "step": 5980 }, { "epoch": 0.68, "grad_norm": 0.6625251770019531, "learning_rate": 4.6284684617554555e-05, "loss": 0.7103, "step": 5985 }, { "epoch": 0.68, "grad_norm": 0.6491448879241943, "learning_rate": 4.613412310186669e-05, "loss": 0.7724, "step": 5990 }, { "epoch": 0.68, "grad_norm": 0.8284974098205566, "learning_rate": 4.5983733409332265e-05, "loss": 0.8283, "step": 5995 }, { "epoch": 0.68, "grad_norm": 0.6128905415534973, "learning_rate": 4.5833516019669275e-05, "loss": 0.7541, "step": 6000 }, { "epoch": 0.68, "grad_norm": 0.642086923122406, "learning_rate": 4.568347141204611e-05, "loss": 0.7939, "step": 6005 }, { "epoch": 0.68, "grad_norm": 0.6303368806838989, "learning_rate": 4.553360006508003e-05, "loss": 0.7202, "step": 6010 }, { "epoch": 0.68, "grad_norm": 0.5753061771392822, "learning_rate": 4.538390245683555e-05, "loss": 0.7782, "step": 6015 }, { "epoch": 0.68, "grad_norm": 0.6998262405395508, "learning_rate": 4.523437906482313e-05, "loss": 0.723, "step": 6020 }, { "epoch": 0.69, "grad_norm": 0.5152105689048767, "learning_rate": 4.508503036599743e-05, "loss": 0.7258, "step": 6025 }, { "epoch": 0.69, "grad_norm": 0.7657172679901123, "learning_rate": 4.493585683675575e-05, "loss": 0.7904, "step": 6030 }, { "epoch": 0.69, "grad_norm": 0.7116373777389526, "learning_rate": 4.478685895293685e-05, "loss": 0.7479, "step": 6035 }, { "epoch": 0.69, "grad_norm": 0.6210662722587585, "learning_rate": 4.463803718981905e-05, "loss": 0.7604, "step": 6040 }, { "epoch": 0.69, "grad_norm": 0.6628497242927551, "learning_rate": 4.448939202211896e-05, "loss": 0.7996, "step": 6045 }, { "epoch": 0.69, "grad_norm": 0.7364223599433899, "learning_rate": 4.434092392398978e-05, "loss": 0.8187, "step": 6050 }, { "epoch": 0.69, "grad_norm": 0.7144873142242432, "learning_rate": 4.4192633369020066e-05, "loss": 0.8995, "step": 6055 }, { "epoch": 0.69, "grad_norm": 0.7300914525985718, "learning_rate": 4.404452083023183e-05, "loss": 0.755, "step": 6060 }, { "epoch": 0.69, "grad_norm": 0.7752556800842285, "learning_rate": 4.389658678007933e-05, "loss": 0.835, "step": 6065 }, { "epoch": 0.69, "grad_norm": 0.631405770778656, "learning_rate": 4.3748831690447565e-05, "loss": 0.769, "step": 6070 }, { "epoch": 0.69, "grad_norm": 0.6509888172149658, "learning_rate": 4.360125603265057e-05, "loss": 0.7765, "step": 6075 }, { "epoch": 0.69, "grad_norm": 0.6003565788269043, "learning_rate": 4.345386027743005e-05, "loss": 0.6942, "step": 6080 }, { "epoch": 0.69, "grad_norm": 0.7362748384475708, "learning_rate": 4.330664489495385e-05, "loss": 0.803, "step": 6085 }, { "epoch": 0.69, "grad_norm": 0.7295613288879395, "learning_rate": 4.315961035481445e-05, "loss": 0.7736, "step": 6090 }, { "epoch": 0.69, "grad_norm": 0.7188911437988281, "learning_rate": 4.30127571260275e-05, "loss": 0.7708, "step": 6095 }, { "epoch": 0.69, "grad_norm": 0.6692444086074829, "learning_rate": 4.286608567703024e-05, "loss": 0.8271, "step": 6100 }, { "epoch": 0.69, "grad_norm": 0.8137506246566772, "learning_rate": 4.271959647568017e-05, "loss": 0.7915, "step": 6105 }, { "epoch": 0.69, "grad_norm": 0.7544528245925903, "learning_rate": 4.257328998925338e-05, "loss": 0.7633, "step": 6110 }, { "epoch": 0.7, "grad_norm": 0.7398918271064758, "learning_rate": 4.242716668444304e-05, "loss": 0.7613, "step": 6115 }, { "epoch": 0.7, "grad_norm": 0.6789968609809875, "learning_rate": 4.2281227027358187e-05, "loss": 0.8473, "step": 6120 }, { "epoch": 0.7, "grad_norm": 0.5836973190307617, "learning_rate": 4.2135471483521925e-05, "loss": 0.7006, "step": 6125 }, { "epoch": 0.7, "grad_norm": 0.6515111327171326, "learning_rate": 4.198990051787012e-05, "loss": 0.7468, "step": 6130 }, { "epoch": 0.7, "grad_norm": 0.5469412207603455, "learning_rate": 4.184451459474983e-05, "loss": 0.7166, "step": 6135 }, { "epoch": 0.7, "grad_norm": 0.6362948417663574, "learning_rate": 4.169931417791788e-05, "loss": 0.6851, "step": 6140 }, { "epoch": 0.7, "grad_norm": 0.7324738502502441, "learning_rate": 4.155429973053935e-05, "loss": 0.6982, "step": 6145 }, { "epoch": 0.7, "grad_norm": 0.6752232313156128, "learning_rate": 4.140947171518609e-05, "loss": 0.7343, "step": 6150 }, { "epoch": 0.7, "grad_norm": 0.7206586003303528, "learning_rate": 4.126483059383534e-05, "loss": 0.7781, "step": 6155 }, { "epoch": 0.7, "grad_norm": 0.6868919134140015, "learning_rate": 4.112037682786811e-05, "loss": 0.8156, "step": 6160 }, { "epoch": 0.7, "grad_norm": 0.709814190864563, "learning_rate": 4.0976110878067783e-05, "loss": 0.8395, "step": 6165 }, { "epoch": 0.7, "grad_norm": 0.6500702500343323, "learning_rate": 4.083203320461867e-05, "loss": 0.7951, "step": 6170 }, { "epoch": 0.7, "grad_norm": 0.7838833332061768, "learning_rate": 4.068814426710447e-05, "loss": 0.8203, "step": 6175 }, { "epoch": 0.7, "grad_norm": 0.6506848335266113, "learning_rate": 4.0544444524506875e-05, "loss": 0.7022, "step": 6180 }, { "epoch": 0.7, "grad_norm": 0.8521379828453064, "learning_rate": 4.040093443520404e-05, "loss": 0.7542, "step": 6185 }, { "epoch": 0.7, "grad_norm": 0.5580353736877441, "learning_rate": 4.025761445696929e-05, "loss": 0.7818, "step": 6190 }, { "epoch": 0.7, "grad_norm": 0.8889886736869812, "learning_rate": 4.011448504696933e-05, "loss": 0.7498, "step": 6195 }, { "epoch": 0.7, "grad_norm": 0.6307147145271301, "learning_rate": 3.997154666176306e-05, "loss": 0.8169, "step": 6200 }, { "epoch": 0.71, "grad_norm": 0.740788459777832, "learning_rate": 3.982879975730015e-05, "loss": 0.6286, "step": 6205 }, { "epoch": 0.71, "grad_norm": 0.7221429347991943, "learning_rate": 3.9686244788919345e-05, "loss": 0.8662, "step": 6210 }, { "epoch": 0.71, "grad_norm": 0.578336238861084, "learning_rate": 3.9543882211347206e-05, "loss": 0.7928, "step": 6215 }, { "epoch": 0.71, "grad_norm": 0.6942074298858643, "learning_rate": 3.940171247869658e-05, "loss": 0.7358, "step": 6220 }, { "epoch": 0.71, "grad_norm": 0.8117511868476868, "learning_rate": 3.925973604446517e-05, "loss": 0.699, "step": 6225 }, { "epoch": 0.71, "grad_norm": 0.6607906818389893, "learning_rate": 3.91179533615341e-05, "loss": 0.7651, "step": 6230 }, { "epoch": 0.71, "grad_norm": 0.705653190612793, "learning_rate": 3.8976364882166414e-05, "loss": 0.8838, "step": 6235 }, { "epoch": 0.71, "grad_norm": 0.6348661780357361, "learning_rate": 3.8834971058005796e-05, "loss": 0.7253, "step": 6240 }, { "epoch": 0.71, "grad_norm": 0.7973907589912415, "learning_rate": 3.869377234007494e-05, "loss": 0.7247, "step": 6245 }, { "epoch": 0.71, "grad_norm": 0.7183629274368286, "learning_rate": 3.855276917877407e-05, "loss": 0.7184, "step": 6250 }, { "epoch": 0.71, "grad_norm": 0.6137698292732239, "learning_rate": 3.8411962023879844e-05, "loss": 0.8104, "step": 6255 }, { "epoch": 0.71, "grad_norm": 0.753842294216156, "learning_rate": 3.827135132454351e-05, "loss": 0.7628, "step": 6260 }, { "epoch": 0.71, "grad_norm": 0.658284068107605, "learning_rate": 3.813093752928973e-05, "loss": 0.7889, "step": 6265 }, { "epoch": 0.71, "grad_norm": 0.5758240818977356, "learning_rate": 3.799072108601511e-05, "loss": 0.7978, "step": 6270 }, { "epoch": 0.71, "grad_norm": 0.6841059327125549, "learning_rate": 3.78507024419867e-05, "loss": 0.8056, "step": 6275 }, { "epoch": 0.71, "grad_norm": 0.6589455604553223, "learning_rate": 3.771088204384051e-05, "loss": 0.7676, "step": 6280 }, { "epoch": 0.71, "grad_norm": 0.5679621696472168, "learning_rate": 3.757126033758028e-05, "loss": 0.7095, "step": 6285 }, { "epoch": 0.72, "grad_norm": 0.6563994288444519, "learning_rate": 3.7431837768576017e-05, "loss": 0.7954, "step": 6290 }, { "epoch": 0.72, "grad_norm": 0.6838070750236511, "learning_rate": 3.7292614781562384e-05, "loss": 0.8108, "step": 6295 }, { "epoch": 0.72, "grad_norm": 0.7321529984474182, "learning_rate": 3.715359182063748e-05, "loss": 0.8576, "step": 6300 }, { "epoch": 0.72, "grad_norm": 0.7272433042526245, "learning_rate": 3.701476932926132e-05, "loss": 0.7887, "step": 6305 }, { "epoch": 0.72, "grad_norm": 0.7071683406829834, "learning_rate": 3.68761477502545e-05, "loss": 0.8411, "step": 6310 }, { "epoch": 0.72, "grad_norm": 0.5736758708953857, "learning_rate": 3.673772752579665e-05, "loss": 0.7584, "step": 6315 }, { "epoch": 0.72, "grad_norm": 0.656456708908081, "learning_rate": 3.659950909742525e-05, "loss": 0.8634, "step": 6320 }, { "epoch": 0.72, "grad_norm": 0.6820427775382996, "learning_rate": 3.646149290603398e-05, "loss": 0.862, "step": 6325 }, { "epoch": 0.72, "grad_norm": 0.8717942833900452, "learning_rate": 3.6323679391871446e-05, "loss": 0.7477, "step": 6330 }, { "epoch": 0.72, "grad_norm": 0.7242932915687561, "learning_rate": 3.6186068994539745e-05, "loss": 0.762, "step": 6335 }, { "epoch": 0.72, "grad_norm": 0.7470153570175171, "learning_rate": 3.6048662152993065e-05, "loss": 0.7616, "step": 6340 }, { "epoch": 0.72, "grad_norm": 0.725097119808197, "learning_rate": 3.59114593055363e-05, "loss": 0.808, "step": 6345 }, { "epoch": 0.72, "grad_norm": 0.6063842177391052, "learning_rate": 3.5774460889823566e-05, "loss": 0.8324, "step": 6350 }, { "epoch": 0.72, "grad_norm": 0.7853096723556519, "learning_rate": 3.563766734285704e-05, "loss": 0.8145, "step": 6355 }, { "epoch": 0.72, "grad_norm": 0.6266723871231079, "learning_rate": 3.5501079100985254e-05, "loss": 0.7249, "step": 6360 }, { "epoch": 0.72, "grad_norm": 0.6518904566764832, "learning_rate": 3.5364696599901835e-05, "loss": 0.8258, "step": 6365 }, { "epoch": 0.72, "grad_norm": 0.6777641177177429, "learning_rate": 3.522852027464426e-05, "loss": 0.8226, "step": 6370 }, { "epoch": 0.72, "grad_norm": 0.6241174936294556, "learning_rate": 3.509255055959224e-05, "loss": 0.7909, "step": 6375 }, { "epoch": 0.73, "grad_norm": 0.7724244594573975, "learning_rate": 3.495678788846648e-05, "loss": 0.6656, "step": 6380 }, { "epoch": 0.73, "grad_norm": 0.6773399710655212, "learning_rate": 3.4821232694327224e-05, "loss": 0.704, "step": 6385 }, { "epoch": 0.73, "grad_norm": 0.6904076337814331, "learning_rate": 3.4685885409572893e-05, "loss": 0.7869, "step": 6390 }, { "epoch": 0.73, "grad_norm": 0.6653614044189453, "learning_rate": 3.455074646593876e-05, "loss": 0.8249, "step": 6395 }, { "epoch": 0.73, "grad_norm": 0.871178925037384, "learning_rate": 3.441581629449542e-05, "loss": 0.8137, "step": 6400 }, { "epoch": 0.73, "grad_norm": 0.5883834362030029, "learning_rate": 3.4281095325647684e-05, "loss": 0.798, "step": 6405 }, { "epoch": 0.73, "grad_norm": 0.6861564517021179, "learning_rate": 3.41465839891329e-05, "loss": 0.7559, "step": 6410 }, { "epoch": 0.73, "grad_norm": 0.6617395877838135, "learning_rate": 3.401228271401978e-05, "loss": 0.782, "step": 6415 }, { "epoch": 0.73, "grad_norm": 0.6430420279502869, "learning_rate": 3.387819192870697e-05, "loss": 0.7349, "step": 6420 }, { "epoch": 0.73, "grad_norm": 0.7476750612258911, "learning_rate": 3.374431206092168e-05, "loss": 0.8074, "step": 6425 }, { "epoch": 0.73, "grad_norm": 0.6503930687904358, "learning_rate": 3.3610643537718345e-05, "loss": 0.7641, "step": 6430 }, { "epoch": 0.73, "grad_norm": 0.5740126967430115, "learning_rate": 3.3477186785477186e-05, "loss": 0.6907, "step": 6435 }, { "epoch": 0.73, "grad_norm": 0.613012433052063, "learning_rate": 3.334394222990307e-05, "loss": 0.7404, "step": 6440 }, { "epoch": 0.73, "grad_norm": 0.7166250348091125, "learning_rate": 3.3210910296023776e-05, "loss": 0.8843, "step": 6445 }, { "epoch": 0.73, "grad_norm": 0.7031468152999878, "learning_rate": 3.3078091408188985e-05, "loss": 0.7878, "step": 6450 }, { "epoch": 0.73, "grad_norm": 0.8461940288543701, "learning_rate": 3.29454859900688e-05, "loss": 0.8152, "step": 6455 }, { "epoch": 0.73, "grad_norm": 0.7095286846160889, "learning_rate": 3.281309446465236e-05, "loss": 0.7507, "step": 6460 }, { "epoch": 0.74, "grad_norm": 0.5577227473258972, "learning_rate": 3.2680917254246515e-05, "loss": 0.6388, "step": 6465 }, { "epoch": 0.74, "grad_norm": 0.6764439940452576, "learning_rate": 3.2548954780474484e-05, "loss": 0.8344, "step": 6470 }, { "epoch": 0.74, "grad_norm": 0.7055917978286743, "learning_rate": 3.241720746427456e-05, "loss": 0.813, "step": 6475 }, { "epoch": 0.74, "grad_norm": 0.7273811101913452, "learning_rate": 3.228567572589864e-05, "loss": 0.8289, "step": 6480 }, { "epoch": 0.74, "grad_norm": 0.8205636739730835, "learning_rate": 3.215435998491102e-05, "loss": 0.8189, "step": 6485 }, { "epoch": 0.74, "grad_norm": 0.5480372309684753, "learning_rate": 3.202326066018701e-05, "loss": 0.8921, "step": 6490 }, { "epoch": 0.74, "grad_norm": 0.6462122797966003, "learning_rate": 3.189237816991161e-05, "loss": 0.7161, "step": 6495 }, { "epoch": 0.74, "grad_norm": 0.678535521030426, "learning_rate": 3.176171293157798e-05, "loss": 0.6587, "step": 6500 }, { "epoch": 0.74, "grad_norm": 0.6733066439628601, "learning_rate": 3.163126536198653e-05, "loss": 0.7188, "step": 6505 }, { "epoch": 0.74, "grad_norm": 0.5796012878417969, "learning_rate": 3.150103587724318e-05, "loss": 0.7414, "step": 6510 }, { "epoch": 0.74, "grad_norm": 0.6976703405380249, "learning_rate": 3.137102489275824e-05, "loss": 0.6708, "step": 6515 }, { "epoch": 0.74, "grad_norm": 0.7101731300354004, "learning_rate": 3.1241232823245026e-05, "loss": 0.6906, "step": 6520 }, { "epoch": 0.74, "grad_norm": 0.7651215195655823, "learning_rate": 3.111166008271866e-05, "loss": 0.8167, "step": 6525 }, { "epoch": 0.74, "grad_norm": 0.7553005218505859, "learning_rate": 3.098230708449445e-05, "loss": 0.6965, "step": 6530 }, { "epoch": 0.74, "grad_norm": 0.6705849766731262, "learning_rate": 3.0853174241186865e-05, "loss": 0.7017, "step": 6535 }, { "epoch": 0.74, "grad_norm": 0.6687391996383667, "learning_rate": 3.072426196470818e-05, "loss": 0.7522, "step": 6540 }, { "epoch": 0.74, "grad_norm": 0.5514757633209229, "learning_rate": 3.0595570666266996e-05, "loss": 0.6664, "step": 6545 }, { "epoch": 0.74, "grad_norm": 0.7751938700675964, "learning_rate": 3.046710075636706e-05, "loss": 0.7551, "step": 6550 }, { "epoch": 0.75, "grad_norm": 0.6046425700187683, "learning_rate": 3.033885264480595e-05, "loss": 0.7579, "step": 6555 }, { "epoch": 0.75, "grad_norm": 0.6243056058883667, "learning_rate": 3.0210826740673727e-05, "loss": 0.7391, "step": 6560 }, { "epoch": 0.75, "grad_norm": 0.6286137700080872, "learning_rate": 3.0083023452351633e-05, "loss": 0.8608, "step": 6565 }, { "epoch": 0.75, "grad_norm": 0.6316589117050171, "learning_rate": 2.99554431875108e-05, "loss": 0.7529, "step": 6570 }, { "epoch": 0.75, "grad_norm": 0.6141951084136963, "learning_rate": 2.982808635311104e-05, "loss": 0.7676, "step": 6575 }, { "epoch": 0.75, "grad_norm": 0.7614745497703552, "learning_rate": 2.9700953355399386e-05, "loss": 0.77, "step": 6580 }, { "epoch": 0.75, "grad_norm": 0.6968342661857605, "learning_rate": 2.9574044599908766e-05, "loss": 0.836, "step": 6585 }, { "epoch": 0.75, "grad_norm": 0.6154257655143738, "learning_rate": 2.9447360491457033e-05, "loss": 0.8214, "step": 6590 }, { "epoch": 0.75, "grad_norm": 0.7290199398994446, "learning_rate": 2.93209014341453e-05, "loss": 0.7725, "step": 6595 }, { "epoch": 0.75, "grad_norm": 0.5743769407272339, "learning_rate": 2.9194667831356837e-05, "loss": 0.7704, "step": 6600 }, { "epoch": 0.75, "grad_norm": 0.6820549964904785, "learning_rate": 2.9068660085755773e-05, "loss": 0.7054, "step": 6605 }, { "epoch": 0.75, "grad_norm": 0.5897756814956665, "learning_rate": 2.894287859928577e-05, "loss": 0.7094, "step": 6610 }, { "epoch": 0.75, "grad_norm": 0.7251617908477783, "learning_rate": 2.881732377316878e-05, "loss": 0.9164, "step": 6615 }, { "epoch": 0.75, "grad_norm": 0.749259889125824, "learning_rate": 2.8691996007903686e-05, "loss": 0.8373, "step": 6620 }, { "epoch": 0.75, "grad_norm": 0.680907666683197, "learning_rate": 2.8566895703265217e-05, "loss": 0.8164, "step": 6625 }, { "epoch": 0.75, "grad_norm": 0.7473598122596741, "learning_rate": 2.844202325830241e-05, "loss": 0.8554, "step": 6630 }, { "epoch": 0.75, "grad_norm": 0.8276415467262268, "learning_rate": 2.831737907133751e-05, "loss": 0.7296, "step": 6635 }, { "epoch": 0.75, "grad_norm": 0.6819781064987183, "learning_rate": 2.8192963539964677e-05, "loss": 0.7994, "step": 6640 }, { "epoch": 0.76, "grad_norm": 0.6347323060035706, "learning_rate": 2.8068777061048668e-05, "loss": 0.7408, "step": 6645 }, { "epoch": 0.76, "grad_norm": 0.6867766976356506, "learning_rate": 2.794482003072364e-05, "loss": 0.8205, "step": 6650 }, { "epoch": 0.76, "grad_norm": 0.6473438143730164, "learning_rate": 2.782109284439176e-05, "loss": 0.7703, "step": 6655 }, { "epoch": 0.76, "grad_norm": 0.7229429483413696, "learning_rate": 2.7697595896722207e-05, "loss": 0.747, "step": 6660 }, { "epoch": 0.76, "grad_norm": 0.6416674852371216, "learning_rate": 2.7574329581649526e-05, "loss": 0.714, "step": 6665 }, { "epoch": 0.76, "grad_norm": 0.6123269200325012, "learning_rate": 2.7451294292372686e-05, "loss": 0.7289, "step": 6670 }, { "epoch": 0.76, "grad_norm": 0.7136829495429993, "learning_rate": 2.732849042135377e-05, "loss": 0.7079, "step": 6675 }, { "epoch": 0.76, "grad_norm": 0.6285436153411865, "learning_rate": 2.7205918360316597e-05, "loss": 0.7023, "step": 6680 }, { "epoch": 0.76, "grad_norm": 0.6968997716903687, "learning_rate": 2.7083578500245566e-05, "loss": 0.6881, "step": 6685 }, { "epoch": 0.76, "grad_norm": 0.9395476579666138, "learning_rate": 2.6961471231384417e-05, "loss": 0.761, "step": 6690 }, { "epoch": 0.76, "grad_norm": 0.7061296105384827, "learning_rate": 2.6839596943234947e-05, "loss": 0.8281, "step": 6695 }, { "epoch": 0.76, "grad_norm": 0.6853705644607544, "learning_rate": 2.671795602455578e-05, "loss": 0.7588, "step": 6700 }, { "epoch": 0.76, "grad_norm": 0.7715938687324524, "learning_rate": 2.6596548863361117e-05, "loss": 0.8728, "step": 6705 }, { "epoch": 0.76, "grad_norm": 0.588621199131012, "learning_rate": 2.647537584691957e-05, "loss": 0.7497, "step": 6710 }, { "epoch": 0.76, "grad_norm": 0.7696636319160461, "learning_rate": 2.6354437361752848e-05, "loss": 0.803, "step": 6715 }, { "epoch": 0.76, "grad_norm": 0.7940670251846313, "learning_rate": 2.623373379363444e-05, "loss": 0.8015, "step": 6720 }, { "epoch": 0.76, "grad_norm": 0.679672360420227, "learning_rate": 2.6113265527588648e-05, "loss": 0.72, "step": 6725 }, { "epoch": 0.77, "grad_norm": 0.6845086812973022, "learning_rate": 2.5993032947889117e-05, "loss": 0.6869, "step": 6730 }, { "epoch": 0.77, "grad_norm": 0.7502850890159607, "learning_rate": 2.5873036438057674e-05, "loss": 0.677, "step": 6735 }, { "epoch": 0.77, "grad_norm": 0.6978954672813416, "learning_rate": 2.5753276380863144e-05, "loss": 0.7409, "step": 6740 }, { "epoch": 0.77, "grad_norm": 0.624478816986084, "learning_rate": 2.5633753158320185e-05, "loss": 0.8546, "step": 6745 }, { "epoch": 0.77, "grad_norm": 0.8565587401390076, "learning_rate": 2.551446715168785e-05, "loss": 0.7651, "step": 6750 }, { "epoch": 0.77, "grad_norm": 0.7041921019554138, "learning_rate": 2.539541874146857e-05, "loss": 0.8322, "step": 6755 }, { "epoch": 0.77, "grad_norm": 0.6443396806716919, "learning_rate": 2.5276608307406945e-05, "loss": 0.7984, "step": 6760 }, { "epoch": 0.77, "grad_norm": 0.6369118094444275, "learning_rate": 2.5158036228488426e-05, "loss": 0.7303, "step": 6765 }, { "epoch": 0.77, "grad_norm": 0.9061431884765625, "learning_rate": 2.503970288293811e-05, "loss": 0.7423, "step": 6770 }, { "epoch": 0.77, "grad_norm": 0.6243317723274231, "learning_rate": 2.492160864821964e-05, "loss": 0.7388, "step": 6775 }, { "epoch": 0.77, "grad_norm": 0.8769336938858032, "learning_rate": 2.480375390103389e-05, "loss": 0.8951, "step": 6780 }, { "epoch": 0.77, "grad_norm": 0.7571940422058105, "learning_rate": 2.4686139017317833e-05, "loss": 0.6837, "step": 6785 }, { "epoch": 0.77, "grad_norm": 0.7043862342834473, "learning_rate": 2.4568764372243268e-05, "loss": 0.6231, "step": 6790 }, { "epoch": 0.77, "grad_norm": 0.6525396108627319, "learning_rate": 2.4451630340215805e-05, "loss": 0.7283, "step": 6795 }, { "epoch": 0.77, "grad_norm": 0.8056162595748901, "learning_rate": 2.433473729487341e-05, "loss": 0.7733, "step": 6800 }, { "epoch": 0.77, "grad_norm": 0.7512674927711487, "learning_rate": 2.4218085609085316e-05, "loss": 0.7004, "step": 6805 }, { "epoch": 0.77, "grad_norm": 0.6620556712150574, "learning_rate": 2.4101675654951006e-05, "loss": 0.7546, "step": 6810 }, { "epoch": 0.77, "grad_norm": 0.6622219085693359, "learning_rate": 2.3985507803798768e-05, "loss": 0.7237, "step": 6815 }, { "epoch": 0.78, "grad_norm": 0.6944047212600708, "learning_rate": 2.3869582426184644e-05, "loss": 0.7919, "step": 6820 }, { "epoch": 0.78, "grad_norm": 0.8888928890228271, "learning_rate": 2.375389989189124e-05, "loss": 0.7315, "step": 6825 }, { "epoch": 0.78, "grad_norm": 0.6875692009925842, "learning_rate": 2.3638460569926523e-05, "loss": 0.8011, "step": 6830 }, { "epoch": 0.78, "grad_norm": 0.6747264266014099, "learning_rate": 2.3523264828522662e-05, "loss": 0.8427, "step": 6835 }, { "epoch": 0.78, "grad_norm": 0.7636617422103882, "learning_rate": 2.3408313035134798e-05, "loss": 0.8938, "step": 6840 }, { "epoch": 0.78, "grad_norm": 0.7376787066459656, "learning_rate": 2.3293605556440033e-05, "loss": 0.7261, "step": 6845 }, { "epoch": 0.78, "grad_norm": 0.6401947736740112, "learning_rate": 2.3179142758336026e-05, "loss": 0.8163, "step": 6850 }, { "epoch": 0.78, "grad_norm": 0.5118803977966309, "learning_rate": 2.3064925005939986e-05, "loss": 0.6642, "step": 6855 }, { "epoch": 0.78, "grad_norm": 0.7035655379295349, "learning_rate": 2.2950952663587498e-05, "loss": 0.8234, "step": 6860 }, { "epoch": 0.78, "grad_norm": 0.5912356376647949, "learning_rate": 2.2837226094831278e-05, "loss": 0.6674, "step": 6865 }, { "epoch": 0.78, "grad_norm": 0.6668829917907715, "learning_rate": 2.272374566244011e-05, "loss": 0.747, "step": 6870 }, { "epoch": 0.78, "grad_norm": 0.674974262714386, "learning_rate": 2.2610511728397587e-05, "loss": 0.8882, "step": 6875 }, { "epoch": 0.78, "grad_norm": 0.8864641785621643, "learning_rate": 2.2497524653901146e-05, "loss": 0.7622, "step": 6880 }, { "epoch": 0.78, "grad_norm": 0.7429625391960144, "learning_rate": 2.238478479936059e-05, "loss": 0.7334, "step": 6885 }, { "epoch": 0.78, "grad_norm": 0.71225905418396, "learning_rate": 2.2272292524397252e-05, "loss": 0.7476, "step": 6890 }, { "epoch": 0.78, "grad_norm": 0.7212709188461304, "learning_rate": 2.2160048187842742e-05, "loss": 0.8207, "step": 6895 }, { "epoch": 0.78, "grad_norm": 0.7402102947235107, "learning_rate": 2.204805214773774e-05, "loss": 0.8319, "step": 6900 }, { "epoch": 0.79, "grad_norm": 0.847400426864624, "learning_rate": 2.193630476133087e-05, "loss": 0.7596, "step": 6905 }, { "epoch": 0.79, "grad_norm": 0.6712285876274109, "learning_rate": 2.1824806385077744e-05, "loss": 0.791, "step": 6910 }, { "epoch": 0.79, "grad_norm": 0.7666999697685242, "learning_rate": 2.1713557374639458e-05, "loss": 0.7542, "step": 6915 }, { "epoch": 0.79, "grad_norm": 0.9645623564720154, "learning_rate": 2.1602558084881796e-05, "loss": 0.8158, "step": 6920 }, { "epoch": 0.79, "grad_norm": 0.6309619545936584, "learning_rate": 2.149180886987401e-05, "loss": 0.7656, "step": 6925 }, { "epoch": 0.79, "grad_norm": 0.6789581775665283, "learning_rate": 2.1381310082887563e-05, "loss": 0.801, "step": 6930 }, { "epoch": 0.79, "grad_norm": 0.7283372282981873, "learning_rate": 2.127106207639519e-05, "loss": 0.8794, "step": 6935 }, { "epoch": 0.79, "grad_norm": 0.6633392572402954, "learning_rate": 2.116106520206952e-05, "loss": 0.9224, "step": 6940 }, { "epoch": 0.79, "grad_norm": 0.6747632622718811, "learning_rate": 2.10513198107823e-05, "loss": 0.789, "step": 6945 }, { "epoch": 0.79, "grad_norm": 0.5490332841873169, "learning_rate": 2.0941826252602993e-05, "loss": 0.7228, "step": 6950 }, { "epoch": 0.79, "grad_norm": 0.6821791529655457, "learning_rate": 2.0832584876797723e-05, "loss": 0.7467, "step": 6955 }, { "epoch": 0.79, "grad_norm": 0.5666481852531433, "learning_rate": 2.0723596031828295e-05, "loss": 0.7156, "step": 6960 }, { "epoch": 0.79, "grad_norm": 0.631689727306366, "learning_rate": 2.061486006535095e-05, "loss": 0.7043, "step": 6965 }, { "epoch": 0.79, "grad_norm": 0.6669164896011353, "learning_rate": 2.0506377324215153e-05, "loss": 0.6862, "step": 6970 }, { "epoch": 0.79, "grad_norm": 0.6554184556007385, "learning_rate": 2.0398148154462826e-05, "loss": 0.7031, "step": 6975 }, { "epoch": 0.79, "grad_norm": 0.6377161145210266, "learning_rate": 2.029017290132693e-05, "loss": 0.7723, "step": 6980 }, { "epoch": 0.79, "grad_norm": 0.6968027353286743, "learning_rate": 2.0182451909230493e-05, "loss": 0.7634, "step": 6985 }, { "epoch": 0.79, "grad_norm": 0.6753907799720764, "learning_rate": 2.0074985521785495e-05, "loss": 0.8347, "step": 6990 }, { "epoch": 0.8, "grad_norm": 0.6418029069900513, "learning_rate": 1.9967774081791756e-05, "loss": 0.731, "step": 6995 }, { "epoch": 0.8, "grad_norm": 0.630171537399292, "learning_rate": 1.9860817931235877e-05, "loss": 0.8453, "step": 7000 }, { "epoch": 0.8, "grad_norm": 0.7392057180404663, "learning_rate": 1.9754117411290096e-05, "loss": 0.6955, "step": 7005 }, { "epoch": 0.8, "grad_norm": 0.7004316449165344, "learning_rate": 1.9647672862311316e-05, "loss": 0.7702, "step": 7010 }, { "epoch": 0.8, "grad_norm": 0.6444839239120483, "learning_rate": 1.9541484623839836e-05, "loss": 0.7155, "step": 7015 }, { "epoch": 0.8, "grad_norm": 0.6129523515701294, "learning_rate": 1.9435553034598398e-05, "loss": 0.7409, "step": 7020 }, { "epoch": 0.8, "grad_norm": 0.8170189261436462, "learning_rate": 1.9329878432491112e-05, "loss": 0.752, "step": 7025 }, { "epoch": 0.8, "grad_norm": 0.6246253252029419, "learning_rate": 1.9224461154602292e-05, "loss": 0.7082, "step": 7030 }, { "epoch": 0.8, "grad_norm": 0.7174789309501648, "learning_rate": 1.9119301537195455e-05, "loss": 0.8013, "step": 7035 }, { "epoch": 0.8, "grad_norm": 0.7129815220832825, "learning_rate": 1.901439991571221e-05, "loss": 0.8366, "step": 7040 }, { "epoch": 0.8, "grad_norm": 0.696826159954071, "learning_rate": 1.890975662477128e-05, "loss": 0.6921, "step": 7045 }, { "epoch": 0.8, "grad_norm": 0.6623396277427673, "learning_rate": 1.8805371998167222e-05, "loss": 0.8734, "step": 7050 }, { "epoch": 0.8, "grad_norm": 0.7665203809738159, "learning_rate": 1.8701246368869563e-05, "loss": 0.826, "step": 7055 }, { "epoch": 0.8, "grad_norm": 0.6396629810333252, "learning_rate": 1.859738006902172e-05, "loss": 0.7085, "step": 7060 }, { "epoch": 0.8, "grad_norm": 0.6628134846687317, "learning_rate": 1.849377342993982e-05, "loss": 0.7313, "step": 7065 }, { "epoch": 0.8, "grad_norm": 0.6394834518432617, "learning_rate": 1.839042678211176e-05, "loss": 0.7605, "step": 7070 }, { "epoch": 0.8, "grad_norm": 0.6791489124298096, "learning_rate": 1.8287340455196068e-05, "loss": 0.8351, "step": 7075 }, { "epoch": 0.81, "grad_norm": 0.6268406510353088, "learning_rate": 1.8184514778020935e-05, "loss": 0.753, "step": 7080 }, { "epoch": 0.81, "grad_norm": 0.7322360873222351, "learning_rate": 1.80819500785831e-05, "loss": 0.7054, "step": 7085 }, { "epoch": 0.81, "grad_norm": 0.6677795052528381, "learning_rate": 1.7979646684046782e-05, "loss": 0.8602, "step": 7090 }, { "epoch": 0.81, "grad_norm": 0.6794170141220093, "learning_rate": 1.787760492074281e-05, "loss": 0.6957, "step": 7095 }, { "epoch": 0.81, "grad_norm": 0.7083120942115784, "learning_rate": 1.7775825114167344e-05, "loss": 0.8359, "step": 7100 }, { "epoch": 0.81, "grad_norm": 0.7114768624305725, "learning_rate": 1.767430758898092e-05, "loss": 0.8783, "step": 7105 }, { "epoch": 0.81, "grad_norm": 0.6977761387825012, "learning_rate": 1.7573052669007552e-05, "loss": 0.8449, "step": 7110 }, { "epoch": 0.81, "grad_norm": 0.7091259360313416, "learning_rate": 1.7472060677233503e-05, "loss": 0.7588, "step": 7115 }, { "epoch": 0.81, "grad_norm": 0.8661299347877502, "learning_rate": 1.737133193580638e-05, "loss": 0.7614, "step": 7120 }, { "epoch": 0.81, "grad_norm": 0.6567466855049133, "learning_rate": 1.727086676603401e-05, "loss": 0.8487, "step": 7125 }, { "epoch": 0.81, "grad_norm": 0.637890100479126, "learning_rate": 1.7170665488383597e-05, "loss": 0.8408, "step": 7130 }, { "epoch": 0.81, "grad_norm": 0.6123327612876892, "learning_rate": 1.70707284224804e-05, "loss": 0.7182, "step": 7135 }, { "epoch": 0.81, "grad_norm": 0.7284800410270691, "learning_rate": 1.697105588710698e-05, "loss": 0.6767, "step": 7140 }, { "epoch": 0.81, "grad_norm": 0.8092273473739624, "learning_rate": 1.6871648200202127e-05, "loss": 0.8872, "step": 7145 }, { "epoch": 0.81, "grad_norm": 0.6525725722312927, "learning_rate": 1.677250567885974e-05, "loss": 0.8425, "step": 7150 }, { "epoch": 0.81, "grad_norm": 0.7781700491905212, "learning_rate": 1.667362863932792e-05, "loss": 0.8548, "step": 7155 }, { "epoch": 0.81, "grad_norm": 0.6901320815086365, "learning_rate": 1.6575017397007896e-05, "loss": 0.7168, "step": 7160 }, { "epoch": 0.81, "grad_norm": 0.637550413608551, "learning_rate": 1.6476672266453087e-05, "loss": 0.8974, "step": 7165 }, { "epoch": 0.82, "grad_norm": 0.6621235609054565, "learning_rate": 1.6378593561368016e-05, "loss": 0.7816, "step": 7170 }, { "epoch": 0.82, "grad_norm": 0.6375472545623779, "learning_rate": 1.6280781594607364e-05, "loss": 0.7895, "step": 7175 }, { "epoch": 0.82, "grad_norm": 0.6751498579978943, "learning_rate": 1.6183236678175028e-05, "loss": 0.8145, "step": 7180 }, { "epoch": 0.82, "grad_norm": 0.6726610064506531, "learning_rate": 1.6085959123222995e-05, "loss": 0.7143, "step": 7185 }, { "epoch": 0.82, "grad_norm": 0.7486894130706787, "learning_rate": 1.5988949240050343e-05, "loss": 0.8543, "step": 7190 }, { "epoch": 0.82, "grad_norm": 0.6508118510246277, "learning_rate": 1.5892207338102494e-05, "loss": 0.7921, "step": 7195 }, { "epoch": 0.82, "grad_norm": 0.6627808809280396, "learning_rate": 1.579573372596993e-05, "loss": 0.8042, "step": 7200 }, { "epoch": 0.82, "grad_norm": 0.6512479782104492, "learning_rate": 1.5699528711387357e-05, "loss": 0.7605, "step": 7205 }, { "epoch": 0.82, "grad_norm": 0.6039304137229919, "learning_rate": 1.560359260123272e-05, "loss": 0.7278, "step": 7210 }, { "epoch": 0.82, "grad_norm": 0.7306320667266846, "learning_rate": 1.550792570152618e-05, "loss": 0.7868, "step": 7215 }, { "epoch": 0.82, "grad_norm": 0.7047690153121948, "learning_rate": 1.5412528317429197e-05, "loss": 0.6986, "step": 7220 }, { "epoch": 0.82, "grad_norm": 0.7422201037406921, "learning_rate": 1.531740075324345e-05, "loss": 0.6588, "step": 7225 }, { "epoch": 0.82, "grad_norm": 0.7010562419891357, "learning_rate": 1.5222543312410042e-05, "loss": 0.78, "step": 7230 }, { "epoch": 0.82, "grad_norm": 0.7242764234542847, "learning_rate": 1.5127956297508338e-05, "loss": 0.6492, "step": 7235 }, { "epoch": 0.82, "grad_norm": 0.5822871923446655, "learning_rate": 1.5033640010255145e-05, "loss": 0.6479, "step": 7240 }, { "epoch": 0.82, "grad_norm": 0.7780561447143555, "learning_rate": 1.493959475150365e-05, "loss": 0.7035, "step": 7245 }, { "epoch": 0.82, "grad_norm": 0.7585102319717407, "learning_rate": 1.484582082124254e-05, "loss": 0.7921, "step": 7250 }, { "epoch": 0.82, "grad_norm": 0.6618209481239319, "learning_rate": 1.4752318518594987e-05, "loss": 0.6766, "step": 7255 }, { "epoch": 0.83, "grad_norm": 0.6074077486991882, "learning_rate": 1.46590881418177e-05, "loss": 0.7541, "step": 7260 }, { "epoch": 0.83, "grad_norm": 0.7971552610397339, "learning_rate": 1.4566129988300093e-05, "loss": 0.935, "step": 7265 }, { "epoch": 0.83, "grad_norm": 0.6890114545822144, "learning_rate": 1.4473444354563082e-05, "loss": 0.6824, "step": 7270 }, { "epoch": 0.83, "grad_norm": 0.6999330520629883, "learning_rate": 1.438103153625835e-05, "loss": 0.7515, "step": 7275 }, { "epoch": 0.83, "grad_norm": 0.7287675738334656, "learning_rate": 1.4288891828167428e-05, "loss": 0.7248, "step": 7280 }, { "epoch": 0.83, "grad_norm": 0.6378064155578613, "learning_rate": 1.4197025524200547e-05, "loss": 0.7629, "step": 7285 }, { "epoch": 0.83, "grad_norm": 0.6283854842185974, "learning_rate": 1.4105432917395911e-05, "loss": 0.7033, "step": 7290 }, { "epoch": 0.83, "grad_norm": 0.7032335996627808, "learning_rate": 1.4014114299918612e-05, "loss": 0.8074, "step": 7295 }, { "epoch": 0.83, "grad_norm": 0.6264139413833618, "learning_rate": 1.3923069963059821e-05, "loss": 0.7572, "step": 7300 }, { "epoch": 0.83, "grad_norm": 0.7607216835021973, "learning_rate": 1.3832300197235748e-05, "loss": 0.6808, "step": 7305 }, { "epoch": 0.83, "grad_norm": 0.7733168005943298, "learning_rate": 1.3741805291986787e-05, "loss": 0.7818, "step": 7310 }, { "epoch": 0.83, "grad_norm": 0.7297649383544922, "learning_rate": 1.3651585535976596e-05, "loss": 0.7182, "step": 7315 }, { "epoch": 0.83, "grad_norm": 0.7502012848854065, "learning_rate": 1.3561641216991162e-05, "loss": 0.7778, "step": 7320 }, { "epoch": 0.83, "grad_norm": 0.7216570377349854, "learning_rate": 1.3471972621937756e-05, "loss": 0.7803, "step": 7325 }, { "epoch": 0.83, "grad_norm": 0.7050399780273438, "learning_rate": 1.3382580036844295e-05, "loss": 0.8175, "step": 7330 }, { "epoch": 0.83, "grad_norm": 0.8511192798614502, "learning_rate": 1.3293463746858182e-05, "loss": 0.8151, "step": 7335 }, { "epoch": 0.83, "grad_norm": 0.7982873320579529, "learning_rate": 1.3204624036245505e-05, "loss": 0.7518, "step": 7340 }, { "epoch": 0.84, "grad_norm": 0.6801825761795044, "learning_rate": 1.3116061188390083e-05, "loss": 0.7761, "step": 7345 }, { "epoch": 0.84, "grad_norm": 0.6566172242164612, "learning_rate": 1.3027775485792681e-05, "loss": 0.7077, "step": 7350 }, { "epoch": 0.84, "grad_norm": 0.6745688319206238, "learning_rate": 1.2939767210069876e-05, "loss": 0.7668, "step": 7355 }, { "epoch": 0.84, "grad_norm": 0.5891917943954468, "learning_rate": 1.285203664195338e-05, "loss": 0.7246, "step": 7360 }, { "epoch": 0.84, "grad_norm": 0.7064673900604248, "learning_rate": 1.2764584061289098e-05, "loss": 0.734, "step": 7365 }, { "epoch": 0.84, "grad_norm": 0.7701008915901184, "learning_rate": 1.267740974703614e-05, "loss": 0.8342, "step": 7370 }, { "epoch": 0.84, "grad_norm": 0.6421898007392883, "learning_rate": 1.2590513977266006e-05, "loss": 0.7911, "step": 7375 }, { "epoch": 0.84, "grad_norm": 0.7210098505020142, "learning_rate": 1.2503897029161715e-05, "loss": 0.8654, "step": 7380 }, { "epoch": 0.84, "grad_norm": 0.8242705464363098, "learning_rate": 1.2417559179016836e-05, "loss": 0.798, "step": 7385 }, { "epoch": 0.84, "grad_norm": 0.7865302562713623, "learning_rate": 1.2331500702234722e-05, "loss": 0.8011, "step": 7390 }, { "epoch": 0.84, "grad_norm": 0.6898683309555054, "learning_rate": 1.2245721873327521e-05, "loss": 0.8357, "step": 7395 }, { "epoch": 0.84, "grad_norm": 0.6212425827980042, "learning_rate": 1.2160222965915401e-05, "loss": 0.7448, "step": 7400 }, { "epoch": 0.84, "grad_norm": 0.7410294413566589, "learning_rate": 1.2075004252725619e-05, "loss": 0.7546, "step": 7405 }, { "epoch": 0.84, "grad_norm": 0.6324216723442078, "learning_rate": 1.199006600559156e-05, "loss": 0.774, "step": 7410 }, { "epoch": 0.84, "grad_norm": 0.8635458946228027, "learning_rate": 1.190540849545213e-05, "loss": 0.697, "step": 7415 }, { "epoch": 0.84, "grad_norm": 0.6437505483627319, "learning_rate": 1.1821031992350628e-05, "loss": 0.797, "step": 7420 }, { "epoch": 0.84, "grad_norm": 0.7493045330047607, "learning_rate": 1.1736936765434004e-05, "loss": 0.749, "step": 7425 }, { "epoch": 0.84, "grad_norm": 0.6989094614982605, "learning_rate": 1.1653123082951966e-05, "loss": 0.8166, "step": 7430 }, { "epoch": 0.85, "grad_norm": 0.7406939268112183, "learning_rate": 1.1569591212256237e-05, "loss": 0.7769, "step": 7435 }, { "epoch": 0.85, "grad_norm": 0.6417839527130127, "learning_rate": 1.1486341419799474e-05, "loss": 0.7865, "step": 7440 }, { "epoch": 0.85, "grad_norm": 0.6294339299201965, "learning_rate": 1.1403373971134624e-05, "loss": 0.7634, "step": 7445 }, { "epoch": 0.85, "grad_norm": 0.615867018699646, "learning_rate": 1.1320689130914019e-05, "loss": 0.689, "step": 7450 }, { "epoch": 0.85, "grad_norm": 0.6019276976585388, "learning_rate": 1.1238287162888483e-05, "loss": 0.7225, "step": 7455 }, { "epoch": 0.85, "grad_norm": 1.1533637046813965, "learning_rate": 1.1156168329906535e-05, "loss": 0.7035, "step": 7460 }, { "epoch": 0.85, "grad_norm": 0.6268231272697449, "learning_rate": 1.1074332893913542e-05, "loss": 0.7861, "step": 7465 }, { "epoch": 0.85, "grad_norm": 0.7267367243766785, "learning_rate": 1.0992781115950868e-05, "loss": 0.7226, "step": 7470 }, { "epoch": 0.85, "grad_norm": 0.6262809634208679, "learning_rate": 1.0911513256155092e-05, "loss": 0.7548, "step": 7475 }, { "epoch": 0.85, "grad_norm": 0.6905179023742676, "learning_rate": 1.0830529573757076e-05, "loss": 0.7447, "step": 7480 }, { "epoch": 0.85, "grad_norm": 0.7611731290817261, "learning_rate": 1.074983032708129e-05, "loss": 0.7509, "step": 7485 }, { "epoch": 0.85, "grad_norm": 0.708593487739563, "learning_rate": 1.0669415773544866e-05, "loss": 0.7352, "step": 7490 }, { "epoch": 0.85, "grad_norm": 0.6629523038864136, "learning_rate": 1.0589286169656742e-05, "loss": 0.7969, "step": 7495 }, { "epoch": 0.85, "grad_norm": 0.6062517166137695, "learning_rate": 1.0509441771017026e-05, "loss": 0.7943, "step": 7500 }, { "epoch": 0.85, "grad_norm": 0.8772883415222168, "learning_rate": 1.0429882832316006e-05, "loss": 0.7385, "step": 7505 }, { "epoch": 0.85, "grad_norm": 0.7119115591049194, "learning_rate": 1.0350609607333384e-05, "loss": 0.7723, "step": 7510 }, { "epoch": 0.85, "grad_norm": 0.6540088653564453, "learning_rate": 1.0271622348937581e-05, "loss": 0.713, "step": 7515 }, { "epoch": 0.86, "grad_norm": 0.6274105906486511, "learning_rate": 1.0192921309084702e-05, "loss": 0.7161, "step": 7520 }, { "epoch": 0.86, "grad_norm": 0.6521838307380676, "learning_rate": 1.0114506738817942e-05, "loss": 0.6904, "step": 7525 }, { "epoch": 0.86, "grad_norm": 0.5840886235237122, "learning_rate": 1.0036378888266663e-05, "loss": 0.736, "step": 7530 }, { "epoch": 0.86, "grad_norm": 0.7056083679199219, "learning_rate": 9.9585380066457e-06, "loss": 0.7663, "step": 7535 }, { "epoch": 0.86, "grad_norm": 0.7185655236244202, "learning_rate": 9.880984342254462e-06, "loss": 0.7682, "step": 7540 }, { "epoch": 0.86, "grad_norm": 0.7891202569007874, "learning_rate": 9.803718142476181e-06, "loss": 0.8107, "step": 7545 }, { "epoch": 0.86, "grad_norm": 0.8266674876213074, "learning_rate": 9.72673965377714e-06, "loss": 0.6771, "step": 7550 }, { "epoch": 0.86, "grad_norm": 0.6623797416687012, "learning_rate": 9.650049121705851e-06, "loss": 0.774, "step": 7555 }, { "epoch": 0.86, "grad_norm": 0.7579045295715332, "learning_rate": 9.573646790892298e-06, "loss": 0.8548, "step": 7560 }, { "epoch": 0.86, "grad_norm": 0.7818818688392639, "learning_rate": 9.497532905047202e-06, "loss": 0.7678, "step": 7565 }, { "epoch": 0.86, "grad_norm": 0.6764331459999084, "learning_rate": 9.421707706961136e-06, "loss": 0.7864, "step": 7570 }, { "epoch": 0.86, "grad_norm": 0.6758101582527161, "learning_rate": 9.34617143850378e-06, "loss": 0.7407, "step": 7575 }, { "epoch": 0.86, "grad_norm": 0.7731313109397888, "learning_rate": 9.270924340623267e-06, "loss": 0.6977, "step": 7580 }, { "epoch": 0.86, "grad_norm": 0.7126657366752625, "learning_rate": 9.195966653345255e-06, "loss": 0.7612, "step": 7585 }, { "epoch": 0.86, "grad_norm": 0.8116534948348999, "learning_rate": 9.121298615772256e-06, "loss": 0.85, "step": 7590 }, { "epoch": 0.86, "grad_norm": 0.7227126359939575, "learning_rate": 9.04692046608281e-06, "loss": 0.7488, "step": 7595 }, { "epoch": 0.86, "grad_norm": 0.7809383273124695, "learning_rate": 8.972832441530876e-06, "loss": 0.8398, "step": 7600 }, { "epoch": 0.86, "grad_norm": 0.7499304413795471, "learning_rate": 8.899034778444804e-06, "loss": 0.7438, "step": 7605 }, { "epoch": 0.87, "grad_norm": 0.7327824234962463, "learning_rate": 8.825527712226833e-06, "loss": 0.8292, "step": 7610 }, { "epoch": 0.87, "grad_norm": 0.7105828523635864, "learning_rate": 8.752311477352259e-06, "loss": 0.6284, "step": 7615 }, { "epoch": 0.87, "grad_norm": 0.7823231220245361, "learning_rate": 8.679386307368631e-06, "loss": 0.7742, "step": 7620 }, { "epoch": 0.87, "grad_norm": 0.6463223099708557, "learning_rate": 8.606752434895061e-06, "loss": 0.6789, "step": 7625 }, { "epoch": 0.87, "grad_norm": 0.6511633396148682, "learning_rate": 8.53441009162148e-06, "loss": 0.706, "step": 7630 }, { "epoch": 0.87, "grad_norm": 0.6864632368087769, "learning_rate": 8.462359508307882e-06, "loss": 0.7008, "step": 7635 }, { "epoch": 0.87, "grad_norm": 0.7921543717384338, "learning_rate": 8.390600914783598e-06, "loss": 0.7399, "step": 7640 }, { "epoch": 0.87, "grad_norm": 0.6895166039466858, "learning_rate": 8.319134539946549e-06, "loss": 0.8166, "step": 7645 }, { "epoch": 0.87, "grad_norm": 0.7610728144645691, "learning_rate": 8.247960611762562e-06, "loss": 0.7897, "step": 7650 }, { "epoch": 0.87, "grad_norm": 0.566939115524292, "learning_rate": 8.177079357264583e-06, "loss": 0.744, "step": 7655 }, { "epoch": 0.87, "grad_norm": 0.7656039595603943, "learning_rate": 8.10649100255194e-06, "loss": 0.8224, "step": 7660 }, { "epoch": 0.87, "grad_norm": 0.7149258852005005, "learning_rate": 8.036195772789734e-06, "loss": 0.76, "step": 7665 }, { "epoch": 0.87, "grad_norm": 0.7486068606376648, "learning_rate": 7.966193892208007e-06, "loss": 0.718, "step": 7670 }, { "epoch": 0.87, "grad_norm": 0.674845278263092, "learning_rate": 7.896485584101066e-06, "loss": 0.7689, "step": 7675 }, { "epoch": 0.87, "grad_norm": 0.74075847864151, "learning_rate": 7.827071070826775e-06, "loss": 0.8496, "step": 7680 }, { "epoch": 0.87, "grad_norm": 0.6453339457511902, "learning_rate": 7.757950573805839e-06, "loss": 0.7359, "step": 7685 }, { "epoch": 0.87, "grad_norm": 0.7150562405586243, "learning_rate": 7.689124313521112e-06, "loss": 0.7677, "step": 7690 }, { "epoch": 0.87, "grad_norm": 0.6305781602859497, "learning_rate": 7.620592509516844e-06, "loss": 0.7111, "step": 7695 }, { "epoch": 0.88, "grad_norm": 0.639238178730011, "learning_rate": 7.5523553803980795e-06, "loss": 0.7908, "step": 7700 }, { "epoch": 0.88, "grad_norm": 0.6141027808189392, "learning_rate": 7.48441314382986e-06, "loss": 0.7716, "step": 7705 }, { "epoch": 0.88, "grad_norm": 0.732420802116394, "learning_rate": 7.416766016536569e-06, "loss": 0.7698, "step": 7710 }, { "epoch": 0.88, "grad_norm": 0.7163684368133545, "learning_rate": 7.349414214301243e-06, "loss": 0.7367, "step": 7715 }, { "epoch": 0.88, "grad_norm": 0.7079850435256958, "learning_rate": 7.282357951964902e-06, "loss": 0.7766, "step": 7720 }, { "epoch": 0.88, "grad_norm": 0.6404287815093994, "learning_rate": 7.215597443425815e-06, "loss": 0.7758, "step": 7725 }, { "epoch": 0.88, "grad_norm": 0.66849285364151, "learning_rate": 7.149132901638844e-06, "loss": 0.7765, "step": 7730 }, { "epoch": 0.88, "grad_norm": 0.6282253265380859, "learning_rate": 7.082964538614823e-06, "loss": 0.7917, "step": 7735 }, { "epoch": 0.88, "grad_norm": 0.7129911184310913, "learning_rate": 7.017092565419747e-06, "loss": 0.7986, "step": 7740 }, { "epoch": 0.88, "grad_norm": 0.6497909426689148, "learning_rate": 6.951517192174195e-06, "loss": 0.7719, "step": 7745 }, { "epoch": 0.88, "grad_norm": 0.7094148397445679, "learning_rate": 6.88623862805271e-06, "loss": 0.7931, "step": 7750 }, { "epoch": 0.88, "grad_norm": 0.5958633422851562, "learning_rate": 6.821257081282972e-06, "loss": 0.8135, "step": 7755 }, { "epoch": 0.88, "grad_norm": 0.6889587044715881, "learning_rate": 6.756572759145285e-06, "loss": 0.7241, "step": 7760 }, { "epoch": 0.88, "grad_norm": 0.6314862370491028, "learning_rate": 6.6921858679718345e-06, "loss": 0.598, "step": 7765 }, { "epoch": 0.88, "grad_norm": 0.6266872882843018, "learning_rate": 6.62809661314604e-06, "loss": 0.8213, "step": 7770 }, { "epoch": 0.88, "grad_norm": 0.8507253527641296, "learning_rate": 6.564305199101939e-06, "loss": 0.8546, "step": 7775 }, { "epoch": 0.88, "grad_norm": 0.6322931051254272, "learning_rate": 6.500811829323461e-06, "loss": 0.8104, "step": 7780 }, { "epoch": 0.89, "grad_norm": 0.5790343880653381, "learning_rate": 6.4376167063438965e-06, "loss": 0.6864, "step": 7785 }, { "epoch": 0.89, "grad_norm": 0.7141791582107544, "learning_rate": 6.3747200317451294e-06, "loss": 0.7184, "step": 7790 }, { "epoch": 0.89, "grad_norm": 0.7355352640151978, "learning_rate": 6.3121220061570065e-06, "loss": 0.7836, "step": 7795 }, { "epoch": 0.89, "grad_norm": 0.6871746778488159, "learning_rate": 6.249822829256835e-06, "loss": 0.7652, "step": 7800 }, { "epoch": 0.89, "grad_norm": 0.8354716897010803, "learning_rate": 6.1878226997685525e-06, "loss": 0.7652, "step": 7805 }, { "epoch": 0.89, "grad_norm": 0.6610546708106995, "learning_rate": 6.1261218154622264e-06, "loss": 0.7605, "step": 7810 }, { "epoch": 0.89, "grad_norm": 0.6841945648193359, "learning_rate": 6.064720373153365e-06, "loss": 0.6656, "step": 7815 }, { "epoch": 0.89, "grad_norm": 0.7112994194030762, "learning_rate": 6.003618568702351e-06, "loss": 0.7717, "step": 7820 }, { "epoch": 0.89, "grad_norm": 0.6608765721321106, "learning_rate": 5.942816597013712e-06, "loss": 0.7229, "step": 7825 }, { "epoch": 0.89, "grad_norm": 0.6349248886108398, "learning_rate": 5.882314652035581e-06, "loss": 0.7397, "step": 7830 }, { "epoch": 0.89, "grad_norm": 0.7428655624389648, "learning_rate": 5.822112926759071e-06, "loss": 0.7451, "step": 7835 }, { "epoch": 0.89, "grad_norm": 0.6516420245170593, "learning_rate": 5.7622116132176495e-06, "loss": 0.6874, "step": 7840 }, { "epoch": 0.89, "grad_norm": 0.6775025129318237, "learning_rate": 5.7026109024864716e-06, "loss": 0.8085, "step": 7845 }, { "epoch": 0.89, "grad_norm": 0.6347958445549011, "learning_rate": 5.643310984681882e-06, "loss": 0.7767, "step": 7850 }, { "epoch": 0.89, "grad_norm": 0.7689727544784546, "learning_rate": 5.5843120489607045e-06, "loss": 0.7772, "step": 7855 }, { "epoch": 0.89, "grad_norm": 0.6171656250953674, "learning_rate": 5.525614283519697e-06, "loss": 0.7042, "step": 7860 }, { "epoch": 0.89, "grad_norm": 0.7996339797973633, "learning_rate": 5.46721787559491e-06, "loss": 0.8328, "step": 7865 }, { "epoch": 0.89, "grad_norm": 0.9136884212493896, "learning_rate": 5.409123011461159e-06, "loss": 0.7864, "step": 7870 }, { "epoch": 0.9, "grad_norm": 0.7024335265159607, "learning_rate": 5.351329876431377e-06, "loss": 0.7926, "step": 7875 }, { "epoch": 0.9, "grad_norm": 0.8358083367347717, "learning_rate": 5.293838654855965e-06, "loss": 0.7301, "step": 7880 }, { "epoch": 0.9, "grad_norm": 0.6557454466819763, "learning_rate": 5.236649530122361e-06, "loss": 0.8171, "step": 7885 }, { "epoch": 0.9, "grad_norm": 0.7085033059120178, "learning_rate": 5.17976268465431e-06, "loss": 0.8429, "step": 7890 }, { "epoch": 0.9, "grad_norm": 0.7088157534599304, "learning_rate": 5.123178299911357e-06, "loss": 0.8049, "step": 7895 }, { "epoch": 0.9, "grad_norm": 0.7027326822280884, "learning_rate": 5.0668965563882235e-06, "loss": 0.8121, "step": 7900 }, { "epoch": 0.9, "grad_norm": 0.5936857461929321, "learning_rate": 5.0109176336142984e-06, "loss": 0.6958, "step": 7905 }, { "epoch": 0.9, "grad_norm": 0.7407312989234924, "learning_rate": 4.95524171015298e-06, "loss": 0.6585, "step": 7910 }, { "epoch": 0.9, "grad_norm": 0.8003597855567932, "learning_rate": 4.899868963601173e-06, "loss": 0.6724, "step": 7915 }, { "epoch": 0.9, "grad_norm": 0.6756457090377808, "learning_rate": 4.844799570588699e-06, "loss": 0.722, "step": 7920 }, { "epoch": 0.9, "grad_norm": 0.7574281096458435, "learning_rate": 4.79003370677773e-06, "loss": 0.8335, "step": 7925 }, { "epoch": 0.9, "grad_norm": 0.71590656042099, "learning_rate": 4.735571546862217e-06, "loss": 0.7708, "step": 7930 }, { "epoch": 0.9, "grad_norm": 0.5873979330062866, "learning_rate": 4.681413264567358e-06, "loss": 0.6377, "step": 7935 }, { "epoch": 0.9, "grad_norm": 0.7385571599006653, "learning_rate": 4.627559032649031e-06, "loss": 0.7705, "step": 7940 }, { "epoch": 0.9, "grad_norm": 0.5803728103637695, "learning_rate": 4.574009022893255e-06, "loss": 0.7057, "step": 7945 }, { "epoch": 0.9, "grad_norm": 0.6368845105171204, "learning_rate": 4.520763406115592e-06, "loss": 0.7599, "step": 7950 }, { "epoch": 0.9, "grad_norm": 0.7460334897041321, "learning_rate": 4.467822352160722e-06, "loss": 0.7941, "step": 7955 }, { "epoch": 0.91, "grad_norm": 0.6857527494430542, "learning_rate": 4.415186029901719e-06, "loss": 0.7451, "step": 7960 }, { "epoch": 0.91, "grad_norm": 0.7004684209823608, "learning_rate": 4.362854607239652e-06, "loss": 0.7884, "step": 7965 }, { "epoch": 0.91, "grad_norm": 0.7447669506072998, "learning_rate": 4.310828251103072e-06, "loss": 0.7734, "step": 7970 }, { "epoch": 0.91, "grad_norm": 0.6418143510818481, "learning_rate": 4.259107127447348e-06, "loss": 0.7259, "step": 7975 }, { "epoch": 0.91, "grad_norm": 0.680642306804657, "learning_rate": 4.20769140125421e-06, "loss": 0.7835, "step": 7980 }, { "epoch": 0.91, "grad_norm": 0.6749563217163086, "learning_rate": 4.156581236531265e-06, "loss": 0.7962, "step": 7985 }, { "epoch": 0.91, "grad_norm": 0.6090324521064758, "learning_rate": 4.1057767963113895e-06, "loss": 0.7743, "step": 7990 }, { "epoch": 0.91, "grad_norm": 0.5747905969619751, "learning_rate": 4.055278242652272e-06, "loss": 0.7332, "step": 7995 }, { "epoch": 0.91, "grad_norm": 0.59056156873703, "learning_rate": 4.00508573663585e-06, "loss": 0.796, "step": 8000 }, { "epoch": 0.91, "grad_norm": 0.7366644740104675, "learning_rate": 3.955199438367874e-06, "loss": 0.6988, "step": 8005 }, { "epoch": 0.91, "grad_norm": 0.8058913946151733, "learning_rate": 3.905619506977287e-06, "loss": 0.7612, "step": 8010 }, { "epoch": 0.91, "grad_norm": 0.696850061416626, "learning_rate": 3.85634610061576e-06, "loss": 0.7504, "step": 8015 }, { "epoch": 0.91, "grad_norm": 0.636525571346283, "learning_rate": 3.807379376457276e-06, "loss": 0.8115, "step": 8020 }, { "epoch": 0.91, "grad_norm": 0.6842799186706543, "learning_rate": 3.7587194906974934e-06, "loss": 0.7669, "step": 8025 }, { "epoch": 0.91, "grad_norm": 0.665556788444519, "learning_rate": 3.7103665985533275e-06, "loss": 0.8176, "step": 8030 }, { "epoch": 0.91, "grad_norm": 0.7209974527359009, "learning_rate": 3.662320854262413e-06, "loss": 0.7631, "step": 8035 }, { "epoch": 0.91, "grad_norm": 0.6841431856155396, "learning_rate": 3.61458241108269e-06, "loss": 0.8565, "step": 8040 }, { "epoch": 0.91, "grad_norm": 0.6740476489067078, "learning_rate": 3.567151421291781e-06, "loss": 0.7445, "step": 8045 }, { "epoch": 0.92, "grad_norm": 0.6454640626907349, "learning_rate": 3.5200280361866287e-06, "loss": 0.7506, "step": 8050 }, { "epoch": 0.92, "grad_norm": 0.696540355682373, "learning_rate": 3.473212406082993e-06, "loss": 0.7865, "step": 8055 }, { "epoch": 0.92, "grad_norm": 0.6765589118003845, "learning_rate": 3.426704680314896e-06, "loss": 0.7273, "step": 8060 }, { "epoch": 0.92, "grad_norm": 0.7231236100196838, "learning_rate": 3.3805050072342246e-06, "loss": 0.7769, "step": 8065 }, { "epoch": 0.92, "grad_norm": 0.7834519147872925, "learning_rate": 3.334613534210218e-06, "loss": 0.7718, "step": 8070 }, { "epoch": 0.92, "grad_norm": 0.7390478253364563, "learning_rate": 3.2890304076290122e-06, "loss": 0.7633, "step": 8075 }, { "epoch": 0.92, "grad_norm": 0.6524391174316406, "learning_rate": 3.2437557728931643e-06, "loss": 0.7352, "step": 8080 }, { "epoch": 0.92, "grad_norm": 0.7138876914978027, "learning_rate": 3.1987897744212068e-06, "loss": 0.7841, "step": 8085 }, { "epoch": 0.92, "grad_norm": 0.6502974033355713, "learning_rate": 3.1541325556471713e-06, "loss": 0.8611, "step": 8090 }, { "epoch": 0.92, "grad_norm": 0.7619243860244751, "learning_rate": 3.1097842590201433e-06, "loss": 0.8266, "step": 8095 }, { "epoch": 0.92, "grad_norm": 0.6988756060600281, "learning_rate": 3.06574502600373e-06, "loss": 0.7378, "step": 8100 }, { "epoch": 0.92, "grad_norm": 0.6371370553970337, "learning_rate": 3.0220149970757947e-06, "loss": 0.7703, "step": 8105 }, { "epoch": 0.92, "grad_norm": 0.6489354372024536, "learning_rate": 2.9785943117277893e-06, "loss": 0.7766, "step": 8110 }, { "epoch": 0.92, "grad_norm": 0.757959246635437, "learning_rate": 2.9354831084644652e-06, "loss": 0.8003, "step": 8115 }, { "epoch": 0.92, "grad_norm": 0.7314863204956055, "learning_rate": 2.8926815248033533e-06, "loss": 0.8064, "step": 8120 }, { "epoch": 0.92, "grad_norm": 0.6922609210014343, "learning_rate": 2.8501896972743748e-06, "loss": 0.6391, "step": 8125 }, { "epoch": 0.92, "grad_norm": 0.7072837352752686, "learning_rate": 2.8080077614193513e-06, "loss": 0.6651, "step": 8130 }, { "epoch": 0.92, "grad_norm": 0.6854616403579712, "learning_rate": 2.766135851791607e-06, "loss": 0.8429, "step": 8135 }, { "epoch": 0.93, "grad_norm": 0.6851674318313599, "learning_rate": 2.724574101955557e-06, "loss": 0.7833, "step": 8140 }, { "epoch": 0.93, "grad_norm": 0.7848556041717529, "learning_rate": 2.6833226444862526e-06, "loss": 0.7674, "step": 8145 }, { "epoch": 0.93, "grad_norm": 0.6435510516166687, "learning_rate": 2.6423816109689357e-06, "loss": 0.6399, "step": 8150 }, { "epoch": 0.93, "grad_norm": 0.6673296689987183, "learning_rate": 2.6017511319986752e-06, "loss": 0.7375, "step": 8155 }, { "epoch": 0.93, "grad_norm": 0.7312006950378418, "learning_rate": 2.56143133717992e-06, "loss": 0.7741, "step": 8160 }, { "epoch": 0.93, "grad_norm": 0.6996206641197205, "learning_rate": 2.5214223551260686e-06, "loss": 0.7002, "step": 8165 }, { "epoch": 0.93, "grad_norm": 0.7395245432853699, "learning_rate": 2.481724313459111e-06, "loss": 0.7389, "step": 8170 }, { "epoch": 0.93, "grad_norm": 0.8081804513931274, "learning_rate": 2.4423373388091753e-06, "loss": 0.7366, "step": 8175 }, { "epoch": 0.93, "grad_norm": 0.7896558046340942, "learning_rate": 2.4032615568141183e-06, "loss": 0.7466, "step": 8180 }, { "epoch": 0.93, "grad_norm": 0.6015843152999878, "learning_rate": 2.3644970921191445e-06, "loss": 0.701, "step": 8185 }, { "epoch": 0.93, "grad_norm": 0.8499304056167603, "learning_rate": 2.326044068376465e-06, "loss": 0.7778, "step": 8190 }, { "epoch": 0.93, "grad_norm": 0.5991072058677673, "learning_rate": 2.287902608244774e-06, "loss": 0.7927, "step": 8195 }, { "epoch": 0.93, "grad_norm": 0.6677555441856384, "learning_rate": 2.250072833388972e-06, "loss": 0.7374, "step": 8200 }, { "epoch": 0.93, "grad_norm": 1.1476563215255737, "learning_rate": 2.2125548644797323e-06, "loss": 0.6659, "step": 8205 }, { "epoch": 0.93, "grad_norm": 0.6608490347862244, "learning_rate": 2.1753488211931016e-06, "loss": 0.8061, "step": 8210 }, { "epoch": 0.93, "grad_norm": 0.7551729679107666, "learning_rate": 2.1384548222101342e-06, "loss": 0.7791, "step": 8215 }, { "epoch": 0.93, "grad_norm": 0.7040895819664001, "learning_rate": 2.1018729852165574e-06, "loss": 0.7368, "step": 8220 }, { "epoch": 0.94, "grad_norm": 0.8961385488510132, "learning_rate": 2.065603426902296e-06, "loss": 0.8227, "step": 8225 }, { "epoch": 0.94, "grad_norm": 0.5928700566291809, "learning_rate": 2.0296462629611934e-06, "loss": 0.7019, "step": 8230 }, { "epoch": 0.94, "grad_norm": 0.6719995737075806, "learning_rate": 1.994001608090612e-06, "loss": 0.6937, "step": 8235 }, { "epoch": 0.94, "grad_norm": 0.6805534362792969, "learning_rate": 1.9586695759910233e-06, "loss": 0.6968, "step": 8240 }, { "epoch": 0.94, "grad_norm": 0.6675357818603516, "learning_rate": 1.92365027936573e-06, "loss": 0.79, "step": 8245 }, { "epoch": 0.94, "grad_norm": 0.7165231704711914, "learning_rate": 1.888943829920431e-06, "loss": 0.7975, "step": 8250 }, { "epoch": 0.94, "grad_norm": 0.663772463798523, "learning_rate": 1.8545503383629147e-06, "loss": 0.64, "step": 8255 }, { "epoch": 0.94, "grad_norm": 0.6648406982421875, "learning_rate": 1.8204699144026893e-06, "loss": 0.7558, "step": 8260 }, { "epoch": 0.94, "grad_norm": 0.707249641418457, "learning_rate": 1.7867026667505725e-06, "loss": 0.8406, "step": 8265 }, { "epoch": 0.94, "grad_norm": 0.6816351413726807, "learning_rate": 1.7532487031184819e-06, "loss": 0.8215, "step": 8270 }, { "epoch": 0.94, "grad_norm": 0.7135679721832275, "learning_rate": 1.720108130218967e-06, "loss": 0.7175, "step": 8275 }, { "epoch": 0.94, "grad_norm": 0.6640385389328003, "learning_rate": 1.6872810537649331e-06, "loss": 0.7476, "step": 8280 }, { "epoch": 0.94, "grad_norm": 0.6773399114608765, "learning_rate": 1.6547675784692517e-06, "loss": 0.6793, "step": 8285 }, { "epoch": 0.94, "grad_norm": 0.6559042930603027, "learning_rate": 1.6225678080444951e-06, "loss": 0.7233, "step": 8290 }, { "epoch": 0.94, "grad_norm": 0.8715654015541077, "learning_rate": 1.5906818452025463e-06, "loss": 0.8084, "step": 8295 }, { "epoch": 0.94, "grad_norm": 0.6095733046531677, "learning_rate": 1.5591097916543006e-06, "loss": 0.6426, "step": 8300 }, { "epoch": 0.94, "grad_norm": 0.669862687587738, "learning_rate": 1.5278517481093436e-06, "loss": 0.6873, "step": 8305 }, { "epoch": 0.94, "grad_norm": 0.6891058683395386, "learning_rate": 1.4969078142756277e-06, "loss": 0.8274, "step": 8310 }, { "epoch": 0.95, "grad_norm": 0.6286734938621521, "learning_rate": 1.4662780888591076e-06, "loss": 0.7603, "step": 8315 }, { "epoch": 0.95, "grad_norm": 0.6344169974327087, "learning_rate": 1.4359626695635176e-06, "loss": 0.7516, "step": 8320 }, { "epoch": 0.95, "grad_norm": 0.7188591361045837, "learning_rate": 1.405961653089971e-06, "loss": 0.7821, "step": 8325 }, { "epoch": 0.95, "grad_norm": 0.6320931911468506, "learning_rate": 1.3762751351367064e-06, "loss": 0.8255, "step": 8330 }, { "epoch": 0.95, "grad_norm": 0.6263067126274109, "learning_rate": 1.3469032103987534e-06, "loss": 0.8558, "step": 8335 }, { "epoch": 0.95, "grad_norm": 0.7200011014938354, "learning_rate": 1.317845972567655e-06, "loss": 0.7277, "step": 8340 }, { "epoch": 0.95, "grad_norm": 0.8542875647544861, "learning_rate": 1.289103514331147e-06, "loss": 0.7618, "step": 8345 }, { "epoch": 0.95, "grad_norm": 0.6276410818099976, "learning_rate": 1.2606759273728564e-06, "loss": 0.7309, "step": 8350 }, { "epoch": 0.95, "grad_norm": 0.7522603273391724, "learning_rate": 1.2325633023720695e-06, "loss": 0.7636, "step": 8355 }, { "epoch": 0.95, "grad_norm": 0.7600675225257874, "learning_rate": 1.204765729003332e-06, "loss": 0.8992, "step": 8360 }, { "epoch": 0.95, "grad_norm": 0.6711683869361877, "learning_rate": 1.1772832959362933e-06, "loss": 0.806, "step": 8365 }, { "epoch": 0.95, "grad_norm": 0.5430208444595337, "learning_rate": 1.150116090835307e-06, "loss": 0.7333, "step": 8370 }, { "epoch": 0.95, "grad_norm": 0.7371984720230103, "learning_rate": 1.1232642003592197e-06, "loss": 0.772, "step": 8375 }, { "epoch": 0.95, "grad_norm": 0.6525667905807495, "learning_rate": 1.096727710161094e-06, "loss": 0.7436, "step": 8380 }, { "epoch": 0.95, "grad_norm": 0.7685186862945557, "learning_rate": 1.070506704887886e-06, "loss": 0.829, "step": 8385 }, { "epoch": 0.95, "grad_norm": 0.6457349061965942, "learning_rate": 1.0446012681802343e-06, "loss": 0.7674, "step": 8390 }, { "epoch": 0.95, "grad_norm": 0.5994265079498291, "learning_rate": 1.0190114826721497e-06, "loss": 0.7031, "step": 8395 }, { "epoch": 0.96, "grad_norm": 0.7038202881813049, "learning_rate": 9.937374299907931e-07, "loss": 0.7039, "step": 8400 }, { "epoch": 0.96, "grad_norm": 0.593492329120636, "learning_rate": 9.687791907561527e-07, "loss": 0.6959, "step": 8405 }, { "epoch": 0.96, "grad_norm": 0.60867840051651, "learning_rate": 9.441368445808451e-07, "loss": 0.6178, "step": 8410 }, { "epoch": 0.96, "grad_norm": 0.7278956174850464, "learning_rate": 9.198104700698595e-07, "loss": 0.8133, "step": 8415 }, { "epoch": 0.96, "grad_norm": 1.0709000825881958, "learning_rate": 8.958001448202357e-07, "loss": 0.912, "step": 8420 }, { "epoch": 0.96, "grad_norm": 0.7433140277862549, "learning_rate": 8.721059454209424e-07, "loss": 0.7567, "step": 8425 }, { "epoch": 0.96, "grad_norm": 0.7910411953926086, "learning_rate": 8.487279474524989e-07, "loss": 0.8209, "step": 8430 }, { "epoch": 0.96, "grad_norm": 0.6875026226043701, "learning_rate": 8.256662254867986e-07, "loss": 0.7594, "step": 8435 }, { "epoch": 0.96, "grad_norm": 0.8132088780403137, "learning_rate": 8.029208530869081e-07, "loss": 0.9099, "step": 8440 }, { "epoch": 0.96, "grad_norm": 0.7695709466934204, "learning_rate": 7.804919028067681e-07, "loss": 0.7659, "step": 8445 }, { "epoch": 0.96, "grad_norm": 0.8422206044197083, "learning_rate": 7.58379446190971e-07, "loss": 0.7784, "step": 8450 }, { "epoch": 0.96, "grad_norm": 0.6463632583618164, "learning_rate": 7.365835537745725e-07, "loss": 0.7772, "step": 8455 }, { "epoch": 0.96, "grad_norm": 0.6620475649833679, "learning_rate": 7.151042950828246e-07, "loss": 0.7338, "step": 8460 }, { "epoch": 0.96, "grad_norm": 0.6927075386047363, "learning_rate": 6.939417386309766e-07, "loss": 0.7169, "step": 8465 }, { "epoch": 0.96, "grad_norm": 0.6566410064697266, "learning_rate": 6.730959519240409e-07, "loss": 0.7196, "step": 8470 }, { "epoch": 0.96, "grad_norm": 0.6217153072357178, "learning_rate": 6.525670014566166e-07, "loss": 0.6497, "step": 8475 }, { "epoch": 0.96, "grad_norm": 0.6876888871192932, "learning_rate": 6.323549527126216e-07, "loss": 0.7666, "step": 8480 }, { "epoch": 0.96, "grad_norm": 0.736946702003479, "learning_rate": 6.124598701651052e-07, "loss": 0.8501, "step": 8485 }, { "epoch": 0.97, "grad_norm": 0.5957123041152954, "learning_rate": 5.928818172760697e-07, "loss": 0.7375, "step": 8490 }, { "epoch": 0.97, "grad_norm": 0.6869728565216064, "learning_rate": 5.736208564962265e-07, "loss": 0.7024, "step": 8495 }, { "epoch": 0.97, "grad_norm": 0.7970830798149109, "learning_rate": 5.546770492648401e-07, "loss": 0.7898, "step": 8500 }, { "epoch": 0.97, "grad_norm": 0.7111850380897522, "learning_rate": 5.360504560094736e-07, "loss": 0.7083, "step": 8505 }, { "epoch": 0.97, "grad_norm": 0.6761876344680786, "learning_rate": 5.177411361458661e-07, "loss": 0.7067, "step": 8510 }, { "epoch": 0.97, "grad_norm": 0.6835991144180298, "learning_rate": 4.997491480776773e-07, "loss": 0.695, "step": 8515 }, { "epoch": 0.97, "grad_norm": 0.6578661203384399, "learning_rate": 4.82074549196343e-07, "loss": 0.7607, "step": 8520 }, { "epoch": 0.97, "grad_norm": 0.684698224067688, "learning_rate": 4.6471739588089814e-07, "loss": 0.7665, "step": 8525 }, { "epoch": 0.97, "grad_norm": 0.7736423015594482, "learning_rate": 4.476777434977653e-07, "loss": 0.8594, "step": 8530 }, { "epoch": 0.97, "grad_norm": 0.6832173466682434, "learning_rate": 4.30955646400566e-07, "loss": 0.6579, "step": 8535 }, { "epoch": 0.97, "grad_norm": 0.731162965297699, "learning_rate": 4.14551157930021e-07, "loss": 0.8462, "step": 8540 }, { "epoch": 0.97, "grad_norm": 0.7852889895439148, "learning_rate": 3.984643304136948e-07, "loss": 0.8533, "step": 8545 }, { "epoch": 0.97, "grad_norm": 0.8519006967544556, "learning_rate": 3.826952151658958e-07, "loss": 0.6815, "step": 8550 }, { "epoch": 0.97, "grad_norm": 0.6394432783126831, "learning_rate": 3.6724386248745415e-07, "loss": 0.7208, "step": 8555 }, { "epoch": 0.97, "grad_norm": 0.602070152759552, "learning_rate": 3.5211032166561077e-07, "loss": 0.7506, "step": 8560 }, { "epoch": 0.97, "grad_norm": 0.6670829653739929, "learning_rate": 3.372946409738398e-07, "loss": 0.7156, "step": 8565 }, { "epoch": 0.97, "grad_norm": 0.7526081204414368, "learning_rate": 3.2279686767168196e-07, "loss": 0.8094, "step": 8570 }, { "epoch": 0.97, "grad_norm": 0.68255215883255, "learning_rate": 3.086170480046113e-07, "loss": 0.7417, "step": 8575 }, { "epoch": 0.98, "grad_norm": 0.6421539187431335, "learning_rate": 2.947552272038911e-07, "loss": 0.7067, "step": 8580 }, { "epoch": 0.98, "grad_norm": 0.7133721113204956, "learning_rate": 2.812114494864182e-07, "loss": 0.7699, "step": 8585 }, { "epoch": 0.98, "grad_norm": 0.7376065254211426, "learning_rate": 2.679857580545786e-07, "loss": 0.8313, "step": 8590 }, { "epoch": 0.98, "grad_norm": 0.7178567051887512, "learning_rate": 2.550781950961034e-07, "loss": 0.7967, "step": 8595 }, { "epoch": 0.98, "grad_norm": 0.7155488133430481, "learning_rate": 2.42488801783991e-07, "loss": 0.7509, "step": 8600 }, { "epoch": 0.98, "grad_norm": 0.7433626055717468, "learning_rate": 2.3021761827628496e-07, "loss": 0.7372, "step": 8605 }, { "epoch": 0.98, "grad_norm": 0.7234955430030823, "learning_rate": 2.182646837160185e-07, "loss": 0.6896, "step": 8610 }, { "epoch": 0.98, "grad_norm": 0.790782630443573, "learning_rate": 2.0663003623105914e-07, "loss": 0.7696, "step": 8615 }, { "epoch": 0.98, "grad_norm": 0.852401614189148, "learning_rate": 1.953137129339977e-07, "loss": 0.7851, "step": 8620 }, { "epoch": 0.98, "grad_norm": 0.7792094349861145, "learning_rate": 1.8431574992199275e-07, "loss": 0.7594, "step": 8625 }, { "epoch": 0.98, "grad_norm": 0.7497335076332092, "learning_rate": 1.7363618227672628e-07, "loss": 0.7123, "step": 8630 }, { "epoch": 0.98, "grad_norm": 0.612318217754364, "learning_rate": 1.632750440642261e-07, "loss": 0.7277, "step": 8635 }, { "epoch": 0.98, "grad_norm": 0.6249239444732666, "learning_rate": 1.5323236833479916e-07, "loss": 0.7046, "step": 8640 }, { "epoch": 0.98, "grad_norm": 0.5679818987846375, "learning_rate": 1.4350818712292048e-07, "loss": 0.6912, "step": 8645 }, { "epoch": 0.98, "grad_norm": 0.6267695426940918, "learning_rate": 1.3410253144707785e-07, "loss": 0.6389, "step": 8650 }, { "epoch": 0.98, "grad_norm": 0.7306914925575256, "learning_rate": 1.2501543130974959e-07, "loss": 0.7294, "step": 8655 }, { "epoch": 0.98, "grad_norm": 0.7982718348503113, "learning_rate": 1.162469156972712e-07, "loss": 0.753, "step": 8660 }, { "epoch": 0.99, "grad_norm": 0.658090353012085, "learning_rate": 1.0779701257974672e-07, "loss": 0.6063, "step": 8665 }, { "epoch": 0.99, "grad_norm": 0.6100469827651978, "learning_rate": 9.96657489109487e-08, "loss": 0.8564, "step": 8670 }, { "epoch": 0.99, "grad_norm": 1.1241860389709473, "learning_rate": 9.185315062826272e-08, "loss": 0.7727, "step": 8675 }, { "epoch": 0.99, "grad_norm": 0.6518518328666687, "learning_rate": 8.435924265256523e-08, "loss": 0.7119, "step": 8680 }, { "epoch": 0.99, "grad_norm": 0.6026988625526428, "learning_rate": 7.718404888816811e-08, "loss": 0.7094, "step": 8685 }, { "epoch": 0.99, "grad_norm": 0.6683136820793152, "learning_rate": 7.032759222274087e-08, "loss": 0.7913, "step": 8690 }, { "epoch": 0.99, "grad_norm": 0.6700591444969177, "learning_rate": 6.378989452724416e-08, "loss": 0.794, "step": 8695 }, { "epoch": 0.99, "grad_norm": 0.6720278859138489, "learning_rate": 5.757097665584077e-08, "loss": 0.6934, "step": 8700 }, { "epoch": 0.99, "grad_norm": 0.7080987691879272, "learning_rate": 5.1670858445829195e-08, "loss": 0.8342, "step": 8705 }, { "epoch": 0.99, "grad_norm": 0.7693775296211243, "learning_rate": 4.6089558717610226e-08, "loss": 0.7174, "step": 8710 }, { "epoch": 0.99, "grad_norm": 0.664599597454071, "learning_rate": 4.082709527459816e-08, "loss": 0.7675, "step": 8715 }, { "epoch": 0.99, "grad_norm": 0.6770430207252502, "learning_rate": 3.588348490317639e-08, "loss": 0.7371, "step": 8720 }, { "epoch": 0.99, "grad_norm": 0.7018636465072632, "learning_rate": 3.125874337261969e-08, "loss": 0.7199, "step": 8725 }, { "epoch": 0.99, "grad_norm": 0.6643563508987427, "learning_rate": 2.6952885435105323e-08, "loss": 0.6487, "step": 8730 }, { "epoch": 0.99, "grad_norm": 0.6865366697311401, "learning_rate": 2.2965924825579797e-08, "loss": 0.7733, "step": 8735 }, { "epoch": 0.99, "grad_norm": 0.6612882614135742, "learning_rate": 1.9297874261792193e-08, "loss": 0.737, "step": 8740 }, { "epoch": 0.99, "grad_norm": 0.62270188331604, "learning_rate": 1.5948745444216428e-08, "loss": 0.7636, "step": 8745 }, { "epoch": 0.99, "grad_norm": 0.6600854396820068, "learning_rate": 1.2918549056006867e-08, "loss": 0.6694, "step": 8750 }, { "epoch": 1.0, "grad_norm": 0.7353793382644653, "learning_rate": 1.0207294762987208e-08, "loss": 0.7041, "step": 8755 }, { "epoch": 1.0, "grad_norm": 0.7038155794143677, "learning_rate": 7.81499121359497e-09, "loss": 0.689, "step": 8760 }, { "epoch": 1.0, "grad_norm": 0.8333893418312073, "learning_rate": 5.7416460388926004e-09, "loss": 0.8068, "step": 8765 }, { "epoch": 1.0, "grad_norm": 0.6727174520492554, "learning_rate": 3.9872658525008655e-09, "loss": 0.7088, "step": 8770 }, { "epoch": 1.0, "grad_norm": 0.6914416551589966, "learning_rate": 2.5518562505988386e-09, "loss": 0.6869, "step": 8775 }, { "epoch": 1.0, "grad_norm": 0.6037778258323669, "learning_rate": 1.435421811901705e-09, "loss": 0.7188, "step": 8780 }, { "epoch": 1.0, "grad_norm": 0.7393903732299805, "learning_rate": 6.37966097649656e-10, "loss": 0.7448, "step": 8785 }, { "epoch": 1.0, "grad_norm": 0.6320146322250366, "learning_rate": 1.5949165159678813e-10, "loss": 0.7623, "step": 8790 }, { "epoch": 1.0, "grad_norm": 0.6219347715377808, "learning_rate": 0.0, "loss": 0.7163, "step": 8795 }, { "epoch": 1.0, "step": 8795, "total_flos": 4.313969195378278e+16, "train_loss": 0.0, "train_runtime": 0.0081, "train_samples_per_second": 1092292.541, "train_steps_per_second": 1092292.541 } ], "logging_steps": 5, "max_steps": 8795, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4.313969195378278e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }