diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12343 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8795, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.6929430365562439, + "learning_rate": 0.0001999998405083484, + "loss": 1.6838, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.5281903743743896, + "learning_rate": 0.00019999936203390236, + "loss": 1.4915, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6337839961051941, + "learning_rate": 0.0001999985645781881, + "loss": 1.2662, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 0.6382477879524231, + "learning_rate": 0.00019999744814374942, + "loss": 1.1492, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.6016966104507446, + "learning_rate": 0.0001999960127341475, + "loss": 1.189, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 0.48733243346214294, + "learning_rate": 0.00019999425835396113, + "loss": 1.2111, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 0.4674977660179138, + "learning_rate": 0.0001999921850087864, + "loss": 1.0462, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 0.48517295718193054, + "learning_rate": 0.00019998979270523704, + "loss": 0.9653, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.5423499345779419, + "learning_rate": 0.000199987081450944, + "loss": 1.0769, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 0.5383784174919128, + "learning_rate": 0.0001999840512545558, + "loss": 1.0471, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.37535467743873596, + "learning_rate": 0.00019998070212573824, + "loss": 1.1045, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 0.5334364175796509, + "learning_rate": 0.00019997703407517443, + "loss": 0.9794, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 0.5327732563018799, + "learning_rate": 0.0001999730471145649, + "loss": 1.0664, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 0.4103855788707733, + "learning_rate": 0.0001999687412566274, + "loss": 1.0529, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 0.5592668652534485, + "learning_rate": 0.00019996411651509684, + "loss": 0.9661, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 0.5078962445259094, + "learning_rate": 0.0001999591729047254, + "loss": 1.0836, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.5035337209701538, + "learning_rate": 0.0001999539104412824, + "loss": 1.0011, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 0.5704492330551147, + "learning_rate": 0.00019994832914155416, + "loss": 0.9957, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 0.4775836169719696, + "learning_rate": 0.00019994242902334416, + "loss": 1.0125, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 0.4531150460243225, + "learning_rate": 0.00019993621010547277, + "loss": 0.9085, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.5194448232650757, + "learning_rate": 0.00019992967240777727, + "loss": 0.9871, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 0.44902467727661133, + "learning_rate": 0.00019992281595111185, + "loss": 0.912, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 0.5913822650909424, + "learning_rate": 0.00019991564075734744, + "loss": 0.941, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 0.524722158908844, + "learning_rate": 0.00019990814684937174, + "loss": 1.0098, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.6472988724708557, + "learning_rate": 0.00019990033425108905, + "loss": 0.9765, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 0.5326634049415588, + "learning_rate": 0.00019989220298742026, + "loss": 0.9342, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 0.5234887599945068, + "learning_rate": 0.00019988375308430275, + "loss": 0.9872, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 0.49476635456085205, + "learning_rate": 0.00019987498456869025, + "loss": 0.9218, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 0.4328886568546295, + "learning_rate": 0.00019986589746855295, + "loss": 1.0531, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 0.5224207639694214, + "learning_rate": 0.0001998564918128771, + "loss": 0.9544, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 0.4767338037490845, + "learning_rate": 0.0001998467676316652, + "loss": 0.9498, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 0.4608069658279419, + "learning_rate": 0.00019983672495593578, + "loss": 0.8847, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 0.4687401056289673, + "learning_rate": 0.00019982636381772327, + "loss": 0.938, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 0.5468807220458984, + "learning_rate": 0.000199815684250078, + "loss": 0.9296, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 0.5462723970413208, + "learning_rate": 0.00019980468628706604, + "loss": 0.9583, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 0.5237535834312439, + "learning_rate": 0.00019979336996376893, + "loss": 0.9683, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 0.5200393199920654, + "learning_rate": 0.000199781735316284, + "loss": 0.9164, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 0.480398565530777, + "learning_rate": 0.00019976978238172373, + "loss": 0.9312, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 0.49409154057502747, + "learning_rate": 0.000199757511198216, + "loss": 0.9962, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.5906654000282288, + "learning_rate": 0.00019974492180490388, + "loss": 0.9277, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 0.5397025346755981, + "learning_rate": 0.00019973201424194542, + "loss": 0.8953, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 0.5493289232254028, + "learning_rate": 0.00019971878855051358, + "loss": 0.9364, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 0.5614472031593323, + "learning_rate": 0.0001997052447727961, + "loss": 0.9419, + "step": 215 + }, + { + "epoch": 0.03, + "grad_norm": 0.5108090043067932, + "learning_rate": 0.0001996913829519954, + "loss": 0.9203, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 0.5096657872200012, + "learning_rate": 0.0001996772031323283, + "loss": 0.8614, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 0.541383683681488, + "learning_rate": 0.00019966270535902618, + "loss": 0.9603, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 0.5045916438102722, + "learning_rate": 0.00019964788967833438, + "loss": 0.9289, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 0.5646150708198547, + "learning_rate": 0.00019963275613751256, + "loss": 0.8575, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 0.4511927664279938, + "learning_rate": 0.0001996173047848341, + "loss": 0.9126, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 0.5300033688545227, + "learning_rate": 0.0001996015356695863, + "loss": 0.9346, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 0.5646623969078064, + "learning_rate": 0.00019958544884207, + "loss": 0.8818, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 0.4728727638721466, + "learning_rate": 0.00019956904435359943, + "loss": 0.9771, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 0.49879685044288635, + "learning_rate": 0.00019955232225650225, + "loss": 0.8741, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 0.47209540009498596, + "learning_rate": 0.00019953528260411912, + "loss": 0.8946, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 0.5027173161506653, + "learning_rate": 0.00019951792545080369, + "loss": 0.8586, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 0.5433563590049744, + "learning_rate": 0.00019950025085192232, + "loss": 0.9146, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 0.493032306432724, + "learning_rate": 0.00019948225886385414, + "loss": 0.9083, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 0.5513988733291626, + "learning_rate": 0.00019946394954399054, + "loss": 0.8455, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 0.588005542755127, + "learning_rate": 0.00019944532295073516, + "loss": 1.0188, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 0.5429700016975403, + "learning_rate": 0.00019942637914350378, + "loss": 0.9102, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 0.6686177849769592, + "learning_rate": 0.00019940711818272394, + "loss": 1.0241, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 0.5708988308906555, + "learning_rate": 0.00019938754012983488, + "loss": 0.9757, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 0.5050585865974426, + "learning_rate": 0.0001993676450472874, + "loss": 1.0048, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 0.5606969594955444, + "learning_rate": 0.00019934743299854338, + "loss": 0.9737, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 0.5190075635910034, + "learning_rate": 0.00019932690404807598, + "loss": 0.9677, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 0.5396589040756226, + "learning_rate": 0.00019930605826136904, + "loss": 0.9119, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 0.5063149333000183, + "learning_rate": 0.0001992848957049172, + "loss": 0.9627, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 0.5704573392868042, + "learning_rate": 0.00019926341644622544, + "loss": 0.9782, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 0.542542040348053, + "learning_rate": 0.00019924162055380903, + "loss": 0.9253, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 0.4711320102214813, + "learning_rate": 0.00019921950809719324, + "loss": 0.9039, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 0.5174899697303772, + "learning_rate": 0.00019919707914691311, + "loss": 0.8753, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 0.5371158719062805, + "learning_rate": 0.0001991743337745132, + "loss": 0.9531, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 0.49162808060646057, + "learning_rate": 0.00019915127205254751, + "loss": 0.9007, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 0.5364812016487122, + "learning_rate": 0.00019912789405457905, + "loss": 0.8619, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 0.5108892321586609, + "learning_rate": 0.00019910419985517977, + "loss": 0.9384, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 0.6169962286949158, + "learning_rate": 0.00019908018952993016, + "loss": 1.0248, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 0.5570533275604248, + "learning_rate": 0.00019905586315541917, + "loss": 0.9526, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 0.5810699462890625, + "learning_rate": 0.00019903122080924387, + "loss": 0.9722, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 0.4718081057071686, + "learning_rate": 0.00019900626257000922, + "loss": 0.8801, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 0.6052469611167908, + "learning_rate": 0.00019898098851732786, + "loss": 0.8618, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 0.491576611995697, + "learning_rate": 0.0001989553987318198, + "loss": 0.8347, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 0.4725711941719055, + "learning_rate": 0.00019892949329511212, + "loss": 0.8782, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 0.4833574891090393, + "learning_rate": 0.00019890327228983893, + "loss": 0.9227, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 0.4949423670768738, + "learning_rate": 0.0001988767357996408, + "loss": 0.9076, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 0.4801913797855377, + "learning_rate": 0.0001988498839091647, + "loss": 0.8838, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 0.5127063393592834, + "learning_rate": 0.00019882271670406372, + "loss": 0.7314, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 0.524604320526123, + "learning_rate": 0.00019879523427099665, + "loss": 0.8783, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 0.6428730487823486, + "learning_rate": 0.00019876743669762793, + "loss": 0.977, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 0.5448722243309021, + "learning_rate": 0.00019873932407262715, + "loss": 0.9373, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 0.5859642028808594, + "learning_rate": 0.00019871089648566885, + "loss": 0.9717, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 0.5077968835830688, + "learning_rate": 0.00019868215402743235, + "loss": 0.8836, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 0.5130951404571533, + "learning_rate": 0.00019865309678960123, + "loss": 1.012, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 0.5490102767944336, + "learning_rate": 0.0001986237248648633, + "loss": 0.8743, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 0.5080798864364624, + "learning_rate": 0.00019859403834691003, + "loss": 0.9109, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 0.5114970207214355, + "learning_rate": 0.0001985640373304365, + "loss": 0.9453, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 0.500872790813446, + "learning_rate": 0.0001985337219111409, + "loss": 0.9275, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 0.4827217757701874, + "learning_rate": 0.00019850309218572438, + "loss": 0.9394, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 0.5236514210700989, + "learning_rate": 0.00019847214825189066, + "loss": 0.9253, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 0.49005958437919617, + "learning_rate": 0.0001984408902083457, + "loss": 0.8693, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 0.5737020373344421, + "learning_rate": 0.00019840931815479746, + "loss": 0.9076, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 0.5236185193061829, + "learning_rate": 0.00019837743219195552, + "loss": 0.9429, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 0.5625522136688232, + "learning_rate": 0.00019834523242153078, + "loss": 0.9305, + "step": 510 + }, + { + "epoch": 0.06, + "grad_norm": 0.5392923951148987, + "learning_rate": 0.0001983127189462351, + "loss": 0.8803, + "step": 515 + }, + { + "epoch": 0.06, + "grad_norm": 0.5529817938804626, + "learning_rate": 0.00019827989186978103, + "loss": 0.9182, + "step": 520 + }, + { + "epoch": 0.06, + "grad_norm": 0.5279693007469177, + "learning_rate": 0.00019824675129688152, + "loss": 0.9022, + "step": 525 + }, + { + "epoch": 0.06, + "grad_norm": 0.6373478174209595, + "learning_rate": 0.00019821329733324942, + "loss": 0.9551, + "step": 530 + }, + { + "epoch": 0.06, + "grad_norm": 0.5089979767799377, + "learning_rate": 0.00019817953008559734, + "loss": 0.8277, + "step": 535 + }, + { + "epoch": 0.06, + "grad_norm": 0.5426749587059021, + "learning_rate": 0.00019814544966163708, + "loss": 1.012, + "step": 540 + }, + { + "epoch": 0.06, + "grad_norm": 0.5283740758895874, + "learning_rate": 0.0001981110561700796, + "loss": 0.8224, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 0.4337727725505829, + "learning_rate": 0.00019807634972063428, + "loss": 0.858, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 0.4819512963294983, + "learning_rate": 0.000198041330424009, + "loss": 0.7776, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 0.550744891166687, + "learning_rate": 0.00019800599839190941, + "loss": 0.8895, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 0.4993669092655182, + "learning_rate": 0.0001979703537370388, + "loss": 0.9043, + "step": 565 + }, + { + "epoch": 0.06, + "grad_norm": 0.618319571018219, + "learning_rate": 0.00019793439657309772, + "loss": 0.8229, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 0.5560276508331299, + "learning_rate": 0.00019789812701478346, + "loss": 0.9208, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 0.5499486327171326, + "learning_rate": 0.00019786154517778987, + "loss": 0.8309, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 0.8018636703491211, + "learning_rate": 0.00019782465117880693, + "loss": 0.9529, + "step": 585 + }, + { + "epoch": 0.07, + "grad_norm": 0.5488550066947937, + "learning_rate": 0.0001977874451355203, + "loss": 0.7879, + "step": 590 + }, + { + "epoch": 0.07, + "grad_norm": 0.5369092226028442, + "learning_rate": 0.00019774992716661106, + "loss": 0.8819, + "step": 595 + }, + { + "epoch": 0.07, + "grad_norm": 0.5121927857398987, + "learning_rate": 0.00019771209739175523, + "loss": 0.8949, + "step": 600 + }, + { + "epoch": 0.07, + "grad_norm": 0.5077289938926697, + "learning_rate": 0.00019767395593162353, + "loss": 0.9174, + "step": 605 + }, + { + "epoch": 0.07, + "grad_norm": 0.6287967562675476, + "learning_rate": 0.00019763550290788085, + "loss": 0.8388, + "step": 610 + }, + { + "epoch": 0.07, + "grad_norm": 0.5971408486366272, + "learning_rate": 0.0001975967384431859, + "loss": 0.8899, + "step": 615 + }, + { + "epoch": 0.07, + "grad_norm": 0.5257498025894165, + "learning_rate": 0.00019755766266119085, + "loss": 1.0072, + "step": 620 + }, + { + "epoch": 0.07, + "grad_norm": 0.5270594954490662, + "learning_rate": 0.00019751827568654089, + "loss": 0.9276, + "step": 625 + }, + { + "epoch": 0.07, + "grad_norm": 0.5067614912986755, + "learning_rate": 0.00019747857764487395, + "loss": 0.8488, + "step": 630 + }, + { + "epoch": 0.07, + "grad_norm": 0.5074208378791809, + "learning_rate": 0.0001974385686628201, + "loss": 0.7905, + "step": 635 + }, + { + "epoch": 0.07, + "grad_norm": 0.5764243602752686, + "learning_rate": 0.00019739824886800134, + "loss": 0.8907, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 0.6378028392791748, + "learning_rate": 0.00019735761838903106, + "loss": 0.9187, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 0.8019906282424927, + "learning_rate": 0.00019731667735551375, + "loss": 0.9371, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 0.5774128437042236, + "learning_rate": 0.00019727542589804444, + "loss": 0.9012, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 0.6190884709358215, + "learning_rate": 0.00019723386414820842, + "loss": 0.941, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 0.526430070400238, + "learning_rate": 0.00019719199223858068, + "loss": 0.77, + "step": 665 + }, + { + "epoch": 0.08, + "grad_norm": 0.4773986041545868, + "learning_rate": 0.00019714981030272567, + "loss": 0.8433, + "step": 670 + }, + { + "epoch": 0.08, + "grad_norm": 0.5461030006408691, + "learning_rate": 0.00019710731847519665, + "loss": 0.8035, + "step": 675 + }, + { + "epoch": 0.08, + "grad_norm": 0.6205869317054749, + "learning_rate": 0.00019706451689153556, + "loss": 0.9234, + "step": 680 + }, + { + "epoch": 0.08, + "grad_norm": 0.5920631885528564, + "learning_rate": 0.00019702140568827222, + "loss": 0.884, + "step": 685 + }, + { + "epoch": 0.08, + "grad_norm": 0.5703709721565247, + "learning_rate": 0.0001969779850029242, + "loss": 0.833, + "step": 690 + }, + { + "epoch": 0.08, + "grad_norm": 0.6153601408004761, + "learning_rate": 0.00019693425497399627, + "loss": 0.9611, + "step": 695 + }, + { + "epoch": 0.08, + "grad_norm": 0.4773414433002472, + "learning_rate": 0.00019689021574097987, + "loss": 0.8674, + "step": 700 + }, + { + "epoch": 0.08, + "grad_norm": 0.5657919645309448, + "learning_rate": 0.00019684586744435283, + "loss": 0.9631, + "step": 705 + }, + { + "epoch": 0.08, + "grad_norm": 0.5606719255447388, + "learning_rate": 0.0001968012102255788, + "loss": 0.8937, + "step": 710 + }, + { + "epoch": 0.08, + "grad_norm": 0.510236382484436, + "learning_rate": 0.00019675624422710682, + "loss": 0.8685, + "step": 715 + }, + { + "epoch": 0.08, + "grad_norm": 0.6055552363395691, + "learning_rate": 0.000196710969592371, + "loss": 0.9136, + "step": 720 + }, + { + "epoch": 0.08, + "grad_norm": 0.52703857421875, + "learning_rate": 0.0001966653864657898, + "loss": 0.8531, + "step": 725 + }, + { + "epoch": 0.08, + "grad_norm": 0.5948337316513062, + "learning_rate": 0.00019661949499276578, + "loss": 0.9152, + "step": 730 + }, + { + "epoch": 0.08, + "grad_norm": 0.5188962817192078, + "learning_rate": 0.00019657329531968512, + "loss": 0.8868, + "step": 735 + }, + { + "epoch": 0.08, + "grad_norm": 0.5457183718681335, + "learning_rate": 0.000196526787593917, + "loss": 0.8716, + "step": 740 + }, + { + "epoch": 0.08, + "grad_norm": 0.5485244393348694, + "learning_rate": 0.0001964799719638134, + "loss": 0.8099, + "step": 745 + }, + { + "epoch": 0.09, + "grad_norm": 0.5232064723968506, + "learning_rate": 0.00019643284857870822, + "loss": 0.8874, + "step": 750 + }, + { + "epoch": 0.09, + "grad_norm": 0.5872951149940491, + "learning_rate": 0.00019638541758891734, + "loss": 0.7815, + "step": 755 + }, + { + "epoch": 0.09, + "grad_norm": 0.5641792416572571, + "learning_rate": 0.0001963376791457376, + "loss": 0.8868, + "step": 760 + }, + { + "epoch": 0.09, + "grad_norm": 1.058152198791504, + "learning_rate": 0.0001962896334014467, + "loss": 0.8355, + "step": 765 + }, + { + "epoch": 0.09, + "grad_norm": 0.5059372782707214, + "learning_rate": 0.00019624128050930252, + "loss": 0.7938, + "step": 770 + }, + { + "epoch": 0.09, + "grad_norm": 0.5510187745094299, + "learning_rate": 0.00019619262062354275, + "loss": 0.8468, + "step": 775 + }, + { + "epoch": 0.09, + "grad_norm": 0.4972304105758667, + "learning_rate": 0.00019614365389938426, + "loss": 0.8065, + "step": 780 + }, + { + "epoch": 0.09, + "grad_norm": 0.7013474106788635, + "learning_rate": 0.00019609438049302273, + "loss": 0.91, + "step": 785 + }, + { + "epoch": 0.09, + "grad_norm": 0.5787790417671204, + "learning_rate": 0.00019604480056163213, + "loss": 0.9695, + "step": 790 + }, + { + "epoch": 0.09, + "grad_norm": 0.45666056871414185, + "learning_rate": 0.00019599491426336413, + "loss": 0.9191, + "step": 795 + }, + { + "epoch": 0.09, + "grad_norm": 0.5539554953575134, + "learning_rate": 0.00019594472175734774, + "loss": 0.9278, + "step": 800 + }, + { + "epoch": 0.09, + "grad_norm": 0.5579492449760437, + "learning_rate": 0.0001958942232036886, + "loss": 0.9074, + "step": 805 + }, + { + "epoch": 0.09, + "grad_norm": 0.48329582810401917, + "learning_rate": 0.00019584341876346874, + "loss": 0.773, + "step": 810 + }, + { + "epoch": 0.09, + "grad_norm": 0.5779743790626526, + "learning_rate": 0.0001957923085987458, + "loss": 0.7915, + "step": 815 + }, + { + "epoch": 0.09, + "grad_norm": 0.5726707577705383, + "learning_rate": 0.0001957408928725527, + "loss": 0.8683, + "step": 820 + }, + { + "epoch": 0.09, + "grad_norm": 0.5140533447265625, + "learning_rate": 0.00019568917174889693, + "loss": 0.8599, + "step": 825 + }, + { + "epoch": 0.09, + "grad_norm": 0.5206133723258972, + "learning_rate": 0.00019563714539276036, + "loss": 0.8629, + "step": 830 + }, + { + "epoch": 0.09, + "grad_norm": 0.6327289342880249, + "learning_rate": 0.0001955848139700983, + "loss": 0.8577, + "step": 835 + }, + { + "epoch": 0.1, + "grad_norm": 0.5038033127784729, + "learning_rate": 0.00019553217764783928, + "loss": 0.8652, + "step": 840 + }, + { + "epoch": 0.1, + "grad_norm": 0.5202915072441101, + "learning_rate": 0.0001954792365938844, + "loss": 0.9021, + "step": 845 + }, + { + "epoch": 0.1, + "grad_norm": 0.5957011580467224, + "learning_rate": 0.00019542599097710676, + "loss": 0.8404, + "step": 850 + }, + { + "epoch": 0.1, + "grad_norm": 0.6331242918968201, + "learning_rate": 0.00019537244096735096, + "loss": 1.0052, + "step": 855 + }, + { + "epoch": 0.1, + "grad_norm": 0.43145543336868286, + "learning_rate": 0.00019531858673543266, + "loss": 0.8814, + "step": 860 + }, + { + "epoch": 0.1, + "grad_norm": 0.5859599709510803, + "learning_rate": 0.0001952644284531378, + "loss": 0.8131, + "step": 865 + }, + { + "epoch": 0.1, + "grad_norm": 0.5870697498321533, + "learning_rate": 0.00019520996629322228, + "loss": 0.8458, + "step": 870 + }, + { + "epoch": 0.1, + "grad_norm": 0.4881855845451355, + "learning_rate": 0.00019515520042941132, + "loss": 0.808, + "step": 875 + }, + { + "epoch": 0.1, + "grad_norm": 0.5687686204910278, + "learning_rate": 0.00019510013103639883, + "loss": 0.8764, + "step": 880 + }, + { + "epoch": 0.1, + "grad_norm": 0.49425217509269714, + "learning_rate": 0.00019504475828984705, + "loss": 0.9531, + "step": 885 + }, + { + "epoch": 0.1, + "grad_norm": 0.6400182247161865, + "learning_rate": 0.00019498908236638572, + "loss": 0.9817, + "step": 890 + }, + { + "epoch": 0.1, + "grad_norm": 0.50630122423172, + "learning_rate": 0.0001949331034436118, + "loss": 0.8061, + "step": 895 + }, + { + "epoch": 0.1, + "grad_norm": 0.6477893590927124, + "learning_rate": 0.00019487682170008866, + "loss": 0.8433, + "step": 900 + }, + { + "epoch": 0.1, + "grad_norm": 0.5080977082252502, + "learning_rate": 0.0001948202373153457, + "loss": 0.753, + "step": 905 + }, + { + "epoch": 0.1, + "grad_norm": 0.6288018226623535, + "learning_rate": 0.00019476335046987763, + "loss": 0.8719, + "step": 910 + }, + { + "epoch": 0.1, + "grad_norm": 0.5954068899154663, + "learning_rate": 0.00019470616134514406, + "loss": 0.9141, + "step": 915 + }, + { + "epoch": 0.1, + "grad_norm": 0.9248217940330505, + "learning_rate": 0.00019464867012356865, + "loss": 0.8477, + "step": 920 + }, + { + "epoch": 0.11, + "grad_norm": 0.5251208543777466, + "learning_rate": 0.00019459087698853883, + "loss": 0.805, + "step": 925 + }, + { + "epoch": 0.11, + "grad_norm": 0.5578038692474365, + "learning_rate": 0.0001945327821244051, + "loss": 0.9431, + "step": 930 + }, + { + "epoch": 0.11, + "grad_norm": 0.5071999430656433, + "learning_rate": 0.0001944743857164803, + "loss": 0.8176, + "step": 935 + }, + { + "epoch": 0.11, + "grad_norm": 0.5080613493919373, + "learning_rate": 0.00019441568795103932, + "loss": 0.9004, + "step": 940 + }, + { + "epoch": 0.11, + "grad_norm": 0.5936622619628906, + "learning_rate": 0.00019435668901531813, + "loss": 0.9215, + "step": 945 + }, + { + "epoch": 0.11, + "grad_norm": 0.5861743092536926, + "learning_rate": 0.00019429738909751353, + "loss": 0.8413, + "step": 950 + }, + { + "epoch": 0.11, + "grad_norm": 0.561229407787323, + "learning_rate": 0.00019423778838678236, + "loss": 0.8808, + "step": 955 + }, + { + "epoch": 0.11, + "grad_norm": 0.6140787601470947, + "learning_rate": 0.00019417788707324095, + "loss": 0.8038, + "step": 960 + }, + { + "epoch": 0.11, + "grad_norm": 0.4852558672428131, + "learning_rate": 0.00019411768534796444, + "loss": 0.7745, + "step": 965 + }, + { + "epoch": 0.11, + "grad_norm": 0.6961116194725037, + "learning_rate": 0.00019405718340298632, + "loss": 0.9561, + "step": 970 + }, + { + "epoch": 0.11, + "grad_norm": 0.5008209943771362, + "learning_rate": 0.00019399638143129767, + "loss": 0.8497, + "step": 975 + }, + { + "epoch": 0.11, + "grad_norm": 0.6141425967216492, + "learning_rate": 0.00019393527962684664, + "loss": 0.8182, + "step": 980 + }, + { + "epoch": 0.11, + "grad_norm": 0.7052502036094666, + "learning_rate": 0.0001938738781845378, + "loss": 0.8945, + "step": 985 + }, + { + "epoch": 0.11, + "grad_norm": 0.5405805110931396, + "learning_rate": 0.00019381217730023146, + "loss": 0.9554, + "step": 990 + }, + { + "epoch": 0.11, + "grad_norm": 0.6514587998390198, + "learning_rate": 0.00019375017717074318, + "loss": 0.811, + "step": 995 + }, + { + "epoch": 0.11, + "grad_norm": 0.5553276538848877, + "learning_rate": 0.000193687877993843, + "loss": 0.8157, + "step": 1000 + }, + { + "epoch": 0.11, + "grad_norm": 0.5811892151832581, + "learning_rate": 0.00019362527996825488, + "loss": 0.8352, + "step": 1005 + }, + { + "epoch": 0.11, + "grad_norm": 0.4772842526435852, + "learning_rate": 0.00019356238329365613, + "loss": 0.8563, + "step": 1010 + }, + { + "epoch": 0.12, + "grad_norm": 0.6040914058685303, + "learning_rate": 0.00019349918817067655, + "loss": 0.8884, + "step": 1015 + }, + { + "epoch": 0.12, + "grad_norm": 0.638569712638855, + "learning_rate": 0.0001934356948008981, + "loss": 0.938, + "step": 1020 + }, + { + "epoch": 0.12, + "grad_norm": 0.47892439365386963, + "learning_rate": 0.00019337190338685397, + "loss": 0.9725, + "step": 1025 + }, + { + "epoch": 0.12, + "grad_norm": 0.5647065043449402, + "learning_rate": 0.0001933078141320282, + "loss": 0.8063, + "step": 1030 + }, + { + "epoch": 0.12, + "grad_norm": 0.4954369068145752, + "learning_rate": 0.0001932434272408547, + "loss": 0.8535, + "step": 1035 + }, + { + "epoch": 0.12, + "grad_norm": 0.655193567276001, + "learning_rate": 0.00019317874291871704, + "loss": 0.876, + "step": 1040 + }, + { + "epoch": 0.12, + "grad_norm": 0.6297211050987244, + "learning_rate": 0.0001931137613719473, + "loss": 0.9351, + "step": 1045 + }, + { + "epoch": 0.12, + "grad_norm": 0.5889206528663635, + "learning_rate": 0.0001930484828078258, + "loss": 0.918, + "step": 1050 + }, + { + "epoch": 0.12, + "grad_norm": 0.5167111158370972, + "learning_rate": 0.00019298290743458027, + "loss": 1.1183, + "step": 1055 + }, + { + "epoch": 0.12, + "grad_norm": 0.44684621691703796, + "learning_rate": 0.0001929170354613852, + "loss": 0.8091, + "step": 1060 + }, + { + "epoch": 0.12, + "grad_norm": 0.5272998213768005, + "learning_rate": 0.00019285086709836116, + "loss": 0.8537, + "step": 1065 + }, + { + "epoch": 0.12, + "grad_norm": 0.5813104510307312, + "learning_rate": 0.0001927844025565742, + "loss": 0.8221, + "step": 1070 + }, + { + "epoch": 0.12, + "grad_norm": 0.507824718952179, + "learning_rate": 0.00019271764204803512, + "loss": 0.9199, + "step": 1075 + }, + { + "epoch": 0.12, + "grad_norm": 0.5733903050422668, + "learning_rate": 0.00019265058578569878, + "loss": 0.7459, + "step": 1080 + }, + { + "epoch": 0.12, + "grad_norm": 0.6304961442947388, + "learning_rate": 0.00019258323398346346, + "loss": 0.8584, + "step": 1085 + }, + { + "epoch": 0.12, + "grad_norm": 0.5296971797943115, + "learning_rate": 0.00019251558685617014, + "loss": 0.8552, + "step": 1090 + }, + { + "epoch": 0.12, + "grad_norm": 0.5370798707008362, + "learning_rate": 0.00019244764461960191, + "loss": 0.9183, + "step": 1095 + }, + { + "epoch": 0.13, + "grad_norm": 0.5226066708564758, + "learning_rate": 0.00019237940749048318, + "loss": 0.883, + "step": 1100 + }, + { + "epoch": 0.13, + "grad_norm": 0.6162168383598328, + "learning_rate": 0.00019231087568647893, + "loss": 0.924, + "step": 1105 + }, + { + "epoch": 0.13, + "grad_norm": 0.6270351409912109, + "learning_rate": 0.00019224204942619417, + "loss": 0.9861, + "step": 1110 + }, + { + "epoch": 0.13, + "grad_norm": 0.6001037955284119, + "learning_rate": 0.00019217292892917325, + "loss": 0.8134, + "step": 1115 + }, + { + "epoch": 0.13, + "grad_norm": 0.5420514345169067, + "learning_rate": 0.00019210351441589896, + "loss": 0.7618, + "step": 1120 + }, + { + "epoch": 0.13, + "grad_norm": 0.6640905141830444, + "learning_rate": 0.000192033806107792, + "loss": 0.9157, + "step": 1125 + }, + { + "epoch": 0.13, + "grad_norm": 0.5504305362701416, + "learning_rate": 0.00019196380422721026, + "loss": 0.8995, + "step": 1130 + }, + { + "epoch": 0.13, + "grad_norm": 0.6820715069770813, + "learning_rate": 0.00019189350899744806, + "loss": 0.9161, + "step": 1135 + }, + { + "epoch": 0.13, + "grad_norm": 0.5563843250274658, + "learning_rate": 0.00019182292064273544, + "loss": 0.7748, + "step": 1140 + }, + { + "epoch": 0.13, + "grad_norm": 0.6278269290924072, + "learning_rate": 0.00019175203938823744, + "loss": 0.7787, + "step": 1145 + }, + { + "epoch": 0.13, + "grad_norm": 0.5549231171607971, + "learning_rate": 0.00019168086546005346, + "loss": 0.9038, + "step": 1150 + }, + { + "epoch": 0.13, + "grad_norm": 0.5321454405784607, + "learning_rate": 0.0001916093990852164, + "loss": 0.7601, + "step": 1155 + }, + { + "epoch": 0.13, + "grad_norm": 0.5376914143562317, + "learning_rate": 0.0001915376404916921, + "loss": 0.8395, + "step": 1160 + }, + { + "epoch": 0.13, + "grad_norm": 0.5487528443336487, + "learning_rate": 0.00019146558990837853, + "loss": 0.7653, + "step": 1165 + }, + { + "epoch": 0.13, + "grad_norm": 0.54075688123703, + "learning_rate": 0.00019139324756510496, + "loss": 0.8811, + "step": 1170 + }, + { + "epoch": 0.13, + "grad_norm": 0.5763316750526428, + "learning_rate": 0.00019132061369263136, + "loss": 0.9043, + "step": 1175 + }, + { + "epoch": 0.13, + "grad_norm": 0.5854605436325073, + "learning_rate": 0.00019124768852264774, + "loss": 0.9339, + "step": 1180 + }, + { + "epoch": 0.13, + "grad_norm": 0.533819854259491, + "learning_rate": 0.00019117447228777316, + "loss": 0.8174, + "step": 1185 + }, + { + "epoch": 0.14, + "grad_norm": 0.48118382692337036, + "learning_rate": 0.00019110096522155523, + "loss": 0.8992, + "step": 1190 + }, + { + "epoch": 0.14, + "grad_norm": 0.5746036171913147, + "learning_rate": 0.00019102716755846913, + "loss": 0.7809, + "step": 1195 + }, + { + "epoch": 0.14, + "grad_norm": 0.5010830163955688, + "learning_rate": 0.00019095307953391718, + "loss": 0.834, + "step": 1200 + }, + { + "epoch": 0.14, + "grad_norm": 0.5111698508262634, + "learning_rate": 0.00019087870138422775, + "loss": 0.7903, + "step": 1205 + }, + { + "epoch": 0.14, + "grad_norm": 0.5492734909057617, + "learning_rate": 0.00019080403334665474, + "loss": 0.8529, + "step": 1210 + }, + { + "epoch": 0.14, + "grad_norm": 0.5762624144554138, + "learning_rate": 0.00019072907565937674, + "loss": 0.8261, + "step": 1215 + }, + { + "epoch": 0.14, + "grad_norm": 0.7704640030860901, + "learning_rate": 0.00019065382856149623, + "loss": 0.8578, + "step": 1220 + }, + { + "epoch": 0.14, + "grad_norm": 0.5472645163536072, + "learning_rate": 0.0001905782922930389, + "loss": 0.7221, + "step": 1225 + }, + { + "epoch": 0.14, + "grad_norm": 0.4950571656227112, + "learning_rate": 0.0001905024670949528, + "loss": 0.8874, + "step": 1230 + }, + { + "epoch": 0.14, + "grad_norm": 0.6059962511062622, + "learning_rate": 0.00019042635320910768, + "loss": 0.8667, + "step": 1235 + }, + { + "epoch": 0.14, + "grad_norm": 0.5352611541748047, + "learning_rate": 0.00019034995087829416, + "loss": 0.8767, + "step": 1240 + }, + { + "epoch": 0.14, + "grad_norm": 0.5850544571876526, + "learning_rate": 0.00019027326034622288, + "loss": 0.8335, + "step": 1245 + }, + { + "epoch": 0.14, + "grad_norm": 0.6183121204376221, + "learning_rate": 0.00019019628185752382, + "loss": 0.866, + "step": 1250 + }, + { + "epoch": 0.14, + "grad_norm": 0.5667844414710999, + "learning_rate": 0.00019011901565774554, + "loss": 0.8816, + "step": 1255 + }, + { + "epoch": 0.14, + "grad_norm": 0.6229298114776611, + "learning_rate": 0.0001900414619933543, + "loss": 0.7841, + "step": 1260 + }, + { + "epoch": 0.14, + "grad_norm": 0.5172335505485535, + "learning_rate": 0.00018996362111173336, + "loss": 0.841, + "step": 1265 + }, + { + "epoch": 0.14, + "grad_norm": 0.5118042230606079, + "learning_rate": 0.00018988549326118208, + "loss": 0.8585, + "step": 1270 + }, + { + "epoch": 0.14, + "grad_norm": 0.5746064782142639, + "learning_rate": 0.0001898070786909153, + "loss": 0.8849, + "step": 1275 + }, + { + "epoch": 0.15, + "grad_norm": 0.5497964024543762, + "learning_rate": 0.00018972837765106245, + "loss": 0.8754, + "step": 1280 + }, + { + "epoch": 0.15, + "grad_norm": 0.4933513402938843, + "learning_rate": 0.0001896493903926666, + "loss": 0.8773, + "step": 1285 + }, + { + "epoch": 0.15, + "grad_norm": 0.5292490124702454, + "learning_rate": 0.00018957011716768402, + "loss": 0.825, + "step": 1290 + }, + { + "epoch": 0.15, + "grad_norm": 0.5365048050880432, + "learning_rate": 0.00018949055822898298, + "loss": 0.8519, + "step": 1295 + }, + { + "epoch": 0.15, + "grad_norm": 0.5695987939834595, + "learning_rate": 0.00018941071383034327, + "loss": 0.9198, + "step": 1300 + }, + { + "epoch": 0.15, + "grad_norm": 0.5951317548751831, + "learning_rate": 0.00018933058422645514, + "loss": 0.8947, + "step": 1305 + }, + { + "epoch": 0.15, + "grad_norm": 0.5998417139053345, + "learning_rate": 0.00018925016967291872, + "loss": 0.9372, + "step": 1310 + }, + { + "epoch": 0.15, + "grad_norm": 0.5713790655136108, + "learning_rate": 0.00018916947042624293, + "loss": 0.9231, + "step": 1315 + }, + { + "epoch": 0.15, + "grad_norm": 0.5592827796936035, + "learning_rate": 0.00018908848674384493, + "loss": 0.8792, + "step": 1320 + }, + { + "epoch": 0.15, + "grad_norm": 0.6018208265304565, + "learning_rate": 0.00018900721888404917, + "loss": 0.8533, + "step": 1325 + }, + { + "epoch": 0.15, + "grad_norm": 0.5641770958900452, + "learning_rate": 0.0001889256671060865, + "loss": 0.8096, + "step": 1330 + }, + { + "epoch": 0.15, + "grad_norm": 0.49681103229522705, + "learning_rate": 0.00018884383167009348, + "loss": 0.7558, + "step": 1335 + }, + { + "epoch": 0.15, + "grad_norm": 0.6027383208274841, + "learning_rate": 0.0001887617128371115, + "loss": 0.7801, + "step": 1340 + }, + { + "epoch": 0.15, + "grad_norm": 0.5480598211288452, + "learning_rate": 0.00018867931086908598, + "loss": 0.9335, + "step": 1345 + }, + { + "epoch": 0.15, + "grad_norm": 0.5374003648757935, + "learning_rate": 0.00018859662602886538, + "loss": 0.8229, + "step": 1350 + }, + { + "epoch": 0.15, + "grad_norm": 0.5359901785850525, + "learning_rate": 0.00018851365858020054, + "loss": 0.888, + "step": 1355 + }, + { + "epoch": 0.15, + "grad_norm": 0.6158584356307983, + "learning_rate": 0.0001884304087877438, + "loss": 0.8434, + "step": 1360 + }, + { + "epoch": 0.16, + "grad_norm": 0.6495197415351868, + "learning_rate": 0.00018834687691704805, + "loss": 0.902, + "step": 1365 + }, + { + "epoch": 0.16, + "grad_norm": 0.5516675710678101, + "learning_rate": 0.000188263063234566, + "loss": 0.8055, + "step": 1370 + }, + { + "epoch": 0.16, + "grad_norm": 0.5624775886535645, + "learning_rate": 0.00018817896800764938, + "loss": 0.8212, + "step": 1375 + }, + { + "epoch": 0.16, + "grad_norm": 0.47556230425834656, + "learning_rate": 0.00018809459150454788, + "loss": 0.7061, + "step": 1380 + }, + { + "epoch": 0.16, + "grad_norm": 0.5220008492469788, + "learning_rate": 0.00018800993399440845, + "loss": 0.7378, + "step": 1385 + }, + { + "epoch": 0.16, + "grad_norm": 0.5644752979278564, + "learning_rate": 0.00018792499574727441, + "loss": 0.8245, + "step": 1390 + }, + { + "epoch": 0.16, + "grad_norm": 0.6211121678352356, + "learning_rate": 0.0001878397770340846, + "loss": 0.9034, + "step": 1395 + }, + { + "epoch": 0.16, + "grad_norm": 0.5424923896789551, + "learning_rate": 0.00018775427812667248, + "loss": 0.9151, + "step": 1400 + }, + { + "epoch": 0.16, + "grad_norm": 0.5865684151649475, + "learning_rate": 0.00018766849929776532, + "loss": 0.8795, + "step": 1405 + }, + { + "epoch": 0.16, + "grad_norm": 0.6018354892730713, + "learning_rate": 0.0001875824408209832, + "loss": 0.6693, + "step": 1410 + }, + { + "epoch": 0.16, + "grad_norm": 0.5678966045379639, + "learning_rate": 0.0001874961029708383, + "loss": 0.8994, + "step": 1415 + }, + { + "epoch": 0.16, + "grad_norm": 0.4987218677997589, + "learning_rate": 0.000187409486022734, + "loss": 0.9152, + "step": 1420 + }, + { + "epoch": 0.16, + "grad_norm": 0.5679898858070374, + "learning_rate": 0.00018732259025296388, + "loss": 0.7408, + "step": 1425 + }, + { + "epoch": 0.16, + "grad_norm": 0.46406781673431396, + "learning_rate": 0.0001872354159387109, + "loss": 0.7725, + "step": 1430 + }, + { + "epoch": 0.16, + "grad_norm": 0.5870163440704346, + "learning_rate": 0.00018714796335804663, + "loss": 0.9934, + "step": 1435 + }, + { + "epoch": 0.16, + "grad_norm": 0.5287370681762695, + "learning_rate": 0.00018706023278993014, + "loss": 0.8331, + "step": 1440 + }, + { + "epoch": 0.16, + "grad_norm": 0.6170235872268677, + "learning_rate": 0.00018697222451420734, + "loss": 0.8614, + "step": 1445 + }, + { + "epoch": 0.16, + "grad_norm": 0.5744518041610718, + "learning_rate": 0.00018688393881160993, + "loss": 0.8481, + "step": 1450 + }, + { + "epoch": 0.17, + "grad_norm": 0.6607945561408997, + "learning_rate": 0.0001867953759637545, + "loss": 0.7717, + "step": 1455 + }, + { + "epoch": 0.17, + "grad_norm": 0.6847087740898132, + "learning_rate": 0.00018670653625314185, + "loss": 0.9084, + "step": 1460 + }, + { + "epoch": 0.17, + "grad_norm": 0.5219533443450928, + "learning_rate": 0.00018661741996315573, + "loss": 0.7862, + "step": 1465 + }, + { + "epoch": 0.17, + "grad_norm": 0.541492223739624, + "learning_rate": 0.00018652802737806226, + "loss": 0.7658, + "step": 1470 + }, + { + "epoch": 0.17, + "grad_norm": 0.5432409048080444, + "learning_rate": 0.00018643835878300887, + "loss": 0.7273, + "step": 1475 + }, + { + "epoch": 0.17, + "grad_norm": 0.5351329445838928, + "learning_rate": 0.00018634841446402343, + "loss": 0.7721, + "step": 1480 + }, + { + "epoch": 0.17, + "grad_norm": 0.5327515602111816, + "learning_rate": 0.0001862581947080132, + "loss": 0.9053, + "step": 1485 + }, + { + "epoch": 0.17, + "grad_norm": 0.5060054063796997, + "learning_rate": 0.00018616769980276426, + "loss": 0.8919, + "step": 1490 + }, + { + "epoch": 0.17, + "grad_norm": 0.6146079897880554, + "learning_rate": 0.0001860769300369402, + "loss": 0.8494, + "step": 1495 + }, + { + "epoch": 0.17, + "grad_norm": 0.5177258849143982, + "learning_rate": 0.0001859858857000814, + "loss": 0.8473, + "step": 1500 + }, + { + "epoch": 0.17, + "grad_norm": 0.5560600757598877, + "learning_rate": 0.0001858945670826041, + "loss": 0.8193, + "step": 1505 + }, + { + "epoch": 0.17, + "grad_norm": 0.6341832876205444, + "learning_rate": 0.00018580297447579947, + "loss": 0.8413, + "step": 1510 + }, + { + "epoch": 0.17, + "grad_norm": 0.5502546429634094, + "learning_rate": 0.0001857111081718326, + "loss": 0.8251, + "step": 1515 + }, + { + "epoch": 0.17, + "grad_norm": 0.6432173252105713, + "learning_rate": 0.00018561896846374168, + "loss": 0.879, + "step": 1520 + }, + { + "epoch": 0.17, + "grad_norm": 0.5981735587120056, + "learning_rate": 0.00018552655564543695, + "loss": 0.8699, + "step": 1525 + }, + { + "epoch": 0.17, + "grad_norm": 0.5953419804573059, + "learning_rate": 0.00018543387001169993, + "loss": 0.8163, + "step": 1530 + }, + { + "epoch": 0.17, + "grad_norm": 0.5923714637756348, + "learning_rate": 0.0001853409118581823, + "loss": 0.9091, + "step": 1535 + }, + { + "epoch": 0.18, + "grad_norm": 0.9148262739181519, + "learning_rate": 0.00018524768148140504, + "loss": 0.7836, + "step": 1540 + }, + { + "epoch": 0.18, + "grad_norm": 0.5257108807563782, + "learning_rate": 0.00018515417917875748, + "loss": 0.7875, + "step": 1545 + }, + { + "epoch": 0.18, + "grad_norm": 0.6084693074226379, + "learning_rate": 0.00018506040524849637, + "loss": 0.8409, + "step": 1550 + }, + { + "epoch": 0.18, + "grad_norm": 0.49939265847206116, + "learning_rate": 0.00018496635998974489, + "loss": 0.8631, + "step": 1555 + }, + { + "epoch": 0.18, + "grad_norm": 0.5126424431800842, + "learning_rate": 0.00018487204370249167, + "loss": 0.8033, + "step": 1560 + }, + { + "epoch": 0.18, + "grad_norm": 0.4886222183704376, + "learning_rate": 0.00018477745668758996, + "loss": 0.8473, + "step": 1565 + }, + { + "epoch": 0.18, + "grad_norm": 0.5652774572372437, + "learning_rate": 0.00018468259924675655, + "loss": 0.9041, + "step": 1570 + }, + { + "epoch": 0.18, + "grad_norm": 0.5118491053581238, + "learning_rate": 0.00018458747168257085, + "loss": 0.8749, + "step": 1575 + }, + { + "epoch": 0.18, + "grad_norm": 0.5526809692382812, + "learning_rate": 0.00018449207429847384, + "loss": 0.915, + "step": 1580 + }, + { + "epoch": 0.18, + "grad_norm": 0.6415011882781982, + "learning_rate": 0.0001843964073987673, + "loss": 0.878, + "step": 1585 + }, + { + "epoch": 0.18, + "grad_norm": 0.5006015300750732, + "learning_rate": 0.00018430047128861266, + "loss": 0.7848, + "step": 1590 + }, + { + "epoch": 0.18, + "grad_norm": 0.5490383505821228, + "learning_rate": 0.0001842042662740301, + "loss": 0.7836, + "step": 1595 + }, + { + "epoch": 0.18, + "grad_norm": 0.5240172743797302, + "learning_rate": 0.00018410779266189752, + "loss": 0.7642, + "step": 1600 + }, + { + "epoch": 0.18, + "grad_norm": 0.6275254487991333, + "learning_rate": 0.00018401105075994967, + "loss": 0.8773, + "step": 1605 + }, + { + "epoch": 0.18, + "grad_norm": 0.5745741724967957, + "learning_rate": 0.00018391404087677704, + "loss": 0.904, + "step": 1610 + }, + { + "epoch": 0.18, + "grad_norm": 0.6686891913414001, + "learning_rate": 0.00018381676332182497, + "loss": 0.8057, + "step": 1615 + }, + { + "epoch": 0.18, + "grad_norm": 0.501735508441925, + "learning_rate": 0.00018371921840539264, + "loss": 0.8361, + "step": 1620 + }, + { + "epoch": 0.18, + "grad_norm": 0.6197385191917419, + "learning_rate": 0.000183621406438632, + "loss": 0.9193, + "step": 1625 + }, + { + "epoch": 0.19, + "grad_norm": 0.5199822783470154, + "learning_rate": 0.00018352332773354695, + "loss": 0.8461, + "step": 1630 + }, + { + "epoch": 0.19, + "grad_norm": 0.48266005516052246, + "learning_rate": 0.00018342498260299212, + "loss": 0.7336, + "step": 1635 + }, + { + "epoch": 0.19, + "grad_norm": 0.7070842981338501, + "learning_rate": 0.0001833263713606721, + "loss": 0.9886, + "step": 1640 + }, + { + "epoch": 0.19, + "grad_norm": 0.5649372339248657, + "learning_rate": 0.00018322749432114028, + "loss": 0.8146, + "step": 1645 + }, + { + "epoch": 0.19, + "grad_norm": 0.5961683988571167, + "learning_rate": 0.00018312835179979788, + "loss": 0.7934, + "step": 1650 + }, + { + "epoch": 0.19, + "grad_norm": 0.5285751819610596, + "learning_rate": 0.00018302894411289304, + "loss": 0.9225, + "step": 1655 + }, + { + "epoch": 0.19, + "grad_norm": 0.5439310669898987, + "learning_rate": 0.0001829292715775196, + "loss": 0.9477, + "step": 1660 + }, + { + "epoch": 0.19, + "grad_norm": 0.5521465539932251, + "learning_rate": 0.00018282933451161643, + "loss": 0.8531, + "step": 1665 + }, + { + "epoch": 0.19, + "grad_norm": 0.6291643381118774, + "learning_rate": 0.00018272913323396598, + "loss": 0.9441, + "step": 1670 + }, + { + "epoch": 0.19, + "grad_norm": 0.5861973762512207, + "learning_rate": 0.00018262866806419362, + "loss": 0.7543, + "step": 1675 + }, + { + "epoch": 0.19, + "grad_norm": 0.6347075700759888, + "learning_rate": 0.0001825279393227665, + "loss": 0.8737, + "step": 1680 + }, + { + "epoch": 0.19, + "grad_norm": 0.6788772344589233, + "learning_rate": 0.00018242694733099245, + "loss": 0.8511, + "step": 1685 + }, + { + "epoch": 0.19, + "grad_norm": 0.5907087326049805, + "learning_rate": 0.0001823256924110191, + "loss": 0.8505, + "step": 1690 + }, + { + "epoch": 0.19, + "grad_norm": 0.6607640385627747, + "learning_rate": 0.0001822241748858327, + "loss": 0.856, + "step": 1695 + }, + { + "epoch": 0.19, + "grad_norm": 0.6193135976791382, + "learning_rate": 0.0001821223950792572, + "loss": 0.8215, + "step": 1700 + }, + { + "epoch": 0.19, + "grad_norm": 0.6171255707740784, + "learning_rate": 0.00018202035331595323, + "loss": 0.7666, + "step": 1705 + }, + { + "epoch": 0.19, + "grad_norm": 0.49476027488708496, + "learning_rate": 0.00018191804992141695, + "loss": 0.8192, + "step": 1710 + }, + { + "epoch": 0.19, + "grad_norm": 0.6600732803344727, + "learning_rate": 0.0001818154852219791, + "loss": 0.8839, + "step": 1715 + }, + { + "epoch": 0.2, + "grad_norm": 0.6551568508148193, + "learning_rate": 0.00018171265954480394, + "loss": 0.7813, + "step": 1720 + }, + { + "epoch": 0.2, + "grad_norm": 0.539866030216217, + "learning_rate": 0.00018160957321788828, + "loss": 0.8957, + "step": 1725 + }, + { + "epoch": 0.2, + "grad_norm": 0.5746687054634094, + "learning_rate": 0.00018150622657006016, + "loss": 0.9047, + "step": 1730 + }, + { + "epoch": 0.2, + "grad_norm": 0.642020583152771, + "learning_rate": 0.0001814026199309783, + "loss": 0.7867, + "step": 1735 + }, + { + "epoch": 0.2, + "grad_norm": 0.47608810663223267, + "learning_rate": 0.00018129875363113044, + "loss": 0.9095, + "step": 1740 + }, + { + "epoch": 0.2, + "grad_norm": 0.5929521918296814, + "learning_rate": 0.0001811946280018328, + "loss": 0.8587, + "step": 1745 + }, + { + "epoch": 0.2, + "grad_norm": 0.6232489347457886, + "learning_rate": 0.00018109024337522876, + "loss": 0.8188, + "step": 1750 + }, + { + "epoch": 0.2, + "grad_norm": 0.47936391830444336, + "learning_rate": 0.00018098560008428778, + "loss": 0.8164, + "step": 1755 + }, + { + "epoch": 0.2, + "grad_norm": 0.6068043112754822, + "learning_rate": 0.00018088069846280456, + "loss": 0.8273, + "step": 1760 + }, + { + "epoch": 0.2, + "grad_norm": 0.5103864073753357, + "learning_rate": 0.00018077553884539773, + "loss": 0.8453, + "step": 1765 + }, + { + "epoch": 0.2, + "grad_norm": 0.5663210153579712, + "learning_rate": 0.0001806701215675089, + "loss": 0.8745, + "step": 1770 + }, + { + "epoch": 0.2, + "grad_norm": 0.6410555243492126, + "learning_rate": 0.00018056444696540162, + "loss": 0.8698, + "step": 1775 + }, + { + "epoch": 0.2, + "grad_norm": 0.7071842551231384, + "learning_rate": 0.00018045851537616016, + "loss": 0.7438, + "step": 1780 + }, + { + "epoch": 0.2, + "grad_norm": 0.551302969455719, + "learning_rate": 0.0001803523271376887, + "loss": 0.8576, + "step": 1785 + }, + { + "epoch": 0.2, + "grad_norm": 0.5493254065513611, + "learning_rate": 0.0001802458825887099, + "loss": 0.7339, + "step": 1790 + }, + { + "epoch": 0.2, + "grad_norm": 0.6084312200546265, + "learning_rate": 0.00018013918206876415, + "loss": 0.7465, + "step": 1795 + }, + { + "epoch": 0.2, + "grad_norm": 0.52370685338974, + "learning_rate": 0.00018003222591820824, + "loss": 0.8574, + "step": 1800 + }, + { + "epoch": 0.21, + "grad_norm": 0.46578991413116455, + "learning_rate": 0.00017992501447821452, + "loss": 0.7609, + "step": 1805 + }, + { + "epoch": 0.21, + "grad_norm": 0.6183673143386841, + "learning_rate": 0.00017981754809076952, + "loss": 0.8273, + "step": 1810 + }, + { + "epoch": 0.21, + "grad_norm": 0.6610841751098633, + "learning_rate": 0.0001797098270986731, + "loss": 0.9363, + "step": 1815 + }, + { + "epoch": 0.21, + "grad_norm": 0.5758525729179382, + "learning_rate": 0.00017960185184553716, + "loss": 0.7438, + "step": 1820 + }, + { + "epoch": 0.21, + "grad_norm": 0.5575788021087646, + "learning_rate": 0.00017949362267578485, + "loss": 0.8472, + "step": 1825 + }, + { + "epoch": 0.21, + "grad_norm": 0.5445650815963745, + "learning_rate": 0.0001793851399346491, + "loss": 0.895, + "step": 1830 + }, + { + "epoch": 0.21, + "grad_norm": 0.509607195854187, + "learning_rate": 0.0001792764039681717, + "loss": 0.8065, + "step": 1835 + }, + { + "epoch": 0.21, + "grad_norm": 0.5268949270248413, + "learning_rate": 0.00017916741512320227, + "loss": 0.7979, + "step": 1840 + }, + { + "epoch": 0.21, + "grad_norm": 0.5786017775535583, + "learning_rate": 0.00017905817374739704, + "loss": 0.8833, + "step": 1845 + }, + { + "epoch": 0.21, + "grad_norm": 0.5071285963058472, + "learning_rate": 0.0001789486801892177, + "loss": 0.7521, + "step": 1850 + }, + { + "epoch": 0.21, + "grad_norm": 0.620481550693512, + "learning_rate": 0.0001788389347979305, + "loss": 0.8354, + "step": 1855 + }, + { + "epoch": 0.21, + "grad_norm": 0.5200830698013306, + "learning_rate": 0.00017872893792360484, + "loss": 0.9292, + "step": 1860 + }, + { + "epoch": 0.21, + "grad_norm": 0.594330370426178, + "learning_rate": 0.00017861868991711247, + "loss": 0.9231, + "step": 1865 + }, + { + "epoch": 0.21, + "grad_norm": 0.542568564414978, + "learning_rate": 0.00017850819113012601, + "loss": 0.6837, + "step": 1870 + }, + { + "epoch": 0.21, + "grad_norm": 0.6251922845840454, + "learning_rate": 0.0001783974419151182, + "loss": 0.7409, + "step": 1875 + }, + { + "epoch": 0.21, + "grad_norm": 0.5484223365783691, + "learning_rate": 0.0001782864426253606, + "loss": 0.824, + "step": 1880 + }, + { + "epoch": 0.21, + "grad_norm": 0.7240644693374634, + "learning_rate": 0.00017817519361492228, + "loss": 0.7806, + "step": 1885 + }, + { + "epoch": 0.21, + "grad_norm": 0.5933576822280884, + "learning_rate": 0.00017806369523866913, + "loss": 0.8518, + "step": 1890 + }, + { + "epoch": 0.22, + "grad_norm": 0.6882241368293762, + "learning_rate": 0.00017795194785226229, + "loss": 0.9101, + "step": 1895 + }, + { + "epoch": 0.22, + "grad_norm": 0.5205492973327637, + "learning_rate": 0.00017783995181215728, + "loss": 0.8973, + "step": 1900 + }, + { + "epoch": 0.22, + "grad_norm": 0.6640832424163818, + "learning_rate": 0.00017772770747560273, + "loss": 0.9665, + "step": 1905 + }, + { + "epoch": 0.22, + "grad_norm": 0.6470832228660583, + "learning_rate": 0.00017761521520063945, + "loss": 0.8718, + "step": 1910 + }, + { + "epoch": 0.22, + "grad_norm": 0.704579770565033, + "learning_rate": 0.0001775024753460989, + "loss": 0.9091, + "step": 1915 + }, + { + "epoch": 0.22, + "grad_norm": 0.7193452715873718, + "learning_rate": 0.00017738948827160242, + "loss": 0.7795, + "step": 1920 + }, + { + "epoch": 0.22, + "grad_norm": 0.5544724464416504, + "learning_rate": 0.0001772762543375599, + "loss": 0.8755, + "step": 1925 + }, + { + "epoch": 0.22, + "grad_norm": 0.5948014855384827, + "learning_rate": 0.00017716277390516876, + "loss": 0.8527, + "step": 1930 + }, + { + "epoch": 0.22, + "grad_norm": 0.7045935392379761, + "learning_rate": 0.00017704904733641255, + "loss": 0.9894, + "step": 1935 + }, + { + "epoch": 0.22, + "grad_norm": 0.5722846388816833, + "learning_rate": 0.00017693507499406, + "loss": 0.8367, + "step": 1940 + }, + { + "epoch": 0.22, + "grad_norm": 0.5934826135635376, + "learning_rate": 0.000176820857241664, + "loss": 0.96, + "step": 1945 + }, + { + "epoch": 0.22, + "grad_norm": 0.6156487464904785, + "learning_rate": 0.00017670639444355998, + "loss": 0.8629, + "step": 1950 + }, + { + "epoch": 0.22, + "grad_norm": 0.6569040417671204, + "learning_rate": 0.0001765916869648652, + "loss": 0.8539, + "step": 1955 + }, + { + "epoch": 0.22, + "grad_norm": 0.5664108395576477, + "learning_rate": 0.0001764767351714774, + "loss": 0.8514, + "step": 1960 + }, + { + "epoch": 0.22, + "grad_norm": 0.681502640247345, + "learning_rate": 0.0001763615394300735, + "loss": 0.9149, + "step": 1965 + }, + { + "epoch": 0.22, + "grad_norm": 0.5791369080543518, + "learning_rate": 0.00017624610010810878, + "loss": 0.8566, + "step": 1970 + }, + { + "epoch": 0.22, + "grad_norm": 0.5944411158561707, + "learning_rate": 0.00017613041757381538, + "loss": 0.8279, + "step": 1975 + }, + { + "epoch": 0.23, + "grad_norm": 0.5867791175842285, + "learning_rate": 0.00017601449219620125, + "loss": 0.8742, + "step": 1980 + }, + { + "epoch": 0.23, + "grad_norm": 0.5641552805900574, + "learning_rate": 0.00017589832434504902, + "loss": 0.7705, + "step": 1985 + }, + { + "epoch": 0.23, + "grad_norm": 0.5856841206550598, + "learning_rate": 0.0001757819143909147, + "loss": 0.8156, + "step": 1990 + }, + { + "epoch": 0.23, + "grad_norm": 0.5216368436813354, + "learning_rate": 0.00017566526270512665, + "loss": 0.965, + "step": 1995 + }, + { + "epoch": 0.23, + "grad_norm": 0.6632785797119141, + "learning_rate": 0.0001755483696597842, + "loss": 0.88, + "step": 2000 + }, + { + "epoch": 0.23, + "grad_norm": 0.5716277360916138, + "learning_rate": 0.0001754312356277567, + "loss": 0.8717, + "step": 2005 + }, + { + "epoch": 0.23, + "grad_norm": 0.5452391505241394, + "learning_rate": 0.0001753138609826822, + "loss": 0.7268, + "step": 2010 + }, + { + "epoch": 0.23, + "grad_norm": 0.5985669493675232, + "learning_rate": 0.00017519624609896615, + "loss": 0.8528, + "step": 2015 + }, + { + "epoch": 0.23, + "grad_norm": 0.605197548866272, + "learning_rate": 0.0001750783913517804, + "loss": 0.9327, + "step": 2020 + }, + { + "epoch": 0.23, + "grad_norm": 0.7269711494445801, + "learning_rate": 0.0001749602971170619, + "loss": 0.9022, + "step": 2025 + }, + { + "epoch": 0.23, + "grad_norm": 0.6030486226081848, + "learning_rate": 0.00017484196377151161, + "loss": 0.851, + "step": 2030 + }, + { + "epoch": 0.23, + "grad_norm": 0.5089486837387085, + "learning_rate": 0.00017472339169259307, + "loss": 0.7455, + "step": 2035 + }, + { + "epoch": 0.23, + "grad_norm": 0.5635782480239868, + "learning_rate": 0.00017460458125853143, + "loss": 1.0043, + "step": 2040 + }, + { + "epoch": 0.23, + "grad_norm": 0.5171327590942383, + "learning_rate": 0.0001744855328483122, + "loss": 0.8556, + "step": 2045 + }, + { + "epoch": 0.23, + "grad_norm": 0.5177988409996033, + "learning_rate": 0.00017436624684167984, + "loss": 0.8203, + "step": 2050 + }, + { + "epoch": 0.23, + "grad_norm": 0.702873170375824, + "learning_rate": 0.00017424672361913686, + "loss": 0.8809, + "step": 2055 + }, + { + "epoch": 0.23, + "grad_norm": 0.5576356649398804, + "learning_rate": 0.00017412696356194235, + "loss": 0.8858, + "step": 2060 + }, + { + "epoch": 0.23, + "grad_norm": 0.6151427030563354, + "learning_rate": 0.0001740069670521109, + "loss": 0.9176, + "step": 2065 + }, + { + "epoch": 0.24, + "grad_norm": 0.5806076526641846, + "learning_rate": 0.00017388673447241138, + "loss": 1.0125, + "step": 2070 + }, + { + "epoch": 0.24, + "grad_norm": 0.6101603507995605, + "learning_rate": 0.00017376626620636557, + "loss": 0.8164, + "step": 2075 + }, + { + "epoch": 0.24, + "grad_norm": 0.6350337266921997, + "learning_rate": 0.00017364556263824719, + "loss": 0.86, + "step": 2080 + }, + { + "epoch": 0.24, + "grad_norm": 0.5311822891235352, + "learning_rate": 0.00017352462415308044, + "loss": 0.8738, + "step": 2085 + }, + { + "epoch": 0.24, + "grad_norm": 0.5520626306533813, + "learning_rate": 0.0001734034511366389, + "loss": 0.8186, + "step": 2090 + }, + { + "epoch": 0.24, + "grad_norm": 0.6877513527870178, + "learning_rate": 0.00017328204397544424, + "loss": 0.8194, + "step": 2095 + }, + { + "epoch": 0.24, + "grad_norm": 0.6582801342010498, + "learning_rate": 0.00017316040305676508, + "loss": 0.9177, + "step": 2100 + }, + { + "epoch": 0.24, + "grad_norm": 0.637827455997467, + "learning_rate": 0.0001730385287686156, + "loss": 0.8896, + "step": 2105 + }, + { + "epoch": 0.24, + "grad_norm": 0.5763481855392456, + "learning_rate": 0.00017291642149975446, + "loss": 0.8431, + "step": 2110 + }, + { + "epoch": 0.24, + "grad_norm": 0.5631943941116333, + "learning_rate": 0.00017279408163968342, + "loss": 0.8405, + "step": 2115 + }, + { + "epoch": 0.24, + "grad_norm": 0.5558026432991028, + "learning_rate": 0.00017267150957864623, + "loss": 0.8788, + "step": 2120 + }, + { + "epoch": 0.24, + "grad_norm": 0.6178169250488281, + "learning_rate": 0.00017254870570762733, + "loss": 0.8176, + "step": 2125 + }, + { + "epoch": 0.24, + "grad_norm": 0.5574104189872742, + "learning_rate": 0.0001724256704183505, + "loss": 0.8623, + "step": 2130 + }, + { + "epoch": 0.24, + "grad_norm": 0.6056390404701233, + "learning_rate": 0.00017230240410327782, + "loss": 0.8526, + "step": 2135 + }, + { + "epoch": 0.24, + "grad_norm": 0.584563136100769, + "learning_rate": 0.00017217890715560822, + "loss": 0.8111, + "step": 2140 + }, + { + "epoch": 0.24, + "grad_norm": 0.5519852042198181, + "learning_rate": 0.0001720551799692764, + "loss": 0.767, + "step": 2145 + }, + { + "epoch": 0.24, + "grad_norm": 0.5955138206481934, + "learning_rate": 0.00017193122293895138, + "loss": 0.9088, + "step": 2150 + }, + { + "epoch": 0.25, + "grad_norm": 0.637936532497406, + "learning_rate": 0.00017180703646003535, + "loss": 0.7683, + "step": 2155 + }, + { + "epoch": 0.25, + "grad_norm": 0.5777458548545837, + "learning_rate": 0.0001716826209286625, + "loss": 0.8103, + "step": 2160 + }, + { + "epoch": 0.25, + "grad_norm": 0.5864526629447937, + "learning_rate": 0.0001715579767416976, + "loss": 0.8297, + "step": 2165 + }, + { + "epoch": 0.25, + "grad_norm": 0.5473954677581787, + "learning_rate": 0.0001714331042967348, + "loss": 0.8161, + "step": 2170 + }, + { + "epoch": 0.25, + "grad_norm": 0.5501269102096558, + "learning_rate": 0.00017130800399209632, + "loss": 0.774, + "step": 2175 + }, + { + "epoch": 0.25, + "grad_norm": 0.622527539730072, + "learning_rate": 0.00017118267622683123, + "loss": 0.9349, + "step": 2180 + }, + { + "epoch": 0.25, + "grad_norm": 0.6729717254638672, + "learning_rate": 0.00017105712140071426, + "loss": 0.8568, + "step": 2185 + }, + { + "epoch": 0.25, + "grad_norm": 0.5891861319541931, + "learning_rate": 0.00017093133991424425, + "loss": 0.787, + "step": 2190 + }, + { + "epoch": 0.25, + "grad_norm": 0.683480978012085, + "learning_rate": 0.00017080533216864318, + "loss": 0.8124, + "step": 2195 + }, + { + "epoch": 0.25, + "grad_norm": 0.7241565585136414, + "learning_rate": 0.00017067909856585472, + "loss": 0.6895, + "step": 2200 + }, + { + "epoch": 0.25, + "grad_norm": 0.6606466770172119, + "learning_rate": 0.00017055263950854297, + "loss": 0.8192, + "step": 2205 + }, + { + "epoch": 0.25, + "grad_norm": 0.5163353681564331, + "learning_rate": 0.00017042595540009124, + "loss": 0.8085, + "step": 2210 + }, + { + "epoch": 0.25, + "grad_norm": 0.6061686277389526, + "learning_rate": 0.00017029904664460065, + "loss": 0.9215, + "step": 2215 + }, + { + "epoch": 0.25, + "grad_norm": 0.5943601727485657, + "learning_rate": 0.00017017191364688896, + "loss": 0.8554, + "step": 2220 + }, + { + "epoch": 0.25, + "grad_norm": 0.6568538546562195, + "learning_rate": 0.00017004455681248918, + "loss": 0.8472, + "step": 2225 + }, + { + "epoch": 0.25, + "grad_norm": 0.554853081703186, + "learning_rate": 0.0001699169765476484, + "loss": 0.8969, + "step": 2230 + }, + { + "epoch": 0.25, + "grad_norm": 0.4984689950942993, + "learning_rate": 0.0001697891732593263, + "loss": 0.8548, + "step": 2235 + }, + { + "epoch": 0.25, + "grad_norm": 0.6138368248939514, + "learning_rate": 0.00016966114735519406, + "loss": 0.8888, + "step": 2240 + }, + { + "epoch": 0.26, + "grad_norm": 0.8124855756759644, + "learning_rate": 0.00016953289924363297, + "loss": 0.9703, + "step": 2245 + }, + { + "epoch": 0.26, + "grad_norm": 0.6956800222396851, + "learning_rate": 0.00016940442933373304, + "loss": 0.7584, + "step": 2250 + }, + { + "epoch": 0.26, + "grad_norm": 0.7704644799232483, + "learning_rate": 0.00016927573803529185, + "loss": 0.7784, + "step": 2255 + }, + { + "epoch": 0.26, + "grad_norm": 0.5658566355705261, + "learning_rate": 0.00016914682575881314, + "loss": 0.8646, + "step": 2260 + }, + { + "epoch": 0.26, + "grad_norm": 0.6220143437385559, + "learning_rate": 0.00016901769291550558, + "loss": 0.7348, + "step": 2265 + }, + { + "epoch": 0.26, + "grad_norm": 0.6346325278282166, + "learning_rate": 0.00016888833991728137, + "loss": 0.8734, + "step": 2270 + }, + { + "epoch": 0.26, + "grad_norm": 0.5919114947319031, + "learning_rate": 0.00016875876717675496, + "loss": 0.781, + "step": 2275 + }, + { + "epoch": 0.26, + "grad_norm": 0.5826617479324341, + "learning_rate": 0.00016862897510724176, + "loss": 0.8185, + "step": 2280 + }, + { + "epoch": 0.26, + "grad_norm": 0.5474236607551575, + "learning_rate": 0.00016849896412275683, + "loss": 0.7699, + "step": 2285 + }, + { + "epoch": 0.26, + "grad_norm": 0.619706928730011, + "learning_rate": 0.0001683687346380135, + "loss": 0.852, + "step": 2290 + }, + { + "epoch": 0.26, + "grad_norm": 0.8853769302368164, + "learning_rate": 0.000168238287068422, + "loss": 0.8513, + "step": 2295 + }, + { + "epoch": 0.26, + "grad_norm": 0.6231085062026978, + "learning_rate": 0.00016810762183008845, + "loss": 0.7905, + "step": 2300 + }, + { + "epoch": 0.26, + "grad_norm": 0.5759189128875732, + "learning_rate": 0.00016797673933981297, + "loss": 0.8566, + "step": 2305 + }, + { + "epoch": 0.26, + "grad_norm": 0.5591956973075867, + "learning_rate": 0.000167845640015089, + "loss": 0.9151, + "step": 2310 + }, + { + "epoch": 0.26, + "grad_norm": 0.7117490172386169, + "learning_rate": 0.00016771432427410137, + "loss": 0.7662, + "step": 2315 + }, + { + "epoch": 0.26, + "grad_norm": 0.5831469893455505, + "learning_rate": 0.00016758279253572546, + "loss": 0.8582, + "step": 2320 + }, + { + "epoch": 0.26, + "grad_norm": 0.7675443291664124, + "learning_rate": 0.00016745104521952552, + "loss": 0.8713, + "step": 2325 + }, + { + "epoch": 0.26, + "grad_norm": 0.5224565863609314, + "learning_rate": 0.0001673190827457535, + "loss": 0.784, + "step": 2330 + }, + { + "epoch": 0.27, + "grad_norm": 0.6027007102966309, + "learning_rate": 0.00016718690553534766, + "loss": 0.8347, + "step": 2335 + }, + { + "epoch": 0.27, + "grad_norm": 0.5675181746482849, + "learning_rate": 0.0001670545140099312, + "loss": 0.7134, + "step": 2340 + }, + { + "epoch": 0.27, + "grad_norm": 0.6312824487686157, + "learning_rate": 0.00016692190859181102, + "loss": 0.8298, + "step": 2345 + }, + { + "epoch": 0.27, + "grad_norm": 0.5071117281913757, + "learning_rate": 0.00016678908970397624, + "loss": 0.845, + "step": 2350 + }, + { + "epoch": 0.27, + "grad_norm": 0.6821117997169495, + "learning_rate": 0.00016665605777009697, + "loss": 0.8077, + "step": 2355 + }, + { + "epoch": 0.27, + "grad_norm": 0.5504366755485535, + "learning_rate": 0.00016652281321452282, + "loss": 0.8021, + "step": 2360 + }, + { + "epoch": 0.27, + "grad_norm": 0.5457181930541992, + "learning_rate": 0.0001663893564622817, + "loss": 0.7769, + "step": 2365 + }, + { + "epoch": 0.27, + "grad_norm": 0.6445764899253845, + "learning_rate": 0.00016625568793907834, + "loss": 0.7976, + "step": 2370 + }, + { + "epoch": 0.27, + "grad_norm": 0.6524176001548767, + "learning_rate": 0.00016612180807129304, + "loss": 0.7837, + "step": 2375 + }, + { + "epoch": 0.27, + "grad_norm": 0.5401653051376343, + "learning_rate": 0.00016598771728598024, + "loss": 0.6942, + "step": 2380 + }, + { + "epoch": 0.27, + "grad_norm": 0.5231527090072632, + "learning_rate": 0.00016585341601086712, + "loss": 0.7986, + "step": 2385 + }, + { + "epoch": 0.27, + "grad_norm": 0.6786569952964783, + "learning_rate": 0.0001657189046743523, + "loss": 0.8576, + "step": 2390 + }, + { + "epoch": 0.27, + "grad_norm": 0.6088935732841492, + "learning_rate": 0.0001655841837055046, + "loss": 0.8909, + "step": 2395 + }, + { + "epoch": 0.27, + "grad_norm": 0.6915789842605591, + "learning_rate": 0.00016544925353406125, + "loss": 0.7604, + "step": 2400 + }, + { + "epoch": 0.27, + "grad_norm": 0.5768089890480042, + "learning_rate": 0.0001653141145904271, + "loss": 0.8903, + "step": 2405 + }, + { + "epoch": 0.27, + "grad_norm": 0.6887524127960205, + "learning_rate": 0.0001651787673056728, + "loss": 0.8363, + "step": 2410 + }, + { + "epoch": 0.27, + "grad_norm": 0.6299374103546143, + "learning_rate": 0.00016504321211153355, + "loss": 0.8185, + "step": 2415 + }, + { + "epoch": 0.28, + "grad_norm": 0.6226490139961243, + "learning_rate": 0.00016490744944040777, + "loss": 0.8176, + "step": 2420 + }, + { + "epoch": 0.28, + "grad_norm": 0.6668693423271179, + "learning_rate": 0.00016477147972535577, + "loss": 0.8422, + "step": 2425 + }, + { + "epoch": 0.28, + "grad_norm": 0.7947617769241333, + "learning_rate": 0.00016463530340009817, + "loss": 0.8986, + "step": 2430 + }, + { + "epoch": 0.28, + "grad_norm": 0.5857140421867371, + "learning_rate": 0.00016449892089901477, + "loss": 0.75, + "step": 2435 + }, + { + "epoch": 0.28, + "grad_norm": 0.658028244972229, + "learning_rate": 0.00016436233265714297, + "loss": 0.8304, + "step": 2440 + }, + { + "epoch": 0.28, + "grad_norm": 0.5767120122909546, + "learning_rate": 0.00016422553911017642, + "loss": 0.8077, + "step": 2445 + }, + { + "epoch": 0.28, + "grad_norm": 0.607893705368042, + "learning_rate": 0.00016408854069446374, + "loss": 0.8232, + "step": 2450 + }, + { + "epoch": 0.28, + "grad_norm": 0.6129177212715149, + "learning_rate": 0.00016395133784700695, + "loss": 0.8561, + "step": 2455 + }, + { + "epoch": 0.28, + "grad_norm": 0.5294451117515564, + "learning_rate": 0.00016381393100546026, + "loss": 0.7024, + "step": 2460 + }, + { + "epoch": 0.28, + "grad_norm": 0.5510571002960205, + "learning_rate": 0.00016367632060812856, + "loss": 0.8306, + "step": 2465 + }, + { + "epoch": 0.28, + "grad_norm": 5.814347267150879, + "learning_rate": 0.00016353850709396604, + "loss": 0.8465, + "step": 2470 + }, + { + "epoch": 0.28, + "grad_norm": 0.5485466718673706, + "learning_rate": 0.00016340049090257476, + "loss": 0.741, + "step": 2475 + }, + { + "epoch": 0.28, + "grad_norm": 0.6270123720169067, + "learning_rate": 0.00016326227247420337, + "loss": 0.9109, + "step": 2480 + }, + { + "epoch": 0.28, + "grad_norm": 0.6270929574966431, + "learning_rate": 0.00016312385224974554, + "loss": 0.8672, + "step": 2485 + }, + { + "epoch": 0.28, + "grad_norm": 0.5627338886260986, + "learning_rate": 0.0001629852306707387, + "loss": 0.9262, + "step": 2490 + }, + { + "epoch": 0.28, + "grad_norm": 0.6023781299591064, + "learning_rate": 0.00016284640817936254, + "loss": 0.7498, + "step": 2495 + }, + { + "epoch": 0.28, + "grad_norm": 0.5960550308227539, + "learning_rate": 0.00016270738521843763, + "loss": 0.7668, + "step": 2500 + }, + { + "epoch": 0.28, + "grad_norm": 0.5488481521606445, + "learning_rate": 0.000162568162231424, + "loss": 0.8611, + "step": 2505 + }, + { + "epoch": 0.29, + "grad_norm": 0.547410786151886, + "learning_rate": 0.00016242873966241974, + "loss": 0.7795, + "step": 2510 + }, + { + "epoch": 0.29, + "grad_norm": 0.5622233748435974, + "learning_rate": 0.00016228911795615952, + "loss": 0.8837, + "step": 2515 + }, + { + "epoch": 0.29, + "grad_norm": 0.8012298941612244, + "learning_rate": 0.00016214929755801335, + "loss": 0.895, + "step": 2520 + }, + { + "epoch": 0.29, + "grad_norm": 0.6051672101020813, + "learning_rate": 0.00016200927891398489, + "loss": 0.7937, + "step": 2525 + }, + { + "epoch": 0.29, + "grad_norm": 0.6143134832382202, + "learning_rate": 0.00016186906247071025, + "loss": 0.8751, + "step": 2530 + }, + { + "epoch": 0.29, + "grad_norm": 0.6904672980308533, + "learning_rate": 0.0001617286486754565, + "loss": 0.7504, + "step": 2535 + }, + { + "epoch": 0.29, + "grad_norm": 0.5270385146141052, + "learning_rate": 0.00016158803797612019, + "loss": 0.8145, + "step": 2540 + }, + { + "epoch": 0.29, + "grad_norm": 0.6666483879089355, + "learning_rate": 0.00016144723082122596, + "loss": 0.778, + "step": 2545 + }, + { + "epoch": 0.29, + "grad_norm": 0.5860428214073181, + "learning_rate": 0.0001613062276599251, + "loss": 0.8231, + "step": 2550 + }, + { + "epoch": 0.29, + "grad_norm": 0.6345640420913696, + "learning_rate": 0.00016116502894199418, + "loss": 0.8982, + "step": 2555 + }, + { + "epoch": 0.29, + "grad_norm": 0.590721845626831, + "learning_rate": 0.00016102363511783362, + "loss": 0.833, + "step": 2560 + }, + { + "epoch": 0.29, + "grad_norm": 0.5986994504928589, + "learning_rate": 0.00016088204663846595, + "loss": 0.8326, + "step": 2565 + }, + { + "epoch": 0.29, + "grad_norm": 0.5553504824638367, + "learning_rate": 0.00016074026395553487, + "loss": 0.7604, + "step": 2570 + }, + { + "epoch": 0.29, + "grad_norm": 0.6035515666007996, + "learning_rate": 0.00016059828752130345, + "loss": 0.7755, + "step": 2575 + }, + { + "epoch": 0.29, + "grad_norm": 0.6856073141098022, + "learning_rate": 0.0001604561177886528, + "loss": 0.7277, + "step": 2580 + }, + { + "epoch": 0.29, + "grad_norm": 0.5378929972648621, + "learning_rate": 0.00016031375521108066, + "loss": 0.8081, + "step": 2585 + }, + { + "epoch": 0.29, + "grad_norm": 0.6315340399742126, + "learning_rate": 0.00016017120024269986, + "loss": 0.9446, + "step": 2590 + }, + { + "epoch": 0.3, + "grad_norm": 0.5610125660896301, + "learning_rate": 0.00016002845333823695, + "loss": 0.7239, + "step": 2595 + }, + { + "epoch": 0.3, + "grad_norm": 0.6176686882972717, + "learning_rate": 0.00015988551495303073, + "loss": 0.8375, + "step": 2600 + }, + { + "epoch": 0.3, + "grad_norm": 0.5788987278938293, + "learning_rate": 0.00015974238554303076, + "loss": 0.8433, + "step": 2605 + }, + { + "epoch": 0.3, + "grad_norm": 0.6443178057670593, + "learning_rate": 0.00015959906556479596, + "loss": 0.9211, + "step": 2610 + }, + { + "epoch": 0.3, + "grad_norm": 0.6430835723876953, + "learning_rate": 0.00015945555547549315, + "loss": 0.8475, + "step": 2615 + }, + { + "epoch": 0.3, + "grad_norm": 0.6127652525901794, + "learning_rate": 0.00015931185573289555, + "loss": 0.8167, + "step": 2620 + }, + { + "epoch": 0.3, + "grad_norm": 0.5990703105926514, + "learning_rate": 0.00015916796679538134, + "loss": 0.7095, + "step": 2625 + }, + { + "epoch": 0.3, + "grad_norm": 0.6349881887435913, + "learning_rate": 0.00015902388912193222, + "loss": 0.8973, + "step": 2630 + }, + { + "epoch": 0.3, + "grad_norm": 0.5582308173179626, + "learning_rate": 0.0001588796231721319, + "loss": 0.7662, + "step": 2635 + }, + { + "epoch": 0.3, + "grad_norm": 0.5647494792938232, + "learning_rate": 0.00015873516940616468, + "loss": 0.7906, + "step": 2640 + }, + { + "epoch": 0.3, + "grad_norm": 0.630150556564331, + "learning_rate": 0.00015859052828481394, + "loss": 0.8536, + "step": 2645 + }, + { + "epoch": 0.3, + "grad_norm": 0.6978116035461426, + "learning_rate": 0.0001584457002694607, + "loss": 0.7525, + "step": 2650 + }, + { + "epoch": 0.3, + "grad_norm": 0.6089206337928772, + "learning_rate": 0.00015830068582208217, + "loss": 0.849, + "step": 2655 + }, + { + "epoch": 0.3, + "grad_norm": 0.5681561231613159, + "learning_rate": 0.0001581554854052502, + "loss": 0.8956, + "step": 2660 + }, + { + "epoch": 0.3, + "grad_norm": 0.6889328956604004, + "learning_rate": 0.0001580100994821299, + "loss": 0.8592, + "step": 2665 + }, + { + "epoch": 0.3, + "grad_norm": 0.5611298084259033, + "learning_rate": 0.0001578645285164781, + "loss": 0.8436, + "step": 2670 + }, + { + "epoch": 0.3, + "grad_norm": 0.6381964087486267, + "learning_rate": 0.00015771877297264184, + "loss": 0.7636, + "step": 2675 + }, + { + "epoch": 0.3, + "grad_norm": 0.6424821615219116, + "learning_rate": 0.00015757283331555697, + "loss": 0.8919, + "step": 2680 + }, + { + "epoch": 0.31, + "grad_norm": 0.5920271873474121, + "learning_rate": 0.00015742671001074668, + "loss": 0.9166, + "step": 2685 + }, + { + "epoch": 0.31, + "grad_norm": 0.6440754532814026, + "learning_rate": 0.00015728040352431982, + "loss": 0.8435, + "step": 2690 + }, + { + "epoch": 0.31, + "grad_norm": 0.6160061359405518, + "learning_rate": 0.00015713391432296977, + "loss": 0.7851, + "step": 2695 + }, + { + "epoch": 0.31, + "grad_norm": 0.5635654926300049, + "learning_rate": 0.00015698724287397254, + "loss": 0.8102, + "step": 2700 + }, + { + "epoch": 0.31, + "grad_norm": 0.5842839479446411, + "learning_rate": 0.00015684038964518558, + "loss": 0.878, + "step": 2705 + }, + { + "epoch": 0.31, + "grad_norm": 0.6162332892417908, + "learning_rate": 0.00015669335510504618, + "loss": 0.8285, + "step": 2710 + }, + { + "epoch": 0.31, + "grad_norm": 0.5869714617729187, + "learning_rate": 0.00015654613972256997, + "loss": 0.8949, + "step": 2715 + }, + { + "epoch": 0.31, + "grad_norm": 0.6483595371246338, + "learning_rate": 0.00015639874396734943, + "loss": 0.739, + "step": 2720 + }, + { + "epoch": 0.31, + "grad_norm": 0.6540059447288513, + "learning_rate": 0.00015625116830955243, + "loss": 0.9031, + "step": 2725 + }, + { + "epoch": 0.31, + "grad_norm": 0.5853153467178345, + "learning_rate": 0.00015610341321992068, + "loss": 0.7753, + "step": 2730 + }, + { + "epoch": 0.31, + "grad_norm": 0.521800696849823, + "learning_rate": 0.0001559554791697682, + "loss": 0.8599, + "step": 2735 + }, + { + "epoch": 0.31, + "grad_norm": 0.5266906023025513, + "learning_rate": 0.00015580736663097996, + "loss": 0.8546, + "step": 2740 + }, + { + "epoch": 0.31, + "grad_norm": 0.585526168346405, + "learning_rate": 0.00015565907607601023, + "loss": 0.7511, + "step": 2745 + }, + { + "epoch": 0.31, + "grad_norm": 0.5523586273193359, + "learning_rate": 0.00015551060797788107, + "loss": 0.8193, + "step": 2750 + }, + { + "epoch": 0.31, + "grad_norm": 0.6577056050300598, + "learning_rate": 0.00015536196281018097, + "loss": 0.8619, + "step": 2755 + }, + { + "epoch": 0.31, + "grad_norm": 0.5628554224967957, + "learning_rate": 0.00015521314104706318, + "loss": 0.7435, + "step": 2760 + }, + { + "epoch": 0.31, + "grad_norm": 0.5676989555358887, + "learning_rate": 0.00015506414316324426, + "loss": 0.8461, + "step": 2765 + }, + { + "epoch": 0.31, + "grad_norm": 0.6753340363502502, + "learning_rate": 0.0001549149696340026, + "loss": 0.8576, + "step": 2770 + }, + { + "epoch": 0.32, + "grad_norm": 0.6714786887168884, + "learning_rate": 0.00015476562093517688, + "loss": 0.8376, + "step": 2775 + }, + { + "epoch": 0.32, + "grad_norm": 0.5616926550865173, + "learning_rate": 0.00015461609754316446, + "loss": 0.7985, + "step": 2780 + }, + { + "epoch": 0.32, + "grad_norm": 0.5463203191757202, + "learning_rate": 0.00015446639993492003, + "loss": 0.844, + "step": 2785 + }, + { + "epoch": 0.32, + "grad_norm": 0.6310843229293823, + "learning_rate": 0.00015431652858795394, + "loss": 0.8265, + "step": 2790 + }, + { + "epoch": 0.32, + "grad_norm": 0.6531693935394287, + "learning_rate": 0.00015416648398033076, + "loss": 0.9024, + "step": 2795 + }, + { + "epoch": 0.32, + "grad_norm": 0.4960426688194275, + "learning_rate": 0.00015401626659066774, + "loss": 0.8993, + "step": 2800 + }, + { + "epoch": 0.32, + "grad_norm": 0.6042740345001221, + "learning_rate": 0.0001538658768981333, + "loss": 0.8909, + "step": 2805 + }, + { + "epoch": 0.32, + "grad_norm": 0.579511284828186, + "learning_rate": 0.00015371531538244546, + "loss": 0.7696, + "step": 2810 + }, + { + "epoch": 0.32, + "grad_norm": 0.6731118559837341, + "learning_rate": 0.00015356458252387025, + "loss": 0.7309, + "step": 2815 + }, + { + "epoch": 0.32, + "grad_norm": 0.5301114916801453, + "learning_rate": 0.00015341367880322042, + "loss": 0.7494, + "step": 2820 + }, + { + "epoch": 0.32, + "grad_norm": 0.7461742162704468, + "learning_rate": 0.00015326260470185352, + "loss": 0.8429, + "step": 2825 + }, + { + "epoch": 0.32, + "grad_norm": 0.6235002875328064, + "learning_rate": 0.00015311136070167075, + "loss": 0.7959, + "step": 2830 + }, + { + "epoch": 0.32, + "grad_norm": 0.5418492555618286, + "learning_rate": 0.00015295994728511532, + "loss": 0.8112, + "step": 2835 + }, + { + "epoch": 0.32, + "grad_norm": 0.6243062019348145, + "learning_rate": 0.0001528083649351706, + "loss": 0.8134, + "step": 2840 + }, + { + "epoch": 0.32, + "grad_norm": 0.5746551156044006, + "learning_rate": 0.00015265661413535906, + "loss": 0.8692, + "step": 2845 + }, + { + "epoch": 0.32, + "grad_norm": 0.7506961226463318, + "learning_rate": 0.00015250469536974042, + "loss": 0.8399, + "step": 2850 + }, + { + "epoch": 0.32, + "grad_norm": 0.6415050625801086, + "learning_rate": 0.00015235260912291012, + "loss": 0.7829, + "step": 2855 + }, + { + "epoch": 0.33, + "grad_norm": 0.6047353148460388, + "learning_rate": 0.00015220035587999796, + "loss": 0.8918, + "step": 2860 + }, + { + "epoch": 0.33, + "grad_norm": 0.573059618473053, + "learning_rate": 0.00015204793612666627, + "loss": 0.817, + "step": 2865 + }, + { + "epoch": 0.33, + "grad_norm": 0.5482955574989319, + "learning_rate": 0.00015189535034910873, + "loss": 0.7738, + "step": 2870 + }, + { + "epoch": 0.33, + "grad_norm": 0.5009284615516663, + "learning_rate": 0.00015174259903404845, + "loss": 0.7723, + "step": 2875 + }, + { + "epoch": 0.33, + "grad_norm": 0.5775097012519836, + "learning_rate": 0.00015158968266873658, + "loss": 0.804, + "step": 2880 + }, + { + "epoch": 0.33, + "grad_norm": 0.5707810521125793, + "learning_rate": 0.00015143660174095081, + "loss": 0.7024, + "step": 2885 + }, + { + "epoch": 0.33, + "grad_norm": 0.5989297032356262, + "learning_rate": 0.00015128335673899375, + "loss": 0.8147, + "step": 2890 + }, + { + "epoch": 0.33, + "grad_norm": 0.5856032967567444, + "learning_rate": 0.00015112994815169142, + "loss": 0.8361, + "step": 2895 + }, + { + "epoch": 0.33, + "grad_norm": 0.6352725028991699, + "learning_rate": 0.0001509763764683915, + "loss": 0.8242, + "step": 2900 + }, + { + "epoch": 0.33, + "grad_norm": 0.613037645816803, + "learning_rate": 0.00015082264217896208, + "loss": 0.9165, + "step": 2905 + }, + { + "epoch": 0.33, + "grad_norm": 0.555920422077179, + "learning_rate": 0.00015066874577378988, + "loss": 0.8409, + "step": 2910 + }, + { + "epoch": 0.33, + "grad_norm": 0.616894006729126, + "learning_rate": 0.00015051468774377868, + "loss": 0.7981, + "step": 2915 + }, + { + "epoch": 0.33, + "grad_norm": 0.6403358578681946, + "learning_rate": 0.00015036046858034796, + "loss": 0.9592, + "step": 2920 + }, + { + "epoch": 0.33, + "grad_norm": 0.6272974014282227, + "learning_rate": 0.00015020608877543102, + "loss": 0.7743, + "step": 2925 + }, + { + "epoch": 0.33, + "grad_norm": 0.6145214438438416, + "learning_rate": 0.00015005154882147373, + "loss": 0.7876, + "step": 2930 + }, + { + "epoch": 0.33, + "grad_norm": 0.5387482047080994, + "learning_rate": 0.00014989684921143268, + "loss": 0.8426, + "step": 2935 + }, + { + "epoch": 0.33, + "grad_norm": 0.6402955651283264, + "learning_rate": 0.0001497419904387738, + "loss": 0.8531, + "step": 2940 + }, + { + "epoch": 0.33, + "grad_norm": 0.6462345719337463, + "learning_rate": 0.0001495869729974708, + "loss": 0.8089, + "step": 2945 + }, + { + "epoch": 0.34, + "grad_norm": 0.5510848760604858, + "learning_rate": 0.00014943179738200333, + "loss": 0.7983, + "step": 2950 + }, + { + "epoch": 0.34, + "grad_norm": 0.6539138555526733, + "learning_rate": 0.00014927646408735576, + "loss": 0.7698, + "step": 2955 + }, + { + "epoch": 0.34, + "grad_norm": 0.5367149710655212, + "learning_rate": 0.00014912097360901533, + "loss": 0.7783, + "step": 2960 + }, + { + "epoch": 0.34, + "grad_norm": 0.5643497109413147, + "learning_rate": 0.0001489653264429707, + "loss": 0.7887, + "step": 2965 + }, + { + "epoch": 0.34, + "grad_norm": 0.6941254138946533, + "learning_rate": 0.0001488095230857104, + "loss": 0.9146, + "step": 2970 + }, + { + "epoch": 0.34, + "grad_norm": 0.5869201421737671, + "learning_rate": 0.00014865356403422105, + "loss": 0.8394, + "step": 2975 + }, + { + "epoch": 0.34, + "grad_norm": 0.8639833331108093, + "learning_rate": 0.00014849744978598603, + "loss": 0.7952, + "step": 2980 + }, + { + "epoch": 0.34, + "grad_norm": 0.6019383668899536, + "learning_rate": 0.00014834118083898373, + "loss": 0.8434, + "step": 2985 + }, + { + "epoch": 0.34, + "grad_norm": 0.6566647887229919, + "learning_rate": 0.00014818475769168594, + "loss": 0.7786, + "step": 2990 + }, + { + "epoch": 0.34, + "grad_norm": 0.6752328276634216, + "learning_rate": 0.00014802818084305646, + "loss": 0.7453, + "step": 2995 + }, + { + "epoch": 0.34, + "grad_norm": 0.5767747759819031, + "learning_rate": 0.00014787145079254925, + "loss": 0.8015, + "step": 3000 + }, + { + "epoch": 0.34, + "grad_norm": 0.6346129775047302, + "learning_rate": 0.00014771456804010702, + "loss": 0.9022, + "step": 3005 + }, + { + "epoch": 0.34, + "grad_norm": 0.6062515377998352, + "learning_rate": 0.0001475575330861595, + "loss": 0.887, + "step": 3010 + }, + { + "epoch": 0.34, + "grad_norm": 0.5885165929794312, + "learning_rate": 0.00014740034643162208, + "loss": 0.8497, + "step": 3015 + }, + { + "epoch": 0.34, + "grad_norm": 0.5770689845085144, + "learning_rate": 0.00014724300857789385, + "loss": 0.7701, + "step": 3020 + }, + { + "epoch": 0.34, + "grad_norm": 0.6336144804954529, + "learning_rate": 0.00014708552002685633, + "loss": 0.7648, + "step": 3025 + }, + { + "epoch": 0.34, + "grad_norm": 0.614093005657196, + "learning_rate": 0.00014692788128087175, + "loss": 0.7606, + "step": 3030 + }, + { + "epoch": 0.35, + "grad_norm": 0.6177241206169128, + "learning_rate": 0.00014677009284278127, + "loss": 0.7948, + "step": 3035 + }, + { + "epoch": 0.35, + "grad_norm": 0.6105360984802246, + "learning_rate": 0.00014661215521590375, + "loss": 0.6969, + "step": 3040 + }, + { + "epoch": 0.35, + "grad_norm": 0.5755805373191833, + "learning_rate": 0.00014645406890403384, + "loss": 0.912, + "step": 3045 + }, + { + "epoch": 0.35, + "grad_norm": 0.6132418513298035, + "learning_rate": 0.00014629583441144042, + "loss": 0.8358, + "step": 3050 + }, + { + "epoch": 0.35, + "grad_norm": 0.5401825308799744, + "learning_rate": 0.00014613745224286524, + "loss": 0.8461, + "step": 3055 + }, + { + "epoch": 0.35, + "grad_norm": 0.6283666491508484, + "learning_rate": 0.0001459789229035208, + "loss": 0.8416, + "step": 3060 + }, + { + "epoch": 0.35, + "grad_norm": 0.5837119221687317, + "learning_rate": 0.00014582024689908932, + "loss": 0.8082, + "step": 3065 + }, + { + "epoch": 0.35, + "grad_norm": 0.6010891199111938, + "learning_rate": 0.0001456614247357208, + "loss": 0.7427, + "step": 3070 + }, + { + "epoch": 0.35, + "grad_norm": 0.625624418258667, + "learning_rate": 0.00014550245692003132, + "loss": 0.8802, + "step": 3075 + }, + { + "epoch": 0.35, + "grad_norm": 0.5869577527046204, + "learning_rate": 0.00014534334395910171, + "loss": 0.8265, + "step": 3080 + }, + { + "epoch": 0.35, + "grad_norm": 0.6369906067848206, + "learning_rate": 0.0001451840863604758, + "loss": 0.7906, + "step": 3085 + }, + { + "epoch": 0.35, + "grad_norm": 0.5773164629936218, + "learning_rate": 0.00014502468463215866, + "loss": 0.7897, + "step": 3090 + }, + { + "epoch": 0.35, + "grad_norm": 0.646680474281311, + "learning_rate": 0.00014486513928261524, + "loss": 0.8279, + "step": 3095 + }, + { + "epoch": 0.35, + "grad_norm": 0.6799820065498352, + "learning_rate": 0.00014470545082076854, + "loss": 0.8948, + "step": 3100 + }, + { + "epoch": 0.35, + "grad_norm": 0.5638763904571533, + "learning_rate": 0.0001445456197559981, + "loss": 0.8377, + "step": 3105 + }, + { + "epoch": 0.35, + "grad_norm": 0.5281423330307007, + "learning_rate": 0.00014438564659813833, + "loss": 0.8436, + "step": 3110 + }, + { + "epoch": 0.35, + "grad_norm": 0.54918372631073, + "learning_rate": 0.00014422553185747692, + "loss": 0.7828, + "step": 3115 + }, + { + "epoch": 0.35, + "grad_norm": 0.5749617218971252, + "learning_rate": 0.00014406527604475308, + "loss": 0.7934, + "step": 3120 + }, + { + "epoch": 0.36, + "grad_norm": 0.5960623025894165, + "learning_rate": 0.00014390487967115619, + "loss": 0.8148, + "step": 3125 + }, + { + "epoch": 0.36, + "grad_norm": 0.5716930031776428, + "learning_rate": 0.00014374434324832385, + "loss": 0.9293, + "step": 3130 + }, + { + "epoch": 0.36, + "grad_norm": 0.6069075465202332, + "learning_rate": 0.00014358366728834044, + "loss": 0.7865, + "step": 3135 + }, + { + "epoch": 0.36, + "grad_norm": 0.6393450498580933, + "learning_rate": 0.0001434228523037355, + "loss": 0.8179, + "step": 3140 + }, + { + "epoch": 0.36, + "grad_norm": 0.6741149425506592, + "learning_rate": 0.00014326189880748186, + "loss": 0.867, + "step": 3145 + }, + { + "epoch": 0.36, + "grad_norm": 0.5840499997138977, + "learning_rate": 0.00014310080731299443, + "loss": 0.8286, + "step": 3150 + }, + { + "epoch": 0.36, + "grad_norm": 0.5931133031845093, + "learning_rate": 0.0001429395783341281, + "loss": 0.9023, + "step": 3155 + }, + { + "epoch": 0.36, + "grad_norm": 0.7968230843544006, + "learning_rate": 0.00014277821238517643, + "loss": 0.754, + "step": 3160 + }, + { + "epoch": 0.36, + "grad_norm": 0.56842041015625, + "learning_rate": 0.00014261670998086986, + "loss": 0.6804, + "step": 3165 + }, + { + "epoch": 0.36, + "grad_norm": 0.6133948564529419, + "learning_rate": 0.00014245507163637407, + "loss": 0.8501, + "step": 3170 + }, + { + "epoch": 0.36, + "grad_norm": 0.5714240670204163, + "learning_rate": 0.00014229329786728839, + "loss": 0.8027, + "step": 3175 + }, + { + "epoch": 0.36, + "grad_norm": 0.8033362627029419, + "learning_rate": 0.00014213138918964415, + "loss": 0.8119, + "step": 3180 + }, + { + "epoch": 0.36, + "grad_norm": 0.7656739950180054, + "learning_rate": 0.00014196934611990296, + "loss": 0.8129, + "step": 3185 + }, + { + "epoch": 0.36, + "grad_norm": 0.5456616282463074, + "learning_rate": 0.0001418071691749552, + "loss": 0.6827, + "step": 3190 + }, + { + "epoch": 0.36, + "grad_norm": 0.5840951204299927, + "learning_rate": 0.00014164485887211824, + "loss": 0.67, + "step": 3195 + }, + { + "epoch": 0.36, + "grad_norm": 0.7157981395721436, + "learning_rate": 0.0001414824157291348, + "loss": 0.861, + "step": 3200 + }, + { + "epoch": 0.36, + "grad_norm": 0.6238237619400024, + "learning_rate": 0.00014131984026417147, + "loss": 0.8524, + "step": 3205 + }, + { + "epoch": 0.36, + "grad_norm": 0.7766329646110535, + "learning_rate": 0.00014115713299581677, + "loss": 0.7376, + "step": 3210 + }, + { + "epoch": 0.37, + "grad_norm": 0.5822386741638184, + "learning_rate": 0.00014099429444307973, + "loss": 0.9006, + "step": 3215 + }, + { + "epoch": 0.37, + "grad_norm": 0.6553748250007629, + "learning_rate": 0.00014083132512538815, + "loss": 0.781, + "step": 3220 + }, + { + "epoch": 0.37, + "grad_norm": 0.5938107967376709, + "learning_rate": 0.00014066822556258693, + "loss": 0.7423, + "step": 3225 + }, + { + "epoch": 0.37, + "grad_norm": 0.6933189630508423, + "learning_rate": 0.00014050499627493647, + "loss": 0.8366, + "step": 3230 + }, + { + "epoch": 0.37, + "grad_norm": 0.5675535202026367, + "learning_rate": 0.00014034163778311095, + "loss": 0.8206, + "step": 3235 + }, + { + "epoch": 0.37, + "grad_norm": 0.49278539419174194, + "learning_rate": 0.00014017815060819665, + "loss": 0.7769, + "step": 3240 + }, + { + "epoch": 0.37, + "grad_norm": 0.5603547096252441, + "learning_rate": 0.00014001453527169035, + "loss": 0.815, + "step": 3245 + }, + { + "epoch": 0.37, + "grad_norm": 0.6954237818717957, + "learning_rate": 0.00013985079229549772, + "loss": 0.8185, + "step": 3250 + }, + { + "epoch": 0.37, + "grad_norm": 0.5948113799095154, + "learning_rate": 0.00013968692220193144, + "loss": 0.7734, + "step": 3255 + }, + { + "epoch": 0.37, + "grad_norm": 0.5735976696014404, + "learning_rate": 0.00013952292551370978, + "loss": 0.7777, + "step": 3260 + }, + { + "epoch": 0.37, + "grad_norm": 0.6098427176475525, + "learning_rate": 0.00013935880275395482, + "loss": 0.689, + "step": 3265 + }, + { + "epoch": 0.37, + "grad_norm": 0.5526130199432373, + "learning_rate": 0.00013919455444619074, + "loss": 0.7506, + "step": 3270 + }, + { + "epoch": 0.37, + "grad_norm": 0.585938572883606, + "learning_rate": 0.0001390301811143422, + "loss": 0.794, + "step": 3275 + }, + { + "epoch": 0.37, + "grad_norm": 0.737772524356842, + "learning_rate": 0.00013886568328273267, + "loss": 0.8794, + "step": 3280 + }, + { + "epoch": 0.37, + "grad_norm": 0.5786934494972229, + "learning_rate": 0.00013870106147608282, + "loss": 0.8145, + "step": 3285 + }, + { + "epoch": 0.37, + "grad_norm": 0.6087589859962463, + "learning_rate": 0.0001385363162195087, + "loss": 0.898, + "step": 3290 + }, + { + "epoch": 0.37, + "grad_norm": 0.6845444440841675, + "learning_rate": 0.00013837144803852016, + "loss": 0.9058, + "step": 3295 + }, + { + "epoch": 0.38, + "grad_norm": 0.6652143001556396, + "learning_rate": 0.00013820645745901916, + "loss": 0.7903, + "step": 3300 + }, + { + "epoch": 0.38, + "grad_norm": 0.5612785816192627, + "learning_rate": 0.00013804134500729816, + "loss": 0.8815, + "step": 3305 + }, + { + "epoch": 0.38, + "grad_norm": 0.5807976126670837, + "learning_rate": 0.00013787611121003824, + "loss": 0.7484, + "step": 3310 + }, + { + "epoch": 0.38, + "grad_norm": 0.6199260950088501, + "learning_rate": 0.0001377107565943077, + "loss": 0.7713, + "step": 3315 + }, + { + "epoch": 0.38, + "grad_norm": 0.5932952761650085, + "learning_rate": 0.00013754528168756006, + "loss": 0.8288, + "step": 3320 + }, + { + "epoch": 0.38, + "grad_norm": 0.5941212773323059, + "learning_rate": 0.00013737968701763275, + "loss": 0.8343, + "step": 3325 + }, + { + "epoch": 0.38, + "grad_norm": 0.5822454690933228, + "learning_rate": 0.00013721397311274505, + "loss": 0.7255, + "step": 3330 + }, + { + "epoch": 0.38, + "grad_norm": 0.6364741921424866, + "learning_rate": 0.00013704814050149663, + "loss": 0.8083, + "step": 3335 + }, + { + "epoch": 0.38, + "grad_norm": 0.707168459892273, + "learning_rate": 0.0001368821897128659, + "loss": 0.7845, + "step": 3340 + }, + { + "epoch": 0.38, + "grad_norm": 0.6789791584014893, + "learning_rate": 0.0001367161212762081, + "loss": 0.8041, + "step": 3345 + }, + { + "epoch": 0.38, + "grad_norm": 0.6019969582557678, + "learning_rate": 0.00013654993572125384, + "loss": 0.7461, + "step": 3350 + }, + { + "epoch": 0.38, + "grad_norm": 0.9661110639572144, + "learning_rate": 0.00013638363357810734, + "loss": 0.7196, + "step": 3355 + }, + { + "epoch": 0.38, + "grad_norm": 0.6087985038757324, + "learning_rate": 0.00013621721537724458, + "loss": 0.7691, + "step": 3360 + }, + { + "epoch": 0.38, + "grad_norm": 0.628359854221344, + "learning_rate": 0.00013605068164951193, + "loss": 0.8378, + "step": 3365 + }, + { + "epoch": 0.38, + "grad_norm": 0.6273655891418457, + "learning_rate": 0.00013588403292612408, + "loss": 0.7873, + "step": 3370 + }, + { + "epoch": 0.38, + "grad_norm": 0.6866421103477478, + "learning_rate": 0.00013571726973866274, + "loss": 0.7953, + "step": 3375 + }, + { + "epoch": 0.38, + "grad_norm": 0.6045382022857666, + "learning_rate": 0.00013555039261907453, + "loss": 0.8285, + "step": 3380 + }, + { + "epoch": 0.38, + "grad_norm": 0.5913712978363037, + "learning_rate": 0.00013538340209966966, + "loss": 0.775, + "step": 3385 + }, + { + "epoch": 0.39, + "grad_norm": 0.669894278049469, + "learning_rate": 0.00013521629871311995, + "loss": 0.7326, + "step": 3390 + }, + { + "epoch": 0.39, + "grad_norm": 0.6073742508888245, + "learning_rate": 0.00013504908299245738, + "loss": 0.8209, + "step": 3395 + }, + { + "epoch": 0.39, + "grad_norm": 0.7493303418159485, + "learning_rate": 0.00013488175547107215, + "loss": 0.7391, + "step": 3400 + }, + { + "epoch": 0.39, + "grad_norm": 0.5117560625076294, + "learning_rate": 0.00013471431668271103, + "loss": 0.7678, + "step": 3405 + }, + { + "epoch": 0.39, + "grad_norm": 0.5908418297767639, + "learning_rate": 0.00013454676716147593, + "loss": 0.8567, + "step": 3410 + }, + { + "epoch": 0.39, + "grad_norm": 0.4934523403644562, + "learning_rate": 0.00013437910744182178, + "loss": 0.9218, + "step": 3415 + }, + { + "epoch": 0.39, + "grad_norm": 0.6765584349632263, + "learning_rate": 0.0001342113380585551, + "loss": 0.7872, + "step": 3420 + }, + { + "epoch": 0.39, + "grad_norm": 0.5264098048210144, + "learning_rate": 0.0001340434595468322, + "loss": 0.7688, + "step": 3425 + }, + { + "epoch": 0.39, + "grad_norm": 0.621438205242157, + "learning_rate": 0.00013387547244215754, + "loss": 0.8054, + "step": 3430 + }, + { + "epoch": 0.39, + "grad_norm": 0.8711406588554382, + "learning_rate": 0.0001337073772803819, + "loss": 0.8414, + "step": 3435 + }, + { + "epoch": 0.39, + "grad_norm": 0.5573263764381409, + "learning_rate": 0.00013353917459770078, + "loss": 0.7817, + "step": 3440 + }, + { + "epoch": 0.39, + "grad_norm": 0.5504346489906311, + "learning_rate": 0.00013337086493065266, + "loss": 0.7979, + "step": 3445 + }, + { + "epoch": 0.39, + "grad_norm": 0.6491450071334839, + "learning_rate": 0.00013320244881611726, + "loss": 0.8133, + "step": 3450 + }, + { + "epoch": 0.39, + "grad_norm": 0.5762600898742676, + "learning_rate": 0.00013303392679131393, + "loss": 0.7396, + "step": 3455 + }, + { + "epoch": 0.39, + "grad_norm": 0.606731116771698, + "learning_rate": 0.00013286529939379968, + "loss": 0.8597, + "step": 3460 + }, + { + "epoch": 0.39, + "grad_norm": 0.583909273147583, + "learning_rate": 0.00013269656716146785, + "loss": 0.7119, + "step": 3465 + }, + { + "epoch": 0.39, + "grad_norm": 0.5781696438789368, + "learning_rate": 0.0001325277306325461, + "loss": 0.7583, + "step": 3470 + }, + { + "epoch": 0.4, + "grad_norm": 0.6569092273712158, + "learning_rate": 0.00013235879034559467, + "loss": 0.7816, + "step": 3475 + }, + { + "epoch": 0.4, + "grad_norm": 0.5959500670433044, + "learning_rate": 0.0001321897468395049, + "loss": 0.7365, + "step": 3480 + }, + { + "epoch": 0.4, + "grad_norm": 0.6647725701332092, + "learning_rate": 0.0001320206006534974, + "loss": 0.8132, + "step": 3485 + }, + { + "epoch": 0.4, + "grad_norm": 0.7699481248855591, + "learning_rate": 0.00013185135232712022, + "loss": 0.7308, + "step": 3490 + }, + { + "epoch": 0.4, + "grad_norm": 0.5117635130882263, + "learning_rate": 0.00013168200240024728, + "loss": 0.7804, + "step": 3495 + }, + { + "epoch": 0.4, + "grad_norm": 0.6221270561218262, + "learning_rate": 0.00013151255141307657, + "loss": 0.8068, + "step": 3500 + }, + { + "epoch": 0.4, + "grad_norm": 0.6052716374397278, + "learning_rate": 0.0001313429999061284, + "loss": 0.7677, + "step": 3505 + }, + { + "epoch": 0.4, + "grad_norm": 0.6011590361595154, + "learning_rate": 0.00013117334842024385, + "loss": 0.8228, + "step": 3510 + }, + { + "epoch": 0.4, + "grad_norm": 0.6260893940925598, + "learning_rate": 0.0001310035974965828, + "loss": 0.8167, + "step": 3515 + }, + { + "epoch": 0.4, + "grad_norm": 0.5733935832977295, + "learning_rate": 0.0001308337476766223, + "loss": 0.8315, + "step": 3520 + }, + { + "epoch": 0.4, + "grad_norm": 0.6916394233703613, + "learning_rate": 0.00013066379950215498, + "loss": 0.735, + "step": 3525 + }, + { + "epoch": 0.4, + "grad_norm": 0.7169948816299438, + "learning_rate": 0.0001304937535152871, + "loss": 0.7433, + "step": 3530 + }, + { + "epoch": 0.4, + "grad_norm": 0.5684279203414917, + "learning_rate": 0.00013032361025843705, + "loss": 0.7539, + "step": 3535 + }, + { + "epoch": 0.4, + "grad_norm": 0.7120059728622437, + "learning_rate": 0.0001301533702743333, + "loss": 0.913, + "step": 3540 + }, + { + "epoch": 0.4, + "grad_norm": 0.6004089713096619, + "learning_rate": 0.000129983034106013, + "loss": 0.7089, + "step": 3545 + }, + { + "epoch": 0.4, + "grad_norm": 0.5413163900375366, + "learning_rate": 0.00012981260229682018, + "loss": 0.7601, + "step": 3550 + }, + { + "epoch": 0.4, + "grad_norm": 0.5603556036949158, + "learning_rate": 0.0001296420753904037, + "loss": 0.8225, + "step": 3555 + }, + { + "epoch": 0.4, + "grad_norm": 0.6073338389396667, + "learning_rate": 0.00012947145393071608, + "loss": 0.8175, + "step": 3560 + }, + { + "epoch": 0.41, + "grad_norm": 0.5353379249572754, + "learning_rate": 0.00012930073846201116, + "loss": 0.7516, + "step": 3565 + }, + { + "epoch": 0.41, + "grad_norm": 0.5886621475219727, + "learning_rate": 0.00012912992952884283, + "loss": 0.8678, + "step": 3570 + }, + { + "epoch": 0.41, + "grad_norm": 0.5950932502746582, + "learning_rate": 0.0001289590276760631, + "loss": 0.7118, + "step": 3575 + }, + { + "epoch": 0.41, + "grad_norm": 0.6099336743354797, + "learning_rate": 0.00012878803344882028, + "loss": 0.909, + "step": 3580 + }, + { + "epoch": 0.41, + "grad_norm": 0.662491500377655, + "learning_rate": 0.00012861694739255746, + "loss": 0.8293, + "step": 3585 + }, + { + "epoch": 0.41, + "grad_norm": 0.49604055285453796, + "learning_rate": 0.00012844577005301054, + "loss": 0.7738, + "step": 3590 + }, + { + "epoch": 0.41, + "grad_norm": 0.5976107716560364, + "learning_rate": 0.00012827450197620672, + "loss": 0.7678, + "step": 3595 + }, + { + "epoch": 0.41, + "grad_norm": 0.6851217150688171, + "learning_rate": 0.00012810314370846252, + "loss": 0.7946, + "step": 3600 + }, + { + "epoch": 0.41, + "grad_norm": 0.6138992309570312, + "learning_rate": 0.00012793169579638223, + "loss": 0.7757, + "step": 3605 + }, + { + "epoch": 0.41, + "grad_norm": 0.5814104080200195, + "learning_rate": 0.00012776015878685604, + "loss": 0.8763, + "step": 3610 + }, + { + "epoch": 0.41, + "grad_norm": 0.7025929689407349, + "learning_rate": 0.00012758853322705836, + "loss": 0.7614, + "step": 3615 + }, + { + "epoch": 0.41, + "grad_norm": 0.7081409692764282, + "learning_rate": 0.00012741681966444609, + "loss": 0.9598, + "step": 3620 + }, + { + "epoch": 0.41, + "grad_norm": 0.5396503210067749, + "learning_rate": 0.0001272450186467568, + "loss": 0.7691, + "step": 3625 + }, + { + "epoch": 0.41, + "grad_norm": 0.5764672160148621, + "learning_rate": 0.0001270731307220071, + "loss": 0.7361, + "step": 3630 + }, + { + "epoch": 0.41, + "grad_norm": 0.5809180736541748, + "learning_rate": 0.00012690115643849078, + "loss": 0.7498, + "step": 3635 + }, + { + "epoch": 0.41, + "grad_norm": 0.8152880668640137, + "learning_rate": 0.000126729096344777, + "loss": 0.8225, + "step": 3640 + }, + { + "epoch": 0.41, + "grad_norm": 0.6310808658599854, + "learning_rate": 0.0001265569509897088, + "loss": 0.9078, + "step": 3645 + }, + { + "epoch": 0.42, + "grad_norm": 0.540705144405365, + "learning_rate": 0.00012638472092240112, + "loss": 0.7529, + "step": 3650 + }, + { + "epoch": 0.42, + "grad_norm": 0.5447615385055542, + "learning_rate": 0.00012621240669223905, + "loss": 0.8246, + "step": 3655 + }, + { + "epoch": 0.42, + "grad_norm": 0.5487338900566101, + "learning_rate": 0.00012604000884887634, + "loss": 0.8896, + "step": 3660 + }, + { + "epoch": 0.42, + "grad_norm": 0.5725840926170349, + "learning_rate": 0.0001258675279422332, + "loss": 0.8124, + "step": 3665 + }, + { + "epoch": 0.42, + "grad_norm": 0.5746605396270752, + "learning_rate": 0.00012569496452249497, + "loss": 0.8167, + "step": 3670 + }, + { + "epoch": 0.42, + "grad_norm": 0.6197704672813416, + "learning_rate": 0.00012552231914011015, + "loss": 0.7547, + "step": 3675 + }, + { + "epoch": 0.42, + "grad_norm": 0.5929337739944458, + "learning_rate": 0.0001253495923457887, + "loss": 0.8175, + "step": 3680 + }, + { + "epoch": 0.42, + "grad_norm": 0.6313364505767822, + "learning_rate": 0.00012517678469050022, + "loss": 0.8266, + "step": 3685 + }, + { + "epoch": 0.42, + "grad_norm": 0.6353349685668945, + "learning_rate": 0.00012500389672547233, + "loss": 0.7939, + "step": 3690 + }, + { + "epoch": 0.42, + "grad_norm": 0.7956532835960388, + "learning_rate": 0.00012483092900218872, + "loss": 0.7978, + "step": 3695 + }, + { + "epoch": 0.42, + "grad_norm": 0.6404840350151062, + "learning_rate": 0.00012465788207238754, + "loss": 0.8911, + "step": 3700 + }, + { + "epoch": 0.42, + "grad_norm": 0.6020888686180115, + "learning_rate": 0.00012448475648805965, + "loss": 0.8003, + "step": 3705 + }, + { + "epoch": 0.42, + "grad_norm": 0.5578533411026001, + "learning_rate": 0.0001243115528014467, + "loss": 0.7608, + "step": 3710 + }, + { + "epoch": 0.42, + "grad_norm": 0.5616059899330139, + "learning_rate": 0.0001241382715650396, + "loss": 0.8111, + "step": 3715 + }, + { + "epoch": 0.42, + "grad_norm": 0.5208618640899658, + "learning_rate": 0.00012396491333157653, + "loss": 0.7848, + "step": 3720 + }, + { + "epoch": 0.42, + "grad_norm": 0.6344425082206726, + "learning_rate": 0.00012379147865404126, + "loss": 0.82, + "step": 3725 + }, + { + "epoch": 0.42, + "grad_norm": 0.6129960417747498, + "learning_rate": 0.00012361796808566154, + "loss": 0.8048, + "step": 3730 + }, + { + "epoch": 0.42, + "grad_norm": 0.5904735922813416, + "learning_rate": 0.00012344438217990706, + "loss": 0.8064, + "step": 3735 + }, + { + "epoch": 0.43, + "grad_norm": 0.6212365031242371, + "learning_rate": 0.00012327072149048785, + "loss": 0.7793, + "step": 3740 + }, + { + "epoch": 0.43, + "grad_norm": 0.5697383284568787, + "learning_rate": 0.00012309698657135264, + "loss": 0.8082, + "step": 3745 + }, + { + "epoch": 0.43, + "grad_norm": 0.5733943581581116, + "learning_rate": 0.00012292317797668665, + "loss": 0.8163, + "step": 3750 + }, + { + "epoch": 0.43, + "grad_norm": 0.7434484362602234, + "learning_rate": 0.00012274929626091035, + "loss": 0.8446, + "step": 3755 + }, + { + "epoch": 0.43, + "grad_norm": 0.5016825795173645, + "learning_rate": 0.00012257534197867743, + "loss": 0.731, + "step": 3760 + }, + { + "epoch": 0.43, + "grad_norm": 0.6584892868995667, + "learning_rate": 0.00012240131568487292, + "loss": 0.7483, + "step": 3765 + }, + { + "epoch": 0.43, + "grad_norm": 0.5814744234085083, + "learning_rate": 0.0001222272179346117, + "loss": 0.718, + "step": 3770 + }, + { + "epoch": 0.43, + "grad_norm": 0.7187736630439758, + "learning_rate": 0.00012205304928323649, + "loss": 0.8388, + "step": 3775 + }, + { + "epoch": 0.43, + "grad_norm": 0.6147611141204834, + "learning_rate": 0.00012187881028631621, + "loss": 0.8159, + "step": 3780 + }, + { + "epoch": 0.43, + "grad_norm": 0.5736737251281738, + "learning_rate": 0.0001217045014996442, + "loss": 0.6625, + "step": 3785 + }, + { + "epoch": 0.43, + "grad_norm": 0.6499507427215576, + "learning_rate": 0.00012153012347923634, + "loss": 0.8721, + "step": 3790 + }, + { + "epoch": 0.43, + "grad_norm": 0.6190776824951172, + "learning_rate": 0.00012135567678132942, + "loss": 0.7648, + "step": 3795 + }, + { + "epoch": 0.43, + "grad_norm": 0.6103724837303162, + "learning_rate": 0.0001211811619623793, + "loss": 0.7944, + "step": 3800 + }, + { + "epoch": 0.43, + "grad_norm": 0.5723038911819458, + "learning_rate": 0.00012100657957905908, + "loss": 0.7289, + "step": 3805 + }, + { + "epoch": 0.43, + "grad_norm": 0.5608327984809875, + "learning_rate": 0.00012083193018825744, + "loss": 0.8117, + "step": 3810 + }, + { + "epoch": 0.43, + "grad_norm": 0.5121841430664062, + "learning_rate": 0.00012065721434707677, + "loss": 0.9014, + "step": 3815 + }, + { + "epoch": 0.43, + "grad_norm": 0.6049484014511108, + "learning_rate": 0.00012048243261283143, + "loss": 0.7161, + "step": 3820 + }, + { + "epoch": 0.43, + "grad_norm": 0.6480629444122314, + "learning_rate": 0.00012030758554304593, + "loss": 0.8718, + "step": 3825 + }, + { + "epoch": 0.44, + "grad_norm": 0.6752771735191345, + "learning_rate": 0.00012013267369545329, + "loss": 0.8241, + "step": 3830 + }, + { + "epoch": 0.44, + "grad_norm": 0.6116204261779785, + "learning_rate": 0.00011995769762799307, + "loss": 0.8426, + "step": 3835 + }, + { + "epoch": 0.44, + "grad_norm": 0.6204956769943237, + "learning_rate": 0.00011978265789880973, + "loss": 0.8223, + "step": 3840 + }, + { + "epoch": 0.44, + "grad_norm": 0.6253282427787781, + "learning_rate": 0.00011960755506625077, + "loss": 0.7238, + "step": 3845 + }, + { + "epoch": 0.44, + "grad_norm": 0.7981095314025879, + "learning_rate": 0.00011943238968886492, + "loss": 0.7958, + "step": 3850 + }, + { + "epoch": 0.44, + "grad_norm": 0.67615807056427, + "learning_rate": 0.00011925716232540061, + "loss": 0.8668, + "step": 3855 + }, + { + "epoch": 0.44, + "grad_norm": 0.6348207592964172, + "learning_rate": 0.0001190818735348038, + "loss": 0.8649, + "step": 3860 + }, + { + "epoch": 0.44, + "grad_norm": 0.8550826907157898, + "learning_rate": 0.00011890652387621643, + "loss": 0.7417, + "step": 3865 + }, + { + "epoch": 0.44, + "grad_norm": 0.638140082359314, + "learning_rate": 0.00011873111390897475, + "loss": 0.8436, + "step": 3870 + }, + { + "epoch": 0.44, + "grad_norm": 0.7016919255256653, + "learning_rate": 0.00011855564419260714, + "loss": 0.7805, + "step": 3875 + }, + { + "epoch": 0.44, + "grad_norm": 0.6126405000686646, + "learning_rate": 0.00011838011528683279, + "loss": 0.8705, + "step": 3880 + }, + { + "epoch": 0.44, + "grad_norm": 0.6007192730903625, + "learning_rate": 0.00011820452775155957, + "loss": 0.7607, + "step": 3885 + }, + { + "epoch": 0.44, + "grad_norm": 0.5800730586051941, + "learning_rate": 0.00011802888214688235, + "loss": 0.8891, + "step": 3890 + }, + { + "epoch": 0.44, + "grad_norm": 0.6240999102592468, + "learning_rate": 0.00011785317903308137, + "loss": 0.729, + "step": 3895 + }, + { + "epoch": 0.44, + "grad_norm": 0.5820502638816833, + "learning_rate": 0.00011767741897062017, + "loss": 0.7398, + "step": 3900 + }, + { + "epoch": 0.44, + "grad_norm": 0.7410991787910461, + "learning_rate": 0.00011750160252014402, + "loss": 0.7372, + "step": 3905 + }, + { + "epoch": 0.44, + "grad_norm": 0.6214268207550049, + "learning_rate": 0.00011732573024247804, + "loss": 0.7226, + "step": 3910 + }, + { + "epoch": 0.45, + "grad_norm": 0.7160118222236633, + "learning_rate": 0.00011714980269862538, + "loss": 0.8079, + "step": 3915 + }, + { + "epoch": 0.45, + "grad_norm": 0.6754052042961121, + "learning_rate": 0.00011697382044976564, + "loss": 0.7681, + "step": 3920 + }, + { + "epoch": 0.45, + "grad_norm": 0.6616350412368774, + "learning_rate": 0.00011679778405725274, + "loss": 0.8621, + "step": 3925 + }, + { + "epoch": 0.45, + "grad_norm": 0.452217698097229, + "learning_rate": 0.00011662169408261339, + "loss": 0.7792, + "step": 3930 + }, + { + "epoch": 0.45, + "grad_norm": 0.6552872061729431, + "learning_rate": 0.00011644555108754517, + "loss": 0.8509, + "step": 3935 + }, + { + "epoch": 0.45, + "grad_norm": 0.6112470626831055, + "learning_rate": 0.0001162693556339149, + "loss": 0.852, + "step": 3940 + }, + { + "epoch": 0.45, + "grad_norm": 0.6591911315917969, + "learning_rate": 0.00011609310828375661, + "loss": 0.8543, + "step": 3945 + }, + { + "epoch": 0.45, + "grad_norm": 0.6253312826156616, + "learning_rate": 0.00011591680959926994, + "loss": 0.79, + "step": 3950 + }, + { + "epoch": 0.45, + "grad_norm": 0.5114595890045166, + "learning_rate": 0.00011574046014281823, + "loss": 0.8251, + "step": 3955 + }, + { + "epoch": 0.45, + "grad_norm": 0.5724785923957825, + "learning_rate": 0.0001155640604769268, + "loss": 0.8662, + "step": 3960 + }, + { + "epoch": 0.45, + "grad_norm": 0.6301324963569641, + "learning_rate": 0.00011538761116428118, + "loss": 0.7555, + "step": 3965 + }, + { + "epoch": 0.45, + "grad_norm": 0.74462890625, + "learning_rate": 0.00011521111276772518, + "loss": 0.9435, + "step": 3970 + }, + { + "epoch": 0.45, + "grad_norm": 0.7058918476104736, + "learning_rate": 0.00011503456585025918, + "loss": 0.8144, + "step": 3975 + }, + { + "epoch": 0.45, + "grad_norm": 0.5204983949661255, + "learning_rate": 0.00011485797097503848, + "loss": 0.894, + "step": 3980 + }, + { + "epoch": 0.45, + "grad_norm": 0.6754153370857239, + "learning_rate": 0.00011468132870537112, + "loss": 0.8955, + "step": 3985 + }, + { + "epoch": 0.45, + "grad_norm": 0.5927032232284546, + "learning_rate": 0.00011450463960471651, + "loss": 0.82, + "step": 3990 + }, + { + "epoch": 0.45, + "grad_norm": 0.5471856594085693, + "learning_rate": 0.00011432790423668338, + "loss": 0.7896, + "step": 3995 + }, + { + "epoch": 0.45, + "grad_norm": 0.5610169768333435, + "learning_rate": 0.00011415112316502803, + "loss": 0.6074, + "step": 4000 + }, + { + "epoch": 0.46, + "grad_norm": 0.6560962200164795, + "learning_rate": 0.0001139742969536526, + "loss": 0.7843, + "step": 4005 + }, + { + "epoch": 0.46, + "grad_norm": 0.6780028939247131, + "learning_rate": 0.0001137974261666031, + "loss": 0.851, + "step": 4010 + }, + { + "epoch": 0.46, + "grad_norm": 0.6535429954528809, + "learning_rate": 0.00011362051136806789, + "loss": 0.7268, + "step": 4015 + }, + { + "epoch": 0.46, + "grad_norm": 0.7623457908630371, + "learning_rate": 0.0001134435531223756, + "loss": 0.8387, + "step": 4020 + }, + { + "epoch": 0.46, + "grad_norm": 0.5498214364051819, + "learning_rate": 0.00011326655199399345, + "loss": 0.7551, + "step": 4025 + }, + { + "epoch": 0.46, + "grad_norm": 0.6058107614517212, + "learning_rate": 0.00011308950854752558, + "loss": 0.765, + "step": 4030 + }, + { + "epoch": 0.46, + "grad_norm": 0.5960306525230408, + "learning_rate": 0.00011291242334771095, + "loss": 0.8633, + "step": 4035 + }, + { + "epoch": 0.46, + "grad_norm": 0.6701599359512329, + "learning_rate": 0.00011273529695942183, + "loss": 0.7851, + "step": 4040 + }, + { + "epoch": 0.46, + "grad_norm": 0.5869520902633667, + "learning_rate": 0.00011255812994766175, + "loss": 0.8756, + "step": 4045 + }, + { + "epoch": 0.46, + "grad_norm": 0.6017047762870789, + "learning_rate": 0.00011238092287756397, + "loss": 0.6697, + "step": 4050 + }, + { + "epoch": 0.46, + "grad_norm": 0.7067684531211853, + "learning_rate": 0.00011220367631438942, + "loss": 0.7732, + "step": 4055 + }, + { + "epoch": 0.46, + "grad_norm": 0.6048575043678284, + "learning_rate": 0.00011202639082352506, + "loss": 0.7937, + "step": 4060 + }, + { + "epoch": 0.46, + "grad_norm": 0.6676318049430847, + "learning_rate": 0.00011184906697048201, + "loss": 0.8011, + "step": 4065 + }, + { + "epoch": 0.46, + "grad_norm": 0.5223115086555481, + "learning_rate": 0.00011167170532089369, + "loss": 0.7453, + "step": 4070 + }, + { + "epoch": 0.46, + "grad_norm": 0.5543930530548096, + "learning_rate": 0.00011149430644051424, + "loss": 0.8195, + "step": 4075 + }, + { + "epoch": 0.46, + "grad_norm": 0.8007245063781738, + "learning_rate": 0.0001113168708952164, + "loss": 0.889, + "step": 4080 + }, + { + "epoch": 0.46, + "grad_norm": 0.5854726433753967, + "learning_rate": 0.00011113939925098997, + "loss": 0.8129, + "step": 4085 + }, + { + "epoch": 0.47, + "grad_norm": 0.5896696448326111, + "learning_rate": 0.00011096189207393987, + "loss": 0.7341, + "step": 4090 + }, + { + "epoch": 0.47, + "grad_norm": 0.6035794019699097, + "learning_rate": 0.00011078434993028431, + "loss": 0.7217, + "step": 4095 + }, + { + "epoch": 0.47, + "grad_norm": 0.5358508825302124, + "learning_rate": 0.0001106067733863531, + "loss": 0.767, + "step": 4100 + }, + { + "epoch": 0.47, + "grad_norm": 0.6229228973388672, + "learning_rate": 0.00011042916300858583, + "loss": 0.7915, + "step": 4105 + }, + { + "epoch": 0.47, + "grad_norm": 0.5339498519897461, + "learning_rate": 0.00011025151936352987, + "loss": 0.7711, + "step": 4110 + }, + { + "epoch": 0.47, + "grad_norm": 0.8428621292114258, + "learning_rate": 0.00011007384301783883, + "loss": 0.8321, + "step": 4115 + }, + { + "epoch": 0.47, + "grad_norm": 0.6193743348121643, + "learning_rate": 0.00010989613453827057, + "loss": 0.7195, + "step": 4120 + }, + { + "epoch": 0.47, + "grad_norm": 0.7001969814300537, + "learning_rate": 0.00010971839449168543, + "loss": 0.7381, + "step": 4125 + }, + { + "epoch": 0.47, + "grad_norm": 0.6056939959526062, + "learning_rate": 0.00010954062344504458, + "loss": 0.8131, + "step": 4130 + }, + { + "epoch": 0.47, + "grad_norm": 0.7164072394371033, + "learning_rate": 0.00010936282196540788, + "loss": 0.8643, + "step": 4135 + }, + { + "epoch": 0.47, + "grad_norm": 0.6174270510673523, + "learning_rate": 0.00010918499061993241, + "loss": 0.7746, + "step": 4140 + }, + { + "epoch": 0.47, + "grad_norm": 0.602906346321106, + "learning_rate": 0.00010900712997587047, + "loss": 0.8276, + "step": 4145 + }, + { + "epoch": 0.47, + "grad_norm": 0.6625570058822632, + "learning_rate": 0.0001088292406005678, + "loss": 0.8349, + "step": 4150 + }, + { + "epoch": 0.47, + "grad_norm": 0.6430180072784424, + "learning_rate": 0.00010865132306146182, + "loss": 0.911, + "step": 4155 + }, + { + "epoch": 0.47, + "grad_norm": 0.6040902733802795, + "learning_rate": 0.00010847337792607978, + "loss": 0.7391, + "step": 4160 + }, + { + "epoch": 0.47, + "grad_norm": 0.6236419081687927, + "learning_rate": 0.00010829540576203695, + "loss": 0.7698, + "step": 4165 + }, + { + "epoch": 0.47, + "grad_norm": 0.704279363155365, + "learning_rate": 0.00010811740713703476, + "loss": 0.7582, + "step": 4170 + }, + { + "epoch": 0.47, + "grad_norm": 0.5443447232246399, + "learning_rate": 0.00010793938261885916, + "loss": 0.6971, + "step": 4175 + }, + { + "epoch": 0.48, + "grad_norm": 0.5762811303138733, + "learning_rate": 0.00010776133277537865, + "loss": 0.7751, + "step": 4180 + }, + { + "epoch": 0.48, + "grad_norm": 0.5704807639122009, + "learning_rate": 0.00010758325817454248, + "loss": 0.7032, + "step": 4185 + }, + { + "epoch": 0.48, + "grad_norm": 0.5983087420463562, + "learning_rate": 0.0001074051593843789, + "loss": 0.676, + "step": 4190 + }, + { + "epoch": 0.48, + "grad_norm": 0.5941609740257263, + "learning_rate": 0.00010722703697299328, + "loss": 0.7671, + "step": 4195 + }, + { + "epoch": 0.48, + "grad_norm": 0.6191790103912354, + "learning_rate": 0.0001070488915085664, + "loss": 0.7947, + "step": 4200 + }, + { + "epoch": 0.48, + "grad_norm": 0.6260554194450378, + "learning_rate": 0.00010687072355935257, + "loss": 0.88, + "step": 4205 + }, + { + "epoch": 0.48, + "grad_norm": 0.5160613059997559, + "learning_rate": 0.00010669253369367775, + "loss": 0.7526, + "step": 4210 + }, + { + "epoch": 0.48, + "grad_norm": 0.5811184644699097, + "learning_rate": 0.00010651432247993794, + "loss": 0.7775, + "step": 4215 + }, + { + "epoch": 0.48, + "grad_norm": 0.5606276392936707, + "learning_rate": 0.00010633609048659705, + "loss": 0.8119, + "step": 4220 + }, + { + "epoch": 0.48, + "grad_norm": 0.6187065243721008, + "learning_rate": 0.00010615783828218547, + "loss": 0.8063, + "step": 4225 + }, + { + "epoch": 0.48, + "grad_norm": 0.5912905931472778, + "learning_rate": 0.0001059795664352979, + "loss": 0.7661, + "step": 4230 + }, + { + "epoch": 0.48, + "grad_norm": 0.7005530595779419, + "learning_rate": 0.00010580127551459178, + "loss": 0.7361, + "step": 4235 + }, + { + "epoch": 0.48, + "grad_norm": 0.58009934425354, + "learning_rate": 0.00010562296608878545, + "loss": 0.7438, + "step": 4240 + }, + { + "epoch": 0.48, + "grad_norm": 0.7159207463264465, + "learning_rate": 0.00010544463872665611, + "loss": 0.8424, + "step": 4245 + }, + { + "epoch": 0.48, + "grad_norm": 0.5975012183189392, + "learning_rate": 0.00010526629399703833, + "loss": 0.8002, + "step": 4250 + }, + { + "epoch": 0.48, + "grad_norm": 0.7247589230537415, + "learning_rate": 0.00010508793246882202, + "loss": 0.7255, + "step": 4255 + }, + { + "epoch": 0.48, + "grad_norm": 0.6801584959030151, + "learning_rate": 0.0001049095547109506, + "loss": 0.8668, + "step": 4260 + }, + { + "epoch": 0.48, + "grad_norm": 0.596948504447937, + "learning_rate": 0.00010473116129241944, + "loss": 0.7466, + "step": 4265 + }, + { + "epoch": 0.49, + "grad_norm": 0.8162465691566467, + "learning_rate": 0.0001045527527822737, + "loss": 0.7843, + "step": 4270 + }, + { + "epoch": 0.49, + "grad_norm": 0.6837438941001892, + "learning_rate": 0.00010437432974960674, + "loss": 0.8472, + "step": 4275 + }, + { + "epoch": 0.49, + "grad_norm": 0.6489061713218689, + "learning_rate": 0.00010419589276355826, + "loss": 0.8139, + "step": 4280 + }, + { + "epoch": 0.49, + "grad_norm": 0.6814378499984741, + "learning_rate": 0.00010401744239331243, + "loss": 0.7912, + "step": 4285 + }, + { + "epoch": 0.49, + "grad_norm": 0.6457976698875427, + "learning_rate": 0.00010383897920809618, + "loss": 0.9099, + "step": 4290 + }, + { + "epoch": 0.49, + "grad_norm": 0.547829806804657, + "learning_rate": 0.00010366050377717722, + "loss": 0.7389, + "step": 4295 + }, + { + "epoch": 0.49, + "grad_norm": 0.6295444965362549, + "learning_rate": 0.00010348201666986241, + "loss": 0.8012, + "step": 4300 + }, + { + "epoch": 0.49, + "grad_norm": 0.6654611825942993, + "learning_rate": 0.00010330351845549578, + "loss": 0.9024, + "step": 4305 + }, + { + "epoch": 0.49, + "grad_norm": 0.5944251418113708, + "learning_rate": 0.00010312500970345688, + "loss": 0.7721, + "step": 4310 + }, + { + "epoch": 0.49, + "grad_norm": 0.6546233296394348, + "learning_rate": 0.0001029464909831588, + "loss": 0.7065, + "step": 4315 + }, + { + "epoch": 0.49, + "grad_norm": 0.6331318020820618, + "learning_rate": 0.00010276796286404644, + "loss": 0.7166, + "step": 4320 + }, + { + "epoch": 0.49, + "grad_norm": 0.9102218151092529, + "learning_rate": 0.00010258942591559475, + "loss": 0.8608, + "step": 4325 + }, + { + "epoch": 0.49, + "grad_norm": 0.5956624150276184, + "learning_rate": 0.00010241088070730669, + "loss": 0.7382, + "step": 4330 + }, + { + "epoch": 0.49, + "grad_norm": 0.6455515623092651, + "learning_rate": 0.00010223232780871173, + "loss": 0.8699, + "step": 4335 + }, + { + "epoch": 0.49, + "grad_norm": 0.6276233196258545, + "learning_rate": 0.00010205376778936379, + "loss": 0.8406, + "step": 4340 + }, + { + "epoch": 0.49, + "grad_norm": 0.6347047090530396, + "learning_rate": 0.0001018752012188395, + "loss": 0.7482, + "step": 4345 + }, + { + "epoch": 0.49, + "grad_norm": 0.6462671756744385, + "learning_rate": 0.00010169662866673646, + "loss": 0.7888, + "step": 4350 + }, + { + "epoch": 0.5, + "grad_norm": 0.7501237392425537, + "learning_rate": 0.00010151805070267121, + "loss": 0.7205, + "step": 4355 + }, + { + "epoch": 0.5, + "grad_norm": 0.6664027571678162, + "learning_rate": 0.00010133946789627773, + "loss": 0.7484, + "step": 4360 + }, + { + "epoch": 0.5, + "grad_norm": 0.5977053642272949, + "learning_rate": 0.00010116088081720527, + "loss": 0.7813, + "step": 4365 + }, + { + "epoch": 0.5, + "grad_norm": 0.5906143188476562, + "learning_rate": 0.00010098229003511683, + "loss": 0.7464, + "step": 4370 + }, + { + "epoch": 0.5, + "grad_norm": 0.6777781844139099, + "learning_rate": 0.00010080369611968723, + "loss": 0.7871, + "step": 4375 + }, + { + "epoch": 0.5, + "grad_norm": 0.6508293747901917, + "learning_rate": 0.00010062509964060118, + "loss": 0.7998, + "step": 4380 + }, + { + "epoch": 0.5, + "grad_norm": 0.6403629779815674, + "learning_rate": 0.00010044650116755165, + "loss": 0.7488, + "step": 4385 + }, + { + "epoch": 0.5, + "grad_norm": 0.5627826452255249, + "learning_rate": 0.00010026790127023793, + "loss": 0.7261, + "step": 4390 + }, + { + "epoch": 0.5, + "grad_norm": 0.5952783823013306, + "learning_rate": 0.0001000893005183639, + "loss": 0.7672, + "step": 4395 + }, + { + "epoch": 0.5, + "grad_norm": 0.5621878504753113, + "learning_rate": 9.991069948163614e-05, + "loss": 0.7098, + "step": 4400 + }, + { + "epoch": 0.5, + "grad_norm": 0.7619518041610718, + "learning_rate": 9.97320987297621e-05, + "loss": 0.8246, + "step": 4405 + }, + { + "epoch": 0.5, + "grad_norm": 0.6683063507080078, + "learning_rate": 9.955349883244837e-05, + "loss": 0.7404, + "step": 4410 + }, + { + "epoch": 0.5, + "grad_norm": 0.5906397104263306, + "learning_rate": 9.937490035939885e-05, + "loss": 0.7627, + "step": 4415 + }, + { + "epoch": 0.5, + "grad_norm": 0.6680077910423279, + "learning_rate": 9.919630388031278e-05, + "loss": 0.7825, + "step": 4420 + }, + { + "epoch": 0.5, + "grad_norm": 0.7676994800567627, + "learning_rate": 9.901770996488315e-05, + "loss": 0.8636, + "step": 4425 + }, + { + "epoch": 0.5, + "grad_norm": 0.7993883490562439, + "learning_rate": 9.883911918279476e-05, + "loss": 0.8637, + "step": 4430 + }, + { + "epoch": 0.5, + "grad_norm": 0.6810579895973206, + "learning_rate": 9.86605321037223e-05, + "loss": 0.7691, + "step": 4435 + }, + { + "epoch": 0.5, + "grad_norm": 0.6630615592002869, + "learning_rate": 9.84819492973288e-05, + "loss": 0.8173, + "step": 4440 + }, + { + "epoch": 0.51, + "grad_norm": 0.6403682231903076, + "learning_rate": 9.830337133326355e-05, + "loss": 0.9035, + "step": 4445 + }, + { + "epoch": 0.51, + "grad_norm": 0.5860075354576111, + "learning_rate": 9.81247987811605e-05, + "loss": 0.8773, + "step": 4450 + }, + { + "epoch": 0.51, + "grad_norm": 0.5799443125724792, + "learning_rate": 9.794623221063625e-05, + "loss": 0.8335, + "step": 4455 + }, + { + "epoch": 0.51, + "grad_norm": 0.6200720071792603, + "learning_rate": 9.776767219128828e-05, + "loss": 0.7708, + "step": 4460 + }, + { + "epoch": 0.51, + "grad_norm": 0.6964035630226135, + "learning_rate": 9.758911929269334e-05, + "loss": 0.8915, + "step": 4465 + }, + { + "epoch": 0.51, + "grad_norm": 0.5871273875236511, + "learning_rate": 9.741057408440528e-05, + "loss": 0.7691, + "step": 4470 + }, + { + "epoch": 0.51, + "grad_norm": 0.5692684054374695, + "learning_rate": 9.723203713595355e-05, + "loss": 0.7232, + "step": 4475 + }, + { + "epoch": 0.51, + "grad_norm": 0.6924221515655518, + "learning_rate": 9.705350901684119e-05, + "loss": 0.819, + "step": 4480 + }, + { + "epoch": 0.51, + "grad_norm": 0.6366986036300659, + "learning_rate": 9.687499029654314e-05, + "loss": 0.6907, + "step": 4485 + }, + { + "epoch": 0.51, + "grad_norm": 0.6119223237037659, + "learning_rate": 9.669648154450425e-05, + "loss": 0.8127, + "step": 4490 + }, + { + "epoch": 0.51, + "grad_norm": 0.6606696248054504, + "learning_rate": 9.651798333013762e-05, + "loss": 0.7755, + "step": 4495 + }, + { + "epoch": 0.51, + "grad_norm": 0.566644549369812, + "learning_rate": 9.63394962228228e-05, + "loss": 0.9107, + "step": 4500 + }, + { + "epoch": 0.51, + "grad_norm": 0.6656118035316467, + "learning_rate": 9.616102079190382e-05, + "loss": 0.7586, + "step": 4505 + }, + { + "epoch": 0.51, + "grad_norm": 0.5353782773017883, + "learning_rate": 9.598255760668758e-05, + "loss": 0.6815, + "step": 4510 + }, + { + "epoch": 0.51, + "grad_norm": 0.7555747032165527, + "learning_rate": 9.580410723644177e-05, + "loss": 0.9274, + "step": 4515 + }, + { + "epoch": 0.51, + "grad_norm": 0.5931302905082703, + "learning_rate": 9.562567025039327e-05, + "loss": 0.7938, + "step": 4520 + }, + { + "epoch": 0.51, + "grad_norm": 0.6406871676445007, + "learning_rate": 9.544724721772631e-05, + "loss": 0.8135, + "step": 4525 + }, + { + "epoch": 0.52, + "grad_norm": 0.5799956321716309, + "learning_rate": 9.526883870758056e-05, + "loss": 0.7286, + "step": 4530 + }, + { + "epoch": 0.52, + "grad_norm": 0.6960060596466064, + "learning_rate": 9.50904452890494e-05, + "loss": 0.8437, + "step": 4535 + }, + { + "epoch": 0.52, + "grad_norm": 0.5997190475463867, + "learning_rate": 9.491206753117803e-05, + "loss": 0.769, + "step": 4540 + }, + { + "epoch": 0.52, + "grad_norm": 0.5737845301628113, + "learning_rate": 9.473370600296169e-05, + "loss": 0.7596, + "step": 4545 + }, + { + "epoch": 0.52, + "grad_norm": 0.7565434575080872, + "learning_rate": 9.45553612733439e-05, + "loss": 0.7319, + "step": 4550 + }, + { + "epoch": 0.52, + "grad_norm": 0.6088534593582153, + "learning_rate": 9.437703391121456e-05, + "loss": 0.7568, + "step": 4555 + }, + { + "epoch": 0.52, + "grad_norm": 0.6066818833351135, + "learning_rate": 9.419872448540821e-05, + "loss": 0.6767, + "step": 4560 + }, + { + "epoch": 0.52, + "grad_norm": 0.5788107514381409, + "learning_rate": 9.402043356470215e-05, + "loss": 0.7041, + "step": 4565 + }, + { + "epoch": 0.52, + "grad_norm": 0.7555676102638245, + "learning_rate": 9.384216171781457e-05, + "loss": 0.6752, + "step": 4570 + }, + { + "epoch": 0.52, + "grad_norm": 0.5866352915763855, + "learning_rate": 9.366390951340297e-05, + "loss": 0.8096, + "step": 4575 + }, + { + "epoch": 0.52, + "grad_norm": 0.5765191912651062, + "learning_rate": 9.348567752006207e-05, + "loss": 0.7834, + "step": 4580 + }, + { + "epoch": 0.52, + "grad_norm": 0.5642898082733154, + "learning_rate": 9.330746630632224e-05, + "loss": 0.8147, + "step": 4585 + }, + { + "epoch": 0.52, + "grad_norm": 0.7535125017166138, + "learning_rate": 9.312927644064741e-05, + "loss": 0.8679, + "step": 4590 + }, + { + "epoch": 0.52, + "grad_norm": 0.6822525262832642, + "learning_rate": 9.295110849143361e-05, + "loss": 0.8211, + "step": 4595 + }, + { + "epoch": 0.52, + "grad_norm": 0.6119083762168884, + "learning_rate": 9.277296302700676e-05, + "loss": 0.7726, + "step": 4600 + }, + { + "epoch": 0.52, + "grad_norm": 0.6004748344421387, + "learning_rate": 9.259484061562113e-05, + "loss": 0.7189, + "step": 4605 + }, + { + "epoch": 0.52, + "grad_norm": 0.5561325550079346, + "learning_rate": 9.241674182545754e-05, + "loss": 0.7632, + "step": 4610 + }, + { + "epoch": 0.52, + "grad_norm": 0.674069344997406, + "learning_rate": 9.223866722462134e-05, + "loss": 0.734, + "step": 4615 + }, + { + "epoch": 0.53, + "grad_norm": 0.6486625075340271, + "learning_rate": 9.206061738114086e-05, + "loss": 0.7448, + "step": 4620 + }, + { + "epoch": 0.53, + "grad_norm": 0.585586428642273, + "learning_rate": 9.188259286296528e-05, + "loss": 0.6513, + "step": 4625 + }, + { + "epoch": 0.53, + "grad_norm": 0.6953662037849426, + "learning_rate": 9.170459423796309e-05, + "loss": 0.7195, + "step": 4630 + }, + { + "epoch": 0.53, + "grad_norm": 0.6233338117599487, + "learning_rate": 9.152662207392024e-05, + "loss": 0.8018, + "step": 4635 + }, + { + "epoch": 0.53, + "grad_norm": 0.6842007637023926, + "learning_rate": 9.134867693853816e-05, + "loss": 0.7632, + "step": 4640 + }, + { + "epoch": 0.53, + "grad_norm": 0.6602983474731445, + "learning_rate": 9.117075939943221e-05, + "loss": 0.7252, + "step": 4645 + }, + { + "epoch": 0.53, + "grad_norm": 0.5861719846725464, + "learning_rate": 9.099287002412956e-05, + "loss": 0.6509, + "step": 4650 + }, + { + "epoch": 0.53, + "grad_norm": 0.6650002002716064, + "learning_rate": 9.081500938006761e-05, + "loss": 0.8915, + "step": 4655 + }, + { + "epoch": 0.53, + "grad_norm": 0.5285857915878296, + "learning_rate": 9.063717803459213e-05, + "loss": 0.7512, + "step": 4660 + }, + { + "epoch": 0.53, + "grad_norm": 0.5826857089996338, + "learning_rate": 9.045937655495544e-05, + "loss": 0.9319, + "step": 4665 + }, + { + "epoch": 0.53, + "grad_norm": 0.6033085584640503, + "learning_rate": 9.028160550831458e-05, + "loss": 0.7979, + "step": 4670 + }, + { + "epoch": 0.53, + "grad_norm": 0.7138378620147705, + "learning_rate": 9.010386546172949e-05, + "loss": 0.763, + "step": 4675 + }, + { + "epoch": 0.53, + "grad_norm": 0.6795177459716797, + "learning_rate": 8.99261569821612e-05, + "loss": 0.7867, + "step": 4680 + }, + { + "epoch": 0.53, + "grad_norm": 0.6258403658866882, + "learning_rate": 8.974848063647015e-05, + "loss": 0.7436, + "step": 4685 + }, + { + "epoch": 0.53, + "grad_norm": 0.6418249607086182, + "learning_rate": 8.957083699141419e-05, + "loss": 0.9125, + "step": 4690 + }, + { + "epoch": 0.53, + "grad_norm": 0.5499231815338135, + "learning_rate": 8.939322661364689e-05, + "loss": 0.7395, + "step": 4695 + }, + { + "epoch": 0.53, + "grad_norm": 0.5704962015151978, + "learning_rate": 8.92156500697157e-05, + "loss": 0.7699, + "step": 4700 + }, + { + "epoch": 0.53, + "grad_norm": 0.6621077060699463, + "learning_rate": 8.903810792606018e-05, + "loss": 0.7511, + "step": 4705 + }, + { + "epoch": 0.54, + "grad_norm": 0.49777495861053467, + "learning_rate": 8.886060074901005e-05, + "loss": 0.6749, + "step": 4710 + }, + { + "epoch": 0.54, + "grad_norm": 0.609890878200531, + "learning_rate": 8.868312910478362e-05, + "loss": 0.6744, + "step": 4715 + }, + { + "epoch": 0.54, + "grad_norm": 0.6777181625366211, + "learning_rate": 8.85056935594858e-05, + "loss": 0.8498, + "step": 4720 + }, + { + "epoch": 0.54, + "grad_norm": 0.6590617895126343, + "learning_rate": 8.832829467910631e-05, + "loss": 0.7196, + "step": 4725 + }, + { + "epoch": 0.54, + "grad_norm": 0.7174111604690552, + "learning_rate": 8.815093302951804e-05, + "loss": 0.7889, + "step": 4730 + }, + { + "epoch": 0.54, + "grad_norm": 0.7540302276611328, + "learning_rate": 8.797360917647498e-05, + "loss": 0.8266, + "step": 4735 + }, + { + "epoch": 0.54, + "grad_norm": 0.6515617966651917, + "learning_rate": 8.77963236856106e-05, + "loss": 0.8768, + "step": 4740 + }, + { + "epoch": 0.54, + "grad_norm": 0.5930551290512085, + "learning_rate": 8.761907712243606e-05, + "loss": 0.7096, + "step": 4745 + }, + { + "epoch": 0.54, + "grad_norm": 0.6492053866386414, + "learning_rate": 8.744187005233826e-05, + "loss": 0.7805, + "step": 4750 + }, + { + "epoch": 0.54, + "grad_norm": 0.5830654501914978, + "learning_rate": 8.72647030405782e-05, + "loss": 0.8618, + "step": 4755 + }, + { + "epoch": 0.54, + "grad_norm": 0.5837696194648743, + "learning_rate": 8.708757665228909e-05, + "loss": 0.8436, + "step": 4760 + }, + { + "epoch": 0.54, + "grad_norm": 0.6426911354064941, + "learning_rate": 8.691049145247445e-05, + "loss": 0.8142, + "step": 4765 + }, + { + "epoch": 0.54, + "grad_norm": 0.5991452932357788, + "learning_rate": 8.673344800600657e-05, + "loss": 0.8628, + "step": 4770 + }, + { + "epoch": 0.54, + "grad_norm": 0.5630876421928406, + "learning_rate": 8.655644687762443e-05, + "loss": 0.6788, + "step": 4775 + }, + { + "epoch": 0.54, + "grad_norm": 0.692578911781311, + "learning_rate": 8.637948863193214e-05, + "loss": 0.8166, + "step": 4780 + }, + { + "epoch": 0.54, + "grad_norm": 0.7014926075935364, + "learning_rate": 8.620257383339694e-05, + "loss": 0.8809, + "step": 4785 + }, + { + "epoch": 0.54, + "grad_norm": 0.717907190322876, + "learning_rate": 8.602570304634745e-05, + "loss": 0.6915, + "step": 4790 + }, + { + "epoch": 0.55, + "grad_norm": 0.6019871234893799, + "learning_rate": 8.584887683497199e-05, + "loss": 0.7664, + "step": 4795 + }, + { + "epoch": 0.55, + "grad_norm": 0.5950486063957214, + "learning_rate": 8.567209576331663e-05, + "loss": 0.7824, + "step": 4800 + }, + { + "epoch": 0.55, + "grad_norm": 0.5407964587211609, + "learning_rate": 8.54953603952835e-05, + "loss": 0.7581, + "step": 4805 + }, + { + "epoch": 0.55, + "grad_norm": 0.6220264434814453, + "learning_rate": 8.531867129462888e-05, + "loss": 0.7908, + "step": 4810 + }, + { + "epoch": 0.55, + "grad_norm": 0.6367149353027344, + "learning_rate": 8.514202902496157e-05, + "loss": 0.8258, + "step": 4815 + }, + { + "epoch": 0.55, + "grad_norm": 0.5935396552085876, + "learning_rate": 8.496543414974083e-05, + "loss": 0.6965, + "step": 4820 + }, + { + "epoch": 0.55, + "grad_norm": 0.6432780027389526, + "learning_rate": 8.478888723227485e-05, + "loss": 0.8545, + "step": 4825 + }, + { + "epoch": 0.55, + "grad_norm": 0.6651439666748047, + "learning_rate": 8.461238883571885e-05, + "loss": 0.8435, + "step": 4830 + }, + { + "epoch": 0.55, + "grad_norm": 0.6177381873130798, + "learning_rate": 8.443593952307319e-05, + "loss": 0.8476, + "step": 4835 + }, + { + "epoch": 0.55, + "grad_norm": 0.647454559803009, + "learning_rate": 8.42595398571818e-05, + "loss": 0.7337, + "step": 4840 + }, + { + "epoch": 0.55, + "grad_norm": 0.6528860330581665, + "learning_rate": 8.408319040073011e-05, + "loss": 0.8445, + "step": 4845 + }, + { + "epoch": 0.55, + "grad_norm": 0.5049680471420288, + "learning_rate": 8.390689171624341e-05, + "loss": 0.8322, + "step": 4850 + }, + { + "epoch": 0.55, + "grad_norm": 0.7274385094642639, + "learning_rate": 8.373064436608512e-05, + "loss": 0.9198, + "step": 4855 + }, + { + "epoch": 0.55, + "grad_norm": 0.6594643592834473, + "learning_rate": 8.355444891245482e-05, + "loss": 0.7995, + "step": 4860 + }, + { + "epoch": 0.55, + "grad_norm": 0.6585742235183716, + "learning_rate": 8.337830591738664e-05, + "loss": 0.7419, + "step": 4865 + }, + { + "epoch": 0.55, + "grad_norm": 0.7104743719100952, + "learning_rate": 8.32022159427473e-05, + "loss": 0.7447, + "step": 4870 + }, + { + "epoch": 0.55, + "grad_norm": 0.7039633393287659, + "learning_rate": 8.302617955023437e-05, + "loss": 0.8402, + "step": 4875 + }, + { + "epoch": 0.55, + "grad_norm": 0.5455465912818909, + "learning_rate": 8.285019730137463e-05, + "loss": 0.8199, + "step": 4880 + }, + { + "epoch": 0.56, + "grad_norm": 0.5330617427825928, + "learning_rate": 8.2674269757522e-05, + "loss": 0.6672, + "step": 4885 + }, + { + "epoch": 0.56, + "grad_norm": 0.6174395084381104, + "learning_rate": 8.2498397479856e-05, + "loss": 0.8038, + "step": 4890 + }, + { + "epoch": 0.56, + "grad_norm": 0.7968682646751404, + "learning_rate": 8.232258102937987e-05, + "loss": 0.7963, + "step": 4895 + }, + { + "epoch": 0.56, + "grad_norm": 0.6184393763542175, + "learning_rate": 8.214682096691866e-05, + "loss": 0.7699, + "step": 4900 + }, + { + "epoch": 0.56, + "grad_norm": 0.6042989492416382, + "learning_rate": 8.197111785311768e-05, + "loss": 0.7207, + "step": 4905 + }, + { + "epoch": 0.56, + "grad_norm": 0.6476695537567139, + "learning_rate": 8.179547224844047e-05, + "loss": 0.8862, + "step": 4910 + }, + { + "epoch": 0.56, + "grad_norm": 0.7073302865028381, + "learning_rate": 8.161988471316723e-05, + "loss": 0.6839, + "step": 4915 + }, + { + "epoch": 0.56, + "grad_norm": 0.7182896733283997, + "learning_rate": 8.144435580739284e-05, + "loss": 0.8197, + "step": 4920 + }, + { + "epoch": 0.56, + "grad_norm": 0.5961290001869202, + "learning_rate": 8.126888609102528e-05, + "loss": 0.7861, + "step": 4925 + }, + { + "epoch": 0.56, + "grad_norm": 0.621379554271698, + "learning_rate": 8.109347612378358e-05, + "loss": 0.8238, + "step": 4930 + }, + { + "epoch": 0.56, + "grad_norm": 0.5518468618392944, + "learning_rate": 8.091812646519623e-05, + "loss": 0.8338, + "step": 4935 + }, + { + "epoch": 0.56, + "grad_norm": 0.715250551700592, + "learning_rate": 8.074283767459941e-05, + "loss": 0.8835, + "step": 4940 + }, + { + "epoch": 0.56, + "grad_norm": 0.6616021394729614, + "learning_rate": 8.056761031113506e-05, + "loss": 0.7993, + "step": 4945 + }, + { + "epoch": 0.56, + "grad_norm": 0.6410152316093445, + "learning_rate": 8.03924449337493e-05, + "loss": 0.7862, + "step": 4950 + }, + { + "epoch": 0.56, + "grad_norm": 0.5611416101455688, + "learning_rate": 8.02173421011903e-05, + "loss": 0.8749, + "step": 4955 + }, + { + "epoch": 0.56, + "grad_norm": 0.5541871786117554, + "learning_rate": 8.004230237200694e-05, + "loss": 0.6462, + "step": 4960 + }, + { + "epoch": 0.56, + "grad_norm": 0.594853401184082, + "learning_rate": 7.986732630454674e-05, + "loss": 0.8506, + "step": 4965 + }, + { + "epoch": 0.57, + "grad_norm": 0.6446029543876648, + "learning_rate": 7.969241445695406e-05, + "loss": 0.8062, + "step": 4970 + }, + { + "epoch": 0.57, + "grad_norm": 0.5990138649940491, + "learning_rate": 7.95175673871686e-05, + "loss": 0.8053, + "step": 4975 + }, + { + "epoch": 0.57, + "grad_norm": 0.7167456746101379, + "learning_rate": 7.934278565292328e-05, + "loss": 0.8316, + "step": 4980 + }, + { + "epoch": 0.57, + "grad_norm": 0.8522974848747253, + "learning_rate": 7.916806981174258e-05, + "loss": 0.9252, + "step": 4985 + }, + { + "epoch": 0.57, + "grad_norm": 0.703740119934082, + "learning_rate": 7.899342042094095e-05, + "loss": 0.8845, + "step": 4990 + }, + { + "epoch": 0.57, + "grad_norm": 0.6203125715255737, + "learning_rate": 7.88188380376207e-05, + "loss": 0.6643, + "step": 4995 + }, + { + "epoch": 0.57, + "grad_norm": 0.6809445023536682, + "learning_rate": 7.864432321867057e-05, + "loss": 0.8035, + "step": 5000 + }, + { + "epoch": 0.57, + "grad_norm": 0.5899534821510315, + "learning_rate": 7.846987652076372e-05, + "loss": 0.8727, + "step": 5005 + }, + { + "epoch": 0.57, + "grad_norm": 0.6520495414733887, + "learning_rate": 7.829549850035584e-05, + "loss": 0.792, + "step": 5010 + }, + { + "epoch": 0.57, + "grad_norm": 0.4904904067516327, + "learning_rate": 7.812118971368383e-05, + "loss": 0.7416, + "step": 5015 + }, + { + "epoch": 0.57, + "grad_norm": 0.6947556734085083, + "learning_rate": 7.794695071676355e-05, + "loss": 0.7412, + "step": 5020 + }, + { + "epoch": 0.57, + "grad_norm": 0.6248174905776978, + "learning_rate": 7.777278206538832e-05, + "loss": 0.8281, + "step": 5025 + }, + { + "epoch": 0.57, + "grad_norm": 0.6942163109779358, + "learning_rate": 7.759868431512709e-05, + "loss": 0.8064, + "step": 5030 + }, + { + "epoch": 0.57, + "grad_norm": 0.8450883626937866, + "learning_rate": 7.742465802132262e-05, + "loss": 0.7073, + "step": 5035 + }, + { + "epoch": 0.57, + "grad_norm": 0.6221974492073059, + "learning_rate": 7.725070373908967e-05, + "loss": 0.7276, + "step": 5040 + }, + { + "epoch": 0.57, + "grad_norm": 0.5618094205856323, + "learning_rate": 7.707682202331338e-05, + "loss": 0.7289, + "step": 5045 + }, + { + "epoch": 0.57, + "grad_norm": 0.5586840510368347, + "learning_rate": 7.690301342864739e-05, + "loss": 0.7277, + "step": 5050 + }, + { + "epoch": 0.57, + "grad_norm": 0.6269596815109253, + "learning_rate": 7.672927850951213e-05, + "loss": 0.8247, + "step": 5055 + }, + { + "epoch": 0.58, + "grad_norm": 0.5967304110527039, + "learning_rate": 7.655561782009298e-05, + "loss": 0.7651, + "step": 5060 + }, + { + "epoch": 0.58, + "grad_norm": 0.5993629097938538, + "learning_rate": 7.638203191433848e-05, + "loss": 0.8346, + "step": 5065 + }, + { + "epoch": 0.58, + "grad_norm": 0.6899064779281616, + "learning_rate": 7.620852134595875e-05, + "loss": 0.8915, + "step": 5070 + }, + { + "epoch": 0.58, + "grad_norm": 0.6052946448326111, + "learning_rate": 7.60350866684235e-05, + "loss": 0.7582, + "step": 5075 + }, + { + "epoch": 0.58, + "grad_norm": 0.6006975173950195, + "learning_rate": 7.586172843496042e-05, + "loss": 0.7252, + "step": 5080 + }, + { + "epoch": 0.58, + "grad_norm": 0.7047126293182373, + "learning_rate": 7.568844719855328e-05, + "loss": 0.7539, + "step": 5085 + }, + { + "epoch": 0.58, + "grad_norm": 0.529346227645874, + "learning_rate": 7.551524351194039e-05, + "loss": 0.7305, + "step": 5090 + }, + { + "epoch": 0.58, + "grad_norm": 0.870637834072113, + "learning_rate": 7.534211792761248e-05, + "loss": 0.7647, + "step": 5095 + }, + { + "epoch": 0.58, + "grad_norm": 0.6663516163825989, + "learning_rate": 7.51690709978113e-05, + "loss": 0.8185, + "step": 5100 + }, + { + "epoch": 0.58, + "grad_norm": 0.688782811164856, + "learning_rate": 7.49961032745277e-05, + "loss": 0.7494, + "step": 5105 + }, + { + "epoch": 0.58, + "grad_norm": 0.7706353068351746, + "learning_rate": 7.482321530949976e-05, + "loss": 0.825, + "step": 5110 + }, + { + "epoch": 0.58, + "grad_norm": 0.6962013840675354, + "learning_rate": 7.465040765421132e-05, + "loss": 0.9311, + "step": 5115 + }, + { + "epoch": 0.58, + "grad_norm": 0.6817395091056824, + "learning_rate": 7.447768085988987e-05, + "loss": 0.8245, + "step": 5120 + }, + { + "epoch": 0.58, + "grad_norm": 0.5578342080116272, + "learning_rate": 7.430503547750505e-05, + "loss": 0.872, + "step": 5125 + }, + { + "epoch": 0.58, + "grad_norm": 0.6477236151695251, + "learning_rate": 7.413247205776683e-05, + "loss": 0.7653, + "step": 5130 + }, + { + "epoch": 0.58, + "grad_norm": 0.6161476373672485, + "learning_rate": 7.395999115112369e-05, + "loss": 0.7479, + "step": 5135 + }, + { + "epoch": 0.58, + "grad_norm": 0.6497325897216797, + "learning_rate": 7.378759330776093e-05, + "loss": 0.704, + "step": 5140 + }, + { + "epoch": 0.58, + "grad_norm": 0.6440129280090332, + "learning_rate": 7.361527907759893e-05, + "loss": 0.7506, + "step": 5145 + }, + { + "epoch": 0.59, + "grad_norm": 0.8600013256072998, + "learning_rate": 7.344304901029121e-05, + "loss": 0.792, + "step": 5150 + }, + { + "epoch": 0.59, + "grad_norm": 0.7030536532402039, + "learning_rate": 7.327090365522302e-05, + "loss": 0.8071, + "step": 5155 + }, + { + "epoch": 0.59, + "grad_norm": 0.7202965617179871, + "learning_rate": 7.309884356150924e-05, + "loss": 0.6725, + "step": 5160 + }, + { + "epoch": 0.59, + "grad_norm": 0.6689905524253845, + "learning_rate": 7.292686927799288e-05, + "loss": 0.7843, + "step": 5165 + }, + { + "epoch": 0.59, + "grad_norm": 0.6386666297912598, + "learning_rate": 7.275498135324322e-05, + "loss": 0.7353, + "step": 5170 + }, + { + "epoch": 0.59, + "grad_norm": 0.8615363836288452, + "learning_rate": 7.258318033555394e-05, + "loss": 0.8075, + "step": 5175 + }, + { + "epoch": 0.59, + "grad_norm": 0.5758625864982605, + "learning_rate": 7.241146677294168e-05, + "loss": 0.7186, + "step": 5180 + }, + { + "epoch": 0.59, + "grad_norm": 0.6153453588485718, + "learning_rate": 7.2239841213144e-05, + "loss": 0.8593, + "step": 5185 + }, + { + "epoch": 0.59, + "grad_norm": 0.728493869304657, + "learning_rate": 7.20683042036178e-05, + "loss": 0.7179, + "step": 5190 + }, + { + "epoch": 0.59, + "grad_norm": 0.7267651557922363, + "learning_rate": 7.189685629153749e-05, + "loss": 0.8491, + "step": 5195 + }, + { + "epoch": 0.59, + "grad_norm": 0.6467040777206421, + "learning_rate": 7.17254980237933e-05, + "loss": 0.7203, + "step": 5200 + }, + { + "epoch": 0.59, + "grad_norm": 0.6471399664878845, + "learning_rate": 7.155422994698948e-05, + "loss": 0.8021, + "step": 5205 + }, + { + "epoch": 0.59, + "grad_norm": 0.5818991661071777, + "learning_rate": 7.138305260744256e-05, + "loss": 0.8434, + "step": 5210 + }, + { + "epoch": 0.59, + "grad_norm": 0.5848895311355591, + "learning_rate": 7.121196655117974e-05, + "loss": 0.7041, + "step": 5215 + }, + { + "epoch": 0.59, + "grad_norm": 0.6077134609222412, + "learning_rate": 7.104097232393691e-05, + "loss": 0.718, + "step": 5220 + }, + { + "epoch": 0.59, + "grad_norm": 0.6883412599563599, + "learning_rate": 7.08700704711572e-05, + "loss": 0.8343, + "step": 5225 + }, + { + "epoch": 0.59, + "grad_norm": 0.7491132020950317, + "learning_rate": 7.069926153798888e-05, + "loss": 0.7679, + "step": 5230 + }, + { + "epoch": 0.6, + "grad_norm": 0.740841805934906, + "learning_rate": 7.052854606928396e-05, + "loss": 0.8734, + "step": 5235 + }, + { + "epoch": 0.6, + "grad_norm": 0.5949041247367859, + "learning_rate": 7.03579246095963e-05, + "loss": 0.7022, + "step": 5240 + }, + { + "epoch": 0.6, + "grad_norm": 0.8379162549972534, + "learning_rate": 7.018739770317985e-05, + "loss": 0.7814, + "step": 5245 + }, + { + "epoch": 0.6, + "grad_norm": 0.6472388505935669, + "learning_rate": 7.001696589398699e-05, + "loss": 0.8437, + "step": 5250 + }, + { + "epoch": 0.6, + "grad_norm": 0.6293101906776428, + "learning_rate": 6.984662972566674e-05, + "loss": 0.825, + "step": 5255 + }, + { + "epoch": 0.6, + "grad_norm": 0.6227039694786072, + "learning_rate": 6.967638974156299e-05, + "loss": 0.8873, + "step": 5260 + }, + { + "epoch": 0.6, + "grad_norm": 0.5379999279975891, + "learning_rate": 6.950624648471288e-05, + "loss": 0.7832, + "step": 5265 + }, + { + "epoch": 0.6, + "grad_norm": 0.6406290531158447, + "learning_rate": 6.933620049784501e-05, + "loss": 0.808, + "step": 5270 + }, + { + "epoch": 0.6, + "grad_norm": 0.5745694041252136, + "learning_rate": 6.91662523233777e-05, + "loss": 0.8721, + "step": 5275 + }, + { + "epoch": 0.6, + "grad_norm": 0.5971238017082214, + "learning_rate": 6.899640250341726e-05, + "loss": 0.8064, + "step": 5280 + }, + { + "epoch": 0.6, + "grad_norm": 0.6286386847496033, + "learning_rate": 6.882665157975617e-05, + "loss": 0.7531, + "step": 5285 + }, + { + "epoch": 0.6, + "grad_norm": 0.7099630236625671, + "learning_rate": 6.865700009387161e-05, + "loss": 0.8179, + "step": 5290 + }, + { + "epoch": 0.6, + "grad_norm": 0.6734933853149414, + "learning_rate": 6.848744858692344e-05, + "loss": 0.7745, + "step": 5295 + }, + { + "epoch": 0.6, + "grad_norm": 0.6301453709602356, + "learning_rate": 6.831799759975273e-05, + "loss": 0.7436, + "step": 5300 + }, + { + "epoch": 0.6, + "grad_norm": 0.6269943118095398, + "learning_rate": 6.814864767287978e-05, + "loss": 0.845, + "step": 5305 + }, + { + "epoch": 0.6, + "grad_norm": 0.6703981757164001, + "learning_rate": 6.797939934650262e-05, + "loss": 0.6949, + "step": 5310 + }, + { + "epoch": 0.6, + "grad_norm": 0.6487662196159363, + "learning_rate": 6.781025316049512e-05, + "loss": 0.8129, + "step": 5315 + }, + { + "epoch": 0.6, + "grad_norm": 0.6541668772697449, + "learning_rate": 6.764120965440537e-05, + "loss": 0.7955, + "step": 5320 + }, + { + "epoch": 0.61, + "grad_norm": 0.5929297208786011, + "learning_rate": 6.747226936745394e-05, + "loss": 0.733, + "step": 5325 + }, + { + "epoch": 0.61, + "grad_norm": 0.6727938652038574, + "learning_rate": 6.730343283853214e-05, + "loss": 0.7599, + "step": 5330 + }, + { + "epoch": 0.61, + "grad_norm": 0.6011897325515747, + "learning_rate": 6.713470060620033e-05, + "loss": 0.7038, + "step": 5335 + }, + { + "epoch": 0.61, + "grad_norm": 0.5427403450012207, + "learning_rate": 6.696607320868612e-05, + "loss": 0.6685, + "step": 5340 + }, + { + "epoch": 0.61, + "grad_norm": 0.6301597952842712, + "learning_rate": 6.679755118388275e-05, + "loss": 0.7247, + "step": 5345 + }, + { + "epoch": 0.61, + "grad_norm": 0.7286877036094666, + "learning_rate": 6.662913506934736e-05, + "loss": 0.7292, + "step": 5350 + }, + { + "epoch": 0.61, + "grad_norm": 0.5117044448852539, + "learning_rate": 6.646082540229923e-05, + "loss": 0.687, + "step": 5355 + }, + { + "epoch": 0.61, + "grad_norm": 0.5101413726806641, + "learning_rate": 6.629262271961811e-05, + "loss": 0.6912, + "step": 5360 + }, + { + "epoch": 0.61, + "grad_norm": 0.5790354609489441, + "learning_rate": 6.61245275578425e-05, + "loss": 0.7624, + "step": 5365 + }, + { + "epoch": 0.61, + "grad_norm": 0.6759084463119507, + "learning_rate": 6.595654045316782e-05, + "loss": 0.7712, + "step": 5370 + }, + { + "epoch": 0.61, + "grad_norm": 0.6083055138587952, + "learning_rate": 6.578866194144492e-05, + "loss": 0.7512, + "step": 5375 + }, + { + "epoch": 0.61, + "grad_norm": 0.6502549648284912, + "learning_rate": 6.562089255817823e-05, + "loss": 0.8051, + "step": 5380 + }, + { + "epoch": 0.61, + "grad_norm": 0.5895970463752747, + "learning_rate": 6.545323283852407e-05, + "loss": 0.6741, + "step": 5385 + }, + { + "epoch": 0.61, + "grad_norm": 0.630634069442749, + "learning_rate": 6.528568331728895e-05, + "loss": 0.8695, + "step": 5390 + }, + { + "epoch": 0.61, + "grad_norm": 0.6064544320106506, + "learning_rate": 6.51182445289279e-05, + "loss": 0.7345, + "step": 5395 + }, + { + "epoch": 0.61, + "grad_norm": 0.6453479528427124, + "learning_rate": 6.495091700754266e-05, + "loss": 0.743, + "step": 5400 + }, + { + "epoch": 0.61, + "grad_norm": 0.6371870040893555, + "learning_rate": 6.478370128688005e-05, + "loss": 0.7806, + "step": 5405 + }, + { + "epoch": 0.62, + "grad_norm": 0.6676410436630249, + "learning_rate": 6.461659790033038e-05, + "loss": 0.767, + "step": 5410 + }, + { + "epoch": 0.62, + "grad_norm": 0.6216074824333191, + "learning_rate": 6.444960738092548e-05, + "loss": 0.6315, + "step": 5415 + }, + { + "epoch": 0.62, + "grad_norm": 0.6729696989059448, + "learning_rate": 6.428273026133731e-05, + "loss": 0.7697, + "step": 5420 + }, + { + "epoch": 0.62, + "grad_norm": 0.5996483564376831, + "learning_rate": 6.411596707387594e-05, + "loss": 0.7897, + "step": 5425 + }, + { + "epoch": 0.62, + "grad_norm": 0.9583771824836731, + "learning_rate": 6.39493183504881e-05, + "loss": 0.7393, + "step": 5430 + }, + { + "epoch": 0.62, + "grad_norm": 0.62912917137146, + "learning_rate": 6.378278462275542e-05, + "loss": 0.8387, + "step": 5435 + }, + { + "epoch": 0.62, + "grad_norm": 0.617304801940918, + "learning_rate": 6.361636642189269e-05, + "loss": 0.8835, + "step": 5440 + }, + { + "epoch": 0.62, + "grad_norm": 0.6186046004295349, + "learning_rate": 6.345006427874615e-05, + "loss": 0.6838, + "step": 5445 + }, + { + "epoch": 0.62, + "grad_norm": 0.5584391355514526, + "learning_rate": 6.328387872379193e-05, + "loss": 0.6263, + "step": 5450 + }, + { + "epoch": 0.62, + "grad_norm": 0.49611517786979675, + "learning_rate": 6.311781028713414e-05, + "loss": 0.7853, + "step": 5455 + }, + { + "epoch": 0.62, + "grad_norm": 0.5180690288543701, + "learning_rate": 6.295185949850339e-05, + "loss": 0.7376, + "step": 5460 + }, + { + "epoch": 0.62, + "grad_norm": 0.6937770247459412, + "learning_rate": 6.278602688725497e-05, + "loss": 0.7639, + "step": 5465 + }, + { + "epoch": 0.62, + "grad_norm": 0.6713787317276001, + "learning_rate": 6.262031298236728e-05, + "loss": 0.7227, + "step": 5470 + }, + { + "epoch": 0.62, + "grad_norm": 0.7440084218978882, + "learning_rate": 6.245471831243996e-05, + "loss": 0.7153, + "step": 5475 + }, + { + "epoch": 0.62, + "grad_norm": 0.6448591947555542, + "learning_rate": 6.228924340569233e-05, + "loss": 0.7611, + "step": 5480 + }, + { + "epoch": 0.62, + "grad_norm": 0.6704011559486389, + "learning_rate": 6.212388878996177e-05, + "loss": 0.8411, + "step": 5485 + }, + { + "epoch": 0.62, + "grad_norm": 0.6402199864387512, + "learning_rate": 6.195865499270186e-05, + "loss": 0.6718, + "step": 5490 + }, + { + "epoch": 0.62, + "grad_norm": 0.6440407037734985, + "learning_rate": 6.179354254098085e-05, + "loss": 0.6699, + "step": 5495 + }, + { + "epoch": 0.63, + "grad_norm": 0.6990440487861633, + "learning_rate": 6.162855196147986e-05, + "loss": 0.7178, + "step": 5500 + }, + { + "epoch": 0.63, + "grad_norm": 0.6173884272575378, + "learning_rate": 6.146368378049134e-05, + "loss": 0.8273, + "step": 5505 + }, + { + "epoch": 0.63, + "grad_norm": 0.6001347303390503, + "learning_rate": 6.129893852391721e-05, + "loss": 0.8893, + "step": 5510 + }, + { + "epoch": 0.63, + "grad_norm": 0.5927189588546753, + "learning_rate": 6.113431671726735e-05, + "loss": 0.7978, + "step": 5515 + }, + { + "epoch": 0.63, + "grad_norm": 0.7115824222564697, + "learning_rate": 6.0969818885657835e-05, + "loss": 0.8715, + "step": 5520 + }, + { + "epoch": 0.63, + "grad_norm": 0.6806285977363586, + "learning_rate": 6.080544555380927e-05, + "loss": 0.6983, + "step": 5525 + }, + { + "epoch": 0.63, + "grad_norm": 0.6034073829650879, + "learning_rate": 6.06411972460452e-05, + "loss": 0.7861, + "step": 5530 + }, + { + "epoch": 0.63, + "grad_norm": 0.6072666645050049, + "learning_rate": 6.047707448629023e-05, + "loss": 0.796, + "step": 5535 + }, + { + "epoch": 0.63, + "grad_norm": 0.6448633670806885, + "learning_rate": 6.0313077798068575e-05, + "loss": 0.7532, + "step": 5540 + }, + { + "epoch": 0.63, + "grad_norm": 0.6306453943252563, + "learning_rate": 6.014920770450232e-05, + "loss": 0.7327, + "step": 5545 + }, + { + "epoch": 0.63, + "grad_norm": 0.8060635328292847, + "learning_rate": 5.998546472830965e-05, + "loss": 0.7468, + "step": 5550 + }, + { + "epoch": 0.63, + "grad_norm": 0.6441696286201477, + "learning_rate": 5.9821849391803375e-05, + "loss": 0.6766, + "step": 5555 + }, + { + "epoch": 0.63, + "grad_norm": 0.7230235934257507, + "learning_rate": 5.9658362216889095e-05, + "loss": 0.7493, + "step": 5560 + }, + { + "epoch": 0.63, + "grad_norm": 0.7482134103775024, + "learning_rate": 5.949500372506354e-05, + "loss": 0.6559, + "step": 5565 + }, + { + "epoch": 0.63, + "grad_norm": 0.5693408846855164, + "learning_rate": 5.933177443741309e-05, + "loss": 0.674, + "step": 5570 + }, + { + "epoch": 0.63, + "grad_norm": 0.6371012330055237, + "learning_rate": 5.916867487461186e-05, + "loss": 0.7717, + "step": 5575 + }, + { + "epoch": 0.63, + "grad_norm": 0.600836455821991, + "learning_rate": 5.900570555692029e-05, + "loss": 0.8248, + "step": 5580 + }, + { + "epoch": 0.64, + "grad_norm": 0.748927891254425, + "learning_rate": 5.884286700418328e-05, + "loss": 0.7426, + "step": 5585 + }, + { + "epoch": 0.64, + "grad_norm": 0.6669610142707825, + "learning_rate": 5.8680159735828555e-05, + "loss": 0.7964, + "step": 5590 + }, + { + "epoch": 0.64, + "grad_norm": 0.6355113387107849, + "learning_rate": 5.85175842708652e-05, + "loss": 0.8308, + "step": 5595 + }, + { + "epoch": 0.64, + "grad_norm": 0.6880905628204346, + "learning_rate": 5.835514112788179e-05, + "loss": 0.711, + "step": 5600 + }, + { + "epoch": 0.64, + "grad_norm": 0.7189605236053467, + "learning_rate": 5.819283082504482e-05, + "loss": 0.7994, + "step": 5605 + }, + { + "epoch": 0.64, + "grad_norm": 0.521282970905304, + "learning_rate": 5.8030653880097066e-05, + "loss": 0.6762, + "step": 5610 + }, + { + "epoch": 0.64, + "grad_norm": 0.6705846786499023, + "learning_rate": 5.7868610810355896e-05, + "loss": 0.7619, + "step": 5615 + }, + { + "epoch": 0.64, + "grad_norm": 0.7209597826004028, + "learning_rate": 5.770670213271165e-05, + "loss": 0.8654, + "step": 5620 + }, + { + "epoch": 0.64, + "grad_norm": 0.6950438022613525, + "learning_rate": 5.7544928363625974e-05, + "loss": 0.7806, + "step": 5625 + }, + { + "epoch": 0.64, + "grad_norm": 0.7177894711494446, + "learning_rate": 5.738329001913014e-05, + "loss": 0.8453, + "step": 5630 + }, + { + "epoch": 0.64, + "grad_norm": 0.6125010848045349, + "learning_rate": 5.722178761482356e-05, + "loss": 0.7242, + "step": 5635 + }, + { + "epoch": 0.64, + "grad_norm": 0.650780975818634, + "learning_rate": 5.706042166587193e-05, + "loss": 0.8344, + "step": 5640 + }, + { + "epoch": 0.64, + "grad_norm": 0.6594287157058716, + "learning_rate": 5.6899192687005585e-05, + "loss": 0.8212, + "step": 5645 + }, + { + "epoch": 0.64, + "grad_norm": 0.8201553225517273, + "learning_rate": 5.673810119251814e-05, + "loss": 0.9035, + "step": 5650 + }, + { + "epoch": 0.64, + "grad_norm": 0.5799230933189392, + "learning_rate": 5.657714769626455e-05, + "loss": 0.7603, + "step": 5655 + }, + { + "epoch": 0.64, + "grad_norm": 0.7373571395874023, + "learning_rate": 5.641633271165955e-05, + "loss": 0.8696, + "step": 5660 + }, + { + "epoch": 0.64, + "grad_norm": 0.6818183660507202, + "learning_rate": 5.6255656751676143e-05, + "loss": 0.7146, + "step": 5665 + }, + { + "epoch": 0.64, + "grad_norm": 0.640631377696991, + "learning_rate": 5.609512032884385e-05, + "loss": 0.8767, + "step": 5670 + }, + { + "epoch": 0.65, + "grad_norm": 0.6422764658927917, + "learning_rate": 5.5934723955246917e-05, + "loss": 0.7608, + "step": 5675 + }, + { + "epoch": 0.65, + "grad_norm": 0.633758544921875, + "learning_rate": 5.5774468142523104e-05, + "loss": 0.7595, + "step": 5680 + }, + { + "epoch": 0.65, + "grad_norm": 0.6388438940048218, + "learning_rate": 5.5614353401861675e-05, + "loss": 0.8263, + "step": 5685 + }, + { + "epoch": 0.65, + "grad_norm": 0.6727738976478577, + "learning_rate": 5.545438024400192e-05, + "loss": 0.7927, + "step": 5690 + }, + { + "epoch": 0.65, + "grad_norm": 0.6814836263656616, + "learning_rate": 5.529454917923149e-05, + "loss": 0.8103, + "step": 5695 + }, + { + "epoch": 0.65, + "grad_norm": 0.772027850151062, + "learning_rate": 5.513486071738481e-05, + "loss": 0.707, + "step": 5700 + }, + { + "epoch": 0.65, + "grad_norm": 0.6700000166893005, + "learning_rate": 5.4975315367841374e-05, + "loss": 0.8225, + "step": 5705 + }, + { + "epoch": 0.65, + "grad_norm": 0.7606449723243713, + "learning_rate": 5.481591363952421e-05, + "loss": 0.7547, + "step": 5710 + }, + { + "epoch": 0.65, + "grad_norm": 0.6637645363807678, + "learning_rate": 5.465665604089829e-05, + "loss": 0.8537, + "step": 5715 + }, + { + "epoch": 0.65, + "grad_norm": 0.6221731901168823, + "learning_rate": 5.449754307996871e-05, + "loss": 0.6966, + "step": 5720 + }, + { + "epoch": 0.65, + "grad_norm": 0.6201137900352478, + "learning_rate": 5.433857526427923e-05, + "loss": 0.7586, + "step": 5725 + }, + { + "epoch": 0.65, + "grad_norm": 0.715356171131134, + "learning_rate": 5.417975310091068e-05, + "loss": 0.9557, + "step": 5730 + }, + { + "epoch": 0.65, + "grad_norm": 0.7795970439910889, + "learning_rate": 5.402107709647921e-05, + "loss": 0.8289, + "step": 5735 + }, + { + "epoch": 0.65, + "grad_norm": 0.6747726798057556, + "learning_rate": 5.3862547757134816e-05, + "loss": 0.6959, + "step": 5740 + }, + { + "epoch": 0.65, + "grad_norm": 0.6822933554649353, + "learning_rate": 5.370416558855955e-05, + "loss": 0.7804, + "step": 5745 + }, + { + "epoch": 0.65, + "grad_norm": 0.6174326539039612, + "learning_rate": 5.354593109596621e-05, + "loss": 0.8149, + "step": 5750 + }, + { + "epoch": 0.65, + "grad_norm": 0.6532416939735413, + "learning_rate": 5.338784478409628e-05, + "loss": 0.8078, + "step": 5755 + }, + { + "epoch": 0.65, + "grad_norm": 0.6764138340950012, + "learning_rate": 5.3229907157218737e-05, + "loss": 0.802, + "step": 5760 + }, + { + "epoch": 0.66, + "grad_norm": 0.6659644842147827, + "learning_rate": 5.307211871912828e-05, + "loss": 0.7593, + "step": 5765 + }, + { + "epoch": 0.66, + "grad_norm": 0.8311240077018738, + "learning_rate": 5.291447997314367e-05, + "loss": 0.7216, + "step": 5770 + }, + { + "epoch": 0.66, + "grad_norm": 0.6325226426124573, + "learning_rate": 5.275699142210615e-05, + "loss": 0.7994, + "step": 5775 + }, + { + "epoch": 0.66, + "grad_norm": 0.678072988986969, + "learning_rate": 5.259965356837795e-05, + "loss": 0.8148, + "step": 5780 + }, + { + "epoch": 0.66, + "grad_norm": 0.6537745594978333, + "learning_rate": 5.244246691384051e-05, + "loss": 0.8272, + "step": 5785 + }, + { + "epoch": 0.66, + "grad_norm": 0.6158615350723267, + "learning_rate": 5.228543195989303e-05, + "loss": 0.6634, + "step": 5790 + }, + { + "epoch": 0.66, + "grad_norm": 0.5966333746910095, + "learning_rate": 5.212854920745075e-05, + "loss": 0.733, + "step": 5795 + }, + { + "epoch": 0.66, + "grad_norm": 0.6967484951019287, + "learning_rate": 5.1971819156943545e-05, + "loss": 0.855, + "step": 5800 + }, + { + "epoch": 0.66, + "grad_norm": 0.6322739124298096, + "learning_rate": 5.181524230831409e-05, + "loss": 0.8315, + "step": 5805 + }, + { + "epoch": 0.66, + "grad_norm": 0.6847050786018372, + "learning_rate": 5.1658819161016294e-05, + "loss": 0.8198, + "step": 5810 + }, + { + "epoch": 0.66, + "grad_norm": 0.6578314304351807, + "learning_rate": 5.150255021401399e-05, + "loss": 0.8202, + "step": 5815 + }, + { + "epoch": 0.66, + "grad_norm": 0.6339473128318787, + "learning_rate": 5.134643596577897e-05, + "loss": 0.7877, + "step": 5820 + }, + { + "epoch": 0.66, + "grad_norm": 0.5576701760292053, + "learning_rate": 5.1190476914289645e-05, + "loss": 0.7409, + "step": 5825 + }, + { + "epoch": 0.66, + "grad_norm": 1.082842469215393, + "learning_rate": 5.103467355702928e-05, + "loss": 0.6842, + "step": 5830 + }, + { + "epoch": 0.66, + "grad_norm": 0.7405625581741333, + "learning_rate": 5.087902639098472e-05, + "loss": 0.7636, + "step": 5835 + }, + { + "epoch": 0.66, + "grad_norm": 0.5836653709411621, + "learning_rate": 5.0723535912644294e-05, + "loss": 0.7134, + "step": 5840 + }, + { + "epoch": 0.66, + "grad_norm": 0.7886473536491394, + "learning_rate": 5.0568202617996675e-05, + "loss": 0.8099, + "step": 5845 + }, + { + "epoch": 0.67, + "grad_norm": 0.6854684948921204, + "learning_rate": 5.0413027002529214e-05, + "loss": 0.7903, + "step": 5850 + }, + { + "epoch": 0.67, + "grad_norm": 0.6613681316375732, + "learning_rate": 5.025800956122619e-05, + "loss": 0.7311, + "step": 5855 + }, + { + "epoch": 0.67, + "grad_norm": 0.6116787791252136, + "learning_rate": 5.010315078856733e-05, + "loss": 0.7632, + "step": 5860 + }, + { + "epoch": 0.67, + "grad_norm": 0.74680095911026, + "learning_rate": 4.99484511785263e-05, + "loss": 0.8084, + "step": 5865 + }, + { + "epoch": 0.67, + "grad_norm": 0.6365654468536377, + "learning_rate": 4.979391122456899e-05, + "loss": 0.6616, + "step": 5870 + }, + { + "epoch": 0.67, + "grad_norm": 0.7090327143669128, + "learning_rate": 4.9639531419652075e-05, + "loss": 0.8372, + "step": 5875 + }, + { + "epoch": 0.67, + "grad_norm": 0.7884336709976196, + "learning_rate": 4.948531225622129e-05, + "loss": 0.7197, + "step": 5880 + }, + { + "epoch": 0.67, + "grad_norm": 0.7464672923088074, + "learning_rate": 4.933125422621013e-05, + "loss": 0.7625, + "step": 5885 + }, + { + "epoch": 0.67, + "grad_norm": 0.5780450701713562, + "learning_rate": 4.9177357821037964e-05, + "loss": 0.7627, + "step": 5890 + }, + { + "epoch": 0.67, + "grad_norm": 0.6157823801040649, + "learning_rate": 4.902362353160851e-05, + "loss": 0.8363, + "step": 5895 + }, + { + "epoch": 0.67, + "grad_norm": 0.6948485374450684, + "learning_rate": 4.8870051848308603e-05, + "loss": 0.7677, + "step": 5900 + }, + { + "epoch": 0.67, + "grad_norm": 0.7331840395927429, + "learning_rate": 4.871664326100625e-05, + "loss": 0.887, + "step": 5905 + }, + { + "epoch": 0.67, + "grad_norm": 0.6237366199493408, + "learning_rate": 4.856339825904921e-05, + "loss": 0.7934, + "step": 5910 + }, + { + "epoch": 0.67, + "grad_norm": 0.6936962008476257, + "learning_rate": 4.841031733126345e-05, + "loss": 0.7613, + "step": 5915 + }, + { + "epoch": 0.67, + "grad_norm": 0.6001924276351929, + "learning_rate": 4.825740096595159e-05, + "loss": 0.7355, + "step": 5920 + }, + { + "epoch": 0.67, + "grad_norm": 0.5404167771339417, + "learning_rate": 4.8104649650891295e-05, + "loss": 0.7993, + "step": 5925 + }, + { + "epoch": 0.67, + "grad_norm": 0.656347930431366, + "learning_rate": 4.795206387333371e-05, + "loss": 0.7538, + "step": 5930 + }, + { + "epoch": 0.67, + "grad_norm": 0.6526111364364624, + "learning_rate": 4.779964412000206e-05, + "loss": 0.7349, + "step": 5935 + }, + { + "epoch": 0.68, + "grad_norm": 0.7296586632728577, + "learning_rate": 4.7647390877089884e-05, + "loss": 0.7385, + "step": 5940 + }, + { + "epoch": 0.68, + "grad_norm": 0.6850858330726624, + "learning_rate": 4.749530463025961e-05, + "loss": 0.7774, + "step": 5945 + }, + { + "epoch": 0.68, + "grad_norm": 0.7608144283294678, + "learning_rate": 4.734338586464096e-05, + "loss": 0.7299, + "step": 5950 + }, + { + "epoch": 0.68, + "grad_norm": 0.6235141158103943, + "learning_rate": 4.719163506482942e-05, + "loss": 0.734, + "step": 5955 + }, + { + "epoch": 0.68, + "grad_norm": 0.6358844637870789, + "learning_rate": 4.704005271488472e-05, + "loss": 0.78, + "step": 5960 + }, + { + "epoch": 0.68, + "grad_norm": 0.7749031186103821, + "learning_rate": 4.6888639298329216e-05, + "loss": 0.8022, + "step": 5965 + }, + { + "epoch": 0.68, + "grad_norm": 0.5940665006637573, + "learning_rate": 4.673739529814653e-05, + "loss": 0.7931, + "step": 5970 + }, + { + "epoch": 0.68, + "grad_norm": 0.674860417842865, + "learning_rate": 4.658632119677965e-05, + "loss": 0.8207, + "step": 5975 + }, + { + "epoch": 0.68, + "grad_norm": 0.5767741799354553, + "learning_rate": 4.643541747612974e-05, + "loss": 0.6727, + "step": 5980 + }, + { + "epoch": 0.68, + "grad_norm": 0.6625251770019531, + "learning_rate": 4.6284684617554555e-05, + "loss": 0.7103, + "step": 5985 + }, + { + "epoch": 0.68, + "grad_norm": 0.6491448879241943, + "learning_rate": 4.613412310186669e-05, + "loss": 0.7724, + "step": 5990 + }, + { + "epoch": 0.68, + "grad_norm": 0.8284974098205566, + "learning_rate": 4.5983733409332265e-05, + "loss": 0.8283, + "step": 5995 + }, + { + "epoch": 0.68, + "grad_norm": 0.6128905415534973, + "learning_rate": 4.5833516019669275e-05, + "loss": 0.7541, + "step": 6000 + }, + { + "epoch": 0.68, + "grad_norm": 0.642086923122406, + "learning_rate": 4.568347141204611e-05, + "loss": 0.7939, + "step": 6005 + }, + { + "epoch": 0.68, + "grad_norm": 0.6303368806838989, + "learning_rate": 4.553360006508003e-05, + "loss": 0.7202, + "step": 6010 + }, + { + "epoch": 0.68, + "grad_norm": 0.5753061771392822, + "learning_rate": 4.538390245683555e-05, + "loss": 0.7782, + "step": 6015 + }, + { + "epoch": 0.68, + "grad_norm": 0.6998262405395508, + "learning_rate": 4.523437906482313e-05, + "loss": 0.723, + "step": 6020 + }, + { + "epoch": 0.69, + "grad_norm": 0.5152105689048767, + "learning_rate": 4.508503036599743e-05, + "loss": 0.7258, + "step": 6025 + }, + { + "epoch": 0.69, + "grad_norm": 0.7657172679901123, + "learning_rate": 4.493585683675575e-05, + "loss": 0.7904, + "step": 6030 + }, + { + "epoch": 0.69, + "grad_norm": 0.7116373777389526, + "learning_rate": 4.478685895293685e-05, + "loss": 0.7479, + "step": 6035 + }, + { + "epoch": 0.69, + "grad_norm": 0.6210662722587585, + "learning_rate": 4.463803718981905e-05, + "loss": 0.7604, + "step": 6040 + }, + { + "epoch": 0.69, + "grad_norm": 0.6628497242927551, + "learning_rate": 4.448939202211896e-05, + "loss": 0.7996, + "step": 6045 + }, + { + "epoch": 0.69, + "grad_norm": 0.7364223599433899, + "learning_rate": 4.434092392398978e-05, + "loss": 0.8187, + "step": 6050 + }, + { + "epoch": 0.69, + "grad_norm": 0.7144873142242432, + "learning_rate": 4.4192633369020066e-05, + "loss": 0.8995, + "step": 6055 + }, + { + "epoch": 0.69, + "grad_norm": 0.7300914525985718, + "learning_rate": 4.404452083023183e-05, + "loss": 0.755, + "step": 6060 + }, + { + "epoch": 0.69, + "grad_norm": 0.7752556800842285, + "learning_rate": 4.389658678007933e-05, + "loss": 0.835, + "step": 6065 + }, + { + "epoch": 0.69, + "grad_norm": 0.631405770778656, + "learning_rate": 4.3748831690447565e-05, + "loss": 0.769, + "step": 6070 + }, + { + "epoch": 0.69, + "grad_norm": 0.6509888172149658, + "learning_rate": 4.360125603265057e-05, + "loss": 0.7765, + "step": 6075 + }, + { + "epoch": 0.69, + "grad_norm": 0.6003565788269043, + "learning_rate": 4.345386027743005e-05, + "loss": 0.6942, + "step": 6080 + }, + { + "epoch": 0.69, + "grad_norm": 0.7362748384475708, + "learning_rate": 4.330664489495385e-05, + "loss": 0.803, + "step": 6085 + }, + { + "epoch": 0.69, + "grad_norm": 0.7295613288879395, + "learning_rate": 4.315961035481445e-05, + "loss": 0.7736, + "step": 6090 + }, + { + "epoch": 0.69, + "grad_norm": 0.7188911437988281, + "learning_rate": 4.30127571260275e-05, + "loss": 0.7708, + "step": 6095 + }, + { + "epoch": 0.69, + "grad_norm": 0.6692444086074829, + "learning_rate": 4.286608567703024e-05, + "loss": 0.8271, + "step": 6100 + }, + { + "epoch": 0.69, + "grad_norm": 0.8137506246566772, + "learning_rate": 4.271959647568017e-05, + "loss": 0.7915, + "step": 6105 + }, + { + "epoch": 0.69, + "grad_norm": 0.7544528245925903, + "learning_rate": 4.257328998925338e-05, + "loss": 0.7633, + "step": 6110 + }, + { + "epoch": 0.7, + "grad_norm": 0.7398918271064758, + "learning_rate": 4.242716668444304e-05, + "loss": 0.7613, + "step": 6115 + }, + { + "epoch": 0.7, + "grad_norm": 0.6789968609809875, + "learning_rate": 4.2281227027358187e-05, + "loss": 0.8473, + "step": 6120 + }, + { + "epoch": 0.7, + "grad_norm": 0.5836973190307617, + "learning_rate": 4.2135471483521925e-05, + "loss": 0.7006, + "step": 6125 + }, + { + "epoch": 0.7, + "grad_norm": 0.6515111327171326, + "learning_rate": 4.198990051787012e-05, + "loss": 0.7468, + "step": 6130 + }, + { + "epoch": 0.7, + "grad_norm": 0.5469412207603455, + "learning_rate": 4.184451459474983e-05, + "loss": 0.7166, + "step": 6135 + }, + { + "epoch": 0.7, + "grad_norm": 0.6362948417663574, + "learning_rate": 4.169931417791788e-05, + "loss": 0.6851, + "step": 6140 + }, + { + "epoch": 0.7, + "grad_norm": 0.7324738502502441, + "learning_rate": 4.155429973053935e-05, + "loss": 0.6982, + "step": 6145 + }, + { + "epoch": 0.7, + "grad_norm": 0.6752232313156128, + "learning_rate": 4.140947171518609e-05, + "loss": 0.7343, + "step": 6150 + }, + { + "epoch": 0.7, + "grad_norm": 0.7206586003303528, + "learning_rate": 4.126483059383534e-05, + "loss": 0.7781, + "step": 6155 + }, + { + "epoch": 0.7, + "grad_norm": 0.6868919134140015, + "learning_rate": 4.112037682786811e-05, + "loss": 0.8156, + "step": 6160 + }, + { + "epoch": 0.7, + "grad_norm": 0.709814190864563, + "learning_rate": 4.0976110878067783e-05, + "loss": 0.8395, + "step": 6165 + }, + { + "epoch": 0.7, + "grad_norm": 0.6500702500343323, + "learning_rate": 4.083203320461867e-05, + "loss": 0.7951, + "step": 6170 + }, + { + "epoch": 0.7, + "grad_norm": 0.7838833332061768, + "learning_rate": 4.068814426710447e-05, + "loss": 0.8203, + "step": 6175 + }, + { + "epoch": 0.7, + "grad_norm": 0.6506848335266113, + "learning_rate": 4.0544444524506875e-05, + "loss": 0.7022, + "step": 6180 + }, + { + "epoch": 0.7, + "grad_norm": 0.8521379828453064, + "learning_rate": 4.040093443520404e-05, + "loss": 0.7542, + "step": 6185 + }, + { + "epoch": 0.7, + "grad_norm": 0.5580353736877441, + "learning_rate": 4.025761445696929e-05, + "loss": 0.7818, + "step": 6190 + }, + { + "epoch": 0.7, + "grad_norm": 0.8889886736869812, + "learning_rate": 4.011448504696933e-05, + "loss": 0.7498, + "step": 6195 + }, + { + "epoch": 0.7, + "grad_norm": 0.6307147145271301, + "learning_rate": 3.997154666176306e-05, + "loss": 0.8169, + "step": 6200 + }, + { + "epoch": 0.71, + "grad_norm": 0.740788459777832, + "learning_rate": 3.982879975730015e-05, + "loss": 0.6286, + "step": 6205 + }, + { + "epoch": 0.71, + "grad_norm": 0.7221429347991943, + "learning_rate": 3.9686244788919345e-05, + "loss": 0.8662, + "step": 6210 + }, + { + "epoch": 0.71, + "grad_norm": 0.578336238861084, + "learning_rate": 3.9543882211347206e-05, + "loss": 0.7928, + "step": 6215 + }, + { + "epoch": 0.71, + "grad_norm": 0.6942074298858643, + "learning_rate": 3.940171247869658e-05, + "loss": 0.7358, + "step": 6220 + }, + { + "epoch": 0.71, + "grad_norm": 0.8117511868476868, + "learning_rate": 3.925973604446517e-05, + "loss": 0.699, + "step": 6225 + }, + { + "epoch": 0.71, + "grad_norm": 0.6607906818389893, + "learning_rate": 3.91179533615341e-05, + "loss": 0.7651, + "step": 6230 + }, + { + "epoch": 0.71, + "grad_norm": 0.705653190612793, + "learning_rate": 3.8976364882166414e-05, + "loss": 0.8838, + "step": 6235 + }, + { + "epoch": 0.71, + "grad_norm": 0.6348661780357361, + "learning_rate": 3.8834971058005796e-05, + "loss": 0.7253, + "step": 6240 + }, + { + "epoch": 0.71, + "grad_norm": 0.7973907589912415, + "learning_rate": 3.869377234007494e-05, + "loss": 0.7247, + "step": 6245 + }, + { + "epoch": 0.71, + "grad_norm": 0.7183629274368286, + "learning_rate": 3.855276917877407e-05, + "loss": 0.7184, + "step": 6250 + }, + { + "epoch": 0.71, + "grad_norm": 0.6137698292732239, + "learning_rate": 3.8411962023879844e-05, + "loss": 0.8104, + "step": 6255 + }, + { + "epoch": 0.71, + "grad_norm": 0.753842294216156, + "learning_rate": 3.827135132454351e-05, + "loss": 0.7628, + "step": 6260 + }, + { + "epoch": 0.71, + "grad_norm": 0.658284068107605, + "learning_rate": 3.813093752928973e-05, + "loss": 0.7889, + "step": 6265 + }, + { + "epoch": 0.71, + "grad_norm": 0.5758240818977356, + "learning_rate": 3.799072108601511e-05, + "loss": 0.7978, + "step": 6270 + }, + { + "epoch": 0.71, + "grad_norm": 0.6841059327125549, + "learning_rate": 3.78507024419867e-05, + "loss": 0.8056, + "step": 6275 + }, + { + "epoch": 0.71, + "grad_norm": 0.6589455604553223, + "learning_rate": 3.771088204384051e-05, + "loss": 0.7676, + "step": 6280 + }, + { + "epoch": 0.71, + "grad_norm": 0.5679621696472168, + "learning_rate": 3.757126033758028e-05, + "loss": 0.7095, + "step": 6285 + }, + { + "epoch": 0.72, + "grad_norm": 0.6563994288444519, + "learning_rate": 3.7431837768576017e-05, + "loss": 0.7954, + "step": 6290 + }, + { + "epoch": 0.72, + "grad_norm": 0.6838070750236511, + "learning_rate": 3.7292614781562384e-05, + "loss": 0.8108, + "step": 6295 + }, + { + "epoch": 0.72, + "grad_norm": 0.7321529984474182, + "learning_rate": 3.715359182063748e-05, + "loss": 0.8576, + "step": 6300 + }, + { + "epoch": 0.72, + "grad_norm": 0.7272433042526245, + "learning_rate": 3.701476932926132e-05, + "loss": 0.7887, + "step": 6305 + }, + { + "epoch": 0.72, + "grad_norm": 0.7071683406829834, + "learning_rate": 3.68761477502545e-05, + "loss": 0.8411, + "step": 6310 + }, + { + "epoch": 0.72, + "grad_norm": 0.5736758708953857, + "learning_rate": 3.673772752579665e-05, + "loss": 0.7584, + "step": 6315 + }, + { + "epoch": 0.72, + "grad_norm": 0.656456708908081, + "learning_rate": 3.659950909742525e-05, + "loss": 0.8634, + "step": 6320 + }, + { + "epoch": 0.72, + "grad_norm": 0.6820427775382996, + "learning_rate": 3.646149290603398e-05, + "loss": 0.862, + "step": 6325 + }, + { + "epoch": 0.72, + "grad_norm": 0.8717942833900452, + "learning_rate": 3.6323679391871446e-05, + "loss": 0.7477, + "step": 6330 + }, + { + "epoch": 0.72, + "grad_norm": 0.7242932915687561, + "learning_rate": 3.6186068994539745e-05, + "loss": 0.762, + "step": 6335 + }, + { + "epoch": 0.72, + "grad_norm": 0.7470153570175171, + "learning_rate": 3.6048662152993065e-05, + "loss": 0.7616, + "step": 6340 + }, + { + "epoch": 0.72, + "grad_norm": 0.725097119808197, + "learning_rate": 3.59114593055363e-05, + "loss": 0.808, + "step": 6345 + }, + { + "epoch": 0.72, + "grad_norm": 0.6063842177391052, + "learning_rate": 3.5774460889823566e-05, + "loss": 0.8324, + "step": 6350 + }, + { + "epoch": 0.72, + "grad_norm": 0.7853096723556519, + "learning_rate": 3.563766734285704e-05, + "loss": 0.8145, + "step": 6355 + }, + { + "epoch": 0.72, + "grad_norm": 0.6266723871231079, + "learning_rate": 3.5501079100985254e-05, + "loss": 0.7249, + "step": 6360 + }, + { + "epoch": 0.72, + "grad_norm": 0.6518904566764832, + "learning_rate": 3.5364696599901835e-05, + "loss": 0.8258, + "step": 6365 + }, + { + "epoch": 0.72, + "grad_norm": 0.6777641177177429, + "learning_rate": 3.522852027464426e-05, + "loss": 0.8226, + "step": 6370 + }, + { + "epoch": 0.72, + "grad_norm": 0.6241174936294556, + "learning_rate": 3.509255055959224e-05, + "loss": 0.7909, + "step": 6375 + }, + { + "epoch": 0.73, + "grad_norm": 0.7724244594573975, + "learning_rate": 3.495678788846648e-05, + "loss": 0.6656, + "step": 6380 + }, + { + "epoch": 0.73, + "grad_norm": 0.6773399710655212, + "learning_rate": 3.4821232694327224e-05, + "loss": 0.704, + "step": 6385 + }, + { + "epoch": 0.73, + "grad_norm": 0.6904076337814331, + "learning_rate": 3.4685885409572893e-05, + "loss": 0.7869, + "step": 6390 + }, + { + "epoch": 0.73, + "grad_norm": 0.6653614044189453, + "learning_rate": 3.455074646593876e-05, + "loss": 0.8249, + "step": 6395 + }, + { + "epoch": 0.73, + "grad_norm": 0.871178925037384, + "learning_rate": 3.441581629449542e-05, + "loss": 0.8137, + "step": 6400 + }, + { + "epoch": 0.73, + "grad_norm": 0.5883834362030029, + "learning_rate": 3.4281095325647684e-05, + "loss": 0.798, + "step": 6405 + }, + { + "epoch": 0.73, + "grad_norm": 0.6861564517021179, + "learning_rate": 3.41465839891329e-05, + "loss": 0.7559, + "step": 6410 + }, + { + "epoch": 0.73, + "grad_norm": 0.6617395877838135, + "learning_rate": 3.401228271401978e-05, + "loss": 0.782, + "step": 6415 + }, + { + "epoch": 0.73, + "grad_norm": 0.6430420279502869, + "learning_rate": 3.387819192870697e-05, + "loss": 0.7349, + "step": 6420 + }, + { + "epoch": 0.73, + "grad_norm": 0.7476750612258911, + "learning_rate": 3.374431206092168e-05, + "loss": 0.8074, + "step": 6425 + }, + { + "epoch": 0.73, + "grad_norm": 0.6503930687904358, + "learning_rate": 3.3610643537718345e-05, + "loss": 0.7641, + "step": 6430 + }, + { + "epoch": 0.73, + "grad_norm": 0.5740126967430115, + "learning_rate": 3.3477186785477186e-05, + "loss": 0.6907, + "step": 6435 + }, + { + "epoch": 0.73, + "grad_norm": 0.613012433052063, + "learning_rate": 3.334394222990307e-05, + "loss": 0.7404, + "step": 6440 + }, + { + "epoch": 0.73, + "grad_norm": 0.7166250348091125, + "learning_rate": 3.3210910296023776e-05, + "loss": 0.8843, + "step": 6445 + }, + { + "epoch": 0.73, + "grad_norm": 0.7031468152999878, + "learning_rate": 3.3078091408188985e-05, + "loss": 0.7878, + "step": 6450 + }, + { + "epoch": 0.73, + "grad_norm": 0.8461940288543701, + "learning_rate": 3.29454859900688e-05, + "loss": 0.8152, + "step": 6455 + }, + { + "epoch": 0.73, + "grad_norm": 0.7095286846160889, + "learning_rate": 3.281309446465236e-05, + "loss": 0.7507, + "step": 6460 + }, + { + "epoch": 0.74, + "grad_norm": 0.5577227473258972, + "learning_rate": 3.2680917254246515e-05, + "loss": 0.6388, + "step": 6465 + }, + { + "epoch": 0.74, + "grad_norm": 0.6764439940452576, + "learning_rate": 3.2548954780474484e-05, + "loss": 0.8344, + "step": 6470 + }, + { + "epoch": 0.74, + "grad_norm": 0.7055917978286743, + "learning_rate": 3.241720746427456e-05, + "loss": 0.813, + "step": 6475 + }, + { + "epoch": 0.74, + "grad_norm": 0.7273811101913452, + "learning_rate": 3.228567572589864e-05, + "loss": 0.8289, + "step": 6480 + }, + { + "epoch": 0.74, + "grad_norm": 0.8205636739730835, + "learning_rate": 3.215435998491102e-05, + "loss": 0.8189, + "step": 6485 + }, + { + "epoch": 0.74, + "grad_norm": 0.5480372309684753, + "learning_rate": 3.202326066018701e-05, + "loss": 0.8921, + "step": 6490 + }, + { + "epoch": 0.74, + "grad_norm": 0.6462122797966003, + "learning_rate": 3.189237816991161e-05, + "loss": 0.7161, + "step": 6495 + }, + { + "epoch": 0.74, + "grad_norm": 0.678535521030426, + "learning_rate": 3.176171293157798e-05, + "loss": 0.6587, + "step": 6500 + }, + { + "epoch": 0.74, + "grad_norm": 0.6733066439628601, + "learning_rate": 3.163126536198653e-05, + "loss": 0.7188, + "step": 6505 + }, + { + "epoch": 0.74, + "grad_norm": 0.5796012878417969, + "learning_rate": 3.150103587724318e-05, + "loss": 0.7414, + "step": 6510 + }, + { + "epoch": 0.74, + "grad_norm": 0.6976703405380249, + "learning_rate": 3.137102489275824e-05, + "loss": 0.6708, + "step": 6515 + }, + { + "epoch": 0.74, + "grad_norm": 0.7101731300354004, + "learning_rate": 3.1241232823245026e-05, + "loss": 0.6906, + "step": 6520 + }, + { + "epoch": 0.74, + "grad_norm": 0.7651215195655823, + "learning_rate": 3.111166008271866e-05, + "loss": 0.8167, + "step": 6525 + }, + { + "epoch": 0.74, + "grad_norm": 0.7553005218505859, + "learning_rate": 3.098230708449445e-05, + "loss": 0.6965, + "step": 6530 + }, + { + "epoch": 0.74, + "grad_norm": 0.6705849766731262, + "learning_rate": 3.0853174241186865e-05, + "loss": 0.7017, + "step": 6535 + }, + { + "epoch": 0.74, + "grad_norm": 0.6687391996383667, + "learning_rate": 3.072426196470818e-05, + "loss": 0.7522, + "step": 6540 + }, + { + "epoch": 0.74, + "grad_norm": 0.5514757633209229, + "learning_rate": 3.0595570666266996e-05, + "loss": 0.6664, + "step": 6545 + }, + { + "epoch": 0.74, + "grad_norm": 0.7751938700675964, + "learning_rate": 3.046710075636706e-05, + "loss": 0.7551, + "step": 6550 + }, + { + "epoch": 0.75, + "grad_norm": 0.6046425700187683, + "learning_rate": 3.033885264480595e-05, + "loss": 0.7579, + "step": 6555 + }, + { + "epoch": 0.75, + "grad_norm": 0.6243056058883667, + "learning_rate": 3.0210826740673727e-05, + "loss": 0.7391, + "step": 6560 + }, + { + "epoch": 0.75, + "grad_norm": 0.6286137700080872, + "learning_rate": 3.0083023452351633e-05, + "loss": 0.8608, + "step": 6565 + }, + { + "epoch": 0.75, + "grad_norm": 0.6316589117050171, + "learning_rate": 2.99554431875108e-05, + "loss": 0.7529, + "step": 6570 + }, + { + "epoch": 0.75, + "grad_norm": 0.6141951084136963, + "learning_rate": 2.982808635311104e-05, + "loss": 0.7676, + "step": 6575 + }, + { + "epoch": 0.75, + "grad_norm": 0.7614745497703552, + "learning_rate": 2.9700953355399386e-05, + "loss": 0.77, + "step": 6580 + }, + { + "epoch": 0.75, + "grad_norm": 0.6968342661857605, + "learning_rate": 2.9574044599908766e-05, + "loss": 0.836, + "step": 6585 + }, + { + "epoch": 0.75, + "grad_norm": 0.6154257655143738, + "learning_rate": 2.9447360491457033e-05, + "loss": 0.8214, + "step": 6590 + }, + { + "epoch": 0.75, + "grad_norm": 0.7290199398994446, + "learning_rate": 2.93209014341453e-05, + "loss": 0.7725, + "step": 6595 + }, + { + "epoch": 0.75, + "grad_norm": 0.5743769407272339, + "learning_rate": 2.9194667831356837e-05, + "loss": 0.7704, + "step": 6600 + }, + { + "epoch": 0.75, + "grad_norm": 0.6820549964904785, + "learning_rate": 2.9068660085755773e-05, + "loss": 0.7054, + "step": 6605 + }, + { + "epoch": 0.75, + "grad_norm": 0.5897756814956665, + "learning_rate": 2.894287859928577e-05, + "loss": 0.7094, + "step": 6610 + }, + { + "epoch": 0.75, + "grad_norm": 0.7251617908477783, + "learning_rate": 2.881732377316878e-05, + "loss": 0.9164, + "step": 6615 + }, + { + "epoch": 0.75, + "grad_norm": 0.749259889125824, + "learning_rate": 2.8691996007903686e-05, + "loss": 0.8373, + "step": 6620 + }, + { + "epoch": 0.75, + "grad_norm": 0.680907666683197, + "learning_rate": 2.8566895703265217e-05, + "loss": 0.8164, + "step": 6625 + }, + { + "epoch": 0.75, + "grad_norm": 0.7473598122596741, + "learning_rate": 2.844202325830241e-05, + "loss": 0.8554, + "step": 6630 + }, + { + "epoch": 0.75, + "grad_norm": 0.8276415467262268, + "learning_rate": 2.831737907133751e-05, + "loss": 0.7296, + "step": 6635 + }, + { + "epoch": 0.75, + "grad_norm": 0.6819781064987183, + "learning_rate": 2.8192963539964677e-05, + "loss": 0.7994, + "step": 6640 + }, + { + "epoch": 0.76, + "grad_norm": 0.6347323060035706, + "learning_rate": 2.8068777061048668e-05, + "loss": 0.7408, + "step": 6645 + }, + { + "epoch": 0.76, + "grad_norm": 0.6867766976356506, + "learning_rate": 2.794482003072364e-05, + "loss": 0.8205, + "step": 6650 + }, + { + "epoch": 0.76, + "grad_norm": 0.6473438143730164, + "learning_rate": 2.782109284439176e-05, + "loss": 0.7703, + "step": 6655 + }, + { + "epoch": 0.76, + "grad_norm": 0.7229429483413696, + "learning_rate": 2.7697595896722207e-05, + "loss": 0.747, + "step": 6660 + }, + { + "epoch": 0.76, + "grad_norm": 0.6416674852371216, + "learning_rate": 2.7574329581649526e-05, + "loss": 0.714, + "step": 6665 + }, + { + "epoch": 0.76, + "grad_norm": 0.6123269200325012, + "learning_rate": 2.7451294292372686e-05, + "loss": 0.7289, + "step": 6670 + }, + { + "epoch": 0.76, + "grad_norm": 0.7136829495429993, + "learning_rate": 2.732849042135377e-05, + "loss": 0.7079, + "step": 6675 + }, + { + "epoch": 0.76, + "grad_norm": 0.6285436153411865, + "learning_rate": 2.7205918360316597e-05, + "loss": 0.7023, + "step": 6680 + }, + { + "epoch": 0.76, + "grad_norm": 0.6968997716903687, + "learning_rate": 2.7083578500245566e-05, + "loss": 0.6881, + "step": 6685 + }, + { + "epoch": 0.76, + "grad_norm": 0.9395476579666138, + "learning_rate": 2.6961471231384417e-05, + "loss": 0.761, + "step": 6690 + }, + { + "epoch": 0.76, + "grad_norm": 0.7061296105384827, + "learning_rate": 2.6839596943234947e-05, + "loss": 0.8281, + "step": 6695 + }, + { + "epoch": 0.76, + "grad_norm": 0.6853705644607544, + "learning_rate": 2.671795602455578e-05, + "loss": 0.7588, + "step": 6700 + }, + { + "epoch": 0.76, + "grad_norm": 0.7715938687324524, + "learning_rate": 2.6596548863361117e-05, + "loss": 0.8728, + "step": 6705 + }, + { + "epoch": 0.76, + "grad_norm": 0.588621199131012, + "learning_rate": 2.647537584691957e-05, + "loss": 0.7497, + "step": 6710 + }, + { + "epoch": 0.76, + "grad_norm": 0.7696636319160461, + "learning_rate": 2.6354437361752848e-05, + "loss": 0.803, + "step": 6715 + }, + { + "epoch": 0.76, + "grad_norm": 0.7940670251846313, + "learning_rate": 2.623373379363444e-05, + "loss": 0.8015, + "step": 6720 + }, + { + "epoch": 0.76, + "grad_norm": 0.679672360420227, + "learning_rate": 2.6113265527588648e-05, + "loss": 0.72, + "step": 6725 + }, + { + "epoch": 0.77, + "grad_norm": 0.6845086812973022, + "learning_rate": 2.5993032947889117e-05, + "loss": 0.6869, + "step": 6730 + }, + { + "epoch": 0.77, + "grad_norm": 0.7502850890159607, + "learning_rate": 2.5873036438057674e-05, + "loss": 0.677, + "step": 6735 + }, + { + "epoch": 0.77, + "grad_norm": 0.6978954672813416, + "learning_rate": 2.5753276380863144e-05, + "loss": 0.7409, + "step": 6740 + }, + { + "epoch": 0.77, + "grad_norm": 0.624478816986084, + "learning_rate": 2.5633753158320185e-05, + "loss": 0.8546, + "step": 6745 + }, + { + "epoch": 0.77, + "grad_norm": 0.8565587401390076, + "learning_rate": 2.551446715168785e-05, + "loss": 0.7651, + "step": 6750 + }, + { + "epoch": 0.77, + "grad_norm": 0.7041921019554138, + "learning_rate": 2.539541874146857e-05, + "loss": 0.8322, + "step": 6755 + }, + { + "epoch": 0.77, + "grad_norm": 0.6443396806716919, + "learning_rate": 2.5276608307406945e-05, + "loss": 0.7984, + "step": 6760 + }, + { + "epoch": 0.77, + "grad_norm": 0.6369118094444275, + "learning_rate": 2.5158036228488426e-05, + "loss": 0.7303, + "step": 6765 + }, + { + "epoch": 0.77, + "grad_norm": 0.9061431884765625, + "learning_rate": 2.503970288293811e-05, + "loss": 0.7423, + "step": 6770 + }, + { + "epoch": 0.77, + "grad_norm": 0.6243317723274231, + "learning_rate": 2.492160864821964e-05, + "loss": 0.7388, + "step": 6775 + }, + { + "epoch": 0.77, + "grad_norm": 0.8769336938858032, + "learning_rate": 2.480375390103389e-05, + "loss": 0.8951, + "step": 6780 + }, + { + "epoch": 0.77, + "grad_norm": 0.7571940422058105, + "learning_rate": 2.4686139017317833e-05, + "loss": 0.6837, + "step": 6785 + }, + { + "epoch": 0.77, + "grad_norm": 0.7043862342834473, + "learning_rate": 2.4568764372243268e-05, + "loss": 0.6231, + "step": 6790 + }, + { + "epoch": 0.77, + "grad_norm": 0.6525396108627319, + "learning_rate": 2.4451630340215805e-05, + "loss": 0.7283, + "step": 6795 + }, + { + "epoch": 0.77, + "grad_norm": 0.8056162595748901, + "learning_rate": 2.433473729487341e-05, + "loss": 0.7733, + "step": 6800 + }, + { + "epoch": 0.77, + "grad_norm": 0.7512674927711487, + "learning_rate": 2.4218085609085316e-05, + "loss": 0.7004, + "step": 6805 + }, + { + "epoch": 0.77, + "grad_norm": 0.6620556712150574, + "learning_rate": 2.4101675654951006e-05, + "loss": 0.7546, + "step": 6810 + }, + { + "epoch": 0.77, + "grad_norm": 0.6622219085693359, + "learning_rate": 2.3985507803798768e-05, + "loss": 0.7237, + "step": 6815 + }, + { + "epoch": 0.78, + "grad_norm": 0.6944047212600708, + "learning_rate": 2.3869582426184644e-05, + "loss": 0.7919, + "step": 6820 + }, + { + "epoch": 0.78, + "grad_norm": 0.8888928890228271, + "learning_rate": 2.375389989189124e-05, + "loss": 0.7315, + "step": 6825 + }, + { + "epoch": 0.78, + "grad_norm": 0.6875692009925842, + "learning_rate": 2.3638460569926523e-05, + "loss": 0.8011, + "step": 6830 + }, + { + "epoch": 0.78, + "grad_norm": 0.6747264266014099, + "learning_rate": 2.3523264828522662e-05, + "loss": 0.8427, + "step": 6835 + }, + { + "epoch": 0.78, + "grad_norm": 0.7636617422103882, + "learning_rate": 2.3408313035134798e-05, + "loss": 0.8938, + "step": 6840 + }, + { + "epoch": 0.78, + "grad_norm": 0.7376787066459656, + "learning_rate": 2.3293605556440033e-05, + "loss": 0.7261, + "step": 6845 + }, + { + "epoch": 0.78, + "grad_norm": 0.6401947736740112, + "learning_rate": 2.3179142758336026e-05, + "loss": 0.8163, + "step": 6850 + }, + { + "epoch": 0.78, + "grad_norm": 0.5118803977966309, + "learning_rate": 2.3064925005939986e-05, + "loss": 0.6642, + "step": 6855 + }, + { + "epoch": 0.78, + "grad_norm": 0.7035655379295349, + "learning_rate": 2.2950952663587498e-05, + "loss": 0.8234, + "step": 6860 + }, + { + "epoch": 0.78, + "grad_norm": 0.5912356376647949, + "learning_rate": 2.2837226094831278e-05, + "loss": 0.6674, + "step": 6865 + }, + { + "epoch": 0.78, + "grad_norm": 0.6668829917907715, + "learning_rate": 2.272374566244011e-05, + "loss": 0.747, + "step": 6870 + }, + { + "epoch": 0.78, + "grad_norm": 0.674974262714386, + "learning_rate": 2.2610511728397587e-05, + "loss": 0.8882, + "step": 6875 + }, + { + "epoch": 0.78, + "grad_norm": 0.8864641785621643, + "learning_rate": 2.2497524653901146e-05, + "loss": 0.7622, + "step": 6880 + }, + { + "epoch": 0.78, + "grad_norm": 0.7429625391960144, + "learning_rate": 2.238478479936059e-05, + "loss": 0.7334, + "step": 6885 + }, + { + "epoch": 0.78, + "grad_norm": 0.71225905418396, + "learning_rate": 2.2272292524397252e-05, + "loss": 0.7476, + "step": 6890 + }, + { + "epoch": 0.78, + "grad_norm": 0.7212709188461304, + "learning_rate": 2.2160048187842742e-05, + "loss": 0.8207, + "step": 6895 + }, + { + "epoch": 0.78, + "grad_norm": 0.7402102947235107, + "learning_rate": 2.204805214773774e-05, + "loss": 0.8319, + "step": 6900 + }, + { + "epoch": 0.79, + "grad_norm": 0.847400426864624, + "learning_rate": 2.193630476133087e-05, + "loss": 0.7596, + "step": 6905 + }, + { + "epoch": 0.79, + "grad_norm": 0.6712285876274109, + "learning_rate": 2.1824806385077744e-05, + "loss": 0.791, + "step": 6910 + }, + { + "epoch": 0.79, + "grad_norm": 0.7666999697685242, + "learning_rate": 2.1713557374639458e-05, + "loss": 0.7542, + "step": 6915 + }, + { + "epoch": 0.79, + "grad_norm": 0.9645623564720154, + "learning_rate": 2.1602558084881796e-05, + "loss": 0.8158, + "step": 6920 + }, + { + "epoch": 0.79, + "grad_norm": 0.6309619545936584, + "learning_rate": 2.149180886987401e-05, + "loss": 0.7656, + "step": 6925 + }, + { + "epoch": 0.79, + "grad_norm": 0.6789581775665283, + "learning_rate": 2.1381310082887563e-05, + "loss": 0.801, + "step": 6930 + }, + { + "epoch": 0.79, + "grad_norm": 0.7283372282981873, + "learning_rate": 2.127106207639519e-05, + "loss": 0.8794, + "step": 6935 + }, + { + "epoch": 0.79, + "grad_norm": 0.6633392572402954, + "learning_rate": 2.116106520206952e-05, + "loss": 0.9224, + "step": 6940 + }, + { + "epoch": 0.79, + "grad_norm": 0.6747632622718811, + "learning_rate": 2.10513198107823e-05, + "loss": 0.789, + "step": 6945 + }, + { + "epoch": 0.79, + "grad_norm": 0.5490332841873169, + "learning_rate": 2.0941826252602993e-05, + "loss": 0.7228, + "step": 6950 + }, + { + "epoch": 0.79, + "grad_norm": 0.6821791529655457, + "learning_rate": 2.0832584876797723e-05, + "loss": 0.7467, + "step": 6955 + }, + { + "epoch": 0.79, + "grad_norm": 0.5666481852531433, + "learning_rate": 2.0723596031828295e-05, + "loss": 0.7156, + "step": 6960 + }, + { + "epoch": 0.79, + "grad_norm": 0.631689727306366, + "learning_rate": 2.061486006535095e-05, + "loss": 0.7043, + "step": 6965 + }, + { + "epoch": 0.79, + "grad_norm": 0.6669164896011353, + "learning_rate": 2.0506377324215153e-05, + "loss": 0.6862, + "step": 6970 + }, + { + "epoch": 0.79, + "grad_norm": 0.6554184556007385, + "learning_rate": 2.0398148154462826e-05, + "loss": 0.7031, + "step": 6975 + }, + { + "epoch": 0.79, + "grad_norm": 0.6377161145210266, + "learning_rate": 2.029017290132693e-05, + "loss": 0.7723, + "step": 6980 + }, + { + "epoch": 0.79, + "grad_norm": 0.6968027353286743, + "learning_rate": 2.0182451909230493e-05, + "loss": 0.7634, + "step": 6985 + }, + { + "epoch": 0.79, + "grad_norm": 0.6753907799720764, + "learning_rate": 2.0074985521785495e-05, + "loss": 0.8347, + "step": 6990 + }, + { + "epoch": 0.8, + "grad_norm": 0.6418029069900513, + "learning_rate": 1.9967774081791756e-05, + "loss": 0.731, + "step": 6995 + }, + { + "epoch": 0.8, + "grad_norm": 0.630171537399292, + "learning_rate": 1.9860817931235877e-05, + "loss": 0.8453, + "step": 7000 + }, + { + "epoch": 0.8, + "grad_norm": 0.7392057180404663, + "learning_rate": 1.9754117411290096e-05, + "loss": 0.6955, + "step": 7005 + }, + { + "epoch": 0.8, + "grad_norm": 0.7004316449165344, + "learning_rate": 1.9647672862311316e-05, + "loss": 0.7702, + "step": 7010 + }, + { + "epoch": 0.8, + "grad_norm": 0.6444839239120483, + "learning_rate": 1.9541484623839836e-05, + "loss": 0.7155, + "step": 7015 + }, + { + "epoch": 0.8, + "grad_norm": 0.6129523515701294, + "learning_rate": 1.9435553034598398e-05, + "loss": 0.7409, + "step": 7020 + }, + { + "epoch": 0.8, + "grad_norm": 0.8170189261436462, + "learning_rate": 1.9329878432491112e-05, + "loss": 0.752, + "step": 7025 + }, + { + "epoch": 0.8, + "grad_norm": 0.6246253252029419, + "learning_rate": 1.9224461154602292e-05, + "loss": 0.7082, + "step": 7030 + }, + { + "epoch": 0.8, + "grad_norm": 0.7174789309501648, + "learning_rate": 1.9119301537195455e-05, + "loss": 0.8013, + "step": 7035 + }, + { + "epoch": 0.8, + "grad_norm": 0.7129815220832825, + "learning_rate": 1.901439991571221e-05, + "loss": 0.8366, + "step": 7040 + }, + { + "epoch": 0.8, + "grad_norm": 0.696826159954071, + "learning_rate": 1.890975662477128e-05, + "loss": 0.6921, + "step": 7045 + }, + { + "epoch": 0.8, + "grad_norm": 0.6623396277427673, + "learning_rate": 1.8805371998167222e-05, + "loss": 0.8734, + "step": 7050 + }, + { + "epoch": 0.8, + "grad_norm": 0.7665203809738159, + "learning_rate": 1.8701246368869563e-05, + "loss": 0.826, + "step": 7055 + }, + { + "epoch": 0.8, + "grad_norm": 0.6396629810333252, + "learning_rate": 1.859738006902172e-05, + "loss": 0.7085, + "step": 7060 + }, + { + "epoch": 0.8, + "grad_norm": 0.6628134846687317, + "learning_rate": 1.849377342993982e-05, + "loss": 0.7313, + "step": 7065 + }, + { + "epoch": 0.8, + "grad_norm": 0.6394834518432617, + "learning_rate": 1.839042678211176e-05, + "loss": 0.7605, + "step": 7070 + }, + { + "epoch": 0.8, + "grad_norm": 0.6791489124298096, + "learning_rate": 1.8287340455196068e-05, + "loss": 0.8351, + "step": 7075 + }, + { + "epoch": 0.81, + "grad_norm": 0.6268406510353088, + "learning_rate": 1.8184514778020935e-05, + "loss": 0.753, + "step": 7080 + }, + { + "epoch": 0.81, + "grad_norm": 0.7322360873222351, + "learning_rate": 1.80819500785831e-05, + "loss": 0.7054, + "step": 7085 + }, + { + "epoch": 0.81, + "grad_norm": 0.6677795052528381, + "learning_rate": 1.7979646684046782e-05, + "loss": 0.8602, + "step": 7090 + }, + { + "epoch": 0.81, + "grad_norm": 0.6794170141220093, + "learning_rate": 1.787760492074281e-05, + "loss": 0.6957, + "step": 7095 + }, + { + "epoch": 0.81, + "grad_norm": 0.7083120942115784, + "learning_rate": 1.7775825114167344e-05, + "loss": 0.8359, + "step": 7100 + }, + { + "epoch": 0.81, + "grad_norm": 0.7114768624305725, + "learning_rate": 1.767430758898092e-05, + "loss": 0.8783, + "step": 7105 + }, + { + "epoch": 0.81, + "grad_norm": 0.6977761387825012, + "learning_rate": 1.7573052669007552e-05, + "loss": 0.8449, + "step": 7110 + }, + { + "epoch": 0.81, + "grad_norm": 0.7091259360313416, + "learning_rate": 1.7472060677233503e-05, + "loss": 0.7588, + "step": 7115 + }, + { + "epoch": 0.81, + "grad_norm": 0.8661299347877502, + "learning_rate": 1.737133193580638e-05, + "loss": 0.7614, + "step": 7120 + }, + { + "epoch": 0.81, + "grad_norm": 0.6567466855049133, + "learning_rate": 1.727086676603401e-05, + "loss": 0.8487, + "step": 7125 + }, + { + "epoch": 0.81, + "grad_norm": 0.637890100479126, + "learning_rate": 1.7170665488383597e-05, + "loss": 0.8408, + "step": 7130 + }, + { + "epoch": 0.81, + "grad_norm": 0.6123327612876892, + "learning_rate": 1.70707284224804e-05, + "loss": 0.7182, + "step": 7135 + }, + { + "epoch": 0.81, + "grad_norm": 0.7284800410270691, + "learning_rate": 1.697105588710698e-05, + "loss": 0.6767, + "step": 7140 + }, + { + "epoch": 0.81, + "grad_norm": 0.8092273473739624, + "learning_rate": 1.6871648200202127e-05, + "loss": 0.8872, + "step": 7145 + }, + { + "epoch": 0.81, + "grad_norm": 0.6525725722312927, + "learning_rate": 1.677250567885974e-05, + "loss": 0.8425, + "step": 7150 + }, + { + "epoch": 0.81, + "grad_norm": 0.7781700491905212, + "learning_rate": 1.667362863932792e-05, + "loss": 0.8548, + "step": 7155 + }, + { + "epoch": 0.81, + "grad_norm": 0.6901320815086365, + "learning_rate": 1.6575017397007896e-05, + "loss": 0.7168, + "step": 7160 + }, + { + "epoch": 0.81, + "grad_norm": 0.637550413608551, + "learning_rate": 1.6476672266453087e-05, + "loss": 0.8974, + "step": 7165 + }, + { + "epoch": 0.82, + "grad_norm": 0.6621235609054565, + "learning_rate": 1.6378593561368016e-05, + "loss": 0.7816, + "step": 7170 + }, + { + "epoch": 0.82, + "grad_norm": 0.6375472545623779, + "learning_rate": 1.6280781594607364e-05, + "loss": 0.7895, + "step": 7175 + }, + { + "epoch": 0.82, + "grad_norm": 0.6751498579978943, + "learning_rate": 1.6183236678175028e-05, + "loss": 0.8145, + "step": 7180 + }, + { + "epoch": 0.82, + "grad_norm": 0.6726610064506531, + "learning_rate": 1.6085959123222995e-05, + "loss": 0.7143, + "step": 7185 + }, + { + "epoch": 0.82, + "grad_norm": 0.7486894130706787, + "learning_rate": 1.5988949240050343e-05, + "loss": 0.8543, + "step": 7190 + }, + { + "epoch": 0.82, + "grad_norm": 0.6508118510246277, + "learning_rate": 1.5892207338102494e-05, + "loss": 0.7921, + "step": 7195 + }, + { + "epoch": 0.82, + "grad_norm": 0.6627808809280396, + "learning_rate": 1.579573372596993e-05, + "loss": 0.8042, + "step": 7200 + }, + { + "epoch": 0.82, + "grad_norm": 0.6512479782104492, + "learning_rate": 1.5699528711387357e-05, + "loss": 0.7605, + "step": 7205 + }, + { + "epoch": 0.82, + "grad_norm": 0.6039304137229919, + "learning_rate": 1.560359260123272e-05, + "loss": 0.7278, + "step": 7210 + }, + { + "epoch": 0.82, + "grad_norm": 0.7306320667266846, + "learning_rate": 1.550792570152618e-05, + "loss": 0.7868, + "step": 7215 + }, + { + "epoch": 0.82, + "grad_norm": 0.7047690153121948, + "learning_rate": 1.5412528317429197e-05, + "loss": 0.6986, + "step": 7220 + }, + { + "epoch": 0.82, + "grad_norm": 0.7422201037406921, + "learning_rate": 1.531740075324345e-05, + "loss": 0.6588, + "step": 7225 + }, + { + "epoch": 0.82, + "grad_norm": 0.7010562419891357, + "learning_rate": 1.5222543312410042e-05, + "loss": 0.78, + "step": 7230 + }, + { + "epoch": 0.82, + "grad_norm": 0.7242764234542847, + "learning_rate": 1.5127956297508338e-05, + "loss": 0.6492, + "step": 7235 + }, + { + "epoch": 0.82, + "grad_norm": 0.5822871923446655, + "learning_rate": 1.5033640010255145e-05, + "loss": 0.6479, + "step": 7240 + }, + { + "epoch": 0.82, + "grad_norm": 0.7780561447143555, + "learning_rate": 1.493959475150365e-05, + "loss": 0.7035, + "step": 7245 + }, + { + "epoch": 0.82, + "grad_norm": 0.7585102319717407, + "learning_rate": 1.484582082124254e-05, + "loss": 0.7921, + "step": 7250 + }, + { + "epoch": 0.82, + "grad_norm": 0.6618209481239319, + "learning_rate": 1.4752318518594987e-05, + "loss": 0.6766, + "step": 7255 + }, + { + "epoch": 0.83, + "grad_norm": 0.6074077486991882, + "learning_rate": 1.46590881418177e-05, + "loss": 0.7541, + "step": 7260 + }, + { + "epoch": 0.83, + "grad_norm": 0.7971552610397339, + "learning_rate": 1.4566129988300093e-05, + "loss": 0.935, + "step": 7265 + }, + { + "epoch": 0.83, + "grad_norm": 0.6890114545822144, + "learning_rate": 1.4473444354563082e-05, + "loss": 0.6824, + "step": 7270 + }, + { + "epoch": 0.83, + "grad_norm": 0.6999330520629883, + "learning_rate": 1.438103153625835e-05, + "loss": 0.7515, + "step": 7275 + }, + { + "epoch": 0.83, + "grad_norm": 0.7287675738334656, + "learning_rate": 1.4288891828167428e-05, + "loss": 0.7248, + "step": 7280 + }, + { + "epoch": 0.83, + "grad_norm": 0.6378064155578613, + "learning_rate": 1.4197025524200547e-05, + "loss": 0.7629, + "step": 7285 + }, + { + "epoch": 0.83, + "grad_norm": 0.6283854842185974, + "learning_rate": 1.4105432917395911e-05, + "loss": 0.7033, + "step": 7290 + }, + { + "epoch": 0.83, + "grad_norm": 0.7032335996627808, + "learning_rate": 1.4014114299918612e-05, + "loss": 0.8074, + "step": 7295 + }, + { + "epoch": 0.83, + "grad_norm": 0.6264139413833618, + "learning_rate": 1.3923069963059821e-05, + "loss": 0.7572, + "step": 7300 + }, + { + "epoch": 0.83, + "grad_norm": 0.7607216835021973, + "learning_rate": 1.3832300197235748e-05, + "loss": 0.6808, + "step": 7305 + }, + { + "epoch": 0.83, + "grad_norm": 0.7733168005943298, + "learning_rate": 1.3741805291986787e-05, + "loss": 0.7818, + "step": 7310 + }, + { + "epoch": 0.83, + "grad_norm": 0.7297649383544922, + "learning_rate": 1.3651585535976596e-05, + "loss": 0.7182, + "step": 7315 + }, + { + "epoch": 0.83, + "grad_norm": 0.7502012848854065, + "learning_rate": 1.3561641216991162e-05, + "loss": 0.7778, + "step": 7320 + }, + { + "epoch": 0.83, + "grad_norm": 0.7216570377349854, + "learning_rate": 1.3471972621937756e-05, + "loss": 0.7803, + "step": 7325 + }, + { + "epoch": 0.83, + "grad_norm": 0.7050399780273438, + "learning_rate": 1.3382580036844295e-05, + "loss": 0.8175, + "step": 7330 + }, + { + "epoch": 0.83, + "grad_norm": 0.8511192798614502, + "learning_rate": 1.3293463746858182e-05, + "loss": 0.8151, + "step": 7335 + }, + { + "epoch": 0.83, + "grad_norm": 0.7982873320579529, + "learning_rate": 1.3204624036245505e-05, + "loss": 0.7518, + "step": 7340 + }, + { + "epoch": 0.84, + "grad_norm": 0.6801825761795044, + "learning_rate": 1.3116061188390083e-05, + "loss": 0.7761, + "step": 7345 + }, + { + "epoch": 0.84, + "grad_norm": 0.6566172242164612, + "learning_rate": 1.3027775485792681e-05, + "loss": 0.7077, + "step": 7350 + }, + { + "epoch": 0.84, + "grad_norm": 0.6745688319206238, + "learning_rate": 1.2939767210069876e-05, + "loss": 0.7668, + "step": 7355 + }, + { + "epoch": 0.84, + "grad_norm": 0.5891917943954468, + "learning_rate": 1.285203664195338e-05, + "loss": 0.7246, + "step": 7360 + }, + { + "epoch": 0.84, + "grad_norm": 0.7064673900604248, + "learning_rate": 1.2764584061289098e-05, + "loss": 0.734, + "step": 7365 + }, + { + "epoch": 0.84, + "grad_norm": 0.7701008915901184, + "learning_rate": 1.267740974703614e-05, + "loss": 0.8342, + "step": 7370 + }, + { + "epoch": 0.84, + "grad_norm": 0.6421898007392883, + "learning_rate": 1.2590513977266006e-05, + "loss": 0.7911, + "step": 7375 + }, + { + "epoch": 0.84, + "grad_norm": 0.7210098505020142, + "learning_rate": 1.2503897029161715e-05, + "loss": 0.8654, + "step": 7380 + }, + { + "epoch": 0.84, + "grad_norm": 0.8242705464363098, + "learning_rate": 1.2417559179016836e-05, + "loss": 0.798, + "step": 7385 + }, + { + "epoch": 0.84, + "grad_norm": 0.7865302562713623, + "learning_rate": 1.2331500702234722e-05, + "loss": 0.8011, + "step": 7390 + }, + { + "epoch": 0.84, + "grad_norm": 0.6898683309555054, + "learning_rate": 1.2245721873327521e-05, + "loss": 0.8357, + "step": 7395 + }, + { + "epoch": 0.84, + "grad_norm": 0.6212425827980042, + "learning_rate": 1.2160222965915401e-05, + "loss": 0.7448, + "step": 7400 + }, + { + "epoch": 0.84, + "grad_norm": 0.7410294413566589, + "learning_rate": 1.2075004252725619e-05, + "loss": 0.7546, + "step": 7405 + }, + { + "epoch": 0.84, + "grad_norm": 0.6324216723442078, + "learning_rate": 1.199006600559156e-05, + "loss": 0.774, + "step": 7410 + }, + { + "epoch": 0.84, + "grad_norm": 0.8635458946228027, + "learning_rate": 1.190540849545213e-05, + "loss": 0.697, + "step": 7415 + }, + { + "epoch": 0.84, + "grad_norm": 0.6437505483627319, + "learning_rate": 1.1821031992350628e-05, + "loss": 0.797, + "step": 7420 + }, + { + "epoch": 0.84, + "grad_norm": 0.7493045330047607, + "learning_rate": 1.1736936765434004e-05, + "loss": 0.749, + "step": 7425 + }, + { + "epoch": 0.84, + "grad_norm": 0.6989094614982605, + "learning_rate": 1.1653123082951966e-05, + "loss": 0.8166, + "step": 7430 + }, + { + "epoch": 0.85, + "grad_norm": 0.7406939268112183, + "learning_rate": 1.1569591212256237e-05, + "loss": 0.7769, + "step": 7435 + }, + { + "epoch": 0.85, + "grad_norm": 0.6417839527130127, + "learning_rate": 1.1486341419799474e-05, + "loss": 0.7865, + "step": 7440 + }, + { + "epoch": 0.85, + "grad_norm": 0.6294339299201965, + "learning_rate": 1.1403373971134624e-05, + "loss": 0.7634, + "step": 7445 + }, + { + "epoch": 0.85, + "grad_norm": 0.615867018699646, + "learning_rate": 1.1320689130914019e-05, + "loss": 0.689, + "step": 7450 + }, + { + "epoch": 0.85, + "grad_norm": 0.6019276976585388, + "learning_rate": 1.1238287162888483e-05, + "loss": 0.7225, + "step": 7455 + }, + { + "epoch": 0.85, + "grad_norm": 1.1533637046813965, + "learning_rate": 1.1156168329906535e-05, + "loss": 0.7035, + "step": 7460 + }, + { + "epoch": 0.85, + "grad_norm": 0.6268231272697449, + "learning_rate": 1.1074332893913542e-05, + "loss": 0.7861, + "step": 7465 + }, + { + "epoch": 0.85, + "grad_norm": 0.7267367243766785, + "learning_rate": 1.0992781115950868e-05, + "loss": 0.7226, + "step": 7470 + }, + { + "epoch": 0.85, + "grad_norm": 0.6262809634208679, + "learning_rate": 1.0911513256155092e-05, + "loss": 0.7548, + "step": 7475 + }, + { + "epoch": 0.85, + "grad_norm": 0.6905179023742676, + "learning_rate": 1.0830529573757076e-05, + "loss": 0.7447, + "step": 7480 + }, + { + "epoch": 0.85, + "grad_norm": 0.7611731290817261, + "learning_rate": 1.074983032708129e-05, + "loss": 0.7509, + "step": 7485 + }, + { + "epoch": 0.85, + "grad_norm": 0.708593487739563, + "learning_rate": 1.0669415773544866e-05, + "loss": 0.7352, + "step": 7490 + }, + { + "epoch": 0.85, + "grad_norm": 0.6629523038864136, + "learning_rate": 1.0589286169656742e-05, + "loss": 0.7969, + "step": 7495 + }, + { + "epoch": 0.85, + "grad_norm": 0.6062517166137695, + "learning_rate": 1.0509441771017026e-05, + "loss": 0.7943, + "step": 7500 + }, + { + "epoch": 0.85, + "grad_norm": 0.8772883415222168, + "learning_rate": 1.0429882832316006e-05, + "loss": 0.7385, + "step": 7505 + }, + { + "epoch": 0.85, + "grad_norm": 0.7119115591049194, + "learning_rate": 1.0350609607333384e-05, + "loss": 0.7723, + "step": 7510 + }, + { + "epoch": 0.85, + "grad_norm": 0.6540088653564453, + "learning_rate": 1.0271622348937581e-05, + "loss": 0.713, + "step": 7515 + }, + { + "epoch": 0.86, + "grad_norm": 0.6274105906486511, + "learning_rate": 1.0192921309084702e-05, + "loss": 0.7161, + "step": 7520 + }, + { + "epoch": 0.86, + "grad_norm": 0.6521838307380676, + "learning_rate": 1.0114506738817942e-05, + "loss": 0.6904, + "step": 7525 + }, + { + "epoch": 0.86, + "grad_norm": 0.5840886235237122, + "learning_rate": 1.0036378888266663e-05, + "loss": 0.736, + "step": 7530 + }, + { + "epoch": 0.86, + "grad_norm": 0.7056083679199219, + "learning_rate": 9.9585380066457e-06, + "loss": 0.7663, + "step": 7535 + }, + { + "epoch": 0.86, + "grad_norm": 0.7185655236244202, + "learning_rate": 9.880984342254462e-06, + "loss": 0.7682, + "step": 7540 + }, + { + "epoch": 0.86, + "grad_norm": 0.7891202569007874, + "learning_rate": 9.803718142476181e-06, + "loss": 0.8107, + "step": 7545 + }, + { + "epoch": 0.86, + "grad_norm": 0.8266674876213074, + "learning_rate": 9.72673965377714e-06, + "loss": 0.6771, + "step": 7550 + }, + { + "epoch": 0.86, + "grad_norm": 0.6623797416687012, + "learning_rate": 9.650049121705851e-06, + "loss": 0.774, + "step": 7555 + }, + { + "epoch": 0.86, + "grad_norm": 0.7579045295715332, + "learning_rate": 9.573646790892298e-06, + "loss": 0.8548, + "step": 7560 + }, + { + "epoch": 0.86, + "grad_norm": 0.7818818688392639, + "learning_rate": 9.497532905047202e-06, + "loss": 0.7678, + "step": 7565 + }, + { + "epoch": 0.86, + "grad_norm": 0.6764331459999084, + "learning_rate": 9.421707706961136e-06, + "loss": 0.7864, + "step": 7570 + }, + { + "epoch": 0.86, + "grad_norm": 0.6758101582527161, + "learning_rate": 9.34617143850378e-06, + "loss": 0.7407, + "step": 7575 + }, + { + "epoch": 0.86, + "grad_norm": 0.7731313109397888, + "learning_rate": 9.270924340623267e-06, + "loss": 0.6977, + "step": 7580 + }, + { + "epoch": 0.86, + "grad_norm": 0.7126657366752625, + "learning_rate": 9.195966653345255e-06, + "loss": 0.7612, + "step": 7585 + }, + { + "epoch": 0.86, + "grad_norm": 0.8116534948348999, + "learning_rate": 9.121298615772256e-06, + "loss": 0.85, + "step": 7590 + }, + { + "epoch": 0.86, + "grad_norm": 0.7227126359939575, + "learning_rate": 9.04692046608281e-06, + "loss": 0.7488, + "step": 7595 + }, + { + "epoch": 0.86, + "grad_norm": 0.7809383273124695, + "learning_rate": 8.972832441530876e-06, + "loss": 0.8398, + "step": 7600 + }, + { + "epoch": 0.86, + "grad_norm": 0.7499304413795471, + "learning_rate": 8.899034778444804e-06, + "loss": 0.7438, + "step": 7605 + }, + { + "epoch": 0.87, + "grad_norm": 0.7327824234962463, + "learning_rate": 8.825527712226833e-06, + "loss": 0.8292, + "step": 7610 + }, + { + "epoch": 0.87, + "grad_norm": 0.7105828523635864, + "learning_rate": 8.752311477352259e-06, + "loss": 0.6284, + "step": 7615 + }, + { + "epoch": 0.87, + "grad_norm": 0.7823231220245361, + "learning_rate": 8.679386307368631e-06, + "loss": 0.7742, + "step": 7620 + }, + { + "epoch": 0.87, + "grad_norm": 0.6463223099708557, + "learning_rate": 8.606752434895061e-06, + "loss": 0.6789, + "step": 7625 + }, + { + "epoch": 0.87, + "grad_norm": 0.6511633396148682, + "learning_rate": 8.53441009162148e-06, + "loss": 0.706, + "step": 7630 + }, + { + "epoch": 0.87, + "grad_norm": 0.6864632368087769, + "learning_rate": 8.462359508307882e-06, + "loss": 0.7008, + "step": 7635 + }, + { + "epoch": 0.87, + "grad_norm": 0.7921543717384338, + "learning_rate": 8.390600914783598e-06, + "loss": 0.7399, + "step": 7640 + }, + { + "epoch": 0.87, + "grad_norm": 0.6895166039466858, + "learning_rate": 8.319134539946549e-06, + "loss": 0.8166, + "step": 7645 + }, + { + "epoch": 0.87, + "grad_norm": 0.7610728144645691, + "learning_rate": 8.247960611762562e-06, + "loss": 0.7897, + "step": 7650 + }, + { + "epoch": 0.87, + "grad_norm": 0.566939115524292, + "learning_rate": 8.177079357264583e-06, + "loss": 0.744, + "step": 7655 + }, + { + "epoch": 0.87, + "grad_norm": 0.7656039595603943, + "learning_rate": 8.10649100255194e-06, + "loss": 0.8224, + "step": 7660 + }, + { + "epoch": 0.87, + "grad_norm": 0.7149258852005005, + "learning_rate": 8.036195772789734e-06, + "loss": 0.76, + "step": 7665 + }, + { + "epoch": 0.87, + "grad_norm": 0.7486068606376648, + "learning_rate": 7.966193892208007e-06, + "loss": 0.718, + "step": 7670 + }, + { + "epoch": 0.87, + "grad_norm": 0.674845278263092, + "learning_rate": 7.896485584101066e-06, + "loss": 0.7689, + "step": 7675 + }, + { + "epoch": 0.87, + "grad_norm": 0.74075847864151, + "learning_rate": 7.827071070826775e-06, + "loss": 0.8496, + "step": 7680 + }, + { + "epoch": 0.87, + "grad_norm": 0.6453339457511902, + "learning_rate": 7.757950573805839e-06, + "loss": 0.7359, + "step": 7685 + }, + { + "epoch": 0.87, + "grad_norm": 0.7150562405586243, + "learning_rate": 7.689124313521112e-06, + "loss": 0.7677, + "step": 7690 + }, + { + "epoch": 0.87, + "grad_norm": 0.6305781602859497, + "learning_rate": 7.620592509516844e-06, + "loss": 0.7111, + "step": 7695 + }, + { + "epoch": 0.88, + "grad_norm": 0.639238178730011, + "learning_rate": 7.5523553803980795e-06, + "loss": 0.7908, + "step": 7700 + }, + { + "epoch": 0.88, + "grad_norm": 0.6141027808189392, + "learning_rate": 7.48441314382986e-06, + "loss": 0.7716, + "step": 7705 + }, + { + "epoch": 0.88, + "grad_norm": 0.732420802116394, + "learning_rate": 7.416766016536569e-06, + "loss": 0.7698, + "step": 7710 + }, + { + "epoch": 0.88, + "grad_norm": 0.7163684368133545, + "learning_rate": 7.349414214301243e-06, + "loss": 0.7367, + "step": 7715 + }, + { + "epoch": 0.88, + "grad_norm": 0.7079850435256958, + "learning_rate": 7.282357951964902e-06, + "loss": 0.7766, + "step": 7720 + }, + { + "epoch": 0.88, + "grad_norm": 0.6404287815093994, + "learning_rate": 7.215597443425815e-06, + "loss": 0.7758, + "step": 7725 + }, + { + "epoch": 0.88, + "grad_norm": 0.66849285364151, + "learning_rate": 7.149132901638844e-06, + "loss": 0.7765, + "step": 7730 + }, + { + "epoch": 0.88, + "grad_norm": 0.6282253265380859, + "learning_rate": 7.082964538614823e-06, + "loss": 0.7917, + "step": 7735 + }, + { + "epoch": 0.88, + "grad_norm": 0.7129911184310913, + "learning_rate": 7.017092565419747e-06, + "loss": 0.7986, + "step": 7740 + }, + { + "epoch": 0.88, + "grad_norm": 0.6497909426689148, + "learning_rate": 6.951517192174195e-06, + "loss": 0.7719, + "step": 7745 + }, + { + "epoch": 0.88, + "grad_norm": 0.7094148397445679, + "learning_rate": 6.88623862805271e-06, + "loss": 0.7931, + "step": 7750 + }, + { + "epoch": 0.88, + "grad_norm": 0.5958633422851562, + "learning_rate": 6.821257081282972e-06, + "loss": 0.8135, + "step": 7755 + }, + { + "epoch": 0.88, + "grad_norm": 0.6889587044715881, + "learning_rate": 6.756572759145285e-06, + "loss": 0.7241, + "step": 7760 + }, + { + "epoch": 0.88, + "grad_norm": 0.6314862370491028, + "learning_rate": 6.6921858679718345e-06, + "loss": 0.598, + "step": 7765 + }, + { + "epoch": 0.88, + "grad_norm": 0.6266872882843018, + "learning_rate": 6.62809661314604e-06, + "loss": 0.8213, + "step": 7770 + }, + { + "epoch": 0.88, + "grad_norm": 0.8507253527641296, + "learning_rate": 6.564305199101939e-06, + "loss": 0.8546, + "step": 7775 + }, + { + "epoch": 0.88, + "grad_norm": 0.6322931051254272, + "learning_rate": 6.500811829323461e-06, + "loss": 0.8104, + "step": 7780 + }, + { + "epoch": 0.89, + "grad_norm": 0.5790343880653381, + "learning_rate": 6.4376167063438965e-06, + "loss": 0.6864, + "step": 7785 + }, + { + "epoch": 0.89, + "grad_norm": 0.7141791582107544, + "learning_rate": 6.3747200317451294e-06, + "loss": 0.7184, + "step": 7790 + }, + { + "epoch": 0.89, + "grad_norm": 0.7355352640151978, + "learning_rate": 6.3121220061570065e-06, + "loss": 0.7836, + "step": 7795 + }, + { + "epoch": 0.89, + "grad_norm": 0.6871746778488159, + "learning_rate": 6.249822829256835e-06, + "loss": 0.7652, + "step": 7800 + }, + { + "epoch": 0.89, + "grad_norm": 0.8354716897010803, + "learning_rate": 6.1878226997685525e-06, + "loss": 0.7652, + "step": 7805 + }, + { + "epoch": 0.89, + "grad_norm": 0.6610546708106995, + "learning_rate": 6.1261218154622264e-06, + "loss": 0.7605, + "step": 7810 + }, + { + "epoch": 0.89, + "grad_norm": 0.6841945648193359, + "learning_rate": 6.064720373153365e-06, + "loss": 0.6656, + "step": 7815 + }, + { + "epoch": 0.89, + "grad_norm": 0.7112994194030762, + "learning_rate": 6.003618568702351e-06, + "loss": 0.7717, + "step": 7820 + }, + { + "epoch": 0.89, + "grad_norm": 0.6608765721321106, + "learning_rate": 5.942816597013712e-06, + "loss": 0.7229, + "step": 7825 + }, + { + "epoch": 0.89, + "grad_norm": 0.6349248886108398, + "learning_rate": 5.882314652035581e-06, + "loss": 0.7397, + "step": 7830 + }, + { + "epoch": 0.89, + "grad_norm": 0.7428655624389648, + "learning_rate": 5.822112926759071e-06, + "loss": 0.7451, + "step": 7835 + }, + { + "epoch": 0.89, + "grad_norm": 0.6516420245170593, + "learning_rate": 5.7622116132176495e-06, + "loss": 0.6874, + "step": 7840 + }, + { + "epoch": 0.89, + "grad_norm": 0.6775025129318237, + "learning_rate": 5.7026109024864716e-06, + "loss": 0.8085, + "step": 7845 + }, + { + "epoch": 0.89, + "grad_norm": 0.6347958445549011, + "learning_rate": 5.643310984681882e-06, + "loss": 0.7767, + "step": 7850 + }, + { + "epoch": 0.89, + "grad_norm": 0.7689727544784546, + "learning_rate": 5.5843120489607045e-06, + "loss": 0.7772, + "step": 7855 + }, + { + "epoch": 0.89, + "grad_norm": 0.6171656250953674, + "learning_rate": 5.525614283519697e-06, + "loss": 0.7042, + "step": 7860 + }, + { + "epoch": 0.89, + "grad_norm": 0.7996339797973633, + "learning_rate": 5.46721787559491e-06, + "loss": 0.8328, + "step": 7865 + }, + { + "epoch": 0.89, + "grad_norm": 0.9136884212493896, + "learning_rate": 5.409123011461159e-06, + "loss": 0.7864, + "step": 7870 + }, + { + "epoch": 0.9, + "grad_norm": 0.7024335265159607, + "learning_rate": 5.351329876431377e-06, + "loss": 0.7926, + "step": 7875 + }, + { + "epoch": 0.9, + "grad_norm": 0.8358083367347717, + "learning_rate": 5.293838654855965e-06, + "loss": 0.7301, + "step": 7880 + }, + { + "epoch": 0.9, + "grad_norm": 0.6557454466819763, + "learning_rate": 5.236649530122361e-06, + "loss": 0.8171, + "step": 7885 + }, + { + "epoch": 0.9, + "grad_norm": 0.7085033059120178, + "learning_rate": 5.17976268465431e-06, + "loss": 0.8429, + "step": 7890 + }, + { + "epoch": 0.9, + "grad_norm": 0.7088157534599304, + "learning_rate": 5.123178299911357e-06, + "loss": 0.8049, + "step": 7895 + }, + { + "epoch": 0.9, + "grad_norm": 0.7027326822280884, + "learning_rate": 5.0668965563882235e-06, + "loss": 0.8121, + "step": 7900 + }, + { + "epoch": 0.9, + "grad_norm": 0.5936857461929321, + "learning_rate": 5.0109176336142984e-06, + "loss": 0.6958, + "step": 7905 + }, + { + "epoch": 0.9, + "grad_norm": 0.7407312989234924, + "learning_rate": 4.95524171015298e-06, + "loss": 0.6585, + "step": 7910 + }, + { + "epoch": 0.9, + "grad_norm": 0.8003597855567932, + "learning_rate": 4.899868963601173e-06, + "loss": 0.6724, + "step": 7915 + }, + { + "epoch": 0.9, + "grad_norm": 0.6756457090377808, + "learning_rate": 4.844799570588699e-06, + "loss": 0.722, + "step": 7920 + }, + { + "epoch": 0.9, + "grad_norm": 0.7574281096458435, + "learning_rate": 4.79003370677773e-06, + "loss": 0.8335, + "step": 7925 + }, + { + "epoch": 0.9, + "grad_norm": 0.71590656042099, + "learning_rate": 4.735571546862217e-06, + "loss": 0.7708, + "step": 7930 + }, + { + "epoch": 0.9, + "grad_norm": 0.5873979330062866, + "learning_rate": 4.681413264567358e-06, + "loss": 0.6377, + "step": 7935 + }, + { + "epoch": 0.9, + "grad_norm": 0.7385571599006653, + "learning_rate": 4.627559032649031e-06, + "loss": 0.7705, + "step": 7940 + }, + { + "epoch": 0.9, + "grad_norm": 0.5803728103637695, + "learning_rate": 4.574009022893255e-06, + "loss": 0.7057, + "step": 7945 + }, + { + "epoch": 0.9, + "grad_norm": 0.6368845105171204, + "learning_rate": 4.520763406115592e-06, + "loss": 0.7599, + "step": 7950 + }, + { + "epoch": 0.9, + "grad_norm": 0.7460334897041321, + "learning_rate": 4.467822352160722e-06, + "loss": 0.7941, + "step": 7955 + }, + { + "epoch": 0.91, + "grad_norm": 0.6857527494430542, + "learning_rate": 4.415186029901719e-06, + "loss": 0.7451, + "step": 7960 + }, + { + "epoch": 0.91, + "grad_norm": 0.7004684209823608, + "learning_rate": 4.362854607239652e-06, + "loss": 0.7884, + "step": 7965 + }, + { + "epoch": 0.91, + "grad_norm": 0.7447669506072998, + "learning_rate": 4.310828251103072e-06, + "loss": 0.7734, + "step": 7970 + }, + { + "epoch": 0.91, + "grad_norm": 0.6418143510818481, + "learning_rate": 4.259107127447348e-06, + "loss": 0.7259, + "step": 7975 + }, + { + "epoch": 0.91, + "grad_norm": 0.680642306804657, + "learning_rate": 4.20769140125421e-06, + "loss": 0.7835, + "step": 7980 + }, + { + "epoch": 0.91, + "grad_norm": 0.6749563217163086, + "learning_rate": 4.156581236531265e-06, + "loss": 0.7962, + "step": 7985 + }, + { + "epoch": 0.91, + "grad_norm": 0.6090324521064758, + "learning_rate": 4.1057767963113895e-06, + "loss": 0.7743, + "step": 7990 + }, + { + "epoch": 0.91, + "grad_norm": 0.5747905969619751, + "learning_rate": 4.055278242652272e-06, + "loss": 0.7332, + "step": 7995 + }, + { + "epoch": 0.91, + "grad_norm": 0.59056156873703, + "learning_rate": 4.00508573663585e-06, + "loss": 0.796, + "step": 8000 + }, + { + "epoch": 0.91, + "grad_norm": 0.7366644740104675, + "learning_rate": 3.955199438367874e-06, + "loss": 0.6988, + "step": 8005 + }, + { + "epoch": 0.91, + "grad_norm": 0.8058913946151733, + "learning_rate": 3.905619506977287e-06, + "loss": 0.7612, + "step": 8010 + }, + { + "epoch": 0.91, + "grad_norm": 0.696850061416626, + "learning_rate": 3.85634610061576e-06, + "loss": 0.7504, + "step": 8015 + }, + { + "epoch": 0.91, + "grad_norm": 0.636525571346283, + "learning_rate": 3.807379376457276e-06, + "loss": 0.8115, + "step": 8020 + }, + { + "epoch": 0.91, + "grad_norm": 0.6842799186706543, + "learning_rate": 3.7587194906974934e-06, + "loss": 0.7669, + "step": 8025 + }, + { + "epoch": 0.91, + "grad_norm": 0.665556788444519, + "learning_rate": 3.7103665985533275e-06, + "loss": 0.8176, + "step": 8030 + }, + { + "epoch": 0.91, + "grad_norm": 0.7209974527359009, + "learning_rate": 3.662320854262413e-06, + "loss": 0.7631, + "step": 8035 + }, + { + "epoch": 0.91, + "grad_norm": 0.6841431856155396, + "learning_rate": 3.61458241108269e-06, + "loss": 0.8565, + "step": 8040 + }, + { + "epoch": 0.91, + "grad_norm": 0.6740476489067078, + "learning_rate": 3.567151421291781e-06, + "loss": 0.7445, + "step": 8045 + }, + { + "epoch": 0.92, + "grad_norm": 0.6454640626907349, + "learning_rate": 3.5200280361866287e-06, + "loss": 0.7506, + "step": 8050 + }, + { + "epoch": 0.92, + "grad_norm": 0.696540355682373, + "learning_rate": 3.473212406082993e-06, + "loss": 0.7865, + "step": 8055 + }, + { + "epoch": 0.92, + "grad_norm": 0.6765589118003845, + "learning_rate": 3.426704680314896e-06, + "loss": 0.7273, + "step": 8060 + }, + { + "epoch": 0.92, + "grad_norm": 0.7231236100196838, + "learning_rate": 3.3805050072342246e-06, + "loss": 0.7769, + "step": 8065 + }, + { + "epoch": 0.92, + "grad_norm": 0.7834519147872925, + "learning_rate": 3.334613534210218e-06, + "loss": 0.7718, + "step": 8070 + }, + { + "epoch": 0.92, + "grad_norm": 0.7390478253364563, + "learning_rate": 3.2890304076290122e-06, + "loss": 0.7633, + "step": 8075 + }, + { + "epoch": 0.92, + "grad_norm": 0.6524391174316406, + "learning_rate": 3.2437557728931643e-06, + "loss": 0.7352, + "step": 8080 + }, + { + "epoch": 0.92, + "grad_norm": 0.7138876914978027, + "learning_rate": 3.1987897744212068e-06, + "loss": 0.7841, + "step": 8085 + }, + { + "epoch": 0.92, + "grad_norm": 0.6502974033355713, + "learning_rate": 3.1541325556471713e-06, + "loss": 0.8611, + "step": 8090 + }, + { + "epoch": 0.92, + "grad_norm": 0.7619243860244751, + "learning_rate": 3.1097842590201433e-06, + "loss": 0.8266, + "step": 8095 + }, + { + "epoch": 0.92, + "grad_norm": 0.6988756060600281, + "learning_rate": 3.06574502600373e-06, + "loss": 0.7378, + "step": 8100 + }, + { + "epoch": 0.92, + "grad_norm": 0.6371370553970337, + "learning_rate": 3.0220149970757947e-06, + "loss": 0.7703, + "step": 8105 + }, + { + "epoch": 0.92, + "grad_norm": 0.6489354372024536, + "learning_rate": 2.9785943117277893e-06, + "loss": 0.7766, + "step": 8110 + }, + { + "epoch": 0.92, + "grad_norm": 0.757959246635437, + "learning_rate": 2.9354831084644652e-06, + "loss": 0.8003, + "step": 8115 + }, + { + "epoch": 0.92, + "grad_norm": 0.7314863204956055, + "learning_rate": 2.8926815248033533e-06, + "loss": 0.8064, + "step": 8120 + }, + { + "epoch": 0.92, + "grad_norm": 0.6922609210014343, + "learning_rate": 2.8501896972743748e-06, + "loss": 0.6391, + "step": 8125 + }, + { + "epoch": 0.92, + "grad_norm": 0.7072837352752686, + "learning_rate": 2.8080077614193513e-06, + "loss": 0.6651, + "step": 8130 + }, + { + "epoch": 0.92, + "grad_norm": 0.6854616403579712, + "learning_rate": 2.766135851791607e-06, + "loss": 0.8429, + "step": 8135 + }, + { + "epoch": 0.93, + "grad_norm": 0.6851674318313599, + "learning_rate": 2.724574101955557e-06, + "loss": 0.7833, + "step": 8140 + }, + { + "epoch": 0.93, + "grad_norm": 0.7848556041717529, + "learning_rate": 2.6833226444862526e-06, + "loss": 0.7674, + "step": 8145 + }, + { + "epoch": 0.93, + "grad_norm": 0.6435510516166687, + "learning_rate": 2.6423816109689357e-06, + "loss": 0.6399, + "step": 8150 + }, + { + "epoch": 0.93, + "grad_norm": 0.6673296689987183, + "learning_rate": 2.6017511319986752e-06, + "loss": 0.7375, + "step": 8155 + }, + { + "epoch": 0.93, + "grad_norm": 0.7312006950378418, + "learning_rate": 2.56143133717992e-06, + "loss": 0.7741, + "step": 8160 + }, + { + "epoch": 0.93, + "grad_norm": 0.6996206641197205, + "learning_rate": 2.5214223551260686e-06, + "loss": 0.7002, + "step": 8165 + }, + { + "epoch": 0.93, + "grad_norm": 0.7395245432853699, + "learning_rate": 2.481724313459111e-06, + "loss": 0.7389, + "step": 8170 + }, + { + "epoch": 0.93, + "grad_norm": 0.8081804513931274, + "learning_rate": 2.4423373388091753e-06, + "loss": 0.7366, + "step": 8175 + }, + { + "epoch": 0.93, + "grad_norm": 0.7896558046340942, + "learning_rate": 2.4032615568141183e-06, + "loss": 0.7466, + "step": 8180 + }, + { + "epoch": 0.93, + "grad_norm": 0.6015843152999878, + "learning_rate": 2.3644970921191445e-06, + "loss": 0.701, + "step": 8185 + }, + { + "epoch": 0.93, + "grad_norm": 0.8499304056167603, + "learning_rate": 2.326044068376465e-06, + "loss": 0.7778, + "step": 8190 + }, + { + "epoch": 0.93, + "grad_norm": 0.5991072058677673, + "learning_rate": 2.287902608244774e-06, + "loss": 0.7927, + "step": 8195 + }, + { + "epoch": 0.93, + "grad_norm": 0.6677555441856384, + "learning_rate": 2.250072833388972e-06, + "loss": 0.7374, + "step": 8200 + }, + { + "epoch": 0.93, + "grad_norm": 1.1476563215255737, + "learning_rate": 2.2125548644797323e-06, + "loss": 0.6659, + "step": 8205 + }, + { + "epoch": 0.93, + "grad_norm": 0.6608490347862244, + "learning_rate": 2.1753488211931016e-06, + "loss": 0.8061, + "step": 8210 + }, + { + "epoch": 0.93, + "grad_norm": 0.7551729679107666, + "learning_rate": 2.1384548222101342e-06, + "loss": 0.7791, + "step": 8215 + }, + { + "epoch": 0.93, + "grad_norm": 0.7040895819664001, + "learning_rate": 2.1018729852165574e-06, + "loss": 0.7368, + "step": 8220 + }, + { + "epoch": 0.94, + "grad_norm": 0.8961385488510132, + "learning_rate": 2.065603426902296e-06, + "loss": 0.8227, + "step": 8225 + }, + { + "epoch": 0.94, + "grad_norm": 0.5928700566291809, + "learning_rate": 2.0296462629611934e-06, + "loss": 0.7019, + "step": 8230 + }, + { + "epoch": 0.94, + "grad_norm": 0.6719995737075806, + "learning_rate": 1.994001608090612e-06, + "loss": 0.6937, + "step": 8235 + }, + { + "epoch": 0.94, + "grad_norm": 0.6805534362792969, + "learning_rate": 1.9586695759910233e-06, + "loss": 0.6968, + "step": 8240 + }, + { + "epoch": 0.94, + "grad_norm": 0.6675357818603516, + "learning_rate": 1.92365027936573e-06, + "loss": 0.79, + "step": 8245 + }, + { + "epoch": 0.94, + "grad_norm": 0.7165231704711914, + "learning_rate": 1.888943829920431e-06, + "loss": 0.7975, + "step": 8250 + }, + { + "epoch": 0.94, + "grad_norm": 0.663772463798523, + "learning_rate": 1.8545503383629147e-06, + "loss": 0.64, + "step": 8255 + }, + { + "epoch": 0.94, + "grad_norm": 0.6648406982421875, + "learning_rate": 1.8204699144026893e-06, + "loss": 0.7558, + "step": 8260 + }, + { + "epoch": 0.94, + "grad_norm": 0.707249641418457, + "learning_rate": 1.7867026667505725e-06, + "loss": 0.8406, + "step": 8265 + }, + { + "epoch": 0.94, + "grad_norm": 0.6816351413726807, + "learning_rate": 1.7532487031184819e-06, + "loss": 0.8215, + "step": 8270 + }, + { + "epoch": 0.94, + "grad_norm": 0.7135679721832275, + "learning_rate": 1.720108130218967e-06, + "loss": 0.7175, + "step": 8275 + }, + { + "epoch": 0.94, + "grad_norm": 0.6640385389328003, + "learning_rate": 1.6872810537649331e-06, + "loss": 0.7476, + "step": 8280 + }, + { + "epoch": 0.94, + "grad_norm": 0.6773399114608765, + "learning_rate": 1.6547675784692517e-06, + "loss": 0.6793, + "step": 8285 + }, + { + "epoch": 0.94, + "grad_norm": 0.6559042930603027, + "learning_rate": 1.6225678080444951e-06, + "loss": 0.7233, + "step": 8290 + }, + { + "epoch": 0.94, + "grad_norm": 0.8715654015541077, + "learning_rate": 1.5906818452025463e-06, + "loss": 0.8084, + "step": 8295 + }, + { + "epoch": 0.94, + "grad_norm": 0.6095733046531677, + "learning_rate": 1.5591097916543006e-06, + "loss": 0.6426, + "step": 8300 + }, + { + "epoch": 0.94, + "grad_norm": 0.669862687587738, + "learning_rate": 1.5278517481093436e-06, + "loss": 0.6873, + "step": 8305 + }, + { + "epoch": 0.94, + "grad_norm": 0.6891058683395386, + "learning_rate": 1.4969078142756277e-06, + "loss": 0.8274, + "step": 8310 + }, + { + "epoch": 0.95, + "grad_norm": 0.6286734938621521, + "learning_rate": 1.4662780888591076e-06, + "loss": 0.7603, + "step": 8315 + }, + { + "epoch": 0.95, + "grad_norm": 0.6344169974327087, + "learning_rate": 1.4359626695635176e-06, + "loss": 0.7516, + "step": 8320 + }, + { + "epoch": 0.95, + "grad_norm": 0.7188591361045837, + "learning_rate": 1.405961653089971e-06, + "loss": 0.7821, + "step": 8325 + }, + { + "epoch": 0.95, + "grad_norm": 0.6320931911468506, + "learning_rate": 1.3762751351367064e-06, + "loss": 0.8255, + "step": 8330 + }, + { + "epoch": 0.95, + "grad_norm": 0.6263067126274109, + "learning_rate": 1.3469032103987534e-06, + "loss": 0.8558, + "step": 8335 + }, + { + "epoch": 0.95, + "grad_norm": 0.7200011014938354, + "learning_rate": 1.317845972567655e-06, + "loss": 0.7277, + "step": 8340 + }, + { + "epoch": 0.95, + "grad_norm": 0.8542875647544861, + "learning_rate": 1.289103514331147e-06, + "loss": 0.7618, + "step": 8345 + }, + { + "epoch": 0.95, + "grad_norm": 0.6276410818099976, + "learning_rate": 1.2606759273728564e-06, + "loss": 0.7309, + "step": 8350 + }, + { + "epoch": 0.95, + "grad_norm": 0.7522603273391724, + "learning_rate": 1.2325633023720695e-06, + "loss": 0.7636, + "step": 8355 + }, + { + "epoch": 0.95, + "grad_norm": 0.7600675225257874, + "learning_rate": 1.204765729003332e-06, + "loss": 0.8992, + "step": 8360 + }, + { + "epoch": 0.95, + "grad_norm": 0.6711683869361877, + "learning_rate": 1.1772832959362933e-06, + "loss": 0.806, + "step": 8365 + }, + { + "epoch": 0.95, + "grad_norm": 0.5430208444595337, + "learning_rate": 1.150116090835307e-06, + "loss": 0.7333, + "step": 8370 + }, + { + "epoch": 0.95, + "grad_norm": 0.7371984720230103, + "learning_rate": 1.1232642003592197e-06, + "loss": 0.772, + "step": 8375 + }, + { + "epoch": 0.95, + "grad_norm": 0.6525667905807495, + "learning_rate": 1.096727710161094e-06, + "loss": 0.7436, + "step": 8380 + }, + { + "epoch": 0.95, + "grad_norm": 0.7685186862945557, + "learning_rate": 1.070506704887886e-06, + "loss": 0.829, + "step": 8385 + }, + { + "epoch": 0.95, + "grad_norm": 0.6457349061965942, + "learning_rate": 1.0446012681802343e-06, + "loss": 0.7674, + "step": 8390 + }, + { + "epoch": 0.95, + "grad_norm": 0.5994265079498291, + "learning_rate": 1.0190114826721497e-06, + "loss": 0.7031, + "step": 8395 + }, + { + "epoch": 0.96, + "grad_norm": 0.7038202881813049, + "learning_rate": 9.937374299907931e-07, + "loss": 0.7039, + "step": 8400 + }, + { + "epoch": 0.96, + "grad_norm": 0.593492329120636, + "learning_rate": 9.687791907561527e-07, + "loss": 0.6959, + "step": 8405 + }, + { + "epoch": 0.96, + "grad_norm": 0.60867840051651, + "learning_rate": 9.441368445808451e-07, + "loss": 0.6178, + "step": 8410 + }, + { + "epoch": 0.96, + "grad_norm": 0.7278956174850464, + "learning_rate": 9.198104700698595e-07, + "loss": 0.8133, + "step": 8415 + }, + { + "epoch": 0.96, + "grad_norm": 1.0709000825881958, + "learning_rate": 8.958001448202357e-07, + "loss": 0.912, + "step": 8420 + }, + { + "epoch": 0.96, + "grad_norm": 0.7433140277862549, + "learning_rate": 8.721059454209424e-07, + "loss": 0.7567, + "step": 8425 + }, + { + "epoch": 0.96, + "grad_norm": 0.7910411953926086, + "learning_rate": 8.487279474524989e-07, + "loss": 0.8209, + "step": 8430 + }, + { + "epoch": 0.96, + "grad_norm": 0.6875026226043701, + "learning_rate": 8.256662254867986e-07, + "loss": 0.7594, + "step": 8435 + }, + { + "epoch": 0.96, + "grad_norm": 0.8132088780403137, + "learning_rate": 8.029208530869081e-07, + "loss": 0.9099, + "step": 8440 + }, + { + "epoch": 0.96, + "grad_norm": 0.7695709466934204, + "learning_rate": 7.804919028067681e-07, + "loss": 0.7659, + "step": 8445 + }, + { + "epoch": 0.96, + "grad_norm": 0.8422206044197083, + "learning_rate": 7.58379446190971e-07, + "loss": 0.7784, + "step": 8450 + }, + { + "epoch": 0.96, + "grad_norm": 0.6463632583618164, + "learning_rate": 7.365835537745725e-07, + "loss": 0.7772, + "step": 8455 + }, + { + "epoch": 0.96, + "grad_norm": 0.6620475649833679, + "learning_rate": 7.151042950828246e-07, + "loss": 0.7338, + "step": 8460 + }, + { + "epoch": 0.96, + "grad_norm": 0.6927075386047363, + "learning_rate": 6.939417386309766e-07, + "loss": 0.7169, + "step": 8465 + }, + { + "epoch": 0.96, + "grad_norm": 0.6566410064697266, + "learning_rate": 6.730959519240409e-07, + "loss": 0.7196, + "step": 8470 + }, + { + "epoch": 0.96, + "grad_norm": 0.6217153072357178, + "learning_rate": 6.525670014566166e-07, + "loss": 0.6497, + "step": 8475 + }, + { + "epoch": 0.96, + "grad_norm": 0.6876888871192932, + "learning_rate": 6.323549527126216e-07, + "loss": 0.7666, + "step": 8480 + }, + { + "epoch": 0.96, + "grad_norm": 0.736946702003479, + "learning_rate": 6.124598701651052e-07, + "loss": 0.8501, + "step": 8485 + }, + { + "epoch": 0.97, + "grad_norm": 0.5957123041152954, + "learning_rate": 5.928818172760697e-07, + "loss": 0.7375, + "step": 8490 + }, + { + "epoch": 0.97, + "grad_norm": 0.6869728565216064, + "learning_rate": 5.736208564962265e-07, + "loss": 0.7024, + "step": 8495 + }, + { + "epoch": 0.97, + "grad_norm": 0.7970830798149109, + "learning_rate": 5.546770492648401e-07, + "loss": 0.7898, + "step": 8500 + }, + { + "epoch": 0.97, + "grad_norm": 0.7111850380897522, + "learning_rate": 5.360504560094736e-07, + "loss": 0.7083, + "step": 8505 + }, + { + "epoch": 0.97, + "grad_norm": 0.6761876344680786, + "learning_rate": 5.177411361458661e-07, + "loss": 0.7067, + "step": 8510 + }, + { + "epoch": 0.97, + "grad_norm": 0.6835991144180298, + "learning_rate": 4.997491480776773e-07, + "loss": 0.695, + "step": 8515 + }, + { + "epoch": 0.97, + "grad_norm": 0.6578661203384399, + "learning_rate": 4.82074549196343e-07, + "loss": 0.7607, + "step": 8520 + }, + { + "epoch": 0.97, + "grad_norm": 0.684698224067688, + "learning_rate": 4.6471739588089814e-07, + "loss": 0.7665, + "step": 8525 + }, + { + "epoch": 0.97, + "grad_norm": 0.7736423015594482, + "learning_rate": 4.476777434977653e-07, + "loss": 0.8594, + "step": 8530 + }, + { + "epoch": 0.97, + "grad_norm": 0.6832173466682434, + "learning_rate": 4.30955646400566e-07, + "loss": 0.6579, + "step": 8535 + }, + { + "epoch": 0.97, + "grad_norm": 0.731162965297699, + "learning_rate": 4.14551157930021e-07, + "loss": 0.8462, + "step": 8540 + }, + { + "epoch": 0.97, + "grad_norm": 0.7852889895439148, + "learning_rate": 3.984643304136948e-07, + "loss": 0.8533, + "step": 8545 + }, + { + "epoch": 0.97, + "grad_norm": 0.8519006967544556, + "learning_rate": 3.826952151658958e-07, + "loss": 0.6815, + "step": 8550 + }, + { + "epoch": 0.97, + "grad_norm": 0.6394432783126831, + "learning_rate": 3.6724386248745415e-07, + "loss": 0.7208, + "step": 8555 + }, + { + "epoch": 0.97, + "grad_norm": 0.602070152759552, + "learning_rate": 3.5211032166561077e-07, + "loss": 0.7506, + "step": 8560 + }, + { + "epoch": 0.97, + "grad_norm": 0.6670829653739929, + "learning_rate": 3.372946409738398e-07, + "loss": 0.7156, + "step": 8565 + }, + { + "epoch": 0.97, + "grad_norm": 0.7526081204414368, + "learning_rate": 3.2279686767168196e-07, + "loss": 0.8094, + "step": 8570 + }, + { + "epoch": 0.97, + "grad_norm": 0.68255215883255, + "learning_rate": 3.086170480046113e-07, + "loss": 0.7417, + "step": 8575 + }, + { + "epoch": 0.98, + "grad_norm": 0.6421539187431335, + "learning_rate": 2.947552272038911e-07, + "loss": 0.7067, + "step": 8580 + }, + { + "epoch": 0.98, + "grad_norm": 0.7133721113204956, + "learning_rate": 2.812114494864182e-07, + "loss": 0.7699, + "step": 8585 + }, + { + "epoch": 0.98, + "grad_norm": 0.7376065254211426, + "learning_rate": 2.679857580545786e-07, + "loss": 0.8313, + "step": 8590 + }, + { + "epoch": 0.98, + "grad_norm": 0.7178567051887512, + "learning_rate": 2.550781950961034e-07, + "loss": 0.7967, + "step": 8595 + }, + { + "epoch": 0.98, + "grad_norm": 0.7155488133430481, + "learning_rate": 2.42488801783991e-07, + "loss": 0.7509, + "step": 8600 + }, + { + "epoch": 0.98, + "grad_norm": 0.7433626055717468, + "learning_rate": 2.3021761827628496e-07, + "loss": 0.7372, + "step": 8605 + }, + { + "epoch": 0.98, + "grad_norm": 0.7234955430030823, + "learning_rate": 2.182646837160185e-07, + "loss": 0.6896, + "step": 8610 + }, + { + "epoch": 0.98, + "grad_norm": 0.790782630443573, + "learning_rate": 2.0663003623105914e-07, + "loss": 0.7696, + "step": 8615 + }, + { + "epoch": 0.98, + "grad_norm": 0.852401614189148, + "learning_rate": 1.953137129339977e-07, + "loss": 0.7851, + "step": 8620 + }, + { + "epoch": 0.98, + "grad_norm": 0.7792094349861145, + "learning_rate": 1.8431574992199275e-07, + "loss": 0.7594, + "step": 8625 + }, + { + "epoch": 0.98, + "grad_norm": 0.7497335076332092, + "learning_rate": 1.7363618227672628e-07, + "loss": 0.7123, + "step": 8630 + }, + { + "epoch": 0.98, + "grad_norm": 0.612318217754364, + "learning_rate": 1.632750440642261e-07, + "loss": 0.7277, + "step": 8635 + }, + { + "epoch": 0.98, + "grad_norm": 0.6249239444732666, + "learning_rate": 1.5323236833479916e-07, + "loss": 0.7046, + "step": 8640 + }, + { + "epoch": 0.98, + "grad_norm": 0.5679818987846375, + "learning_rate": 1.4350818712292048e-07, + "loss": 0.6912, + "step": 8645 + }, + { + "epoch": 0.98, + "grad_norm": 0.6267695426940918, + "learning_rate": 1.3410253144707785e-07, + "loss": 0.6389, + "step": 8650 + }, + { + "epoch": 0.98, + "grad_norm": 0.7306914925575256, + "learning_rate": 1.2501543130974959e-07, + "loss": 0.7294, + "step": 8655 + }, + { + "epoch": 0.98, + "grad_norm": 0.7982718348503113, + "learning_rate": 1.162469156972712e-07, + "loss": 0.753, + "step": 8660 + }, + { + "epoch": 0.99, + "grad_norm": 0.658090353012085, + "learning_rate": 1.0779701257974672e-07, + "loss": 0.6063, + "step": 8665 + }, + { + "epoch": 0.99, + "grad_norm": 0.6100469827651978, + "learning_rate": 9.96657489109487e-08, + "loss": 0.8564, + "step": 8670 + }, + { + "epoch": 0.99, + "grad_norm": 1.1241860389709473, + "learning_rate": 9.185315062826272e-08, + "loss": 0.7727, + "step": 8675 + }, + { + "epoch": 0.99, + "grad_norm": 0.6518518328666687, + "learning_rate": 8.435924265256523e-08, + "loss": 0.7119, + "step": 8680 + }, + { + "epoch": 0.99, + "grad_norm": 0.6026988625526428, + "learning_rate": 7.718404888816811e-08, + "loss": 0.7094, + "step": 8685 + }, + { + "epoch": 0.99, + "grad_norm": 0.6683136820793152, + "learning_rate": 7.032759222274087e-08, + "loss": 0.7913, + "step": 8690 + }, + { + "epoch": 0.99, + "grad_norm": 0.6700591444969177, + "learning_rate": 6.378989452724416e-08, + "loss": 0.794, + "step": 8695 + }, + { + "epoch": 0.99, + "grad_norm": 0.6720278859138489, + "learning_rate": 5.757097665584077e-08, + "loss": 0.6934, + "step": 8700 + }, + { + "epoch": 0.99, + "grad_norm": 0.7080987691879272, + "learning_rate": 5.1670858445829195e-08, + "loss": 0.8342, + "step": 8705 + }, + { + "epoch": 0.99, + "grad_norm": 0.7693775296211243, + "learning_rate": 4.6089558717610226e-08, + "loss": 0.7174, + "step": 8710 + }, + { + "epoch": 0.99, + "grad_norm": 0.664599597454071, + "learning_rate": 4.082709527459816e-08, + "loss": 0.7675, + "step": 8715 + }, + { + "epoch": 0.99, + "grad_norm": 0.6770430207252502, + "learning_rate": 3.588348490317639e-08, + "loss": 0.7371, + "step": 8720 + }, + { + "epoch": 0.99, + "grad_norm": 0.7018636465072632, + "learning_rate": 3.125874337261969e-08, + "loss": 0.7199, + "step": 8725 + }, + { + "epoch": 0.99, + "grad_norm": 0.6643563508987427, + "learning_rate": 2.6952885435105323e-08, + "loss": 0.6487, + "step": 8730 + }, + { + "epoch": 0.99, + "grad_norm": 0.6865366697311401, + "learning_rate": 2.2965924825579797e-08, + "loss": 0.7733, + "step": 8735 + }, + { + "epoch": 0.99, + "grad_norm": 0.6612882614135742, + "learning_rate": 1.9297874261792193e-08, + "loss": 0.737, + "step": 8740 + }, + { + "epoch": 0.99, + "grad_norm": 0.62270188331604, + "learning_rate": 1.5948745444216428e-08, + "loss": 0.7636, + "step": 8745 + }, + { + "epoch": 0.99, + "grad_norm": 0.6600854396820068, + "learning_rate": 1.2918549056006867e-08, + "loss": 0.6694, + "step": 8750 + }, + { + "epoch": 1.0, + "grad_norm": 0.7353793382644653, + "learning_rate": 1.0207294762987208e-08, + "loss": 0.7041, + "step": 8755 + }, + { + "epoch": 1.0, + "grad_norm": 0.7038155794143677, + "learning_rate": 7.81499121359497e-09, + "loss": 0.689, + "step": 8760 + }, + { + "epoch": 1.0, + "grad_norm": 0.8333893418312073, + "learning_rate": 5.7416460388926004e-09, + "loss": 0.8068, + "step": 8765 + }, + { + "epoch": 1.0, + "grad_norm": 0.6727174520492554, + "learning_rate": 3.9872658525008655e-09, + "loss": 0.7088, + "step": 8770 + }, + { + "epoch": 1.0, + "grad_norm": 0.6914416551589966, + "learning_rate": 2.5518562505988386e-09, + "loss": 0.6869, + "step": 8775 + }, + { + "epoch": 1.0, + "grad_norm": 0.6037778258323669, + "learning_rate": 1.435421811901705e-09, + "loss": 0.7188, + "step": 8780 + }, + { + "epoch": 1.0, + "grad_norm": 0.7393903732299805, + "learning_rate": 6.37966097649656e-10, + "loss": 0.7448, + "step": 8785 + }, + { + "epoch": 1.0, + "grad_norm": 0.6320146322250366, + "learning_rate": 1.5949165159678813e-10, + "loss": 0.7623, + "step": 8790 + }, + { + "epoch": 1.0, + "grad_norm": 0.6219347715377808, + "learning_rate": 0.0, + "loss": 0.7163, + "step": 8795 + }, + { + "epoch": 1.0, + "step": 8795, + "total_flos": 4.313969195378278e+16, + "train_loss": 0.0, + "train_runtime": 0.0081, + "train_samples_per_second": 1092292.541, + "train_steps_per_second": 1092292.541 + } + ], + "logging_steps": 5, + "max_steps": 8795, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 4.313969195378278e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}