{ "best_metric": 0.9710622429847717, "best_model_checkpoint": "/scratch/czm5kz/NEW_finetuned_llama27b32_1_0.0003_alternate/checkpoint-1400", "epoch": 0.9975062344139651, "eval_steps": 20, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.9070518016815186, "learning_rate": 0.0002989308624376336, "loss": 3.4521, "step": 5 }, { "epoch": 0.01, "grad_norm": 2.083425998687744, "learning_rate": 0.00029786172487526725, "loss": 2.6104, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.840538740158081, "learning_rate": 0.0002967925873129009, "loss": 2.0815, "step": 15 }, { "epoch": 0.01, "grad_norm": 1.4302124977111816, "learning_rate": 0.00029572344975053457, "loss": 1.7791, "step": 20 }, { "epoch": 0.01, "eval_loss": 1.6359353065490723, "eval_runtime": 233.285, "eval_samples_per_second": 48.134, "eval_steps_per_second": 6.018, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.1669527292251587, "learning_rate": 0.00029465431218816815, "loss": 1.6289, "step": 25 }, { "epoch": 0.02, "grad_norm": 1.1892757415771484, "learning_rate": 0.00029358517462580184, "loss": 1.4635, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.9651042819023132, "learning_rate": 0.0002925160370634355, "loss": 1.3605, "step": 35 }, { "epoch": 0.03, "grad_norm": 1.3424170017242432, "learning_rate": 0.0002914468995010691, "loss": 1.3334, "step": 40 }, { "epoch": 0.03, "eval_loss": 1.2492247819900513, "eval_runtime": 233.1335, "eval_samples_per_second": 48.166, "eval_steps_per_second": 6.022, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.857738196849823, "learning_rate": 0.00029037776193870275, "loss": 1.1967, "step": 45 }, { "epoch": 0.04, "grad_norm": 0.9371346235275269, "learning_rate": 0.0002893086243763364, "loss": 1.192, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.9375379681587219, "learning_rate": 0.00028823948681397, "loss": 1.1894, "step": 55 }, { "epoch": 0.04, "grad_norm": 0.9225698113441467, "learning_rate": 0.0002871703492516037, "loss": 1.2074, "step": 60 }, { "epoch": 0.04, "eval_loss": 1.1329890489578247, "eval_runtime": 233.4936, "eval_samples_per_second": 48.091, "eval_steps_per_second": 6.013, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.7983834147453308, "learning_rate": 0.0002861012116892373, "loss": 1.1401, "step": 65 }, { "epoch": 0.05, "grad_norm": 0.8089339733123779, "learning_rate": 0.000285032074126871, "loss": 1.0932, "step": 70 }, { "epoch": 0.05, "grad_norm": 0.9041706323623657, "learning_rate": 0.0002839629365645046, "loss": 1.081, "step": 75 }, { "epoch": 0.06, "grad_norm": 0.7338858842849731, "learning_rate": 0.00028289379900213826, "loss": 1.1233, "step": 80 }, { "epoch": 0.06, "eval_loss": 1.091849446296692, "eval_runtime": 233.0902, "eval_samples_per_second": 48.174, "eval_steps_per_second": 6.023, "step": 80 }, { "epoch": 0.06, "grad_norm": 0.816347062587738, "learning_rate": 0.0002818246614397719, "loss": 1.1108, "step": 85 }, { "epoch": 0.06, "grad_norm": 0.6140106916427612, "learning_rate": 0.00028075552387740553, "loss": 1.1325, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.6668590903282166, "learning_rate": 0.00027968638631503917, "loss": 1.0967, "step": 95 }, { "epoch": 0.07, "grad_norm": 0.6617318391799927, "learning_rate": 0.0002786172487526728, "loss": 1.0699, "step": 100 }, { "epoch": 0.07, "eval_loss": 1.0730466842651367, "eval_runtime": 233.2743, "eval_samples_per_second": 48.136, "eval_steps_per_second": 6.019, "step": 100 }, { "epoch": 0.07, "grad_norm": 0.7199958562850952, "learning_rate": 0.00027754811119030644, "loss": 1.1094, "step": 105 }, { "epoch": 0.08, "grad_norm": 0.5779143571853638, "learning_rate": 0.0002764789736279401, "loss": 1.0849, "step": 110 }, { "epoch": 0.08, "grad_norm": 0.5297559499740601, "learning_rate": 0.00027540983606557377, "loss": 1.059, "step": 115 }, { "epoch": 0.09, "grad_norm": 0.794755220413208, "learning_rate": 0.0002743406985032074, "loss": 1.0519, "step": 120 }, { "epoch": 0.09, "eval_loss": 1.0585691928863525, "eval_runtime": 233.4391, "eval_samples_per_second": 48.102, "eval_steps_per_second": 6.014, "step": 120 }, { "epoch": 0.09, "grad_norm": 0.5636884570121765, "learning_rate": 0.00027327156094084104, "loss": 1.0797, "step": 125 }, { "epoch": 0.09, "grad_norm": 0.760064423084259, "learning_rate": 0.0002722024233784747, "loss": 1.0599, "step": 130 }, { "epoch": 0.1, "grad_norm": 0.4716905951499939, "learning_rate": 0.0002711332858161083, "loss": 1.0662, "step": 135 }, { "epoch": 0.1, "grad_norm": 1.0044249296188354, "learning_rate": 0.00027006414825374195, "loss": 1.0562, "step": 140 }, { "epoch": 0.1, "eval_loss": 1.0480350255966187, "eval_runtime": 232.9379, "eval_samples_per_second": 48.206, "eval_steps_per_second": 6.027, "step": 140 }, { "epoch": 0.1, "grad_norm": 0.5207091569900513, "learning_rate": 0.00026899501069137564, "loss": 1.0867, "step": 145 }, { "epoch": 0.11, "grad_norm": 0.5663835406303406, "learning_rate": 0.0002679258731290092, "loss": 1.0417, "step": 150 }, { "epoch": 0.11, "grad_norm": 0.6968746185302734, "learning_rate": 0.0002668567355666429, "loss": 1.0675, "step": 155 }, { "epoch": 0.11, "grad_norm": 0.5393630862236023, "learning_rate": 0.00026578759800427654, "loss": 1.0537, "step": 160 }, { "epoch": 0.11, "eval_loss": 1.0404675006866455, "eval_runtime": 233.2342, "eval_samples_per_second": 48.145, "eval_steps_per_second": 6.02, "step": 160 }, { "epoch": 0.12, "grad_norm": 0.6900107860565186, "learning_rate": 0.0002647184604419102, "loss": 1.028, "step": 165 }, { "epoch": 0.12, "grad_norm": 0.6130568981170654, "learning_rate": 0.0002636493228795438, "loss": 1.0394, "step": 170 }, { "epoch": 0.12, "grad_norm": 0.5123544931411743, "learning_rate": 0.00026258018531717745, "loss": 1.0185, "step": 175 }, { "epoch": 0.13, "grad_norm": 0.5719022750854492, "learning_rate": 0.0002615110477548111, "loss": 1.0661, "step": 180 }, { "epoch": 0.13, "eval_loss": 1.0330013036727905, "eval_runtime": 232.9041, "eval_samples_per_second": 48.213, "eval_steps_per_second": 6.028, "step": 180 }, { "epoch": 0.13, "grad_norm": 0.5089533925056458, "learning_rate": 0.0002604419101924447, "loss": 1.0173, "step": 185 }, { "epoch": 0.14, "grad_norm": 0.4675886332988739, "learning_rate": 0.00025937277263007836, "loss": 0.9911, "step": 190 }, { "epoch": 0.14, "grad_norm": 0.4334254264831543, "learning_rate": 0.000258303635067712, "loss": 1.047, "step": 195 }, { "epoch": 0.14, "grad_norm": 0.4760352671146393, "learning_rate": 0.0002572344975053457, "loss": 1.0267, "step": 200 }, { "epoch": 0.14, "eval_loss": 1.027119755744934, "eval_runtime": 232.824, "eval_samples_per_second": 48.23, "eval_steps_per_second": 6.03, "step": 200 }, { "epoch": 0.15, "grad_norm": 0.6061151027679443, "learning_rate": 0.0002561653599429793, "loss": 1.0051, "step": 205 }, { "epoch": 0.15, "grad_norm": 0.5186893939971924, "learning_rate": 0.00025509622238061296, "loss": 1.012, "step": 210 }, { "epoch": 0.15, "grad_norm": 0.5924214124679565, "learning_rate": 0.0002540270848182466, "loss": 1.0089, "step": 215 }, { "epoch": 0.16, "grad_norm": 0.5817935466766357, "learning_rate": 0.00025295794725588023, "loss": 1.0317, "step": 220 }, { "epoch": 0.16, "eval_loss": 1.0244611501693726, "eval_runtime": 233.3845, "eval_samples_per_second": 48.114, "eval_steps_per_second": 6.016, "step": 220 }, { "epoch": 0.16, "grad_norm": 0.5199680924415588, "learning_rate": 0.00025188880969351387, "loss": 0.9895, "step": 225 }, { "epoch": 0.16, "grad_norm": 0.5646855235099792, "learning_rate": 0.00025081967213114756, "loss": 1.0858, "step": 230 }, { "epoch": 0.17, "grad_norm": 0.47028979659080505, "learning_rate": 0.00024975053456878114, "loss": 1.0456, "step": 235 }, { "epoch": 0.17, "grad_norm": 0.528374433517456, "learning_rate": 0.00024868139700641483, "loss": 1.0268, "step": 240 }, { "epoch": 0.17, "eval_loss": 1.026130199432373, "eval_runtime": 233.4901, "eval_samples_per_second": 48.092, "eval_steps_per_second": 6.013, "step": 240 }, { "epoch": 0.17, "grad_norm": 0.557716429233551, "learning_rate": 0.00024761225944404847, "loss": 1.0249, "step": 245 }, { "epoch": 0.18, "grad_norm": 0.6109133362770081, "learning_rate": 0.0002465431218816821, "loss": 1.0497, "step": 250 }, { "epoch": 0.18, "grad_norm": 0.5170979499816895, "learning_rate": 0.00024547398431931574, "loss": 1.0874, "step": 255 }, { "epoch": 0.19, "grad_norm": 0.48054125905036926, "learning_rate": 0.0002444048467569494, "loss": 1.0261, "step": 260 }, { "epoch": 0.19, "eval_loss": 1.0192761421203613, "eval_runtime": 233.4201, "eval_samples_per_second": 48.106, "eval_steps_per_second": 6.015, "step": 260 }, { "epoch": 0.19, "grad_norm": 0.5639621019363403, "learning_rate": 0.000243335709194583, "loss": 1.001, "step": 265 }, { "epoch": 0.19, "grad_norm": 0.5815649032592773, "learning_rate": 0.00024226657163221665, "loss": 0.9984, "step": 270 }, { "epoch": 0.2, "grad_norm": 0.5705720782279968, "learning_rate": 0.0002411974340698503, "loss": 1.0214, "step": 275 }, { "epoch": 0.2, "grad_norm": 0.5422970652580261, "learning_rate": 0.00024012829650748392, "loss": 1.0365, "step": 280 }, { "epoch": 0.2, "eval_loss": 1.0164554119110107, "eval_runtime": 233.0466, "eval_samples_per_second": 48.183, "eval_steps_per_second": 6.025, "step": 280 }, { "epoch": 0.2, "grad_norm": 0.5650832653045654, "learning_rate": 0.00023905915894511758, "loss": 1.054, "step": 285 }, { "epoch": 0.21, "grad_norm": 0.5521314740180969, "learning_rate": 0.00023799002138275122, "loss": 1.0778, "step": 290 }, { "epoch": 0.21, "grad_norm": 0.5239119529724121, "learning_rate": 0.00023692088382038488, "loss": 0.9975, "step": 295 }, { "epoch": 0.21, "grad_norm": 0.43719667196273804, "learning_rate": 0.0002358517462580185, "loss": 1.035, "step": 300 }, { "epoch": 0.21, "eval_loss": 1.0123059749603271, "eval_runtime": 233.8927, "eval_samples_per_second": 48.009, "eval_steps_per_second": 6.003, "step": 300 }, { "epoch": 0.22, "grad_norm": 0.5832995772361755, "learning_rate": 0.00023478260869565215, "loss": 0.9933, "step": 305 }, { "epoch": 0.22, "grad_norm": 0.47054553031921387, "learning_rate": 0.0002337134711332858, "loss": 1.0032, "step": 310 }, { "epoch": 0.22, "grad_norm": 0.6376765370368958, "learning_rate": 0.00023264433357091945, "loss": 1.0231, "step": 315 }, { "epoch": 0.23, "grad_norm": 0.47957855463027954, "learning_rate": 0.0002315751960085531, "loss": 1.0272, "step": 320 }, { "epoch": 0.23, "eval_loss": 1.0109120607376099, "eval_runtime": 233.0467, "eval_samples_per_second": 48.183, "eval_steps_per_second": 6.025, "step": 320 }, { "epoch": 0.23, "grad_norm": 0.44474130868911743, "learning_rate": 0.00023050605844618672, "loss": 1.0308, "step": 325 }, { "epoch": 0.24, "grad_norm": 0.5061362981796265, "learning_rate": 0.00022943692088382036, "loss": 1.0351, "step": 330 }, { "epoch": 0.24, "grad_norm": 0.5571821331977844, "learning_rate": 0.00022836778332145402, "loss": 1.0549, "step": 335 }, { "epoch": 0.24, "grad_norm": 0.4734516739845276, "learning_rate": 0.00022729864575908766, "loss": 0.9756, "step": 340 }, { "epoch": 0.24, "eval_loss": 1.0074673891067505, "eval_runtime": 233.4351, "eval_samples_per_second": 48.103, "eval_steps_per_second": 6.015, "step": 340 }, { "epoch": 0.25, "grad_norm": 0.38644713163375854, "learning_rate": 0.00022622950819672127, "loss": 1.0128, "step": 345 }, { "epoch": 0.25, "grad_norm": 0.6090478301048279, "learning_rate": 0.00022516037063435493, "loss": 1.0577, "step": 350 }, { "epoch": 0.25, "grad_norm": 0.6028711795806885, "learning_rate": 0.00022409123307198857, "loss": 1.0256, "step": 355 }, { "epoch": 0.26, "grad_norm": 0.5490003228187561, "learning_rate": 0.00022302209550962223, "loss": 1.0364, "step": 360 }, { "epoch": 0.26, "eval_loss": 1.005479097366333, "eval_runtime": 233.0631, "eval_samples_per_second": 48.18, "eval_steps_per_second": 6.024, "step": 360 }, { "epoch": 0.26, "grad_norm": 0.4859737753868103, "learning_rate": 0.00022195295794725584, "loss": 1.0368, "step": 365 }, { "epoch": 0.26, "grad_norm": 0.5286576151847839, "learning_rate": 0.0002208838203848895, "loss": 0.9919, "step": 370 }, { "epoch": 0.27, "grad_norm": 0.5190165638923645, "learning_rate": 0.00021981468282252314, "loss": 1.0574, "step": 375 }, { "epoch": 0.27, "grad_norm": 0.5330237746238708, "learning_rate": 0.0002187455452601568, "loss": 0.9796, "step": 380 }, { "epoch": 0.27, "eval_loss": 1.0031410455703735, "eval_runtime": 233.5388, "eval_samples_per_second": 48.082, "eval_steps_per_second": 6.012, "step": 380 }, { "epoch": 0.27, "grad_norm": 0.598227858543396, "learning_rate": 0.0002176764076977904, "loss": 1.0521, "step": 385 }, { "epoch": 0.28, "grad_norm": 0.4702519178390503, "learning_rate": 0.00021660727013542407, "loss": 1.0167, "step": 390 }, { "epoch": 0.28, "grad_norm": 0.550994336605072, "learning_rate": 0.0002155381325730577, "loss": 1.0502, "step": 395 }, { "epoch": 0.29, "grad_norm": 0.5564207434654236, "learning_rate": 0.00021446899501069137, "loss": 1.0414, "step": 400 }, { "epoch": 0.29, "eval_loss": 1.0028647184371948, "eval_runtime": 233.0518, "eval_samples_per_second": 48.182, "eval_steps_per_second": 6.024, "step": 400 }, { "epoch": 0.29, "grad_norm": 0.4342661499977112, "learning_rate": 0.00021339985744832498, "loss": 0.9645, "step": 405 }, { "epoch": 0.29, "grad_norm": 2.041062593460083, "learning_rate": 0.00021233071988595865, "loss": 1.0252, "step": 410 }, { "epoch": 0.3, "grad_norm": 0.516720712184906, "learning_rate": 0.00021126158232359228, "loss": 0.976, "step": 415 }, { "epoch": 0.3, "grad_norm": 0.5575417876243591, "learning_rate": 0.00021019244476122595, "loss": 1.0176, "step": 420 }, { "epoch": 0.3, "eval_loss": 1.001124620437622, "eval_runtime": 233.1135, "eval_samples_per_second": 48.17, "eval_steps_per_second": 6.023, "step": 420 }, { "epoch": 0.3, "grad_norm": 0.4411332309246063, "learning_rate": 0.00020912330719885958, "loss": 1.0407, "step": 425 }, { "epoch": 0.31, "grad_norm": 0.3926582336425781, "learning_rate": 0.0002080541696364932, "loss": 1.0158, "step": 430 }, { "epoch": 0.31, "grad_norm": 0.4546189308166504, "learning_rate": 0.00020698503207412685, "loss": 1.0102, "step": 435 }, { "epoch": 0.31, "grad_norm": 0.5215502977371216, "learning_rate": 0.0002059158945117605, "loss": 1.0285, "step": 440 }, { "epoch": 0.31, "eval_loss": 0.9982317686080933, "eval_runtime": 233.2445, "eval_samples_per_second": 48.143, "eval_steps_per_second": 6.019, "step": 440 }, { "epoch": 0.32, "grad_norm": 0.6208043694496155, "learning_rate": 0.00020484675694939415, "loss": 1.0089, "step": 445 }, { "epoch": 0.32, "grad_norm": 0.5218257904052734, "learning_rate": 0.00020377761938702776, "loss": 0.9841, "step": 450 }, { "epoch": 0.32, "grad_norm": 0.5310038924217224, "learning_rate": 0.00020270848182466143, "loss": 0.9717, "step": 455 }, { "epoch": 0.33, "grad_norm": 0.5639017820358276, "learning_rate": 0.00020163934426229506, "loss": 1.0441, "step": 460 }, { "epoch": 0.33, "eval_loss": 0.9958406686782837, "eval_runtime": 233.4156, "eval_samples_per_second": 48.107, "eval_steps_per_second": 6.015, "step": 460 }, { "epoch": 0.33, "grad_norm": 0.5642719864845276, "learning_rate": 0.00020057020669992872, "loss": 1.055, "step": 465 }, { "epoch": 0.33, "grad_norm": 0.4470948576927185, "learning_rate": 0.00019950106913756233, "loss": 1.0306, "step": 470 }, { "epoch": 0.34, "grad_norm": 0.5050873160362244, "learning_rate": 0.000198431931575196, "loss": 1.0047, "step": 475 }, { "epoch": 0.34, "grad_norm": 0.3829382658004761, "learning_rate": 0.00019736279401282963, "loss": 1.0449, "step": 480 }, { "epoch": 0.34, "eval_loss": 0.9962899684906006, "eval_runtime": 233.2659, "eval_samples_per_second": 48.138, "eval_steps_per_second": 6.019, "step": 480 }, { "epoch": 0.35, "grad_norm": 0.36466968059539795, "learning_rate": 0.0001962936564504633, "loss": 1.067, "step": 485 }, { "epoch": 0.35, "grad_norm": 0.4244692623615265, "learning_rate": 0.0001952245188880969, "loss": 0.985, "step": 490 }, { "epoch": 0.35, "grad_norm": 0.38433513045310974, "learning_rate": 0.00019415538132573057, "loss": 0.988, "step": 495 }, { "epoch": 0.36, "grad_norm": 0.41374388337135315, "learning_rate": 0.0001930862437633642, "loss": 0.9988, "step": 500 }, { "epoch": 0.36, "eval_loss": 0.9949624538421631, "eval_runtime": 233.13, "eval_samples_per_second": 48.166, "eval_steps_per_second": 6.022, "step": 500 }, { "epoch": 0.36, "grad_norm": 0.4772859811782837, "learning_rate": 0.00019201710620099787, "loss": 1.0462, "step": 505 }, { "epoch": 0.36, "grad_norm": 0.48039695620536804, "learning_rate": 0.00019094796863863148, "loss": 1.0354, "step": 510 }, { "epoch": 0.37, "grad_norm": 0.5138806104660034, "learning_rate": 0.0001898788310762651, "loss": 0.9772, "step": 515 }, { "epoch": 0.37, "grad_norm": 0.4885447025299072, "learning_rate": 0.00018880969351389878, "loss": 1.0409, "step": 520 }, { "epoch": 0.37, "eval_loss": 0.9945592880249023, "eval_runtime": 233.5393, "eval_samples_per_second": 48.082, "eval_steps_per_second": 6.012, "step": 520 }, { "epoch": 0.37, "grad_norm": 0.44714003801345825, "learning_rate": 0.0001877405559515324, "loss": 0.9899, "step": 525 }, { "epoch": 0.38, "grad_norm": 0.4275243282318115, "learning_rate": 0.00018667141838916605, "loss": 0.9873, "step": 530 }, { "epoch": 0.38, "grad_norm": 0.4732975959777832, "learning_rate": 0.00018560228082679968, "loss": 1.006, "step": 535 }, { "epoch": 0.38, "grad_norm": 0.4305308163166046, "learning_rate": 0.00018453314326443335, "loss": 0.9871, "step": 540 }, { "epoch": 0.38, "eval_loss": 0.9933722019195557, "eval_runtime": 233.0714, "eval_samples_per_second": 48.178, "eval_steps_per_second": 6.024, "step": 540 }, { "epoch": 0.39, "grad_norm": 0.4475422203540802, "learning_rate": 0.00018346400570206698, "loss": 0.9713, "step": 545 }, { "epoch": 0.39, "grad_norm": 0.4801812767982483, "learning_rate": 0.00018239486813970065, "loss": 1.0059, "step": 550 }, { "epoch": 0.4, "grad_norm": 0.48028790950775146, "learning_rate": 0.00018132573057733425, "loss": 0.9594, "step": 555 }, { "epoch": 0.4, "grad_norm": 0.48806053400039673, "learning_rate": 0.00018025659301496792, "loss": 1.0144, "step": 560 }, { "epoch": 0.4, "eval_loss": 0.9934199452400208, "eval_runtime": 232.9538, "eval_samples_per_second": 48.203, "eval_steps_per_second": 6.027, "step": 560 }, { "epoch": 0.4, "grad_norm": 0.5405058860778809, "learning_rate": 0.00017918745545260155, "loss": 1.0135, "step": 565 }, { "epoch": 0.41, "grad_norm": 0.3488529622554779, "learning_rate": 0.00017811831789023522, "loss": 0.9895, "step": 570 }, { "epoch": 0.41, "grad_norm": 0.4228030741214752, "learning_rate": 0.00017704918032786883, "loss": 1.0373, "step": 575 }, { "epoch": 0.41, "grad_norm": 0.3694140613079071, "learning_rate": 0.0001759800427655025, "loss": 0.9522, "step": 580 }, { "epoch": 0.41, "eval_loss": 0.990608274936676, "eval_runtime": 233.3892, "eval_samples_per_second": 48.113, "eval_steps_per_second": 6.016, "step": 580 }, { "epoch": 0.42, "grad_norm": 0.4826660752296448, "learning_rate": 0.00017491090520313613, "loss": 1.0129, "step": 585 }, { "epoch": 0.42, "grad_norm": 0.40139585733413696, "learning_rate": 0.0001738417676407698, "loss": 0.9783, "step": 590 }, { "epoch": 0.42, "grad_norm": 0.5151722431182861, "learning_rate": 0.0001727726300784034, "loss": 1.0676, "step": 595 }, { "epoch": 0.43, "grad_norm": 0.37561264634132385, "learning_rate": 0.00017170349251603703, "loss": 1.0592, "step": 600 }, { "epoch": 0.43, "eval_loss": 0.9911465644836426, "eval_runtime": 233.5858, "eval_samples_per_second": 48.072, "eval_steps_per_second": 6.011, "step": 600 }, { "epoch": 0.43, "grad_norm": 0.4334346354007721, "learning_rate": 0.0001706343549536707, "loss": 0.9892, "step": 605 }, { "epoch": 0.43, "grad_norm": 0.3643469512462616, "learning_rate": 0.00016956521739130433, "loss": 1.0195, "step": 610 }, { "epoch": 0.44, "grad_norm": 0.4546561539173126, "learning_rate": 0.00016849607982893797, "loss": 0.9758, "step": 615 }, { "epoch": 0.44, "grad_norm": 0.48632940649986267, "learning_rate": 0.0001674269422665716, "loss": 1.0421, "step": 620 }, { "epoch": 0.44, "eval_loss": 0.9907721877098083, "eval_runtime": 233.4632, "eval_samples_per_second": 48.098, "eval_steps_per_second": 6.014, "step": 620 }, { "epoch": 0.45, "grad_norm": 0.3957265317440033, "learning_rate": 0.00016635780470420527, "loss": 0.9755, "step": 625 }, { "epoch": 0.45, "grad_norm": 0.5378085374832153, "learning_rate": 0.0001652886671418389, "loss": 1.0281, "step": 630 }, { "epoch": 0.45, "grad_norm": 0.556155800819397, "learning_rate": 0.00016421952957947254, "loss": 0.9628, "step": 635 }, { "epoch": 0.46, "grad_norm": 0.43633490800857544, "learning_rate": 0.00016315039201710618, "loss": 1.0179, "step": 640 }, { "epoch": 0.46, "eval_loss": 0.9871463775634766, "eval_runtime": 233.9947, "eval_samples_per_second": 47.988, "eval_steps_per_second": 6.0, "step": 640 }, { "epoch": 0.46, "grad_norm": 0.3990333378314972, "learning_rate": 0.00016208125445473984, "loss": 0.9964, "step": 645 }, { "epoch": 0.46, "grad_norm": 0.36454010009765625, "learning_rate": 0.00016101211689237348, "loss": 1.0073, "step": 650 }, { "epoch": 0.47, "grad_norm": 0.3725971579551697, "learning_rate": 0.00015994297933000714, "loss": 0.9967, "step": 655 }, { "epoch": 0.47, "grad_norm": 0.5494146943092346, "learning_rate": 0.00015887384176764075, "loss": 0.9956, "step": 660 }, { "epoch": 0.47, "eval_loss": 0.9869644641876221, "eval_runtime": 233.2617, "eval_samples_per_second": 48.139, "eval_steps_per_second": 6.019, "step": 660 }, { "epoch": 0.47, "grad_norm": 0.4139350652694702, "learning_rate": 0.0001578047042052744, "loss": 1.022, "step": 665 }, { "epoch": 0.48, "grad_norm": 0.44353967905044556, "learning_rate": 0.00015673556664290805, "loss": 1.0101, "step": 670 }, { "epoch": 0.48, "grad_norm": 0.4779268205165863, "learning_rate": 0.0001556664290805417, "loss": 1.0261, "step": 675 }, { "epoch": 0.48, "grad_norm": 0.37415191531181335, "learning_rate": 0.00015459729151817532, "loss": 0.9729, "step": 680 }, { "epoch": 0.48, "eval_loss": 0.9881226420402527, "eval_runtime": 233.1792, "eval_samples_per_second": 48.156, "eval_steps_per_second": 6.021, "step": 680 }, { "epoch": 0.49, "grad_norm": 0.4490317106246948, "learning_rate": 0.00015352815395580896, "loss": 0.9895, "step": 685 }, { "epoch": 0.49, "grad_norm": 0.4588363766670227, "learning_rate": 0.00015245901639344262, "loss": 1.0078, "step": 690 }, { "epoch": 0.5, "grad_norm": 0.4649104177951813, "learning_rate": 0.00015138987883107623, "loss": 0.9891, "step": 695 }, { "epoch": 0.5, "grad_norm": 0.47752678394317627, "learning_rate": 0.0001503207412687099, "loss": 0.9936, "step": 700 }, { "epoch": 0.5, "eval_loss": 0.9856240153312683, "eval_runtime": 233.0604, "eval_samples_per_second": 48.181, "eval_steps_per_second": 6.024, "step": 700 }, { "epoch": 0.5, "grad_norm": 0.5587136745452881, "learning_rate": 0.00014925160370634355, "loss": 1.0269, "step": 705 }, { "epoch": 0.51, "grad_norm": 0.4135577082633972, "learning_rate": 0.0001481824661439772, "loss": 0.9948, "step": 710 }, { "epoch": 0.51, "grad_norm": 0.4373073875904083, "learning_rate": 0.00014711332858161083, "loss": 0.9559, "step": 715 }, { "epoch": 0.51, "grad_norm": 0.4726034998893738, "learning_rate": 0.00014604419101924446, "loss": 0.9894, "step": 720 }, { "epoch": 0.51, "eval_loss": 0.984818160533905, "eval_runtime": 233.4048, "eval_samples_per_second": 48.11, "eval_steps_per_second": 6.015, "step": 720 }, { "epoch": 0.52, "grad_norm": 0.4481181800365448, "learning_rate": 0.0001449750534568781, "loss": 0.9723, "step": 725 }, { "epoch": 0.52, "grad_norm": 0.3406706154346466, "learning_rate": 0.00014390591589451173, "loss": 0.9888, "step": 730 }, { "epoch": 0.52, "grad_norm": 0.5130278468132019, "learning_rate": 0.0001428367783321454, "loss": 0.9928, "step": 735 }, { "epoch": 0.53, "grad_norm": 0.36185356974601746, "learning_rate": 0.00014176764076977903, "loss": 0.9778, "step": 740 }, { "epoch": 0.53, "eval_loss": 0.9835929274559021, "eval_runtime": 233.3843, "eval_samples_per_second": 48.114, "eval_steps_per_second": 6.016, "step": 740 }, { "epoch": 0.53, "grad_norm": 0.5359773635864258, "learning_rate": 0.00014069850320741267, "loss": 0.9865, "step": 745 }, { "epoch": 0.53, "grad_norm": 0.5931240320205688, "learning_rate": 0.0001396293656450463, "loss": 1.011, "step": 750 }, { "epoch": 0.54, "grad_norm": 0.44104117155075073, "learning_rate": 0.00013856022808267997, "loss": 1.0204, "step": 755 }, { "epoch": 0.54, "grad_norm": 0.4355945587158203, "learning_rate": 0.0001374910905203136, "loss": 0.9671, "step": 760 }, { "epoch": 0.54, "eval_loss": 0.9838274717330933, "eval_runtime": 233.2563, "eval_samples_per_second": 48.14, "eval_steps_per_second": 6.019, "step": 760 }, { "epoch": 0.55, "grad_norm": 0.39338964223861694, "learning_rate": 0.00013642195295794724, "loss": 1.0193, "step": 765 }, { "epoch": 0.55, "grad_norm": 0.43264439702033997, "learning_rate": 0.00013535281539558088, "loss": 0.9929, "step": 770 }, { "epoch": 0.55, "grad_norm": 0.4693204462528229, "learning_rate": 0.00013428367783321454, "loss": 1.0113, "step": 775 }, { "epoch": 0.56, "grad_norm": 0.3989047110080719, "learning_rate": 0.00013321454027084818, "loss": 1.0049, "step": 780 }, { "epoch": 0.56, "eval_loss": 0.9828758239746094, "eval_runtime": 233.0119, "eval_samples_per_second": 48.191, "eval_steps_per_second": 6.025, "step": 780 }, { "epoch": 0.56, "grad_norm": 0.3822576701641083, "learning_rate": 0.0001321454027084818, "loss": 0.998, "step": 785 }, { "epoch": 0.56, "grad_norm": 0.3883315920829773, "learning_rate": 0.00013107626514611545, "loss": 1.0123, "step": 790 }, { "epoch": 0.57, "grad_norm": 0.48141029477119446, "learning_rate": 0.0001300071275837491, "loss": 0.9956, "step": 795 }, { "epoch": 0.57, "grad_norm": 0.43159806728363037, "learning_rate": 0.00012893799002138275, "loss": 0.9517, "step": 800 }, { "epoch": 0.57, "eval_loss": 0.9815587401390076, "eval_runtime": 233.5763, "eval_samples_per_second": 48.074, "eval_steps_per_second": 6.011, "step": 800 }, { "epoch": 0.57, "grad_norm": 0.4243312180042267, "learning_rate": 0.00012786885245901638, "loss": 0.9875, "step": 805 }, { "epoch": 0.58, "grad_norm": 0.32536423206329346, "learning_rate": 0.00012679971489665002, "loss": 1.028, "step": 810 }, { "epoch": 0.58, "grad_norm": 0.3888213038444519, "learning_rate": 0.00012573057733428366, "loss": 0.9964, "step": 815 }, { "epoch": 0.58, "grad_norm": 0.35847336053848267, "learning_rate": 0.0001246614397719173, "loss": 0.9995, "step": 820 }, { "epoch": 0.58, "eval_loss": 0.9827173948287964, "eval_runtime": 233.1259, "eval_samples_per_second": 48.167, "eval_steps_per_second": 6.022, "step": 820 }, { "epoch": 0.59, "grad_norm": 0.43905025720596313, "learning_rate": 0.00012359230220955095, "loss": 1.0293, "step": 825 }, { "epoch": 0.59, "grad_norm": 0.44588425755500793, "learning_rate": 0.0001225231646471846, "loss": 0.962, "step": 830 }, { "epoch": 0.59, "grad_norm": 0.336024671792984, "learning_rate": 0.00012145402708481824, "loss": 1.0679, "step": 835 }, { "epoch": 0.6, "grad_norm": 0.30530259013175964, "learning_rate": 0.00012038488952245188, "loss": 1.0149, "step": 840 }, { "epoch": 0.6, "eval_loss": 0.9809694290161133, "eval_runtime": 232.8618, "eval_samples_per_second": 48.222, "eval_steps_per_second": 6.029, "step": 840 }, { "epoch": 0.6, "grad_norm": 0.4134106934070587, "learning_rate": 0.00011931575196008553, "loss": 0.9827, "step": 845 }, { "epoch": 0.61, "grad_norm": 0.4589226543903351, "learning_rate": 0.00011824661439771916, "loss": 0.9659, "step": 850 }, { "epoch": 0.61, "grad_norm": 0.4869944155216217, "learning_rate": 0.00011717747683535281, "loss": 0.9849, "step": 855 }, { "epoch": 0.61, "grad_norm": 0.4100353717803955, "learning_rate": 0.00011610833927298645, "loss": 0.9797, "step": 860 }, { "epoch": 0.61, "eval_loss": 0.9812436103820801, "eval_runtime": 233.0856, "eval_samples_per_second": 48.175, "eval_steps_per_second": 6.024, "step": 860 }, { "epoch": 0.62, "grad_norm": 0.4007159173488617, "learning_rate": 0.0001150392017106201, "loss": 1.0056, "step": 865 }, { "epoch": 0.62, "grad_norm": 0.4553622603416443, "learning_rate": 0.00011397006414825373, "loss": 0.9957, "step": 870 }, { "epoch": 0.62, "grad_norm": 0.3927219808101654, "learning_rate": 0.00011290092658588738, "loss": 0.9725, "step": 875 }, { "epoch": 0.63, "grad_norm": 0.363148033618927, "learning_rate": 0.00011183178902352102, "loss": 0.9903, "step": 880 }, { "epoch": 0.63, "eval_loss": 0.9790927171707153, "eval_runtime": 233.4773, "eval_samples_per_second": 48.095, "eval_steps_per_second": 6.013, "step": 880 }, { "epoch": 0.63, "grad_norm": 0.4857732653617859, "learning_rate": 0.00011076265146115467, "loss": 1.0079, "step": 885 }, { "epoch": 0.63, "grad_norm": 0.4485923647880554, "learning_rate": 0.00010969351389878829, "loss": 0.9998, "step": 890 }, { "epoch": 0.64, "grad_norm": 0.40661126375198364, "learning_rate": 0.00010862437633642194, "loss": 1.0143, "step": 895 }, { "epoch": 0.64, "grad_norm": 0.47750282287597656, "learning_rate": 0.00010755523877405558, "loss": 0.9909, "step": 900 }, { "epoch": 0.64, "eval_loss": 0.9794048070907593, "eval_runtime": 232.8011, "eval_samples_per_second": 48.234, "eval_steps_per_second": 6.031, "step": 900 }, { "epoch": 0.64, "grad_norm": 0.49147945642471313, "learning_rate": 0.00010648610121168923, "loss": 1.0004, "step": 905 }, { "epoch": 0.65, "grad_norm": 0.4777437448501587, "learning_rate": 0.00010541696364932286, "loss": 1.0, "step": 910 }, { "epoch": 0.65, "grad_norm": 0.43491894006729126, "learning_rate": 0.00010434782608695651, "loss": 1.0349, "step": 915 }, { "epoch": 0.66, "grad_norm": 0.4582345187664032, "learning_rate": 0.00010327868852459015, "loss": 1.0298, "step": 920 }, { "epoch": 0.66, "eval_loss": 0.9796016812324524, "eval_runtime": 233.4945, "eval_samples_per_second": 48.091, "eval_steps_per_second": 6.013, "step": 920 }, { "epoch": 0.66, "grad_norm": 0.4664052128791809, "learning_rate": 0.0001022095509622238, "loss": 1.0192, "step": 925 }, { "epoch": 0.66, "grad_norm": 0.4029270112514496, "learning_rate": 0.00010114041339985743, "loss": 0.9866, "step": 930 }, { "epoch": 0.67, "grad_norm": 0.5014646649360657, "learning_rate": 0.00010007127583749108, "loss": 0.9919, "step": 935 }, { "epoch": 0.67, "grad_norm": 0.4741760790348053, "learning_rate": 9.900213827512472e-05, "loss": 0.9845, "step": 940 }, { "epoch": 0.67, "eval_loss": 0.9778198599815369, "eval_runtime": 233.2505, "eval_samples_per_second": 48.141, "eval_steps_per_second": 6.019, "step": 940 }, { "epoch": 0.67, "grad_norm": 0.3994337320327759, "learning_rate": 9.793300071275837e-05, "loss": 0.992, "step": 945 }, { "epoch": 0.68, "grad_norm": 0.3798719048500061, "learning_rate": 9.686386315039202e-05, "loss": 1.0055, "step": 950 }, { "epoch": 0.68, "grad_norm": 0.5074355006217957, "learning_rate": 9.579472558802566e-05, "loss": 1.0449, "step": 955 }, { "epoch": 0.68, "grad_norm": 0.39967235922813416, "learning_rate": 9.47255880256593e-05, "loss": 0.9388, "step": 960 }, { "epoch": 0.68, "eval_loss": 0.978825569152832, "eval_runtime": 233.051, "eval_samples_per_second": 48.183, "eval_steps_per_second": 6.024, "step": 960 }, { "epoch": 0.69, "grad_norm": 0.47815588116645813, "learning_rate": 9.365645046329294e-05, "loss": 0.9669, "step": 965 }, { "epoch": 0.69, "grad_norm": 0.41418758034706116, "learning_rate": 9.258731290092659e-05, "loss": 1.0471, "step": 970 }, { "epoch": 0.69, "grad_norm": 0.47331690788269043, "learning_rate": 9.151817533856021e-05, "loss": 0.9924, "step": 975 }, { "epoch": 0.7, "grad_norm": 0.6328206062316895, "learning_rate": 9.044903777619385e-05, "loss": 0.9483, "step": 980 }, { "epoch": 0.7, "eval_loss": 0.9771455526351929, "eval_runtime": 233.1232, "eval_samples_per_second": 48.168, "eval_steps_per_second": 6.023, "step": 980 }, { "epoch": 0.7, "grad_norm": 0.33134448528289795, "learning_rate": 8.93799002138275e-05, "loss": 0.9692, "step": 985 }, { "epoch": 0.71, "grad_norm": 0.41730013489723206, "learning_rate": 8.831076265146115e-05, "loss": 0.97, "step": 990 }, { "epoch": 0.71, "grad_norm": 0.3953218460083008, "learning_rate": 8.724162508909478e-05, "loss": 0.9786, "step": 995 }, { "epoch": 0.71, "grad_norm": 0.5037208199501038, "learning_rate": 8.617248752672843e-05, "loss": 0.9988, "step": 1000 }, { "epoch": 0.71, "eval_loss": 0.9770392775535583, "eval_runtime": 233.7995, "eval_samples_per_second": 48.028, "eval_steps_per_second": 6.005, "step": 1000 }, { "epoch": 0.72, "grad_norm": 0.6080517172813416, "learning_rate": 8.510334996436207e-05, "loss": 0.9739, "step": 1005 }, { "epoch": 0.72, "grad_norm": 0.41217270493507385, "learning_rate": 8.403421240199572e-05, "loss": 0.9599, "step": 1010 }, { "epoch": 0.72, "grad_norm": 0.4043317139148712, "learning_rate": 8.296507483962936e-05, "loss": 0.958, "step": 1015 }, { "epoch": 0.73, "grad_norm": 0.42344191670417786, "learning_rate": 8.1895937277263e-05, "loss": 0.9474, "step": 1020 }, { "epoch": 0.73, "eval_loss": 0.9775366187095642, "eval_runtime": 233.032, "eval_samples_per_second": 48.187, "eval_steps_per_second": 6.025, "step": 1020 }, { "epoch": 0.73, "grad_norm": 0.4266872704029083, "learning_rate": 8.082679971489664e-05, "loss": 0.9744, "step": 1025 }, { "epoch": 0.73, "grad_norm": 0.53822261095047, "learning_rate": 7.975766215253029e-05, "loss": 1.0066, "step": 1030 }, { "epoch": 0.74, "grad_norm": 0.47148677706718445, "learning_rate": 7.868852459016393e-05, "loss": 1.0076, "step": 1035 }, { "epoch": 0.74, "grad_norm": 0.4493381977081299, "learning_rate": 7.761938702779758e-05, "loss": 1.0063, "step": 1040 }, { "epoch": 0.74, "eval_loss": 0.9757411479949951, "eval_runtime": 233.1816, "eval_samples_per_second": 48.156, "eval_steps_per_second": 6.021, "step": 1040 }, { "epoch": 0.74, "grad_norm": 0.3722420334815979, "learning_rate": 7.655024946543121e-05, "loss": 0.9636, "step": 1045 }, { "epoch": 0.75, "grad_norm": 0.3606567680835724, "learning_rate": 7.548111190306486e-05, "loss": 1.0114, "step": 1050 }, { "epoch": 0.75, "grad_norm": 0.520788311958313, "learning_rate": 7.44119743406985e-05, "loss": 1.0032, "step": 1055 }, { "epoch": 0.76, "grad_norm": 0.7319127321243286, "learning_rate": 7.334283677833213e-05, "loss": 1.0007, "step": 1060 }, { "epoch": 0.76, "eval_loss": 0.9754329323768616, "eval_runtime": 233.5812, "eval_samples_per_second": 48.073, "eval_steps_per_second": 6.011, "step": 1060 }, { "epoch": 0.76, "grad_norm": 0.4526929557323456, "learning_rate": 7.227369921596578e-05, "loss": 0.989, "step": 1065 }, { "epoch": 0.76, "grad_norm": 0.4485223889350891, "learning_rate": 7.120456165359942e-05, "loss": 0.9985, "step": 1070 }, { "epoch": 0.77, "grad_norm": 0.46079307794570923, "learning_rate": 7.013542409123307e-05, "loss": 1.0313, "step": 1075 }, { "epoch": 0.77, "grad_norm": 0.43368515372276306, "learning_rate": 6.90662865288667e-05, "loss": 0.9722, "step": 1080 }, { "epoch": 0.77, "eval_loss": 0.9747573137283325, "eval_runtime": 232.8905, "eval_samples_per_second": 48.216, "eval_steps_per_second": 6.029, "step": 1080 }, { "epoch": 0.77, "grad_norm": 0.30304279923439026, "learning_rate": 6.799714896650034e-05, "loss": 0.9795, "step": 1085 }, { "epoch": 0.78, "grad_norm": 0.46647265553474426, "learning_rate": 6.692801140413399e-05, "loss": 0.9787, "step": 1090 }, { "epoch": 0.78, "grad_norm": 0.502044677734375, "learning_rate": 6.585887384176763e-05, "loss": 0.9896, "step": 1095 }, { "epoch": 0.78, "grad_norm": 0.36911284923553467, "learning_rate": 6.478973627940128e-05, "loss": 0.9907, "step": 1100 }, { "epoch": 0.78, "eval_loss": 0.9756913185119629, "eval_runtime": 233.558, "eval_samples_per_second": 48.078, "eval_steps_per_second": 6.011, "step": 1100 }, { "epoch": 0.79, "grad_norm": 0.41324666142463684, "learning_rate": 6.372059871703493e-05, "loss": 0.9344, "step": 1105 }, { "epoch": 0.79, "grad_norm": 0.543196976184845, "learning_rate": 6.265146115466856e-05, "loss": 0.9829, "step": 1110 }, { "epoch": 0.79, "grad_norm": 0.45038723945617676, "learning_rate": 6.158232359230221e-05, "loss": 0.9591, "step": 1115 }, { "epoch": 0.8, "grad_norm": 0.3415057957172394, "learning_rate": 6.051318602993584e-05, "loss": 0.9425, "step": 1120 }, { "epoch": 0.8, "eval_loss": 0.9745773673057556, "eval_runtime": 233.0909, "eval_samples_per_second": 48.174, "eval_steps_per_second": 6.023, "step": 1120 }, { "epoch": 0.8, "grad_norm": 0.5441497564315796, "learning_rate": 5.9444048467569485e-05, "loss": 0.9796, "step": 1125 }, { "epoch": 0.81, "grad_norm": 0.42310476303100586, "learning_rate": 5.837491090520313e-05, "loss": 0.9625, "step": 1130 }, { "epoch": 0.81, "grad_norm": 0.42857611179351807, "learning_rate": 5.730577334283677e-05, "loss": 0.9652, "step": 1135 }, { "epoch": 0.81, "grad_norm": 0.3687695562839508, "learning_rate": 5.6236635780470413e-05, "loss": 0.9465, "step": 1140 }, { "epoch": 0.81, "eval_loss": 0.974454939365387, "eval_runtime": 233.5503, "eval_samples_per_second": 48.08, "eval_steps_per_second": 6.012, "step": 1140 }, { "epoch": 0.82, "grad_norm": 0.4901082217693329, "learning_rate": 5.5167498218104056e-05, "loss": 0.992, "step": 1145 }, { "epoch": 0.82, "grad_norm": 0.5039891600608826, "learning_rate": 5.40983606557377e-05, "loss": 0.9943, "step": 1150 }, { "epoch": 0.82, "grad_norm": 0.4841081202030182, "learning_rate": 5.302922309337134e-05, "loss": 0.9436, "step": 1155 }, { "epoch": 0.83, "grad_norm": 0.37224721908569336, "learning_rate": 5.196008553100499e-05, "loss": 0.997, "step": 1160 }, { "epoch": 0.83, "eval_loss": 0.9734376668930054, "eval_runtime": 233.22, "eval_samples_per_second": 48.148, "eval_steps_per_second": 6.02, "step": 1160 }, { "epoch": 0.83, "grad_norm": 0.41128888726234436, "learning_rate": 5.089094796863862e-05, "loss": 0.9735, "step": 1165 }, { "epoch": 0.83, "grad_norm": 0.406338095664978, "learning_rate": 4.9821810406272264e-05, "loss": 0.9955, "step": 1170 }, { "epoch": 0.84, "grad_norm": 0.551630973815918, "learning_rate": 4.875267284390591e-05, "loss": 1.0032, "step": 1175 }, { "epoch": 0.84, "grad_norm": 0.36810871958732605, "learning_rate": 4.7683535281539556e-05, "loss": 0.9371, "step": 1180 }, { "epoch": 0.84, "eval_loss": 0.9730872511863708, "eval_runtime": 233.3639, "eval_samples_per_second": 48.118, "eval_steps_per_second": 6.016, "step": 1180 }, { "epoch": 0.84, "grad_norm": 0.43923842906951904, "learning_rate": 4.66143977191732e-05, "loss": 0.9796, "step": 1185 }, { "epoch": 0.85, "grad_norm": 0.4551779329776764, "learning_rate": 4.554526015680684e-05, "loss": 0.9973, "step": 1190 }, { "epoch": 0.85, "grad_norm": 0.40367963910102844, "learning_rate": 4.4476122594440485e-05, "loss": 0.9833, "step": 1195 }, { "epoch": 0.86, "grad_norm": 0.41799184679985046, "learning_rate": 4.340698503207413e-05, "loss": 0.9681, "step": 1200 }, { "epoch": 0.86, "eval_loss": 0.9727602601051331, "eval_runtime": 233.417, "eval_samples_per_second": 48.107, "eval_steps_per_second": 6.015, "step": 1200 }, { "epoch": 0.86, "grad_norm": 0.6011839509010315, "learning_rate": 4.2337847469707764e-05, "loss": 1.0271, "step": 1205 }, { "epoch": 0.86, "grad_norm": 0.386572927236557, "learning_rate": 4.1268709907341407e-05, "loss": 0.9882, "step": 1210 }, { "epoch": 0.87, "grad_norm": 0.5409009456634521, "learning_rate": 4.019957234497505e-05, "loss": 0.9772, "step": 1215 }, { "epoch": 0.87, "grad_norm": 0.4123440980911255, "learning_rate": 3.913043478260869e-05, "loss": 0.988, "step": 1220 }, { "epoch": 0.87, "eval_loss": 0.9725460410118103, "eval_runtime": 233.2561, "eval_samples_per_second": 48.14, "eval_steps_per_second": 6.019, "step": 1220 }, { "epoch": 0.87, "grad_norm": 0.448880136013031, "learning_rate": 3.8061297220242335e-05, "loss": 0.9838, "step": 1225 }, { "epoch": 0.88, "grad_norm": 0.33402350544929504, "learning_rate": 3.699215965787598e-05, "loss": 0.9564, "step": 1230 }, { "epoch": 0.88, "grad_norm": 0.5954431891441345, "learning_rate": 3.592302209550962e-05, "loss": 0.9882, "step": 1235 }, { "epoch": 0.88, "grad_norm": 0.410119891166687, "learning_rate": 3.485388453314326e-05, "loss": 0.9635, "step": 1240 }, { "epoch": 0.88, "eval_loss": 0.9721638560295105, "eval_runtime": 233.3651, "eval_samples_per_second": 48.118, "eval_steps_per_second": 6.016, "step": 1240 }, { "epoch": 0.89, "grad_norm": 0.3925500214099884, "learning_rate": 3.3784746970776906e-05, "loss": 0.9998, "step": 1245 }, { "epoch": 0.89, "grad_norm": 0.4033336937427521, "learning_rate": 3.271560940841055e-05, "loss": 0.9943, "step": 1250 }, { "epoch": 0.89, "grad_norm": 0.45633894205093384, "learning_rate": 3.164647184604419e-05, "loss": 1.0161, "step": 1255 }, { "epoch": 0.9, "grad_norm": 0.44340837001800537, "learning_rate": 3.057733428367783e-05, "loss": 1.0065, "step": 1260 }, { "epoch": 0.9, "eval_loss": 0.9722057580947876, "eval_runtime": 233.2613, "eval_samples_per_second": 48.139, "eval_steps_per_second": 6.019, "step": 1260 }, { "epoch": 0.9, "grad_norm": 0.371636301279068, "learning_rate": 2.950819672131147e-05, "loss": 0.9746, "step": 1265 }, { "epoch": 0.9, "grad_norm": 0.35404834151268005, "learning_rate": 2.8439059158945114e-05, "loss": 1.0016, "step": 1270 }, { "epoch": 0.91, "grad_norm": 0.3891022503376007, "learning_rate": 2.736992159657876e-05, "loss": 0.9371, "step": 1275 }, { "epoch": 0.91, "grad_norm": 0.4783947765827179, "learning_rate": 2.63007840342124e-05, "loss": 0.9776, "step": 1280 }, { "epoch": 0.91, "eval_loss": 0.9719991087913513, "eval_runtime": 233.0799, "eval_samples_per_second": 48.177, "eval_steps_per_second": 6.024, "step": 1280 }, { "epoch": 0.92, "grad_norm": 0.5107392072677612, "learning_rate": 2.5231646471846042e-05, "loss": 0.9662, "step": 1285 }, { "epoch": 0.92, "grad_norm": 0.46247023344039917, "learning_rate": 2.4162508909479685e-05, "loss": 0.9845, "step": 1290 }, { "epoch": 0.92, "grad_norm": 0.42233213782310486, "learning_rate": 2.3093371347113328e-05, "loss": 0.9892, "step": 1295 }, { "epoch": 0.93, "grad_norm": 0.439802348613739, "learning_rate": 2.2024233784746968e-05, "loss": 1.0365, "step": 1300 }, { "epoch": 0.93, "eval_loss": 0.9716612696647644, "eval_runtime": 232.9368, "eval_samples_per_second": 48.206, "eval_steps_per_second": 6.027, "step": 1300 }, { "epoch": 0.93, "grad_norm": 0.3681885004043579, "learning_rate": 2.095509622238061e-05, "loss": 0.9908, "step": 1305 }, { "epoch": 0.93, "grad_norm": 0.49940550327301025, "learning_rate": 1.9885958660014253e-05, "loss": 1.0436, "step": 1310 }, { "epoch": 0.94, "grad_norm": 0.44985833764076233, "learning_rate": 1.8816821097647896e-05, "loss": 0.9962, "step": 1315 }, { "epoch": 0.94, "grad_norm": 0.44092249870300293, "learning_rate": 1.774768353528154e-05, "loss": 0.9739, "step": 1320 }, { "epoch": 0.94, "eval_loss": 0.9714374542236328, "eval_runtime": 232.9847, "eval_samples_per_second": 48.196, "eval_steps_per_second": 6.026, "step": 1320 }, { "epoch": 0.94, "grad_norm": 0.49097079038619995, "learning_rate": 1.6678545972915182e-05, "loss": 1.0104, "step": 1325 }, { "epoch": 0.95, "grad_norm": 0.40521782636642456, "learning_rate": 1.560940841054882e-05, "loss": 0.9506, "step": 1330 }, { "epoch": 0.95, "grad_norm": 0.40690892934799194, "learning_rate": 1.4540270848182466e-05, "loss": 0.9733, "step": 1335 }, { "epoch": 0.95, "grad_norm": 0.3609257638454437, "learning_rate": 1.3471133285816107e-05, "loss": 0.9915, "step": 1340 }, { "epoch": 0.95, "eval_loss": 0.9715495109558105, "eval_runtime": 233.3093, "eval_samples_per_second": 48.129, "eval_steps_per_second": 6.018, "step": 1340 }, { "epoch": 0.96, "grad_norm": 0.3654801547527313, "learning_rate": 1.240199572344975e-05, "loss": 0.9821, "step": 1345 }, { "epoch": 0.96, "grad_norm": 0.46048375964164734, "learning_rate": 1.1332858161083391e-05, "loss": 0.9414, "step": 1350 }, { "epoch": 0.97, "grad_norm": 0.7182672023773193, "learning_rate": 1.0263720598717034e-05, "loss": 0.9658, "step": 1355 }, { "epoch": 0.97, "grad_norm": 0.4436035752296448, "learning_rate": 9.194583036350677e-06, "loss": 0.9906, "step": 1360 }, { "epoch": 0.97, "eval_loss": 0.9712263941764832, "eval_runtime": 232.8136, "eval_samples_per_second": 48.232, "eval_steps_per_second": 6.031, "step": 1360 }, { "epoch": 0.97, "grad_norm": 0.4624147415161133, "learning_rate": 8.12544547398432e-06, "loss": 0.9995, "step": 1365 }, { "epoch": 0.98, "grad_norm": 0.36057719588279724, "learning_rate": 7.0563079116179615e-06, "loss": 0.9749, "step": 1370 }, { "epoch": 0.98, "grad_norm": 0.4826163351535797, "learning_rate": 5.9871703492516035e-06, "loss": 0.9691, "step": 1375 }, { "epoch": 0.98, "grad_norm": 0.46810442209243774, "learning_rate": 4.9180327868852455e-06, "loss": 0.9769, "step": 1380 }, { "epoch": 0.98, "eval_loss": 0.9711294174194336, "eval_runtime": 233.1267, "eval_samples_per_second": 48.167, "eval_steps_per_second": 6.022, "step": 1380 }, { "epoch": 0.99, "grad_norm": 0.4403376877307892, "learning_rate": 3.848895224518888e-06, "loss": 1.0044, "step": 1385 }, { "epoch": 0.99, "grad_norm": 0.3866402804851532, "learning_rate": 2.7797576621525303e-06, "loss": 0.9635, "step": 1390 }, { "epoch": 0.99, "grad_norm": 0.4226088523864746, "learning_rate": 1.7106200997861725e-06, "loss": 1.0251, "step": 1395 }, { "epoch": 1.0, "grad_norm": 0.48196887969970703, "learning_rate": 6.414825374198146e-07, "loss": 0.9691, "step": 1400 }, { "epoch": 1.0, "eval_loss": 0.9710622429847717, "eval_runtime": 233.1637, "eval_samples_per_second": 48.159, "eval_steps_per_second": 6.022, "step": 1400 } ], "logging_steps": 5, "max_steps": 1403, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 2.7134894097432576e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }