{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2730, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.5403712464892836, "learning_rate": 7.326007326007327e-08, "loss": 1.379, "step": 1 }, { "epoch": 0.01, "grad_norm": 2.5059454164116177, "learning_rate": 3.6630036630036635e-07, "loss": 1.411, "step": 5 }, { "epoch": 0.01, "grad_norm": 2.403039830631443, "learning_rate": 7.326007326007327e-07, "loss": 1.3904, "step": 10 }, { "epoch": 0.02, "grad_norm": 2.187796179583603, "learning_rate": 1.098901098901099e-06, "loss": 1.3698, "step": 15 }, { "epoch": 0.02, "grad_norm": 1.4489435205825498, "learning_rate": 1.4652014652014654e-06, "loss": 1.2609, "step": 20 }, { "epoch": 0.03, "grad_norm": 1.483113856101743, "learning_rate": 1.8315018315018316e-06, "loss": 1.1341, "step": 25 }, { "epoch": 0.03, "grad_norm": 0.7051067484479403, "learning_rate": 2.197802197802198e-06, "loss": 1.0478, "step": 30 }, { "epoch": 0.04, "grad_norm": 0.7254079611231652, "learning_rate": 2.564102564102564e-06, "loss": 1.0312, "step": 35 }, { "epoch": 0.04, "grad_norm": 0.6389491910662315, "learning_rate": 2.930402930402931e-06, "loss": 0.9897, "step": 40 }, { "epoch": 0.05, "grad_norm": 0.500844994643637, "learning_rate": 3.2967032967032968e-06, "loss": 0.9549, "step": 45 }, { "epoch": 0.05, "grad_norm": 0.4201833928940666, "learning_rate": 3.663003663003663e-06, "loss": 0.9373, "step": 50 }, { "epoch": 0.06, "grad_norm": 0.370974019020061, "learning_rate": 4.0293040293040296e-06, "loss": 0.9118, "step": 55 }, { "epoch": 0.07, "grad_norm": 0.34743939582124356, "learning_rate": 4.395604395604396e-06, "loss": 0.8832, "step": 60 }, { "epoch": 0.07, "grad_norm": 0.30550288282826443, "learning_rate": 4.761904761904762e-06, "loss": 0.8905, "step": 65 }, { "epoch": 0.08, "grad_norm": 0.3877890435103539, "learning_rate": 5.128205128205128e-06, "loss": 0.8745, "step": 70 }, { "epoch": 0.08, "grad_norm": 0.4090389705350457, "learning_rate": 5.494505494505495e-06, "loss": 0.869, "step": 75 }, { "epoch": 0.09, "grad_norm": 0.28471599620620913, "learning_rate": 5.860805860805862e-06, "loss": 0.8701, "step": 80 }, { "epoch": 0.09, "grad_norm": 0.2835464196942085, "learning_rate": 6.227106227106228e-06, "loss": 0.8566, "step": 85 }, { "epoch": 0.1, "grad_norm": 0.3339230231336609, "learning_rate": 6.5934065934065935e-06, "loss": 0.8926, "step": 90 }, { "epoch": 0.1, "grad_norm": 0.26409030663411054, "learning_rate": 6.95970695970696e-06, "loss": 0.8639, "step": 95 }, { "epoch": 0.11, "grad_norm": 0.2548657900119433, "learning_rate": 7.326007326007326e-06, "loss": 0.8574, "step": 100 }, { "epoch": 0.12, "grad_norm": 0.33583619074613247, "learning_rate": 7.692307692307694e-06, "loss": 0.8371, "step": 105 }, { "epoch": 0.12, "grad_norm": 0.26284318740686946, "learning_rate": 8.058608058608059e-06, "loss": 0.8209, "step": 110 }, { "epoch": 0.13, "grad_norm": 0.27047354400674645, "learning_rate": 8.424908424908426e-06, "loss": 0.8125, "step": 115 }, { "epoch": 0.13, "grad_norm": 0.25311492465959046, "learning_rate": 8.791208791208792e-06, "loss": 0.8371, "step": 120 }, { "epoch": 0.14, "grad_norm": 0.27273984018778547, "learning_rate": 9.157509157509158e-06, "loss": 0.8361, "step": 125 }, { "epoch": 0.14, "grad_norm": 0.2789308175031694, "learning_rate": 9.523809523809525e-06, "loss": 0.7995, "step": 130 }, { "epoch": 0.15, "grad_norm": 0.26030914277390094, "learning_rate": 9.890109890109892e-06, "loss": 0.8137, "step": 135 }, { "epoch": 0.15, "grad_norm": 0.8625694710794981, "learning_rate": 1.0256410256410256e-05, "loss": 0.7968, "step": 140 }, { "epoch": 0.16, "grad_norm": 0.26526694267370393, "learning_rate": 1.0622710622710623e-05, "loss": 0.8011, "step": 145 }, { "epoch": 0.16, "grad_norm": 0.2786629865850713, "learning_rate": 1.098901098901099e-05, "loss": 0.7944, "step": 150 }, { "epoch": 0.17, "grad_norm": 0.2547160983307644, "learning_rate": 1.1355311355311356e-05, "loss": 0.8072, "step": 155 }, { "epoch": 0.18, "grad_norm": 0.252471949121061, "learning_rate": 1.1721611721611723e-05, "loss": 0.7941, "step": 160 }, { "epoch": 0.18, "grad_norm": 0.25858945532947464, "learning_rate": 1.2087912087912089e-05, "loss": 0.7951, "step": 165 }, { "epoch": 0.19, "grad_norm": 0.247980244139678, "learning_rate": 1.2454212454212456e-05, "loss": 0.7858, "step": 170 }, { "epoch": 0.19, "grad_norm": 0.255903929999623, "learning_rate": 1.2820512820512823e-05, "loss": 0.7878, "step": 175 }, { "epoch": 0.2, "grad_norm": 0.6019780019252425, "learning_rate": 1.3186813186813187e-05, "loss": 0.7795, "step": 180 }, { "epoch": 0.2, "grad_norm": 0.27779059252699495, "learning_rate": 1.3553113553113554e-05, "loss": 0.7754, "step": 185 }, { "epoch": 0.21, "grad_norm": 0.2710280200562755, "learning_rate": 1.391941391941392e-05, "loss": 0.7855, "step": 190 }, { "epoch": 0.21, "grad_norm": 0.2743116770469599, "learning_rate": 1.4285714285714287e-05, "loss": 0.7565, "step": 195 }, { "epoch": 0.22, "grad_norm": 0.26037793315040886, "learning_rate": 1.4652014652014653e-05, "loss": 0.7726, "step": 200 }, { "epoch": 0.23, "grad_norm": 0.26090200965795596, "learning_rate": 1.501831501831502e-05, "loss": 0.7653, "step": 205 }, { "epoch": 0.23, "grad_norm": 0.2632394726032374, "learning_rate": 1.5384615384615387e-05, "loss": 0.777, "step": 210 }, { "epoch": 0.24, "grad_norm": 0.2540381175124688, "learning_rate": 1.575091575091575e-05, "loss": 0.7433, "step": 215 }, { "epoch": 0.24, "grad_norm": 0.2627604876525692, "learning_rate": 1.6117216117216118e-05, "loss": 0.7597, "step": 220 }, { "epoch": 0.25, "grad_norm": 0.2651216093866056, "learning_rate": 1.6483516483516486e-05, "loss": 0.7609, "step": 225 }, { "epoch": 0.25, "grad_norm": 0.24401238946000964, "learning_rate": 1.6849816849816853e-05, "loss": 0.7469, "step": 230 }, { "epoch": 0.26, "grad_norm": 0.2547534354636469, "learning_rate": 1.721611721611722e-05, "loss": 0.7641, "step": 235 }, { "epoch": 0.26, "grad_norm": 0.24893607597108447, "learning_rate": 1.7582417582417584e-05, "loss": 0.7323, "step": 240 }, { "epoch": 0.27, "grad_norm": 0.2530974475033568, "learning_rate": 1.794871794871795e-05, "loss": 0.7533, "step": 245 }, { "epoch": 0.27, "grad_norm": 0.2733773460887857, "learning_rate": 1.8315018315018315e-05, "loss": 0.7538, "step": 250 }, { "epoch": 0.28, "grad_norm": 0.27130243796420767, "learning_rate": 1.8681318681318682e-05, "loss": 0.7665, "step": 255 }, { "epoch": 0.29, "grad_norm": 0.25889603349835133, "learning_rate": 1.904761904761905e-05, "loss": 0.737, "step": 260 }, { "epoch": 0.29, "grad_norm": 0.2758988715664599, "learning_rate": 1.9413919413919417e-05, "loss": 0.7587, "step": 265 }, { "epoch": 0.3, "grad_norm": 0.24591163232882832, "learning_rate": 1.9780219780219784e-05, "loss": 0.7503, "step": 270 }, { "epoch": 0.3, "grad_norm": 0.29213859652031626, "learning_rate": 1.9999967302150437e-05, "loss": 0.7544, "step": 275 }, { "epoch": 0.31, "grad_norm": 0.23841997418474895, "learning_rate": 1.9999599453798523e-05, "loss": 0.738, "step": 280 }, { "epoch": 0.31, "grad_norm": 0.23103339591842176, "learning_rate": 1.9998822899867633e-05, "loss": 0.7316, "step": 285 }, { "epoch": 0.32, "grad_norm": 0.24038013908936962, "learning_rate": 1.9997637672097222e-05, "loss": 0.7202, "step": 290 }, { "epoch": 0.32, "grad_norm": 0.24246381579537735, "learning_rate": 1.9996043818930153e-05, "loss": 0.7463, "step": 295 }, { "epoch": 0.33, "grad_norm": 0.24527189059683477, "learning_rate": 1.9994041405510705e-05, "loss": 0.755, "step": 300 }, { "epoch": 0.34, "grad_norm": 0.24371499098074792, "learning_rate": 1.999163051368191e-05, "loss": 0.7599, "step": 305 }, { "epoch": 0.34, "grad_norm": 0.2575400741876107, "learning_rate": 1.9988811241982206e-05, "loss": 0.7443, "step": 310 }, { "epoch": 0.35, "grad_norm": 0.2404592716601957, "learning_rate": 1.9985583705641418e-05, "loss": 0.7327, "step": 315 }, { "epoch": 0.35, "grad_norm": 0.24637921462827092, "learning_rate": 1.9981948036576045e-05, "loss": 0.7339, "step": 320 }, { "epoch": 0.36, "grad_norm": 0.24681007900489016, "learning_rate": 1.997790438338385e-05, "loss": 0.7265, "step": 325 }, { "epoch": 0.36, "grad_norm": 0.2690257655867855, "learning_rate": 1.997345291133783e-05, "loss": 0.7232, "step": 330 }, { "epoch": 0.37, "grad_norm": 0.2534124946734991, "learning_rate": 1.9968593802379405e-05, "loss": 0.7327, "step": 335 }, { "epoch": 0.37, "grad_norm": 0.23796973039683816, "learning_rate": 1.9963327255111033e-05, "loss": 0.7218, "step": 340 }, { "epoch": 0.38, "grad_norm": 0.23546876853519053, "learning_rate": 1.9957653484788054e-05, "loss": 0.7291, "step": 345 }, { "epoch": 0.38, "grad_norm": 0.2629306598176605, "learning_rate": 1.9951572723309918e-05, "loss": 0.7177, "step": 350 }, { "epoch": 0.39, "grad_norm": 0.2392128886783249, "learning_rate": 1.99450852192107e-05, "loss": 0.7082, "step": 355 }, { "epoch": 0.4, "grad_norm": 0.23159963577625967, "learning_rate": 1.9938191237648924e-05, "loss": 0.7031, "step": 360 }, { "epoch": 0.4, "grad_norm": 0.2522691194103305, "learning_rate": 1.9930891060396757e-05, "loss": 0.7094, "step": 365 }, { "epoch": 0.41, "grad_norm": 0.2708112490882325, "learning_rate": 1.992318498582846e-05, "loss": 0.7201, "step": 370 }, { "epoch": 0.41, "grad_norm": 0.2293891910969228, "learning_rate": 1.9915073328908217e-05, "loss": 0.7144, "step": 375 }, { "epoch": 0.42, "grad_norm": 0.25206778778834, "learning_rate": 1.9906556421177256e-05, "loss": 0.7234, "step": 380 }, { "epoch": 0.42, "grad_norm": 0.2518785514497742, "learning_rate": 1.989763461074029e-05, "loss": 0.7141, "step": 385 }, { "epoch": 0.43, "grad_norm": 0.2378420502334953, "learning_rate": 1.9888308262251286e-05, "loss": 0.7231, "step": 390 }, { "epoch": 0.43, "grad_norm": 0.24009865726485224, "learning_rate": 1.987857775689859e-05, "loss": 0.7187, "step": 395 }, { "epoch": 0.44, "grad_norm": 0.28274675300361035, "learning_rate": 1.9868443492389307e-05, "loss": 0.7044, "step": 400 }, { "epoch": 0.45, "grad_norm": 0.22963475241854425, "learning_rate": 1.985790588293308e-05, "loss": 0.7104, "step": 405 }, { "epoch": 0.45, "grad_norm": 0.2377445508195736, "learning_rate": 1.9846965359225127e-05, "loss": 0.6946, "step": 410 }, { "epoch": 0.46, "grad_norm": 0.2608944217378828, "learning_rate": 1.9835622368428673e-05, "loss": 0.7113, "step": 415 }, { "epoch": 0.46, "grad_norm": 0.24157694006779415, "learning_rate": 1.9823877374156647e-05, "loss": 0.698, "step": 420 }, { "epoch": 0.47, "grad_norm": 0.23398315907950198, "learning_rate": 1.9811730856452754e-05, "loss": 0.7257, "step": 425 }, { "epoch": 0.47, "grad_norm": 0.23894709403347192, "learning_rate": 1.9799183311771823e-05, "loss": 0.7184, "step": 430 }, { "epoch": 0.48, "grad_norm": 0.23431405547670192, "learning_rate": 1.9786235252959555e-05, "loss": 0.7128, "step": 435 }, { "epoch": 0.48, "grad_norm": 0.2376569203749501, "learning_rate": 1.977288720923153e-05, "loss": 0.7203, "step": 440 }, { "epoch": 0.49, "grad_norm": 0.23231143839603255, "learning_rate": 1.9759139726151597e-05, "loss": 0.7082, "step": 445 }, { "epoch": 0.49, "grad_norm": 0.22602078925455668, "learning_rate": 1.9744993365609563e-05, "loss": 0.6944, "step": 450 }, { "epoch": 0.5, "grad_norm": 0.22861362046539935, "learning_rate": 1.973044870579824e-05, "loss": 0.698, "step": 455 }, { "epoch": 0.51, "grad_norm": 0.24188463913379904, "learning_rate": 1.9715506341189795e-05, "loss": 0.7015, "step": 460 }, { "epoch": 0.51, "grad_norm": 0.2327706520398592, "learning_rate": 1.970016688251147e-05, "loss": 0.7054, "step": 465 }, { "epoch": 0.52, "grad_norm": 0.2298533583392459, "learning_rate": 1.9684430956720613e-05, "loss": 0.7005, "step": 470 }, { "epoch": 0.52, "grad_norm": 0.23290007131084822, "learning_rate": 1.966829920697905e-05, "loss": 0.7098, "step": 475 }, { "epoch": 0.53, "grad_norm": 0.23281363580583736, "learning_rate": 1.9651772292626804e-05, "loss": 0.6818, "step": 480 }, { "epoch": 0.53, "grad_norm": 0.23817560008326671, "learning_rate": 1.963485088915514e-05, "loss": 0.7088, "step": 485 }, { "epoch": 0.54, "grad_norm": 0.2350735880544215, "learning_rate": 1.961753568817896e-05, "loss": 0.7066, "step": 490 }, { "epoch": 0.54, "grad_norm": 0.23884064719455786, "learning_rate": 1.959982739740854e-05, "loss": 0.7042, "step": 495 }, { "epoch": 0.55, "grad_norm": 0.21790596438572063, "learning_rate": 1.9581726740620585e-05, "loss": 0.6757, "step": 500 }, { "epoch": 0.55, "grad_norm": 0.23917884378106077, "learning_rate": 1.9563234457628678e-05, "loss": 0.6921, "step": 505 }, { "epoch": 0.56, "grad_norm": 0.2399642009348881, "learning_rate": 1.954435130425301e-05, "loss": 0.7047, "step": 510 }, { "epoch": 0.57, "grad_norm": 0.23880300515157055, "learning_rate": 1.952507805228951e-05, "loss": 0.6884, "step": 515 }, { "epoch": 0.57, "grad_norm": 0.23179275898540247, "learning_rate": 1.9505415489478293e-05, "loss": 0.6932, "step": 520 }, { "epoch": 0.58, "grad_norm": 0.2216104142333809, "learning_rate": 1.9485364419471454e-05, "loss": 0.6728, "step": 525 }, { "epoch": 0.58, "grad_norm": 0.22858244743422157, "learning_rate": 1.9464925661800247e-05, "loss": 0.6809, "step": 530 }, { "epoch": 0.59, "grad_norm": 0.22424934505435848, "learning_rate": 1.9444100051841556e-05, "loss": 0.6967, "step": 535 }, { "epoch": 0.59, "grad_norm": 0.2895129667800956, "learning_rate": 1.9422888440783773e-05, "loss": 0.6989, "step": 540 }, { "epoch": 0.6, "grad_norm": 0.240457298966355, "learning_rate": 1.9401291695592e-05, "loss": 0.6818, "step": 545 }, { "epoch": 0.6, "grad_norm": 0.22825665925272676, "learning_rate": 1.9379310698972618e-05, "loss": 0.6922, "step": 550 }, { "epoch": 0.61, "grad_norm": 0.2172530625853638, "learning_rate": 1.935694634933721e-05, "loss": 0.6627, "step": 555 }, { "epoch": 0.62, "grad_norm": 0.22221082939742373, "learning_rate": 1.933419956076584e-05, "loss": 0.6744, "step": 560 }, { "epoch": 0.62, "grad_norm": 0.2264067795562855, "learning_rate": 1.9311071262969675e-05, "loss": 0.6641, "step": 565 }, { "epoch": 0.63, "grad_norm": 0.2254067456859091, "learning_rate": 1.9287562401253023e-05, "loss": 0.6892, "step": 570 }, { "epoch": 0.63, "grad_norm": 0.2370414298818578, "learning_rate": 1.9263673936474662e-05, "loss": 0.6779, "step": 575 }, { "epoch": 0.64, "grad_norm": 0.22087990811265876, "learning_rate": 1.9239406845008583e-05, "loss": 0.6805, "step": 580 }, { "epoch": 0.64, "grad_norm": 0.22405579155571978, "learning_rate": 1.921476211870408e-05, "loss": 0.6661, "step": 585 }, { "epoch": 0.65, "grad_norm": 0.22938270799667376, "learning_rate": 1.918974076484521e-05, "loss": 0.6773, "step": 590 }, { "epoch": 0.65, "grad_norm": 0.24245138817040104, "learning_rate": 1.916434380610963e-05, "loss": 0.6922, "step": 595 }, { "epoch": 0.66, "grad_norm": 0.22591029702471543, "learning_rate": 1.9138572280526795e-05, "loss": 0.6672, "step": 600 }, { "epoch": 0.66, "grad_norm": 0.22886503680106302, "learning_rate": 1.911242724143552e-05, "loss": 0.6574, "step": 605 }, { "epoch": 0.67, "grad_norm": 0.23592282593270214, "learning_rate": 1.908590975744094e-05, "loss": 0.6701, "step": 610 }, { "epoch": 0.68, "grad_norm": 0.2263018036424681, "learning_rate": 1.9059020912370836e-05, "loss": 0.6645, "step": 615 }, { "epoch": 0.68, "grad_norm": 0.23573388350757823, "learning_rate": 1.9031761805231322e-05, "loss": 0.6789, "step": 620 }, { "epoch": 0.69, "grad_norm": 0.2264306405740615, "learning_rate": 1.9004133550161953e-05, "loss": 0.6798, "step": 625 }, { "epoch": 0.69, "grad_norm": 0.23127140958344955, "learning_rate": 1.8976137276390145e-05, "loss": 0.6918, "step": 630 }, { "epoch": 0.7, "grad_norm": 0.22485640726090023, "learning_rate": 1.894777412818506e-05, "loss": 0.6579, "step": 635 }, { "epoch": 0.7, "grad_norm": 0.2164772947483346, "learning_rate": 1.891904526481083e-05, "loss": 0.6729, "step": 640 }, { "epoch": 0.71, "grad_norm": 0.21930341038212908, "learning_rate": 1.8889951860479165e-05, "loss": 0.6636, "step": 645 }, { "epoch": 0.71, "grad_norm": 0.2212836429863629, "learning_rate": 1.8860495104301346e-05, "loss": 0.7046, "step": 650 }, { "epoch": 0.72, "grad_norm": 0.236222638103693, "learning_rate": 1.8830676200239666e-05, "loss": 0.6544, "step": 655 }, { "epoch": 0.73, "grad_norm": 0.23689174767504043, "learning_rate": 1.8800496367058187e-05, "loss": 0.6619, "step": 660 }, { "epoch": 0.73, "grad_norm": 0.22794708045955256, "learning_rate": 1.8769956838272937e-05, "loss": 0.6536, "step": 665 }, { "epoch": 0.74, "grad_norm": 0.2284737343207547, "learning_rate": 1.8739058862101487e-05, "loss": 0.6716, "step": 670 }, { "epoch": 0.74, "grad_norm": 0.22074952318779162, "learning_rate": 1.8707803701411946e-05, "loss": 0.671, "step": 675 }, { "epoch": 0.75, "grad_norm": 0.2215061880912214, "learning_rate": 1.8676192633671342e-05, "loss": 0.6865, "step": 680 }, { "epoch": 0.75, "grad_norm": 0.23413437096611026, "learning_rate": 1.8644226950893394e-05, "loss": 0.6707, "step": 685 }, { "epoch": 0.76, "grad_norm": 0.22955684806244817, "learning_rate": 1.861190795958573e-05, "loss": 0.6835, "step": 690 }, { "epoch": 0.76, "grad_norm": 0.22476186793320183, "learning_rate": 1.857923698069646e-05, "loss": 0.6597, "step": 695 }, { "epoch": 0.77, "grad_norm": 0.22821748154651522, "learning_rate": 1.8546215349560204e-05, "loss": 0.6769, "step": 700 }, { "epoch": 0.77, "grad_norm": 0.22293250318189695, "learning_rate": 1.8512844415843514e-05, "loss": 0.6828, "step": 705 }, { "epoch": 0.78, "grad_norm": 0.22191425457543576, "learning_rate": 1.8479125543489694e-05, "loss": 0.6849, "step": 710 }, { "epoch": 0.79, "grad_norm": 0.22106154189199517, "learning_rate": 1.844506011066308e-05, "loss": 0.6877, "step": 715 }, { "epoch": 0.79, "grad_norm": 0.22662154758390868, "learning_rate": 1.841064950969268e-05, "loss": 0.6579, "step": 720 }, { "epoch": 0.8, "grad_norm": 0.22395783462295688, "learning_rate": 1.8375895147015285e-05, "loss": 0.6808, "step": 725 }, { "epoch": 0.8, "grad_norm": 0.22270568666818555, "learning_rate": 1.8340798443117992e-05, "loss": 0.6705, "step": 730 }, { "epoch": 0.81, "grad_norm": 0.2150504585631758, "learning_rate": 1.8305360832480118e-05, "loss": 0.6628, "step": 735 }, { "epoch": 0.81, "grad_norm": 0.21416406499964488, "learning_rate": 1.8269583763514603e-05, "loss": 0.6602, "step": 740 }, { "epoch": 0.82, "grad_norm": 0.22446619283908673, "learning_rate": 1.8233468698508786e-05, "loss": 0.6516, "step": 745 }, { "epoch": 0.82, "grad_norm": 0.22209820477583542, "learning_rate": 1.819701711356464e-05, "loss": 0.6719, "step": 750 }, { "epoch": 0.83, "grad_norm": 0.20930564484300637, "learning_rate": 1.8160230498538464e-05, "loss": 0.6462, "step": 755 }, { "epoch": 0.84, "grad_norm": 0.2157300926603618, "learning_rate": 1.8123110356979955e-05, "loss": 0.6386, "step": 760 }, { "epoch": 0.84, "grad_norm": 0.2278341187144425, "learning_rate": 1.808565820607078e-05, "loss": 0.6864, "step": 765 }, { "epoch": 0.85, "grad_norm": 0.22190749729143514, "learning_rate": 1.8047875576562556e-05, "loss": 0.6606, "step": 770 }, { "epoch": 0.85, "grad_norm": 0.22188524148964123, "learning_rate": 1.8009764012714283e-05, "loss": 0.6369, "step": 775 }, { "epoch": 0.86, "grad_norm": 0.24010299878120459, "learning_rate": 1.7971325072229227e-05, "loss": 0.6586, "step": 780 }, { "epoch": 0.86, "grad_norm": 0.2129875521525412, "learning_rate": 1.7932560326191265e-05, "loss": 0.6446, "step": 785 }, { "epoch": 0.87, "grad_norm": 0.2284930031799763, "learning_rate": 1.789347135900066e-05, "loss": 0.6704, "step": 790 }, { "epoch": 0.87, "grad_norm": 0.21164688299648138, "learning_rate": 1.7854059768309292e-05, "loss": 0.6501, "step": 795 }, { "epoch": 0.88, "grad_norm": 0.2185026582076693, "learning_rate": 1.7814327164955388e-05, "loss": 0.6504, "step": 800 }, { "epoch": 0.88, "grad_norm": 0.23885266134914798, "learning_rate": 1.777427517289766e-05, "loss": 0.6534, "step": 805 }, { "epoch": 0.89, "grad_norm": 0.24735644831293754, "learning_rate": 1.773390542914894e-05, "loss": 0.6593, "step": 810 }, { "epoch": 0.9, "grad_norm": 0.21801160079829443, "learning_rate": 1.7693219583709266e-05, "loss": 0.6538, "step": 815 }, { "epoch": 0.9, "grad_norm": 0.23033125474961877, "learning_rate": 1.765221929949845e-05, "loss": 0.6544, "step": 820 }, { "epoch": 0.91, "grad_norm": 0.21636906024225588, "learning_rate": 1.7610906252288097e-05, "loss": 0.6678, "step": 825 }, { "epoch": 0.91, "grad_norm": 0.22862735644017182, "learning_rate": 1.7569282130633137e-05, "loss": 0.6676, "step": 830 }, { "epoch": 0.92, "grad_norm": 0.22135157314088189, "learning_rate": 1.752734863580278e-05, "loss": 0.6463, "step": 835 }, { "epoch": 0.92, "grad_norm": 0.21472268874726408, "learning_rate": 1.7485107481711014e-05, "loss": 0.6525, "step": 840 }, { "epoch": 0.93, "grad_norm": 0.21461981313008696, "learning_rate": 1.7442560394846518e-05, "loss": 0.6484, "step": 845 }, { "epoch": 0.93, "grad_norm": 0.21774234081011679, "learning_rate": 1.739970911420213e-05, "loss": 0.651, "step": 850 }, { "epoch": 0.94, "grad_norm": 0.21598579831580747, "learning_rate": 1.7356555391203745e-05, "loss": 0.6785, "step": 855 }, { "epoch": 0.95, "grad_norm": 0.22611280288574187, "learning_rate": 1.7313100989638745e-05, "loss": 0.6579, "step": 860 }, { "epoch": 0.95, "grad_norm": 0.2118379153642089, "learning_rate": 1.7269347685583913e-05, "loss": 0.658, "step": 865 }, { "epoch": 0.96, "grad_norm": 0.2098318370691198, "learning_rate": 1.7225297267332815e-05, "loss": 0.6628, "step": 870 }, { "epoch": 0.96, "grad_norm": 0.2096338281124951, "learning_rate": 1.7180951535322742e-05, "loss": 0.6519, "step": 875 }, { "epoch": 0.97, "grad_norm": 0.2277800903608631, "learning_rate": 1.7136312302061097e-05, "loss": 0.6396, "step": 880 }, { "epoch": 0.97, "grad_norm": 0.2104470768411409, "learning_rate": 1.7091381392051333e-05, "loss": 0.6434, "step": 885 }, { "epoch": 0.98, "grad_norm": 0.2142332846578606, "learning_rate": 1.704616064171836e-05, "loss": 0.6384, "step": 890 }, { "epoch": 0.98, "grad_norm": 0.21219425658206706, "learning_rate": 1.7000651899333512e-05, "loss": 0.6346, "step": 895 }, { "epoch": 0.99, "grad_norm": 0.2208443696000645, "learning_rate": 1.6954857024938976e-05, "loss": 0.648, "step": 900 }, { "epoch": 0.99, "grad_norm": 0.21055402887891725, "learning_rate": 1.6908777890271794e-05, "loss": 0.6472, "step": 905 }, { "epoch": 1.0, "grad_norm": 0.2122526858833542, "learning_rate": 1.686241637868734e-05, "loss": 0.6422, "step": 910 }, { "epoch": 1.0, "eval_loss": 0.6909855604171753, "eval_runtime": 13.3743, "eval_samples_per_second": 103.034, "eval_steps_per_second": 0.822, "step": 910 }, { "epoch": 1.01, "grad_norm": 0.2376946716415055, "learning_rate": 1.6815774385082355e-05, "loss": 0.6025, "step": 915 }, { "epoch": 1.01, "grad_norm": 0.21981152940016552, "learning_rate": 1.6768853815817506e-05, "loss": 0.6149, "step": 920 }, { "epoch": 1.02, "grad_norm": 0.2114064615721331, "learning_rate": 1.6721656588639444e-05, "loss": 0.6084, "step": 925 }, { "epoch": 1.02, "grad_norm": 0.21343882616611637, "learning_rate": 1.6674184632602447e-05, "loss": 0.6192, "step": 930 }, { "epoch": 1.03, "grad_norm": 0.20964848533593644, "learning_rate": 1.6626439887989552e-05, "loss": 0.577, "step": 935 }, { "epoch": 1.03, "grad_norm": 0.21080943869356272, "learning_rate": 1.6578424306233282e-05, "loss": 0.5858, "step": 940 }, { "epoch": 1.04, "grad_norm": 0.21972421289491473, "learning_rate": 1.653013984983585e-05, "loss": 0.5907, "step": 945 }, { "epoch": 1.04, "grad_norm": 0.2148077134860289, "learning_rate": 1.6481588492288985e-05, "loss": 0.5974, "step": 950 }, { "epoch": 1.05, "grad_norm": 0.222495841900069, "learning_rate": 1.643277221799323e-05, "loss": 0.5979, "step": 955 }, { "epoch": 1.05, "grad_norm": 0.21883932168474696, "learning_rate": 1.638369302217687e-05, "loss": 0.6077, "step": 960 }, { "epoch": 1.06, "grad_norm": 0.20756222937087987, "learning_rate": 1.633435291081437e-05, "loss": 0.5886, "step": 965 }, { "epoch": 1.07, "grad_norm": 0.22034735404299097, "learning_rate": 1.6284753900544384e-05, "loss": 0.6023, "step": 970 }, { "epoch": 1.07, "grad_norm": 0.2178991358059654, "learning_rate": 1.6234898018587336e-05, "loss": 0.6077, "step": 975 }, { "epoch": 1.08, "grad_norm": 0.21185656324247812, "learning_rate": 1.618478730266255e-05, "loss": 0.5891, "step": 980 }, { "epoch": 1.08, "grad_norm": 0.2225195916998371, "learning_rate": 1.6134423800904985e-05, "loss": 0.6082, "step": 985 }, { "epoch": 1.09, "grad_norm": 0.21175099229044173, "learning_rate": 1.6083809571781498e-05, "loss": 0.6022, "step": 990 }, { "epoch": 1.09, "grad_norm": 0.21693130048041695, "learning_rate": 1.6032946684006745e-05, "loss": 0.5877, "step": 995 }, { "epoch": 1.1, "grad_norm": 0.20628677851436183, "learning_rate": 1.598183721645858e-05, "loss": 0.6025, "step": 1000 }, { "epoch": 1.1, "grad_norm": 0.21136866931809867, "learning_rate": 1.5930483258093144e-05, "loss": 0.6056, "step": 1005 }, { "epoch": 1.11, "grad_norm": 0.21628647967275302, "learning_rate": 1.5878886907859423e-05, "loss": 0.5973, "step": 1010 }, { "epoch": 1.12, "grad_norm": 0.21460811563007248, "learning_rate": 1.5827050274613512e-05, "loss": 0.6151, "step": 1015 }, { "epoch": 1.12, "grad_norm": 0.21019968808729894, "learning_rate": 1.57749754770324e-05, "loss": 0.5871, "step": 1020 }, { "epoch": 1.13, "grad_norm": 0.21602614964752764, "learning_rate": 1.5722664643527362e-05, "loss": 0.6088, "step": 1025 }, { "epoch": 1.13, "grad_norm": 0.21863820245310453, "learning_rate": 1.567011991215699e-05, "loss": 0.5968, "step": 1030 }, { "epoch": 1.14, "grad_norm": 0.21638329517362545, "learning_rate": 1.561734343053979e-05, "loss": 0.5879, "step": 1035 }, { "epoch": 1.14, "grad_norm": 0.21082524836877004, "learning_rate": 1.5564337355766412e-05, "loss": 0.583, "step": 1040 }, { "epoch": 1.15, "grad_norm": 0.2055779159254746, "learning_rate": 1.551110385431148e-05, "loss": 0.5934, "step": 1045 }, { "epoch": 1.15, "grad_norm": 0.22359885758488707, "learning_rate": 1.5457645101945046e-05, "loss": 0.5824, "step": 1050 }, { "epoch": 1.16, "grad_norm": 0.2194479570684916, "learning_rate": 1.540396328364367e-05, "loss": 0.6125, "step": 1055 }, { "epoch": 1.16, "grad_norm": 0.21187983654193793, "learning_rate": 1.5350060593501086e-05, "loss": 0.6028, "step": 1060 }, { "epoch": 1.17, "grad_norm": 0.21206606898137628, "learning_rate": 1.5295939234638566e-05, "loss": 0.5934, "step": 1065 }, { "epoch": 1.18, "grad_norm": 0.20258591515345375, "learning_rate": 1.5241601419114842e-05, "loss": 0.5775, "step": 1070 }, { "epoch": 1.18, "grad_norm": 0.2136809123465599, "learning_rate": 1.5187049367835709e-05, "loss": 0.5941, "step": 1075 }, { "epoch": 1.19, "grad_norm": 0.21728230317859182, "learning_rate": 1.5132285310463243e-05, "loss": 0.5832, "step": 1080 }, { "epoch": 1.19, "grad_norm": 0.2173442514170428, "learning_rate": 1.507731148532468e-05, "loss": 0.5896, "step": 1085 }, { "epoch": 1.2, "grad_norm": 0.2171859843513773, "learning_rate": 1.5022130139320916e-05, "loss": 0.6007, "step": 1090 }, { "epoch": 1.2, "grad_norm": 0.22340196606689453, "learning_rate": 1.4966743527834691e-05, "loss": 0.6034, "step": 1095 }, { "epoch": 1.21, "grad_norm": 0.20721993882081483, "learning_rate": 1.4911153914638388e-05, "loss": 0.6027, "step": 1100 }, { "epoch": 1.21, "grad_norm": 0.21607496950020452, "learning_rate": 1.4855363571801523e-05, "loss": 0.6128, "step": 1105 }, { "epoch": 1.22, "grad_norm": 0.21215405353798844, "learning_rate": 1.4799374779597866e-05, "loss": 0.583, "step": 1110 }, { "epoch": 1.23, "grad_norm": 0.21724719376448562, "learning_rate": 1.474318982641225e-05, "loss": 0.5913, "step": 1115 }, { "epoch": 1.23, "grad_norm": 0.22458471606601424, "learning_rate": 1.4686811008647037e-05, "loss": 0.612, "step": 1120 }, { "epoch": 1.24, "grad_norm": 0.21247586240459654, "learning_rate": 1.463024063062827e-05, "loss": 0.5855, "step": 1125 }, { "epoch": 1.24, "grad_norm": 0.22045045025828708, "learning_rate": 1.457348100451146e-05, "loss": 0.5883, "step": 1130 }, { "epoch": 1.25, "grad_norm": 0.214769725773024, "learning_rate": 1.4516534450187126e-05, "loss": 0.5877, "step": 1135 }, { "epoch": 1.25, "grad_norm": 0.2160947649465179, "learning_rate": 1.4459403295185933e-05, "loss": 0.5951, "step": 1140 }, { "epoch": 1.26, "grad_norm": 0.21080831461558835, "learning_rate": 1.4402089874583594e-05, "loss": 0.5616, "step": 1145 }, { "epoch": 1.26, "grad_norm": 0.3517838242091068, "learning_rate": 1.4344596530905412e-05, "loss": 0.5981, "step": 1150 }, { "epoch": 1.27, "grad_norm": 0.2170223068256742, "learning_rate": 1.4286925614030542e-05, "loss": 0.5962, "step": 1155 }, { "epoch": 1.27, "grad_norm": 0.20876572174386707, "learning_rate": 1.4229079481095949e-05, "loss": 0.5705, "step": 1160 }, { "epoch": 1.28, "grad_norm": 0.2114530881165322, "learning_rate": 1.4171060496400055e-05, "loss": 0.5831, "step": 1165 }, { "epoch": 1.29, "grad_norm": 0.2124316227899679, "learning_rate": 1.4112871031306118e-05, "loss": 0.5927, "step": 1170 }, { "epoch": 1.29, "grad_norm": 0.24069796832607096, "learning_rate": 1.4054513464145303e-05, "loss": 0.5843, "step": 1175 }, { "epoch": 1.3, "grad_norm": 0.21761542079259927, "learning_rate": 1.3995990180119478e-05, "loss": 0.5913, "step": 1180 }, { "epoch": 1.3, "grad_norm": 0.22376197027518, "learning_rate": 1.3937303571203718e-05, "loss": 0.5937, "step": 1185 }, { "epoch": 1.31, "grad_norm": 0.21270100949939988, "learning_rate": 1.387845603604855e-05, "loss": 0.6087, "step": 1190 }, { "epoch": 1.31, "grad_norm": 0.2061356457768136, "learning_rate": 1.3819449979881907e-05, "loss": 0.5913, "step": 1195 }, { "epoch": 1.32, "grad_norm": 0.21386723380022615, "learning_rate": 1.3760287814410822e-05, "loss": 0.5993, "step": 1200 }, { "epoch": 1.32, "grad_norm": 0.21228068552627186, "learning_rate": 1.3700971957722861e-05, "loss": 0.5957, "step": 1205 }, { "epoch": 1.33, "grad_norm": 0.22337829074807855, "learning_rate": 1.3641504834187288e-05, "loss": 0.5877, "step": 1210 }, { "epoch": 1.34, "grad_norm": 0.20603083898424746, "learning_rate": 1.3581888874355969e-05, "loss": 0.5925, "step": 1215 }, { "epoch": 1.34, "grad_norm": 0.21192778054346198, "learning_rate": 1.3522126514864047e-05, "loss": 0.5891, "step": 1220 }, { "epoch": 1.35, "grad_norm": 0.2084884476447279, "learning_rate": 1.346222019833033e-05, "loss": 0.5834, "step": 1225 }, { "epoch": 1.35, "grad_norm": 0.2147834275155995, "learning_rate": 1.3402172373257466e-05, "loss": 0.5699, "step": 1230 }, { "epoch": 1.36, "grad_norm": 0.2098959357243527, "learning_rate": 1.3341985493931877e-05, "loss": 0.5962, "step": 1235 }, { "epoch": 1.36, "grad_norm": 0.21524463993944057, "learning_rate": 1.3281662020323434e-05, "loss": 0.5732, "step": 1240 }, { "epoch": 1.37, "grad_norm": 0.21867167573117696, "learning_rate": 1.3221204417984907e-05, "loss": 0.5955, "step": 1245 }, { "epoch": 1.37, "grad_norm": 0.21413713721132682, "learning_rate": 1.3160615157951218e-05, "loss": 0.6075, "step": 1250 }, { "epoch": 1.38, "grad_norm": 0.21839276156580137, "learning_rate": 1.3099896716638414e-05, "loss": 0.6037, "step": 1255 }, { "epoch": 1.38, "grad_norm": 0.2083037589975683, "learning_rate": 1.303905157574247e-05, "loss": 0.5824, "step": 1260 }, { "epoch": 1.39, "grad_norm": 0.2113647393844558, "learning_rate": 1.297808222213785e-05, "loss": 0.583, "step": 1265 }, { "epoch": 1.4, "grad_norm": 0.21208258707761682, "learning_rate": 1.2916991147775867e-05, "loss": 0.5968, "step": 1270 }, { "epoch": 1.4, "grad_norm": 0.21585097977939674, "learning_rate": 1.2855780849582828e-05, "loss": 0.605, "step": 1275 }, { "epoch": 1.41, "grad_norm": 0.2177190382165568, "learning_rate": 1.2794453829357974e-05, "loss": 0.5917, "step": 1280 }, { "epoch": 1.41, "grad_norm": 0.20918133994979798, "learning_rate": 1.2733012593671235e-05, "loss": 0.5808, "step": 1285 }, { "epoch": 1.42, "grad_norm": 0.20727620915430647, "learning_rate": 1.2671459653760781e-05, "loss": 0.5848, "step": 1290 }, { "epoch": 1.42, "grad_norm": 0.21127807703832494, "learning_rate": 1.2609797525430374e-05, "loss": 0.5918, "step": 1295 }, { "epoch": 1.43, "grad_norm": 0.22995739968919704, "learning_rate": 1.2548028728946548e-05, "loss": 0.5788, "step": 1300 }, { "epoch": 1.43, "grad_norm": 0.283656419786389, "learning_rate": 1.2486155788935599e-05, "loss": 0.5696, "step": 1305 }, { "epoch": 1.44, "grad_norm": 0.2060037733197605, "learning_rate": 1.24241812342804e-05, "loss": 0.5679, "step": 1310 }, { "epoch": 1.45, "grad_norm": 0.20635796088681332, "learning_rate": 1.2362107598017037e-05, "loss": 0.5724, "step": 1315 }, { "epoch": 1.45, "grad_norm": 0.22467393881977182, "learning_rate": 1.2299937417231269e-05, "loss": 0.5955, "step": 1320 }, { "epoch": 1.46, "grad_norm": 0.2074247154565041, "learning_rate": 1.2237673232954854e-05, "loss": 0.5971, "step": 1325 }, { "epoch": 1.46, "grad_norm": 0.2424465336794506, "learning_rate": 1.2175317590061676e-05, "loss": 0.5781, "step": 1330 }, { "epoch": 1.47, "grad_norm": 0.22257500583547102, "learning_rate": 1.2112873037163728e-05, "loss": 0.5839, "step": 1335 }, { "epoch": 1.47, "grad_norm": 0.2066344256103609, "learning_rate": 1.2050342126506958e-05, "loss": 0.5739, "step": 1340 }, { "epoch": 1.48, "grad_norm": 0.22883829030621783, "learning_rate": 1.1987727413866936e-05, "loss": 0.5834, "step": 1345 }, { "epoch": 1.48, "grad_norm": 0.2078381748570218, "learning_rate": 1.1925031458444416e-05, "loss": 0.5987, "step": 1350 }, { "epoch": 1.49, "grad_norm": 0.19775167797301507, "learning_rate": 1.1862256822760704e-05, "loss": 0.6014, "step": 1355 }, { "epoch": 1.49, "grad_norm": 0.22122261973414395, "learning_rate": 1.1799406072552963e-05, "loss": 0.6051, "step": 1360 }, { "epoch": 1.5, "grad_norm": 0.2048271696065503, "learning_rate": 1.1736481776669307e-05, "loss": 0.5836, "step": 1365 }, { "epoch": 1.51, "grad_norm": 0.2089570875625985, "learning_rate": 1.1673486506963824e-05, "loss": 0.5969, "step": 1370 }, { "epoch": 1.51, "grad_norm": 0.20686459307988367, "learning_rate": 1.1610422838191473e-05, "loss": 0.5838, "step": 1375 }, { "epoch": 1.52, "grad_norm": 0.2011184871623206, "learning_rate": 1.1547293347902813e-05, "loss": 0.5809, "step": 1380 }, { "epoch": 1.52, "grad_norm": 0.20796307453083696, "learning_rate": 1.148410061633869e-05, "loss": 0.577, "step": 1385 }, { "epoch": 1.53, "grad_norm": 0.21285626663848575, "learning_rate": 1.1420847226324746e-05, "loss": 0.5842, "step": 1390 }, { "epoch": 1.53, "grad_norm": 0.21073449630208443, "learning_rate": 1.135753576316588e-05, "loss": 0.5631, "step": 1395 }, { "epoch": 1.54, "grad_norm": 0.21419628737948346, "learning_rate": 1.1294168814540554e-05, "loss": 0.5725, "step": 1400 }, { "epoch": 1.54, "grad_norm": 0.21145643817601928, "learning_rate": 1.1230748970395056e-05, "loss": 0.5841, "step": 1405 }, { "epoch": 1.55, "grad_norm": 0.21985931289420635, "learning_rate": 1.1167278822837621e-05, "loss": 0.5921, "step": 1410 }, { "epoch": 1.55, "grad_norm": 0.21769355752757327, "learning_rate": 1.1103760966032497e-05, "loss": 0.5969, "step": 1415 }, { "epoch": 1.56, "grad_norm": 0.205702651788512, "learning_rate": 1.1040197996093915e-05, "loss": 0.5998, "step": 1420 }, { "epoch": 1.57, "grad_norm": 0.2123850240700951, "learning_rate": 1.0976592510979982e-05, "loss": 0.602, "step": 1425 }, { "epoch": 1.57, "grad_norm": 0.2080343816532754, "learning_rate": 1.0912947110386484e-05, "loss": 0.5779, "step": 1430 }, { "epoch": 1.58, "grad_norm": 0.24850849809790562, "learning_rate": 1.084926439564065e-05, "loss": 0.6032, "step": 1435 }, { "epoch": 1.58, "grad_norm": 0.21564897773144823, "learning_rate": 1.0785546969594813e-05, "loss": 0.5869, "step": 1440 }, { "epoch": 1.59, "grad_norm": 0.20762144286700152, "learning_rate": 1.0721797436520044e-05, "loss": 0.5826, "step": 1445 }, { "epoch": 1.59, "grad_norm": 0.2063081386274062, "learning_rate": 1.0658018401999681e-05, "loss": 0.5704, "step": 1450 }, { "epoch": 1.6, "grad_norm": 0.20663776641786638, "learning_rate": 1.0594212472822865e-05, "loss": 0.5699, "step": 1455 }, { "epoch": 1.6, "grad_norm": 0.2041445625688445, "learning_rate": 1.053038225687798e-05, "loss": 0.5844, "step": 1460 }, { "epoch": 1.61, "grad_norm": 0.2130137089909296, "learning_rate": 1.0466530363046057e-05, "loss": 0.5863, "step": 1465 }, { "epoch": 1.62, "grad_norm": 0.20861757470403922, "learning_rate": 1.0402659401094154e-05, "loss": 0.5967, "step": 1470 }, { "epoch": 1.62, "grad_norm": 0.2027441783860282, "learning_rate": 1.033877198156868e-05, "loss": 0.5897, "step": 1475 }, { "epoch": 1.63, "grad_norm": 0.213846779001926, "learning_rate": 1.0274870715688713e-05, "loss": 0.6025, "step": 1480 }, { "epoch": 1.63, "grad_norm": 0.2127779802918096, "learning_rate": 1.0210958215239249e-05, "loss": 0.6056, "step": 1485 }, { "epoch": 1.64, "grad_norm": 0.20745193969187556, "learning_rate": 1.0147037092464469e-05, "loss": 0.577, "step": 1490 }, { "epoch": 1.64, "grad_norm": 0.21182819095935948, "learning_rate": 1.0083109959960974e-05, "loss": 0.5864, "step": 1495 }, { "epoch": 1.65, "grad_norm": 0.19696271189726422, "learning_rate": 1.0019179430570984e-05, "loss": 0.5928, "step": 1500 }, { "epoch": 1.65, "grad_norm": 0.20879291422466129, "learning_rate": 9.955248117275566e-06, "loss": 0.5759, "step": 1505 }, { "epoch": 1.66, "grad_norm": 0.20354233979841574, "learning_rate": 9.891318633087831e-06, "loss": 0.5752, "step": 1510 }, { "epoch": 1.66, "grad_norm": 0.20873959818273613, "learning_rate": 9.827393590946116e-06, "loss": 0.5781, "step": 1515 }, { "epoch": 1.67, "grad_norm": 0.20270163619729287, "learning_rate": 9.763475603607215e-06, "loss": 0.5766, "step": 1520 }, { "epoch": 1.68, "grad_norm": 0.20654005706496942, "learning_rate": 9.699567283539567e-06, "loss": 0.5681, "step": 1525 }, { "epoch": 1.68, "grad_norm": 0.20618557107103305, "learning_rate": 9.635671242816503e-06, "loss": 0.609, "step": 1530 }, { "epoch": 1.69, "grad_norm": 0.21061722996416557, "learning_rate": 9.571790093009445e-06, "loss": 0.5934, "step": 1535 }, { "epoch": 1.69, "grad_norm": 0.21093457717343486, "learning_rate": 9.50792644508122e-06, "loss": 0.581, "step": 1540 }, { "epoch": 1.7, "grad_norm": 0.21238370099560738, "learning_rate": 9.44408290927929e-06, "loss": 0.5739, "step": 1545 }, { "epoch": 1.7, "grad_norm": 0.2164905762404212, "learning_rate": 9.380262095029113e-06, "loss": 0.5927, "step": 1550 }, { "epoch": 1.71, "grad_norm": 0.20454952313454147, "learning_rate": 9.316466610827446e-06, "loss": 0.5873, "step": 1555 }, { "epoch": 1.71, "grad_norm": 0.20592385724986123, "learning_rate": 9.252699064135759e-06, "loss": 0.5671, "step": 1560 }, { "epoch": 1.72, "grad_norm": 0.2033785032533706, "learning_rate": 9.188962061273664e-06, "loss": 0.5658, "step": 1565 }, { "epoch": 1.73, "grad_norm": 0.21054461531122468, "learning_rate": 9.125258207312365e-06, "loss": 0.5792, "step": 1570 }, { "epoch": 1.73, "grad_norm": 0.21120511900125027, "learning_rate": 9.061590105968208e-06, "loss": 0.5854, "step": 1575 }, { "epoch": 1.74, "grad_norm": 0.20830877482379653, "learning_rate": 8.997960359496248e-06, "loss": 0.5826, "step": 1580 }, { "epoch": 1.74, "grad_norm": 0.20288306787814717, "learning_rate": 8.934371568583893e-06, "loss": 0.5706, "step": 1585 }, { "epoch": 1.75, "grad_norm": 0.21694924382055683, "learning_rate": 8.8708263322446e-06, "loss": 0.5951, "step": 1590 }, { "epoch": 1.75, "grad_norm": 0.20903099761718094, "learning_rate": 8.807327247711667e-06, "loss": 0.5824, "step": 1595 }, { "epoch": 1.76, "grad_norm": 0.20063759680689586, "learning_rate": 8.743876910332057e-06, "loss": 0.5614, "step": 1600 }, { "epoch": 1.76, "grad_norm": 0.20409842230229638, "learning_rate": 8.680477913460339e-06, "loss": 0.5946, "step": 1605 }, { "epoch": 1.77, "grad_norm": 0.20466612084153943, "learning_rate": 8.617132848352672e-06, "loss": 0.5652, "step": 1610 }, { "epoch": 1.77, "grad_norm": 0.20697633678427196, "learning_rate": 8.553844304060908e-06, "loss": 0.5812, "step": 1615 }, { "epoch": 1.78, "grad_norm": 0.21720588749316946, "learning_rate": 8.490614867326775e-06, "loss": 0.6117, "step": 1620 }, { "epoch": 1.79, "grad_norm": 0.20226043800686028, "learning_rate": 8.427447122476148e-06, "loss": 0.5813, "step": 1625 }, { "epoch": 1.79, "grad_norm": 0.20240033305212438, "learning_rate": 8.364343651313406e-06, "loss": 0.5838, "step": 1630 }, { "epoch": 1.8, "grad_norm": 0.20457722466303066, "learning_rate": 8.301307033015928e-06, "loss": 0.5838, "step": 1635 }, { "epoch": 1.8, "grad_norm": 0.21113002739789974, "learning_rate": 8.23833984402868e-06, "loss": 0.5642, "step": 1640 }, { "epoch": 1.81, "grad_norm": 0.20884646327430137, "learning_rate": 8.175444657958875e-06, "loss": 0.5887, "step": 1645 }, { "epoch": 1.81, "grad_norm": 0.2117374486270568, "learning_rate": 8.112624045470834e-06, "loss": 0.5586, "step": 1650 }, { "epoch": 1.82, "grad_norm": 0.2153924095751537, "learning_rate": 8.04988057418088e-06, "loss": 0.5586, "step": 1655 }, { "epoch": 1.82, "grad_norm": 0.2242880757841404, "learning_rate": 7.987216808552409e-06, "loss": 0.584, "step": 1660 }, { "epoch": 1.83, "grad_norm": 0.2052097826707033, "learning_rate": 7.924635309791065e-06, "loss": 0.5734, "step": 1665 }, { "epoch": 1.84, "grad_norm": 0.20970207818516165, "learning_rate": 7.862138635740078e-06, "loss": 0.5794, "step": 1670 }, { "epoch": 1.84, "grad_norm": 0.22854744935308324, "learning_rate": 7.799729340775688e-06, "loss": 0.5665, "step": 1675 }, { "epoch": 1.85, "grad_norm": 0.21044359070616686, "learning_rate": 7.73740997570278e-06, "loss": 0.5798, "step": 1680 }, { "epoch": 1.85, "grad_norm": 0.22708073388787278, "learning_rate": 7.675183087650592e-06, "loss": 0.5801, "step": 1685 }, { "epoch": 1.86, "grad_norm": 0.2033282205763091, "learning_rate": 7.613051219968624e-06, "loss": 0.5839, "step": 1690 }, { "epoch": 1.86, "grad_norm": 0.20524876955037163, "learning_rate": 7.551016912122692e-06, "loss": 0.5669, "step": 1695 }, { "epoch": 1.87, "grad_norm": 0.2084171272109649, "learning_rate": 7.489082699591128e-06, "loss": 0.5772, "step": 1700 }, { "epoch": 1.87, "grad_norm": 0.2045659246543415, "learning_rate": 7.4272511137611405e-06, "loss": 0.5888, "step": 1705 }, { "epoch": 1.88, "grad_norm": 0.2238357346707814, "learning_rate": 7.3655246818253626e-06, "loss": 0.5778, "step": 1710 }, { "epoch": 1.88, "grad_norm": 0.20675904524387453, "learning_rate": 7.303905926678565e-06, "loss": 0.5716, "step": 1715 }, { "epoch": 1.89, "grad_norm": 0.19722673110072383, "learning_rate": 7.242397366814516e-06, "loss": 0.5807, "step": 1720 }, { "epoch": 1.9, "grad_norm": 0.2048782845337188, "learning_rate": 7.181001516223074e-06, "loss": 0.5826, "step": 1725 }, { "epoch": 1.9, "grad_norm": 0.20915644349360044, "learning_rate": 7.1197208842874175e-06, "loss": 0.5604, "step": 1730 }, { "epoch": 1.91, "grad_norm": 0.20273160113226266, "learning_rate": 7.058557975681488e-06, "loss": 0.5333, "step": 1735 }, { "epoch": 1.91, "grad_norm": 0.20926689973709214, "learning_rate": 6.997515290267611e-06, "loss": 0.5946, "step": 1740 }, { "epoch": 1.92, "grad_norm": 0.21243148030222134, "learning_rate": 6.936595322994328e-06, "loss": 0.5704, "step": 1745 }, { "epoch": 1.92, "grad_norm": 0.2322671841870559, "learning_rate": 6.8758005637944245e-06, "loss": 0.5672, "step": 1750 }, { "epoch": 1.93, "grad_norm": 0.20191574871457757, "learning_rate": 6.815133497483157e-06, "loss": 0.5531, "step": 1755 }, { "epoch": 1.93, "grad_norm": 0.1979263709848421, "learning_rate": 6.754596603656687e-06, "loss": 0.5855, "step": 1760 }, { "epoch": 1.94, "grad_norm": 0.20297785849664604, "learning_rate": 6.694192356590743e-06, "loss": 0.5611, "step": 1765 }, { "epoch": 1.95, "grad_norm": 0.2057255315804723, "learning_rate": 6.633923225139498e-06, "loss": 0.5603, "step": 1770 }, { "epoch": 1.95, "grad_norm": 0.2039103225151876, "learning_rate": 6.573791672634638e-06, "loss": 0.564, "step": 1775 }, { "epoch": 1.96, "grad_norm": 0.20729859585281207, "learning_rate": 6.513800156784709e-06, "loss": 0.5665, "step": 1780 }, { "epoch": 1.96, "grad_norm": 0.21039586635611288, "learning_rate": 6.453951129574644e-06, "loss": 0.5731, "step": 1785 }, { "epoch": 1.97, "grad_norm": 0.2517268650469455, "learning_rate": 6.394247037165559e-06, "loss": 0.5895, "step": 1790 }, { "epoch": 1.97, "grad_norm": 0.20420292346101634, "learning_rate": 6.3346903197947564e-06, "loss": 0.5822, "step": 1795 }, { "epoch": 1.98, "grad_norm": 0.2005713051020868, "learning_rate": 6.275283411676008e-06, "loss": 0.5747, "step": 1800 }, { "epoch": 1.98, "grad_norm": 0.20383448396361142, "learning_rate": 6.216028740900042e-06, "loss": 0.564, "step": 1805 }, { "epoch": 1.99, "grad_norm": 0.20518916264594467, "learning_rate": 6.1569287293353274e-06, "loss": 0.569, "step": 1810 }, { "epoch": 1.99, "grad_norm": 0.20991517111604094, "learning_rate": 6.097985792529055e-06, "loss": 0.5776, "step": 1815 }, { "epoch": 2.0, "grad_norm": 0.20901449217066023, "learning_rate": 6.039202339608432e-06, "loss": 0.5701, "step": 1820 }, { "epoch": 2.0, "eval_loss": 0.6639273166656494, "eval_runtime": 13.2474, "eval_samples_per_second": 104.021, "eval_steps_per_second": 0.83, "step": 1820 }, { "epoch": 2.01, "grad_norm": 0.22019281423746326, "learning_rate": 5.980580773182214e-06, "loss": 0.522, "step": 1825 }, { "epoch": 2.01, "grad_norm": 0.20628500931618077, "learning_rate": 5.922123489242499e-06, "loss": 0.5217, "step": 1830 }, { "epoch": 2.02, "grad_norm": 0.21214052920599313, "learning_rate": 5.8638328770667905e-06, "loss": 0.5158, "step": 1835 }, { "epoch": 2.02, "grad_norm": 0.2125306506706992, "learning_rate": 5.805711319120358e-06, "loss": 0.5266, "step": 1840 }, { "epoch": 2.03, "grad_norm": 0.20497451303661474, "learning_rate": 5.747761190958859e-06, "loss": 0.5379, "step": 1845 }, { "epoch": 2.03, "grad_norm": 0.20908534530239564, "learning_rate": 5.689984861131221e-06, "loss": 0.5235, "step": 1850 }, { "epoch": 2.04, "grad_norm": 0.21454631325571177, "learning_rate": 5.632384691082874e-06, "loss": 0.559, "step": 1855 }, { "epoch": 2.04, "grad_norm": 0.20512681342425096, "learning_rate": 5.5749630350592e-06, "loss": 0.5112, "step": 1860 }, { "epoch": 2.05, "grad_norm": 0.21863556347579605, "learning_rate": 5.517722240009319e-06, "loss": 0.5252, "step": 1865 }, { "epoch": 2.05, "grad_norm": 0.2042884660329648, "learning_rate": 5.460664645490172e-06, "loss": 0.5319, "step": 1870 }, { "epoch": 2.06, "grad_norm": 0.20585363297531442, "learning_rate": 5.403792583570884e-06, "loss": 0.5486, "step": 1875 }, { "epoch": 2.07, "grad_norm": 0.21017174229067934, "learning_rate": 5.347108378737469e-06, "loss": 0.5219, "step": 1880 }, { "epoch": 2.07, "grad_norm": 0.20323326084962492, "learning_rate": 5.290614347797802e-06, "loss": 0.5304, "step": 1885 }, { "epoch": 2.08, "grad_norm": 0.20641023175360712, "learning_rate": 5.234312799786921e-06, "loss": 0.5078, "step": 1890 }, { "epoch": 2.08, "grad_norm": 0.20699085777952853, "learning_rate": 5.1782060358726885e-06, "loss": 0.541, "step": 1895 }, { "epoch": 2.09, "grad_norm": 0.21849299793363283, "learning_rate": 5.122296349261695e-06, "loss": 0.5382, "step": 1900 }, { "epoch": 2.09, "grad_norm": 0.2225675454375723, "learning_rate": 5.066586025105558e-06, "loss": 0.5222, "step": 1905 }, { "epoch": 2.1, "grad_norm": 0.21820736824894094, "learning_rate": 5.011077340407509e-06, "loss": 0.521, "step": 1910 }, { "epoch": 2.1, "grad_norm": 0.21040780570604573, "learning_rate": 4.955772563929334e-06, "loss": 0.5138, "step": 1915 }, { "epoch": 2.11, "grad_norm": 0.20328121606226746, "learning_rate": 4.900673956098644e-06, "loss": 0.5, "step": 1920 }, { "epoch": 2.12, "grad_norm": 0.20307162400180162, "learning_rate": 4.845783768916482e-06, "loss": 0.5205, "step": 1925 }, { "epoch": 2.12, "grad_norm": 0.2145754172951753, "learning_rate": 4.79110424586528e-06, "loss": 0.5361, "step": 1930 }, { "epoch": 2.13, "grad_norm": 0.22278545561346286, "learning_rate": 4.736637621817176e-06, "loss": 0.5415, "step": 1935 }, { "epoch": 2.13, "grad_norm": 0.20777015428265447, "learning_rate": 4.682386122942649e-06, "loss": 0.5466, "step": 1940 }, { "epoch": 2.14, "grad_norm": 0.20476960423981388, "learning_rate": 4.628351966619531e-06, "loss": 0.5356, "step": 1945 }, { "epoch": 2.14, "grad_norm": 0.2026935858356193, "learning_rate": 4.5745373613424075e-06, "loss": 0.5183, "step": 1950 }, { "epoch": 2.15, "grad_norm": 0.20396283481465377, "learning_rate": 4.520944506632314e-06, "loss": 0.5282, "step": 1955 }, { "epoch": 2.15, "grad_norm": 0.20723703288940273, "learning_rate": 4.467575592946865e-06, "loss": 0.5353, "step": 1960 }, { "epoch": 2.16, "grad_norm": 0.20283509715409914, "learning_rate": 4.414432801590703e-06, "loss": 0.5253, "step": 1965 }, { "epoch": 2.16, "grad_norm": 0.2052800409715183, "learning_rate": 4.361518304626366e-06, "loss": 0.5144, "step": 1970 }, { "epoch": 2.17, "grad_norm": 0.19650849856593194, "learning_rate": 4.308834264785483e-06, "loss": 0.515, "step": 1975 }, { "epoch": 2.18, "grad_norm": 0.20863894389929338, "learning_rate": 4.256382835380421e-06, "loss": 0.5202, "step": 1980 }, { "epoch": 2.18, "grad_norm": 0.20698871916210282, "learning_rate": 4.204166160216216e-06, "loss": 0.5341, "step": 1985 }, { "epoch": 2.19, "grad_norm": 0.21288394644934694, "learning_rate": 4.1521863735030065e-06, "loss": 0.5428, "step": 1990 }, { "epoch": 2.19, "grad_norm": 0.21106167476818527, "learning_rate": 4.100445599768774e-06, "loss": 0.522, "step": 1995 }, { "epoch": 2.2, "grad_norm": 0.20964196528234977, "learning_rate": 4.048945953772504e-06, "loss": 0.5294, "step": 2000 }, { "epoch": 2.2, "grad_norm": 0.2186729758235947, "learning_rate": 3.99768954041778e-06, "loss": 0.5258, "step": 2005 }, { "epoch": 2.21, "grad_norm": 0.21074477316001103, "learning_rate": 3.946678454666719e-06, "loss": 0.5296, "step": 2010 }, { "epoch": 2.21, "grad_norm": 0.2082023730818782, "learning_rate": 3.89591478145437e-06, "loss": 0.5232, "step": 2015 }, { "epoch": 2.22, "grad_norm": 0.211665046662059, "learning_rate": 3.845400595603482e-06, "loss": 0.5325, "step": 2020 }, { "epoch": 2.23, "grad_norm": 0.20452212904152248, "learning_rate": 3.79513796173971e-06, "loss": 0.5208, "step": 2025 }, { "epoch": 2.23, "grad_norm": 0.20097361934014168, "learning_rate": 3.745128934207225e-06, "loss": 0.5154, "step": 2030 }, { "epoch": 2.24, "grad_norm": 0.20835101950643772, "learning_rate": 3.695375556984764e-06, "loss": 0.5221, "step": 2035 }, { "epoch": 2.24, "grad_norm": 0.20751752347710595, "learning_rate": 3.6458798636020477e-06, "loss": 0.5208, "step": 2040 }, { "epoch": 2.25, "grad_norm": 0.2083029476627408, "learning_rate": 3.59664387705672e-06, "loss": 0.524, "step": 2045 }, { "epoch": 2.25, "grad_norm": 0.20547050805474043, "learning_rate": 3.5476696097316253e-06, "loss": 0.5224, "step": 2050 }, { "epoch": 2.26, "grad_norm": 0.20721249112618484, "learning_rate": 3.4989590633125583e-06, "loss": 0.5335, "step": 2055 }, { "epoch": 2.26, "grad_norm": 0.2084694829774017, "learning_rate": 3.450514228706482e-06, "loss": 0.5229, "step": 2060 }, { "epoch": 2.27, "grad_norm": 0.20216282130770613, "learning_rate": 3.4023370859601192e-06, "loss": 0.5375, "step": 2065 }, { "epoch": 2.27, "grad_norm": 0.2036489936294621, "learning_rate": 3.3544296041790457e-06, "loss": 0.5123, "step": 2070 }, { "epoch": 2.28, "grad_norm": 0.20447914104651402, "learning_rate": 3.3067937414471986e-06, "loss": 0.5283, "step": 2075 }, { "epoch": 2.29, "grad_norm": 0.20966243005652113, "learning_rate": 3.2594314447468457e-06, "loss": 0.5207, "step": 2080 }, { "epoch": 2.29, "grad_norm": 0.20579910881393176, "learning_rate": 3.2123446498790214e-06, "loss": 0.518, "step": 2085 }, { "epoch": 2.3, "grad_norm": 0.19982035514392868, "learning_rate": 3.1655352813843886e-06, "loss": 0.53, "step": 2090 }, { "epoch": 2.3, "grad_norm": 0.2114271078042674, "learning_rate": 3.1190052524645752e-06, "loss": 0.5297, "step": 2095 }, { "epoch": 2.31, "grad_norm": 0.20832244818822046, "learning_rate": 3.0727564649040066e-06, "loss": 0.5102, "step": 2100 }, { "epoch": 2.31, "grad_norm": 0.2008612540571781, "learning_rate": 3.0267908089921438e-06, "loss": 0.525, "step": 2105 }, { "epoch": 2.32, "grad_norm": 0.21335420821701767, "learning_rate": 2.9811101634462414e-06, "loss": 0.5161, "step": 2110 }, { "epoch": 2.32, "grad_norm": 0.20450727458510107, "learning_rate": 2.93571639533455e-06, "loss": 0.5139, "step": 2115 }, { "epoch": 2.33, "grad_norm": 0.213686615068082, "learning_rate": 2.8906113600000153e-06, "loss": 0.5203, "step": 2120 }, { "epoch": 2.34, "grad_norm": 0.20772690016178588, "learning_rate": 2.8457969009844354e-06, "loss": 0.5437, "step": 2125 }, { "epoch": 2.34, "grad_norm": 0.20632381814936407, "learning_rate": 2.8012748499531195e-06, "loss": 0.5318, "step": 2130 }, { "epoch": 2.35, "grad_norm": 0.2016664102243559, "learning_rate": 2.7570470266200177e-06, "loss": 0.5165, "step": 2135 }, { "epoch": 2.35, "grad_norm": 0.20520163431623825, "learning_rate": 2.713115238673356e-06, "loss": 0.5193, "step": 2140 }, { "epoch": 2.36, "grad_norm": 0.214071163971577, "learning_rate": 2.669481281701739e-06, "loss": 0.5268, "step": 2145 }, { "epoch": 2.36, "grad_norm": 0.20652575043829738, "learning_rate": 2.626146939120757e-06, "loss": 0.5176, "step": 2150 }, { "epoch": 2.37, "grad_norm": 0.20623703636177199, "learning_rate": 2.5831139821001184e-06, "loss": 0.5194, "step": 2155 }, { "epoch": 2.37, "grad_norm": 0.2073488736443259, "learning_rate": 2.5403841694912333e-06, "loss": 0.521, "step": 2160 }, { "epoch": 2.38, "grad_norm": 0.20428316103740377, "learning_rate": 2.497959247755335e-06, "loss": 0.522, "step": 2165 }, { "epoch": 2.38, "grad_norm": 0.20624815834176138, "learning_rate": 2.455840950892099e-06, "loss": 0.5338, "step": 2170 }, { "epoch": 2.39, "grad_norm": 0.2107922125349263, "learning_rate": 2.414031000368767e-06, "loss": 0.5381, "step": 2175 }, { "epoch": 2.4, "grad_norm": 0.20453939515184638, "learning_rate": 2.372531105049789e-06, "loss": 0.5183, "step": 2180 }, { "epoch": 2.4, "grad_norm": 0.19793223071462618, "learning_rate": 2.331342961126988e-06, "loss": 0.5113, "step": 2185 }, { "epoch": 2.41, "grad_norm": 0.21378964248804647, "learning_rate": 2.290468252050204e-06, "loss": 0.5296, "step": 2190 }, { "epoch": 2.41, "grad_norm": 0.22063310823848487, "learning_rate": 2.2499086484585255e-06, "loss": 0.5169, "step": 2195 }, { "epoch": 2.42, "grad_norm": 0.20381232643959812, "learning_rate": 2.2096658081119793e-06, "loss": 0.5196, "step": 2200 }, { "epoch": 2.42, "grad_norm": 0.21044163878960068, "learning_rate": 2.1697413758237785e-06, "loss": 0.5373, "step": 2205 }, { "epoch": 2.43, "grad_norm": 0.20577762743347522, "learning_rate": 2.130136983393112e-06, "loss": 0.5095, "step": 2210 }, { "epoch": 2.43, "grad_norm": 0.20351905534806264, "learning_rate": 2.0908542495384276e-06, "loss": 0.5238, "step": 2215 }, { "epoch": 2.44, "grad_norm": 0.19865856316493957, "learning_rate": 2.051894779831286e-06, "loss": 0.516, "step": 2220 }, { "epoch": 2.45, "grad_norm": 0.19763372174700705, "learning_rate": 2.0132601666307295e-06, "loss": 0.5073, "step": 2225 }, { "epoch": 2.45, "grad_norm": 0.19981546411123563, "learning_rate": 1.9749519890182035e-06, "loss": 0.5233, "step": 2230 }, { "epoch": 2.46, "grad_norm": 0.2033638338024947, "learning_rate": 1.936971812733012e-06, "loss": 0.5023, "step": 2235 }, { "epoch": 2.46, "grad_norm": 0.20822189802082794, "learning_rate": 1.8993211901083353e-06, "loss": 0.5009, "step": 2240 }, { "epoch": 2.47, "grad_norm": 0.2079466895136193, "learning_rate": 1.8620016600077516e-06, "loss": 0.5256, "step": 2245 }, { "epoch": 2.47, "grad_norm": 0.20439899215227353, "learning_rate": 1.8250147477623836e-06, "loss": 0.5365, "step": 2250 }, { "epoch": 2.48, "grad_norm": 0.22075705014027322, "learning_rate": 1.7883619651085194e-06, "loss": 0.5168, "step": 2255 }, { "epoch": 2.48, "grad_norm": 0.1964170868119619, "learning_rate": 1.7520448101258325e-06, "loss": 0.5173, "step": 2260 }, { "epoch": 2.49, "grad_norm": 0.2018340951265649, "learning_rate": 1.716064767176172e-06, "loss": 0.5208, "step": 2265 }, { "epoch": 2.49, "grad_norm": 0.20205110525587222, "learning_rate": 1.6804233068428678e-06, "loss": 0.5321, "step": 2270 }, { "epoch": 2.5, "grad_norm": 0.21357970053476708, "learning_rate": 1.6451218858706374e-06, "loss": 0.5203, "step": 2275 }, { "epoch": 2.51, "grad_norm": 0.2060738726003442, "learning_rate": 1.6101619471060415e-06, "loss": 0.5206, "step": 2280 }, { "epoch": 2.51, "grad_norm": 0.20422451049362417, "learning_rate": 1.5755449194385164e-06, "loss": 0.5183, "step": 2285 }, { "epoch": 2.52, "grad_norm": 0.2002944726641853, "learning_rate": 1.5412722177419658e-06, "loss": 0.5315, "step": 2290 }, { "epoch": 2.52, "grad_norm": 0.20080246126740545, "learning_rate": 1.5073452428169444e-06, "loss": 0.5302, "step": 2295 }, { "epoch": 2.53, "grad_norm": 0.20361752591083399, "learning_rate": 1.4737653813333774e-06, "loss": 0.5178, "step": 2300 }, { "epoch": 2.53, "grad_norm": 0.19999788848075653, "learning_rate": 1.4405340057739203e-06, "loss": 0.5192, "step": 2305 }, { "epoch": 2.54, "grad_norm": 0.1959282126715599, "learning_rate": 1.407652474377832e-06, "loss": 0.5156, "step": 2310 }, { "epoch": 2.54, "grad_norm": 0.20774381542552106, "learning_rate": 1.3751221310854778e-06, "loss": 0.5056, "step": 2315 }, { "epoch": 2.55, "grad_norm": 0.2038197815633597, "learning_rate": 1.3429443054833913e-06, "loss": 0.5265, "step": 2320 }, { "epoch": 2.55, "grad_norm": 0.20325668884167009, "learning_rate": 1.311120312749935e-06, "loss": 0.5234, "step": 2325 }, { "epoch": 2.56, "grad_norm": 0.20390134033380405, "learning_rate": 1.2796514536015492e-06, "loss": 0.5368, "step": 2330 }, { "epoch": 2.57, "grad_norm": 0.20166563167725796, "learning_rate": 1.2485390142395793e-06, "loss": 0.5205, "step": 2335 }, { "epoch": 2.57, "grad_norm": 0.2022982713597928, "learning_rate": 1.2177842662977136e-06, "loss": 0.5231, "step": 2340 }, { "epoch": 2.58, "grad_norm": 0.21218295817596197, "learning_rate": 1.1873884667900125e-06, "loss": 0.5439, "step": 2345 }, { "epoch": 2.58, "grad_norm": 0.20473864934409344, "learning_rate": 1.1573528580595195e-06, "loss": 0.521, "step": 2350 }, { "epoch": 2.59, "grad_norm": 0.20949092154655793, "learning_rate": 1.1276786677274866e-06, "loss": 0.5105, "step": 2355 }, { "epoch": 2.59, "grad_norm": 0.20094255278913592, "learning_rate": 1.0983671086432146e-06, "loss": 0.5128, "step": 2360 }, { "epoch": 2.6, "grad_norm": 0.2057662784043456, "learning_rate": 1.069419378834461e-06, "loss": 0.5385, "step": 2365 }, { "epoch": 2.6, "grad_norm": 0.20176044390537098, "learning_rate": 1.040836661458482e-06, "loss": 0.5171, "step": 2370 }, { "epoch": 2.61, "grad_norm": 0.21932275472178112, "learning_rate": 1.0126201247536783e-06, "loss": 0.528, "step": 2375 }, { "epoch": 2.62, "grad_norm": 0.19746784289323338, "learning_rate": 9.8477092199184e-07, "loss": 0.511, "step": 2380 }, { "epoch": 2.62, "grad_norm": 0.20034469208109756, "learning_rate": 9.57290191431013e-07, "loss": 0.5227, "step": 2385 }, { "epoch": 2.63, "grad_norm": 0.20104497306207902, "learning_rate": 9.301790562689794e-07, "loss": 0.5277, "step": 2390 }, { "epoch": 2.63, "grad_norm": 0.1972303787668993, "learning_rate": 9.034386245973359e-07, "loss": 0.5147, "step": 2395 }, { "epoch": 2.64, "grad_norm": 0.20641734055273483, "learning_rate": 8.770699893562273e-07, "loss": 0.5256, "step": 2400 }, { "epoch": 2.64, "grad_norm": 0.2083707616895028, "learning_rate": 8.510742282896545e-07, "loss": 0.5154, "step": 2405 }, { "epoch": 2.65, "grad_norm": 0.20003452385731113, "learning_rate": 8.254524039014289e-07, "loss": 0.5333, "step": 2410 }, { "epoch": 2.65, "grad_norm": 0.20109885864318555, "learning_rate": 8.002055634117578e-07, "loss": 0.5088, "step": 2415 }, { "epoch": 2.66, "grad_norm": 0.20645302721542735, "learning_rate": 7.753347387144294e-07, "loss": 0.5126, "step": 2420 }, { "epoch": 2.66, "grad_norm": 0.20323590104576408, "learning_rate": 7.508409463346389e-07, "loss": 0.5234, "step": 2425 }, { "epoch": 2.67, "grad_norm": 0.20483041641048474, "learning_rate": 7.26725187387446e-07, "loss": 0.5207, "step": 2430 }, { "epoch": 2.68, "grad_norm": 0.20363185231425762, "learning_rate": 7.029884475368542e-07, "loss": 0.5234, "step": 2435 }, { "epoch": 2.68, "grad_norm": 0.19585624352673048, "learning_rate": 6.796316969555205e-07, "loss": 0.516, "step": 2440 }, { "epoch": 2.69, "grad_norm": 0.21163098285472737, "learning_rate": 6.566558902851161e-07, "loss": 0.5382, "step": 2445 }, { "epoch": 2.69, "grad_norm": 0.2061275767234072, "learning_rate": 6.340619665972847e-07, "loss": 0.5252, "step": 2450 }, { "epoch": 2.7, "grad_norm": 0.2050927030534903, "learning_rate": 6.118508493552866e-07, "loss": 0.5081, "step": 2455 }, { "epoch": 2.7, "grad_norm": 0.20411513952463778, "learning_rate": 5.900234463762367e-07, "loss": 0.5248, "step": 2460 }, { "epoch": 2.71, "grad_norm": 0.20399794959320638, "learning_rate": 5.685806497940027e-07, "loss": 0.5163, "step": 2465 }, { "epoch": 2.71, "grad_norm": 0.20891752353282228, "learning_rate": 5.475233360227516e-07, "loss": 0.529, "step": 2470 }, { "epoch": 2.72, "grad_norm": 0.2005044795674912, "learning_rate": 5.268523657211188e-07, "loss": 0.5009, "step": 2475 }, { "epoch": 2.73, "grad_norm": 0.2046930603550785, "learning_rate": 5.065685837570312e-07, "loss": 0.5165, "step": 2480 }, { "epoch": 2.73, "grad_norm": 0.20175610856641593, "learning_rate": 4.866728191731829e-07, "loss": 0.5182, "step": 2485 }, { "epoch": 2.74, "grad_norm": 0.2030309299250094, "learning_rate": 4.671658851531424e-07, "loss": 0.5199, "step": 2490 }, { "epoch": 2.74, "grad_norm": 0.1914622288737903, "learning_rate": 4.480485789881217e-07, "loss": 0.5177, "step": 2495 }, { "epoch": 2.75, "grad_norm": 0.19668048668936855, "learning_rate": 4.293216820443891e-07, "loss": 0.5308, "step": 2500 }, { "epoch": 2.75, "grad_norm": 0.22039481521735169, "learning_rate": 4.109859597313237e-07, "loss": 0.5283, "step": 2505 }, { "epoch": 2.76, "grad_norm": 0.20757846010886538, "learning_rate": 3.9304216147014853e-07, "loss": 0.5347, "step": 2510 }, { "epoch": 2.76, "grad_norm": 0.20740377627342962, "learning_rate": 3.7549102066328226e-07, "loss": 0.5307, "step": 2515 }, { "epoch": 2.77, "grad_norm": 0.2100472925176084, "learning_rate": 3.5833325466437697e-07, "loss": 0.5223, "step": 2520 }, { "epoch": 2.77, "grad_norm": 0.20177360779386036, "learning_rate": 3.4156956474898805e-07, "loss": 0.5255, "step": 2525 }, { "epoch": 2.78, "grad_norm": 0.20707380368033076, "learning_rate": 3.2520063608592165e-07, "loss": 0.5122, "step": 2530 }, { "epoch": 2.79, "grad_norm": 0.19965994990493274, "learning_rate": 3.0922713770922155e-07, "loss": 0.5214, "step": 2535 }, { "epoch": 2.79, "grad_norm": 0.20324516199000714, "learning_rate": 2.9364972249082747e-07, "loss": 0.5145, "step": 2540 }, { "epoch": 2.8, "grad_norm": 0.20150097372860662, "learning_rate": 2.7846902711389236e-07, "loss": 0.5132, "step": 2545 }, { "epoch": 2.8, "grad_norm": 0.2051450132081192, "learning_rate": 2.636856720467573e-07, "loss": 0.5128, "step": 2550 }, { "epoch": 2.81, "grad_norm": 0.20266490344541718, "learning_rate": 2.493002615175977e-07, "loss": 0.5122, "step": 2555 }, { "epoch": 2.81, "grad_norm": 0.20236603858533658, "learning_rate": 2.3531338348971366e-07, "loss": 0.5327, "step": 2560 }, { "epoch": 2.82, "grad_norm": 0.20530608079603682, "learning_rate": 2.217256096375131e-07, "loss": 0.5099, "step": 2565 }, { "epoch": 2.82, "grad_norm": 0.2000522109375465, "learning_rate": 2.0853749532314006e-07, "loss": 0.5299, "step": 2570 }, { "epoch": 2.83, "grad_norm": 0.20808661956102248, "learning_rate": 1.9574957957377294e-07, "loss": 0.5225, "step": 2575 }, { "epoch": 2.84, "grad_norm": 0.2026575467689292, "learning_rate": 1.8336238505959892e-07, "loss": 0.5286, "step": 2580 }, { "epoch": 2.84, "grad_norm": 0.2043520049010144, "learning_rate": 1.7137641807244754e-07, "loss": 0.5218, "step": 2585 }, { "epoch": 2.85, "grad_norm": 0.20057729109993613, "learning_rate": 1.5979216850509848e-07, "loss": 0.5225, "step": 2590 }, { "epoch": 2.85, "grad_norm": 0.20086887199752657, "learning_rate": 1.4861010983126202e-07, "loss": 0.521, "step": 2595 }, { "epoch": 2.86, "grad_norm": 0.20600288196892624, "learning_rate": 1.3783069908621772e-07, "loss": 0.5233, "step": 2600 }, { "epoch": 2.86, "grad_norm": 0.2032590701020277, "learning_rate": 1.274543768481451e-07, "loss": 0.5298, "step": 2605 }, { "epoch": 2.87, "grad_norm": 0.19937401805144794, "learning_rate": 1.1748156722011128e-07, "loss": 0.5012, "step": 2610 }, { "epoch": 2.87, "grad_norm": 0.2021678865177686, "learning_rate": 1.0791267781273263e-07, "loss": 0.5242, "step": 2615 }, { "epoch": 2.88, "grad_norm": 0.20777988223157803, "learning_rate": 9.874809972752697e-08, "loss": 0.5293, "step": 2620 }, { "epoch": 2.88, "grad_norm": 0.20368366862991866, "learning_rate": 8.99882075409153e-08, "loss": 0.5153, "step": 2625 }, { "epoch": 2.89, "grad_norm": 0.20787994510400237, "learning_rate": 8.16333592889207e-08, "loss": 0.5345, "step": 2630 }, { "epoch": 2.9, "grad_norm": 0.20946081246812162, "learning_rate": 7.368389645252772e-08, "loss": 0.5229, "step": 2635 }, { "epoch": 2.9, "grad_norm": 0.20571639236209135, "learning_rate": 6.61401439437348e-08, "loss": 0.5151, "step": 2640 }, { "epoch": 2.91, "grad_norm": 0.21182777269953695, "learning_rate": 5.9002410092262593e-08, "loss": 0.5207, "step": 2645 }, { "epoch": 2.91, "grad_norm": 0.1955808209671619, "learning_rate": 5.227098663296404e-08, "loss": 0.5073, "step": 2650 }, { "epoch": 2.92, "grad_norm": 0.20179292685793432, "learning_rate": 4.594614869388947e-08, "loss": 0.5212, "step": 2655 }, { "epoch": 2.92, "grad_norm": 0.20236344715337046, "learning_rate": 4.002815478505007e-08, "loss": 0.5208, "step": 2660 }, { "epoch": 2.93, "grad_norm": 0.2112521255560089, "learning_rate": 3.451724678784518e-08, "loss": 0.5155, "step": 2665 }, { "epoch": 2.93, "grad_norm": 0.1988319336416073, "learning_rate": 2.9413649945182475e-08, "loss": 0.5149, "step": 2670 }, { "epoch": 2.94, "grad_norm": 0.2393477878018782, "learning_rate": 2.47175728522675e-08, "loss": 0.5196, "step": 2675 }, { "epoch": 2.95, "grad_norm": 0.1989296468037607, "learning_rate": 2.0429207448078302e-08, "loss": 0.4954, "step": 2680 }, { "epoch": 2.95, "grad_norm": 0.21328717756995305, "learning_rate": 1.654872900752169e-08, "loss": 0.5211, "step": 2685 }, { "epoch": 2.96, "grad_norm": 0.2043004896942241, "learning_rate": 1.3076296134271194e-08, "loss": 0.5298, "step": 2690 }, { "epoch": 2.96, "grad_norm": 0.20203087493728625, "learning_rate": 1.0012050754277802e-08, "loss": 0.5142, "step": 2695 }, { "epoch": 2.97, "grad_norm": 0.20708771065311932, "learning_rate": 7.356118109977939e-09, "loss": 0.5352, "step": 2700 }, { "epoch": 2.97, "grad_norm": 0.20602452029141277, "learning_rate": 5.108606755168666e-09, "loss": 0.528, "step": 2705 }, { "epoch": 2.98, "grad_norm": 0.20878635383712413, "learning_rate": 3.269608550571235e-09, "loss": 0.5155, "step": 2710 }, { "epoch": 2.98, "grad_norm": 0.1972058695515002, "learning_rate": 1.839198660079644e-09, "loss": 0.5121, "step": 2715 }, { "epoch": 2.99, "grad_norm": 0.2017859212074039, "learning_rate": 8.174355476864293e-10, "loss": 0.5193, "step": 2720 }, { "epoch": 2.99, "grad_norm": 0.20180951712274753, "learning_rate": 2.0436097509235475e-10, "loss": 0.5331, "step": 2725 }, { "epoch": 3.0, "grad_norm": 0.2006750130728712, "learning_rate": 0.0, "loss": 0.5227, "step": 2730 }, { "epoch": 3.0, "eval_loss": 0.6613607406616211, "eval_runtime": 13.2749, "eval_samples_per_second": 103.805, "eval_steps_per_second": 0.829, "step": 2730 }, { "epoch": 3.0, "step": 2730, "total_flos": 1.183665069490176e+16, "train_loss": 0.6159939037574517, "train_runtime": 13745.4884, "train_samples_per_second": 25.399, "train_steps_per_second": 0.199 } ], "logging_steps": 5, "max_steps": 2730, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.183665069490176e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }