{ "best_metric": 2.27061128616333, "best_model_checkpoint": "./output/training_results/C018_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-800", "epoch": 4.0, "eval_steps": 200, "global_step": 3660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001092896174863388, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.5038, "step": 1 }, { "epoch": 0.00546448087431694, "grad_norm": 6.018359004510701, "learning_rate": 1.5e-06, "loss": 2.4907, "step": 5 }, { "epoch": 0.01092896174863388, "grad_norm": 3.4017007364457332, "learning_rate": 5.25e-06, "loss": 2.4315, "step": 10 }, { "epoch": 0.01639344262295082, "grad_norm": 2.6900944944121132, "learning_rate": 8.25e-06, "loss": 2.428, "step": 15 }, { "epoch": 0.02185792349726776, "grad_norm": 2.708090744472938, "learning_rate": 1.2e-05, "loss": 2.4387, "step": 20 }, { "epoch": 0.0273224043715847, "grad_norm": 3.032951222015636, "learning_rate": 1.4954883435929662e-05, "loss": 2.4481, "step": 25 }, { "epoch": 0.03278688524590164, "grad_norm": 2.4292246650027147, "learning_rate": 1.4731151665173554e-05, "loss": 2.466, "step": 30 }, { "epoch": 0.03825136612021858, "grad_norm": 2.621759707907051, "learning_rate": 1.4510477122963378e-05, "loss": 2.4376, "step": 35 }, { "epoch": 0.04371584699453552, "grad_norm": 2.4177283273408454, "learning_rate": 1.4292822159268742e-05, "loss": 2.4828, "step": 40 }, { "epoch": 0.04918032786885246, "grad_norm": 2.8501680697628307, "learning_rate": 1.4078149536769946e-05, "loss": 2.4589, "step": 45 }, { "epoch": 0.0546448087431694, "grad_norm": 2.479759465440778, "learning_rate": 1.386642242689401e-05, "loss": 2.4807, "step": 50 }, { "epoch": 0.060109289617486336, "grad_norm": 2.8217394805300566, "learning_rate": 1.3657604405883384e-05, "loss": 2.4267, "step": 55 }, { "epoch": 0.06557377049180328, "grad_norm": 2.081431814197302, "learning_rate": 1.3451659450897103e-05, "loss": 2.4302, "step": 60 }, { "epoch": 0.07103825136612021, "grad_norm": 2.4806889496772477, "learning_rate": 1.3248551936144194e-05, "loss": 2.4394, "step": 65 }, { "epoch": 0.07650273224043716, "grad_norm": 2.440876359920224, "learning_rate": 1.3048246629049058e-05, "loss": 2.4125, "step": 70 }, { "epoch": 0.08196721311475409, "grad_norm": 2.0932998526603517, "learning_rate": 1.2889996545293194e-05, "loss": 2.4529, "step": 75 }, { "epoch": 0.08743169398907104, "grad_norm": 2.1065047597686135, "learning_rate": 1.2694647666488102e-05, "loss": 2.4064, "step": 80 }, { "epoch": 0.09289617486338798, "grad_norm": 2.547164860929408, "learning_rate": 1.2502004403786172e-05, "loss": 2.4265, "step": 85 }, { "epoch": 0.09836065573770492, "grad_norm": 2.1438020912476348, "learning_rate": 1.2312032984416495e-05, "loss": 2.4098, "step": 90 }, { "epoch": 0.10382513661202186, "grad_norm": 2.068160218649763, "learning_rate": 1.212470001085604e-05, "loss": 2.4031, "step": 95 }, { "epoch": 0.1092896174863388, "grad_norm": 1.9843644588123759, "learning_rate": 1.1939972457176422e-05, "loss": 2.405, "step": 100 }, { "epoch": 0.11475409836065574, "grad_norm": 2.1172473045661224, "learning_rate": 1.175781766542116e-05, "loss": 2.3911, "step": 105 }, { "epoch": 0.12021857923497267, "grad_norm": 2.3445926712239196, "learning_rate": 1.1613924507166693e-05, "loss": 2.3869, "step": 110 }, { "epoch": 0.12568306010928962, "grad_norm": 2.025043424580075, "learning_rate": 1.1436319549649206e-05, "loss": 2.391, "step": 115 }, { "epoch": 0.13114754098360656, "grad_norm": 1.9766925379731093, "learning_rate": 1.1261197838924792e-05, "loss": 2.4065, "step": 120 }, { "epoch": 0.1366120218579235, "grad_norm": 2.0255472574517768, "learning_rate": 1.1088528082822625e-05, "loss": 2.4301, "step": 125 }, { "epoch": 0.14207650273224043, "grad_norm": 2.2637369635110933, "learning_rate": 1.0918279340172864e-05, "loss": 2.3594, "step": 130 }, { "epoch": 0.14754098360655737, "grad_norm": 2.053999551417616, "learning_rate": 1.0750421017356817e-05, "loss": 2.3738, "step": 135 }, { "epoch": 0.15300546448087432, "grad_norm": 2.305037126151136, "learning_rate": 1.0584922864886185e-05, "loss": 2.3832, "step": 140 }, { "epoch": 0.15846994535519127, "grad_norm": 2.1180967831426787, "learning_rate": 1.0421754974011241e-05, "loss": 2.413, "step": 145 }, { "epoch": 0.16393442622950818, "grad_norm": 2.1528001939587402, "learning_rate": 1.026088777335768e-05, "loss": 2.3649, "step": 150 }, { "epoch": 0.16939890710382513, "grad_norm": 1.9416004094979256, "learning_rate": 1.0102292025591967e-05, "loss": 2.3733, "step": 155 }, { "epoch": 0.17486338797814208, "grad_norm": 2.2225363194253847, "learning_rate": 9.945938824114975e-06, "loss": 2.385, "step": 160 }, { "epoch": 0.18032786885245902, "grad_norm": 2.492677416034468, "learning_rate": 9.791799589783724e-06, "loss": 2.3586, "step": 165 }, { "epoch": 0.18579234972677597, "grad_norm": 1.9241743990594526, "learning_rate": 9.639846067661005e-06, "loss": 2.3548, "step": 170 }, { "epoch": 0.1912568306010929, "grad_norm": 1.9740476668210596, "learning_rate": 9.490050323792687e-06, "loss": 2.3768, "step": 175 }, { "epoch": 0.19672131147540983, "grad_norm": 2.0445759366709106, "learning_rate": 9.342384742012546e-06, "loss": 2.4061, "step": 180 }, { "epoch": 0.20218579234972678, "grad_norm": 1.9639271133424887, "learning_rate": 9.19682202077437e-06, "loss": 2.3726, "step": 185 }, { "epoch": 0.20765027322404372, "grad_norm": 1.9663469004265115, "learning_rate": 9.053335170011187e-06, "loss": 2.3515, "step": 190 }, { "epoch": 0.21311475409836064, "grad_norm": 1.9350297245856483, "learning_rate": 8.911897508021392e-06, "loss": 2.3408, "step": 195 }, { "epoch": 0.2185792349726776, "grad_norm": 1.8993527753581194, "learning_rate": 8.77248265838164e-06, "loss": 2.3701, "step": 200 }, { "epoch": 0.2185792349726776, "eval_loss": 2.3701858520507812, "eval_runtime": 75.1482, "eval_samples_per_second": 86.589, "eval_steps_per_second": 0.679, "step": 200 }, { "epoch": 0.22404371584699453, "grad_norm": 2.203924855542508, "learning_rate": 8.635064546886168e-06, "loss": 2.3966, "step": 205 }, { "epoch": 0.22950819672131148, "grad_norm": 2.114129121333607, "learning_rate": 8.499617398512568e-06, "loss": 2.3397, "step": 210 }, { "epoch": 0.23497267759562843, "grad_norm": 1.924587792624844, "learning_rate": 8.366115734413646e-06, "loss": 2.3665, "step": 215 }, { "epoch": 0.24043715846994534, "grad_norm": 1.8589409579909668, "learning_rate": 8.234534368935251e-06, "loss": 2.3603, "step": 220 }, { "epoch": 0.2459016393442623, "grad_norm": 2.045586546826662, "learning_rate": 8.104848406659907e-06, "loss": 2.3569, "step": 225 }, { "epoch": 0.25136612021857924, "grad_norm": 1.9470161431434365, "learning_rate": 7.97703323947598e-06, "loss": 2.322, "step": 230 }, { "epoch": 0.2568306010928962, "grad_norm": 1.8292713581809432, "learning_rate": 7.85106454367231e-06, "loss": 2.3516, "step": 235 }, { "epoch": 0.26229508196721313, "grad_norm": 1.8494090625326762, "learning_rate": 7.72691827705802e-06, "loss": 2.375, "step": 240 }, { "epoch": 0.2677595628415301, "grad_norm": 2.161027732489493, "learning_rate": 7.604570676107382e-06, "loss": 2.3498, "step": 245 }, { "epoch": 0.273224043715847, "grad_norm": 2.1707598899295357, "learning_rate": 7.483998253129525e-06, "loss": 2.3503, "step": 250 }, { "epoch": 0.2786885245901639, "grad_norm": 1.8564170426077466, "learning_rate": 7.365177793462842e-06, "loss": 2.3285, "step": 255 }, { "epoch": 0.28415300546448086, "grad_norm": 1.9463620290299803, "learning_rate": 7.248086352693862e-06, "loss": 2.3287, "step": 260 }, { "epoch": 0.2896174863387978, "grad_norm": 2.0448861366457924, "learning_rate": 7.132701253900465e-06, "loss": 2.3307, "step": 265 }, { "epoch": 0.29508196721311475, "grad_norm": 2.0011605708324685, "learning_rate": 7.019000084919226e-06, "loss": 2.3445, "step": 270 }, { "epoch": 0.3005464480874317, "grad_norm": 1.7859540910895997, "learning_rate": 6.906960695636718e-06, "loss": 2.3176, "step": 275 }, { "epoch": 0.30601092896174864, "grad_norm": 2.0882334857896554, "learning_rate": 6.796561195304612e-06, "loss": 2.3152, "step": 280 }, { "epoch": 0.3114754098360656, "grad_norm": 1.8550717237355474, "learning_rate": 6.687779949878386e-06, "loss": 2.3072, "step": 285 }, { "epoch": 0.31693989071038253, "grad_norm": 1.9231867190142091, "learning_rate": 6.580595579379473e-06, "loss": 2.3527, "step": 290 }, { "epoch": 0.3224043715846995, "grad_norm": 2.2210554738167056, "learning_rate": 6.474986955280685e-06, "loss": 2.3422, "step": 295 }, { "epoch": 0.32786885245901637, "grad_norm": 2.0430624582463506, "learning_rate": 6.370933197914722e-06, "loss": 2.3153, "step": 300 }, { "epoch": 0.3333333333333333, "grad_norm": 2.0678711431760104, "learning_rate": 6.268413673905618e-06, "loss": 2.3097, "step": 305 }, { "epoch": 0.33879781420765026, "grad_norm": 1.8084581993894073, "learning_rate": 6.167407993622935e-06, "loss": 2.3256, "step": 310 }, { "epoch": 0.3442622950819672, "grad_norm": 1.9468221957558098, "learning_rate": 6.067896008658554e-06, "loss": 2.3447, "step": 315 }, { "epoch": 0.34972677595628415, "grad_norm": 1.995128197802868, "learning_rate": 5.9698578093258756e-06, "loss": 2.3063, "step": 320 }, { "epoch": 0.3551912568306011, "grad_norm": 1.9717788395704754, "learning_rate": 5.873273722181316e-06, "loss": 2.3468, "step": 325 }, { "epoch": 0.36065573770491804, "grad_norm": 1.7044787526539047, "learning_rate": 5.778124307567816e-06, "loss": 2.3458, "step": 330 }, { "epoch": 0.366120218579235, "grad_norm": 2.0310677469626994, "learning_rate": 5.68439035718035e-06, "loss": 2.3099, "step": 335 }, { "epoch": 0.37158469945355194, "grad_norm": 1.8797720229376973, "learning_rate": 5.592052891653163e-06, "loss": 2.3293, "step": 340 }, { "epoch": 0.3770491803278688, "grad_norm": 1.7420600285844794, "learning_rate": 5.5010931581686135e-06, "loss": 2.3347, "step": 345 }, { "epoch": 0.3825136612021858, "grad_norm": 1.923012658321935, "learning_rate": 5.411492628087456e-06, "loss": 2.2903, "step": 350 }, { "epoch": 0.3879781420765027, "grad_norm": 1.8521170693883549, "learning_rate": 5.3232329946004e-06, "loss": 2.3296, "step": 355 }, { "epoch": 0.39344262295081966, "grad_norm": 1.8510472078686617, "learning_rate": 5.2362961704007885e-06, "loss": 2.3372, "step": 360 }, { "epoch": 0.3989071038251366, "grad_norm": 2.0309505396989302, "learning_rate": 5.150664285378238e-06, "loss": 2.2872, "step": 365 }, { "epoch": 0.40437158469945356, "grad_norm": 1.8697259417175387, "learning_rate": 5.06631968433308e-06, "loss": 2.3182, "step": 370 }, { "epoch": 0.4098360655737705, "grad_norm": 1.829952724705, "learning_rate": 4.9832449247114525e-06, "loss": 2.2973, "step": 375 }, { "epoch": 0.41530054644808745, "grad_norm": 1.7902123068449143, "learning_rate": 4.901422774360872e-06, "loss": 2.3068, "step": 380 }, { "epoch": 0.4207650273224044, "grad_norm": 1.7833743483161062, "learning_rate": 4.8208362093061525e-06, "loss": 2.2842, "step": 385 }, { "epoch": 0.4262295081967213, "grad_norm": 1.9963838229648958, "learning_rate": 4.741468411545501e-06, "loss": 2.2788, "step": 390 }, { "epoch": 0.43169398907103823, "grad_norm": 2.165728407748183, "learning_rate": 4.6633027668666485e-06, "loss": 2.2629, "step": 395 }, { "epoch": 0.4371584699453552, "grad_norm": 1.8956899938550533, "learning_rate": 4.58632286268284e-06, "loss": 2.3183, "step": 400 }, { "epoch": 0.4371584699453552, "eval_loss": 2.316016674041748, "eval_runtime": 75.0612, "eval_samples_per_second": 86.689, "eval_steps_per_second": 0.679, "step": 400 }, { "epoch": 0.4426229508196721, "grad_norm": 1.9009361654227148, "learning_rate": 4.510512485888576e-06, "loss": 2.3128, "step": 405 }, { "epoch": 0.44808743169398907, "grad_norm": 1.8723633542124947, "learning_rate": 4.435855620734914e-06, "loss": 2.2849, "step": 410 }, { "epoch": 0.453551912568306, "grad_norm": 1.739990729696985, "learning_rate": 4.3623364467242e-06, "loss": 2.323, "step": 415 }, { "epoch": 0.45901639344262296, "grad_norm": 2.739418331731018, "learning_rate": 4.289939336524074e-06, "loss": 2.285, "step": 420 }, { "epoch": 0.4644808743169399, "grad_norm": 1.744787908955572, "learning_rate": 4.218648853900638e-06, "loss": 2.3438, "step": 425 }, { "epoch": 0.46994535519125685, "grad_norm": 2.108125431007958, "learning_rate": 4.148449751670545e-06, "loss": 2.2864, "step": 430 }, { "epoch": 0.47540983606557374, "grad_norm": 1.7777684575501653, "learning_rate": 4.0793269696719935e-06, "loss": 2.2953, "step": 435 }, { "epoch": 0.4808743169398907, "grad_norm": 1.9646082069769346, "learning_rate": 4.011265632754383e-06, "loss": 2.3371, "step": 440 }, { "epoch": 0.48633879781420764, "grad_norm": 1.9015798878951815, "learning_rate": 3.944251048786522e-06, "loss": 2.2647, "step": 445 }, { "epoch": 0.4918032786885246, "grad_norm": 1.946335104230161, "learning_rate": 3.878268706683258e-06, "loss": 2.2622, "step": 450 }, { "epoch": 0.4972677595628415, "grad_norm": 1.7407268985234177, "learning_rate": 3.8133042744503556e-06, "loss": 2.2978, "step": 455 }, { "epoch": 0.5027322404371585, "grad_norm": 2.0354514658677867, "learning_rate": 3.7493435972475156e-06, "loss": 2.3088, "step": 460 }, { "epoch": 0.5081967213114754, "grad_norm": 1.8511439849509024, "learning_rate": 3.686372695469369e-06, "loss": 2.3243, "step": 465 }, { "epoch": 0.5136612021857924, "grad_norm": 1.8563805349568043, "learning_rate": 3.6243777628443207e-06, "loss": 2.3126, "step": 470 }, { "epoch": 0.5191256830601093, "grad_norm": 1.8151947657786947, "learning_rate": 3.5633451645510976e-06, "loss": 2.3406, "step": 475 }, { "epoch": 0.5245901639344263, "grad_norm": 1.8105166181198042, "learning_rate": 3.5032614353528692e-06, "loss": 2.3148, "step": 480 }, { "epoch": 0.5300546448087432, "grad_norm": 1.8770594853800158, "learning_rate": 3.4441132777487983e-06, "loss": 2.2673, "step": 485 }, { "epoch": 0.5355191256830601, "grad_norm": 1.8047846783432062, "learning_rate": 3.385887560142889e-06, "loss": 2.2999, "step": 490 }, { "epoch": 0.5409836065573771, "grad_norm": 1.8251621882251348, "learning_rate": 3.3285713150299956e-06, "loss": 2.2806, "step": 495 }, { "epoch": 0.546448087431694, "grad_norm": 1.871841118264576, "learning_rate": 3.27215173719886e-06, "loss": 2.2755, "step": 500 }, { "epoch": 0.5519125683060109, "grad_norm": 1.7071197085633982, "learning_rate": 3.216616181952041e-06, "loss": 2.3256, "step": 505 }, { "epoch": 0.5573770491803278, "grad_norm": 1.8202076520109158, "learning_rate": 3.161952163342607e-06, "loss": 2.2326, "step": 510 }, { "epoch": 0.5628415300546448, "grad_norm": 2.101153989436264, "learning_rate": 3.1081473524274575e-06, "loss": 2.2992, "step": 515 }, { "epoch": 0.5683060109289617, "grad_norm": 2.8542639158592804, "learning_rate": 3.0551895755371417e-06, "loss": 2.2662, "step": 520 }, { "epoch": 0.5737704918032787, "grad_norm": 1.9673021719695818, "learning_rate": 3.00306681256205e-06, "loss": 2.3003, "step": 525 }, { "epoch": 0.5792349726775956, "grad_norm": 2.010083775622308, "learning_rate": 2.9517671952548357e-06, "loss": 2.3146, "step": 530 }, { "epoch": 0.5846994535519126, "grad_norm": 1.8194888309578177, "learning_rate": 2.9012790055489625e-06, "loss": 2.2817, "step": 535 }, { "epoch": 0.5901639344262295, "grad_norm": 3.4489573911262608, "learning_rate": 2.8515906738932173e-06, "loss": 2.2923, "step": 540 }, { "epoch": 0.5956284153005464, "grad_norm": 1.7960363620990365, "learning_rate": 2.8026907776020966e-06, "loss": 2.2396, "step": 545 }, { "epoch": 0.6010928961748634, "grad_norm": 1.9801171472834103, "learning_rate": 2.7545680392219096e-06, "loss": 2.2668, "step": 550 }, { "epoch": 0.6065573770491803, "grad_norm": 1.7618650836948095, "learning_rate": 2.7072113249124913e-06, "loss": 2.2449, "step": 555 }, { "epoch": 0.6120218579234973, "grad_norm": 1.730914258843425, "learning_rate": 2.660609642844413e-06, "loss": 2.2918, "step": 560 }, { "epoch": 0.6174863387978142, "grad_norm": 1.7646145040322634, "learning_rate": 2.6147521416115106e-06, "loss": 2.2862, "step": 565 }, { "epoch": 0.6229508196721312, "grad_norm": 1.9128717333080465, "learning_rate": 2.5696281086586865e-06, "loss": 2.2657, "step": 570 }, { "epoch": 0.6284153005464481, "grad_norm": 1.956303743152218, "learning_rate": 2.5252269687248056e-06, "loss": 2.3029, "step": 575 }, { "epoch": 0.6338797814207651, "grad_norm": 1.7226771008230806, "learning_rate": 2.4815382823005854e-06, "loss": 2.2454, "step": 580 }, { "epoch": 0.639344262295082, "grad_norm": 1.8222023652295238, "learning_rate": 2.4385517441013565e-06, "loss": 2.3003, "step": 585 }, { "epoch": 0.644808743169399, "grad_norm": 1.8097760740890172, "learning_rate": 2.3962571815545747e-06, "loss": 2.3239, "step": 590 }, { "epoch": 0.6502732240437158, "grad_norm": 1.7469489003388072, "learning_rate": 2.3546445533019647e-06, "loss": 2.289, "step": 595 }, { "epoch": 0.6557377049180327, "grad_norm": 1.8488648855172372, "learning_rate": 2.31370394771618e-06, "loss": 2.2634, "step": 600 }, { "epoch": 0.6557377049180327, "eval_loss": 2.2862629890441895, "eval_runtime": 75.0848, "eval_samples_per_second": 86.662, "eval_steps_per_second": 0.679, "step": 600 }, { "epoch": 0.6612021857923497, "grad_norm": 1.775242640587005, "learning_rate": 2.2734255814318526e-06, "loss": 2.2729, "step": 605 }, { "epoch": 0.6666666666666666, "grad_norm": 1.7202766576440889, "learning_rate": 2.233799797890934e-06, "loss": 2.2784, "step": 610 }, { "epoch": 0.6721311475409836, "grad_norm": 1.808717147716665, "learning_rate": 2.1948170659021868e-06, "loss": 2.2501, "step": 615 }, { "epoch": 0.6775956284153005, "grad_norm": 1.7433893633830992, "learning_rate": 2.1564679782147374e-06, "loss": 2.2937, "step": 620 }, { "epoch": 0.6830601092896175, "grad_norm": 1.7254327868818564, "learning_rate": 2.1187432501055544e-06, "loss": 2.3049, "step": 625 }, { "epoch": 0.6885245901639344, "grad_norm": 1.7514572806831676, "learning_rate": 2.0816337179807527e-06, "loss": 2.2563, "step": 630 }, { "epoch": 0.6939890710382514, "grad_norm": 2.517555263627969, "learning_rate": 2.0451303379906046e-06, "loss": 2.2915, "step": 635 }, { "epoch": 0.6994535519125683, "grad_norm": 1.841614406691522, "learning_rate": 2.0092241846581427e-06, "loss": 2.2846, "step": 640 }, { "epoch": 0.7049180327868853, "grad_norm": 2.117519428379047, "learning_rate": 1.973906449521264e-06, "loss": 2.2822, "step": 645 }, { "epoch": 0.7103825136612022, "grad_norm": 1.775749159266723, "learning_rate": 1.9391684397881756e-06, "loss": 2.2472, "step": 650 }, { "epoch": 0.7158469945355191, "grad_norm": 1.7229676762831452, "learning_rate": 1.9050015770061387e-06, "loss": 2.2924, "step": 655 }, { "epoch": 0.7213114754098361, "grad_norm": 1.7135336336426077, "learning_rate": 1.8713973957433444e-06, "loss": 2.2932, "step": 660 }, { "epoch": 0.726775956284153, "grad_norm": 1.746651653634065, "learning_rate": 1.838347542283849e-06, "loss": 2.2625, "step": 665 }, { "epoch": 0.73224043715847, "grad_norm": 1.8692885279821523, "learning_rate": 1.8058437733354382e-06, "loss": 2.2856, "step": 670 }, { "epoch": 0.7377049180327869, "grad_norm": 1.8208277427071937, "learning_rate": 1.773877954750328e-06, "loss": 2.2477, "step": 675 }, { "epoch": 0.7431693989071039, "grad_norm": 1.6812537080705303, "learning_rate": 1.7424420602585894e-06, "loss": 2.3132, "step": 680 }, { "epoch": 0.7486338797814208, "grad_norm": 2.0801892129831256, "learning_rate": 1.7115281702141926e-06, "loss": 2.2575, "step": 685 }, { "epoch": 0.7540983606557377, "grad_norm": 1.8414116320368654, "learning_rate": 1.6811284703535634e-06, "loss": 2.2476, "step": 690 }, { "epoch": 0.7595628415300546, "grad_norm": 2.0566662495103483, "learning_rate": 1.651235250566554e-06, "loss": 2.2569, "step": 695 }, { "epoch": 0.7650273224043715, "grad_norm": 2.016755831922365, "learning_rate": 1.6218409036797155e-06, "loss": 2.2568, "step": 700 }, { "epoch": 0.7704918032786885, "grad_norm": 1.7180035270775444, "learning_rate": 1.592937924251778e-06, "loss": 2.2993, "step": 705 }, { "epoch": 0.7759562841530054, "grad_norm": 1.7480243896979724, "learning_rate": 1.5645189073812295e-06, "loss": 2.2602, "step": 710 }, { "epoch": 0.7814207650273224, "grad_norm": 1.7826578176545964, "learning_rate": 1.5365765475258971e-06, "loss": 2.2554, "step": 715 }, { "epoch": 0.7868852459016393, "grad_norm": 1.7135083462521725, "learning_rate": 1.5091036373344258e-06, "loss": 2.2941, "step": 720 }, { "epoch": 0.7923497267759563, "grad_norm": 1.829935840802774, "learning_rate": 1.4820930664895563e-06, "loss": 2.2986, "step": 725 }, { "epoch": 0.7978142076502732, "grad_norm": 1.731928027216758, "learning_rate": 1.455537820563104e-06, "loss": 2.249, "step": 730 }, { "epoch": 0.8032786885245902, "grad_norm": 1.7873474828840332, "learning_rate": 1.4294309798825372e-06, "loss": 2.2462, "step": 735 }, { "epoch": 0.8087431693989071, "grad_norm": 1.759568248731093, "learning_rate": 1.4037657184090597e-06, "loss": 2.2722, "step": 740 }, { "epoch": 0.8142076502732241, "grad_norm": 1.7115339550033273, "learning_rate": 1.3785353026270964e-06, "loss": 2.2739, "step": 745 }, { "epoch": 0.819672131147541, "grad_norm": 1.6802608563862464, "learning_rate": 1.3537330904450898e-06, "loss": 2.2312, "step": 750 }, { "epoch": 0.825136612021858, "grad_norm": 1.7819162358568228, "learning_rate": 1.3293525301075076e-06, "loss": 2.2691, "step": 755 }, { "epoch": 0.8306010928961749, "grad_norm": 1.7268343293878012, "learning_rate": 1.305387159117968e-06, "loss": 2.3017, "step": 760 }, { "epoch": 0.8360655737704918, "grad_norm": 1.9444235134875572, "learning_rate": 1.2818306031733856e-06, "loss": 2.2924, "step": 765 }, { "epoch": 0.8415300546448088, "grad_norm": 1.7510208764482034, "learning_rate": 1.258676575109047e-06, "loss": 2.2897, "step": 770 }, { "epoch": 0.8469945355191257, "grad_norm": 2.517375736052748, "learning_rate": 1.2359188738545197e-06, "loss": 2.2454, "step": 775 }, { "epoch": 0.8524590163934426, "grad_norm": 1.6966653908275375, "learning_rate": 1.2135513834003019e-06, "loss": 2.2569, "step": 780 }, { "epoch": 0.8579234972677595, "grad_norm": 1.91572460682662, "learning_rate": 1.1915680717751282e-06, "loss": 2.2454, "step": 785 }, { "epoch": 0.8633879781420765, "grad_norm": 1.7753619527615636, "learning_rate": 1.1699629900338182e-06, "loss": 2.271, "step": 790 }, { "epoch": 0.8688524590163934, "grad_norm": 1.7960345912349553, "learning_rate": 1.1487302712556065e-06, "loss": 2.2328, "step": 795 }, { "epoch": 0.8743169398907104, "grad_norm": 1.8256697032153515, "learning_rate": 1.1278641295528428e-06, "loss": 2.2522, "step": 800 }, { "epoch": 0.8743169398907104, "eval_loss": 2.27061128616333, "eval_runtime": 75.1121, "eval_samples_per_second": 86.631, "eval_steps_per_second": 0.679, "step": 800 }, { "epoch": 0.8797814207650273, "grad_norm": 1.755415386789429, "learning_rate": 1.1073588590899781e-06, "loss": 2.2794, "step": 805 }, { "epoch": 0.8852459016393442, "grad_norm": 1.82410498387524, "learning_rate": 1.087208833112751e-06, "loss": 2.285, "step": 810 }, { "epoch": 0.8907103825136612, "grad_norm": 1.7491199970554299, "learning_rate": 1.0674085029874798e-06, "loss": 2.2838, "step": 815 }, { "epoch": 0.8961748633879781, "grad_norm": 1.7620440534843038, "learning_rate": 1.0479523972503778e-06, "loss": 2.2571, "step": 820 }, { "epoch": 0.9016393442622951, "grad_norm": 1.7623871723124545, "learning_rate": 1.0288351206668029e-06, "loss": 2.2152, "step": 825 }, { "epoch": 0.907103825136612, "grad_norm": 1.7314270943428405, "learning_rate": 1.0100513533003527e-06, "loss": 2.2728, "step": 830 }, { "epoch": 0.912568306010929, "grad_norm": 1.7654507075500774, "learning_rate": 9.915958495917222e-07, "loss": 2.247, "step": 835 }, { "epoch": 0.9180327868852459, "grad_norm": 1.678925667557596, "learning_rate": 9.734634374472352e-07, "loss": 2.2616, "step": 840 }, { "epoch": 0.9234972677595629, "grad_norm": 1.7684124052868442, "learning_rate": 9.556490173369703e-07, "loss": 2.2862, "step": 845 }, { "epoch": 0.9289617486338798, "grad_norm": 3.0732219671025516, "learning_rate": 9.381475614023894e-07, "loss": 2.2431, "step": 850 }, { "epoch": 0.9344262295081968, "grad_norm": 1.7623273850966537, "learning_rate": 9.209541125733917e-07, "loss": 2.2347, "step": 855 }, { "epoch": 0.9398907103825137, "grad_norm": 1.7652819800497817, "learning_rate": 9.040637836947072e-07, "loss": 2.2397, "step": 860 }, { "epoch": 0.9453551912568307, "grad_norm": 2.309656697777325, "learning_rate": 8.874717566615452e-07, "loss": 2.2653, "step": 865 }, { "epoch": 0.9508196721311475, "grad_norm": 1.7843612739885204, "learning_rate": 8.711732815644269e-07, "loss": 2.2434, "step": 870 }, { "epoch": 0.9562841530054644, "grad_norm": 1.7109714272710808, "learning_rate": 8.551636758430965e-07, "loss": 2.2745, "step": 875 }, { "epoch": 0.9617486338797814, "grad_norm": 1.7579744867576292, "learning_rate": 8.394383234494619e-07, "loss": 2.2248, "step": 880 }, { "epoch": 0.9672131147540983, "grad_norm": 1.849160968341628, "learning_rate": 8.239926740194595e-07, "loss": 2.251, "step": 885 }, { "epoch": 0.9726775956284153, "grad_norm": 1.861581611214087, "learning_rate": 8.088222420537758e-07, "loss": 2.2483, "step": 890 }, { "epoch": 0.9781420765027322, "grad_norm": 1.7754689870317883, "learning_rate": 7.939226061073428e-07, "loss": 2.2332, "step": 895 }, { "epoch": 0.9836065573770492, "grad_norm": 1.7804277680548917, "learning_rate": 7.792894079875298e-07, "loss": 2.236, "step": 900 }, { "epoch": 0.9890710382513661, "grad_norm": 1.8706013663334191, "learning_rate": 7.649183519609543e-07, "loss": 2.2355, "step": 905 }, { "epoch": 0.994535519125683, "grad_norm": 2.1654744337173804, "learning_rate": 7.508052039688325e-07, "loss": 2.2716, "step": 910 }, { "epoch": 1.0, "grad_norm": 1.8923905206181715, "learning_rate": 7.369457908507959e-07, "loss": 2.2432, "step": 915 }, { "epoch": 1.005464480874317, "grad_norm": 1.8669783431369535, "learning_rate": 7.233359995770941e-07, "loss": 2.0815, "step": 920 }, { "epoch": 1.010928961748634, "grad_norm": 1.8664223098231116, "learning_rate": 7.09971776489111e-07, "loss": 2.1179, "step": 925 }, { "epoch": 1.0163934426229508, "grad_norm": 2.0035276022188526, "learning_rate": 6.968491265481181e-07, "loss": 2.0239, "step": 930 }, { "epoch": 1.0218579234972678, "grad_norm": 1.863776527509761, "learning_rate": 6.839641125921904e-07, "loss": 2.0409, "step": 935 }, { "epoch": 1.0273224043715847, "grad_norm": 1.8550407448868003, "learning_rate": 6.713128546012103e-07, "loss": 2.0766, "step": 940 }, { "epoch": 1.0327868852459017, "grad_norm": 1.836653625153128, "learning_rate": 6.588915289698876e-07, "loss": 2.0376, "step": 945 }, { "epoch": 1.0382513661202186, "grad_norm": 1.8030661735223916, "learning_rate": 6.466963677887208e-07, "loss": 2.0702, "step": 950 }, { "epoch": 1.0437158469945356, "grad_norm": 1.9561619861336457, "learning_rate": 6.347236581328288e-07, "loss": 2.0205, "step": 955 }, { "epoch": 1.0491803278688525, "grad_norm": 1.8564459874702657, "learning_rate": 6.229697413585796e-07, "loss": 1.9857, "step": 960 }, { "epoch": 1.0546448087431695, "grad_norm": 1.8936006989320295, "learning_rate": 6.114310124079459e-07, "loss": 2.0398, "step": 965 }, { "epoch": 1.0601092896174864, "grad_norm": 1.9195082055604684, "learning_rate": 6.001039191205155e-07, "loss": 2.1075, "step": 970 }, { "epoch": 1.0655737704918034, "grad_norm": 1.8455934194504435, "learning_rate": 5.88984961553089e-07, "loss": 2.0609, "step": 975 }, { "epoch": 1.0710382513661203, "grad_norm": 1.8016262977421866, "learning_rate": 5.780706913067893e-07, "loss": 2.0502, "step": 980 }, { "epoch": 1.0765027322404372, "grad_norm": 1.7677047836486395, "learning_rate": 5.673577108616207e-07, "loss": 2.051, "step": 985 }, { "epoch": 1.0819672131147542, "grad_norm": 1.8046971572225328, "learning_rate": 5.568426729184038e-07, "loss": 2.0531, "step": 990 }, { "epoch": 1.0874316939890711, "grad_norm": 1.82987759218833, "learning_rate": 5.465222797480186e-07, "loss": 2.0766, "step": 995 }, { "epoch": 1.092896174863388, "grad_norm": 1.9232052942988758, "learning_rate": 5.3639328254789e-07, "loss": 2.0306, "step": 1000 }, { "epoch": 1.092896174863388, "eval_loss": 2.277691602706909, "eval_runtime": 75.0135, "eval_samples_per_second": 86.744, "eval_steps_per_second": 0.68, "step": 1000 }, { "epoch": 1.098360655737705, "grad_norm": 2.0428068706971323, "learning_rate": 5.264524808056471e-07, "loss": 2.0239, "step": 1005 }, { "epoch": 1.1038251366120218, "grad_norm": 1.8366344916179231, "learning_rate": 5.166967216698893e-07, "loss": 2.0634, "step": 1010 }, { "epoch": 1.1092896174863387, "grad_norm": 1.7965628794594979, "learning_rate": 5.071228993279937e-07, "loss": 2.0611, "step": 1015 }, { "epoch": 1.1147540983606556, "grad_norm": 2.017245605772054, "learning_rate": 4.977279543908971e-07, "loss": 2.0588, "step": 1020 }, { "epoch": 1.1202185792349726, "grad_norm": 1.8313145099357355, "learning_rate": 4.885088732847877e-07, "loss": 2.0667, "step": 1025 }, { "epoch": 1.1256830601092895, "grad_norm": 1.8185093106173156, "learning_rate": 4.794626876496447e-07, "loss": 2.0602, "step": 1030 }, { "epoch": 1.1311475409836065, "grad_norm": 1.7971795633377927, "learning_rate": 4.705864737445532e-07, "loss": 2.0819, "step": 1035 }, { "epoch": 1.1366120218579234, "grad_norm": 1.804877775071399, "learning_rate": 4.6187735185974027e-07, "loss": 2.0733, "step": 1040 }, { "epoch": 1.1420765027322404, "grad_norm": 1.8882874062597697, "learning_rate": 4.53332485735264e-07, "loss": 2.0624, "step": 1045 }, { "epoch": 1.1475409836065573, "grad_norm": 1.8725308660946791, "learning_rate": 4.4494908198629223e-07, "loss": 2.0751, "step": 1050 }, { "epoch": 1.1530054644808743, "grad_norm": 1.832601726764631, "learning_rate": 4.3672438953490993e-07, "loss": 2.0633, "step": 1055 }, { "epoch": 1.1584699453551912, "grad_norm": 1.864300674472044, "learning_rate": 4.2865569904839347e-07, "loss": 2.0313, "step": 1060 }, { "epoch": 1.1639344262295082, "grad_norm": 1.8685178757975862, "learning_rate": 4.2074034238388927e-07, "loss": 2.0323, "step": 1065 }, { "epoch": 1.169398907103825, "grad_norm": 1.9710058409039382, "learning_rate": 4.129756920394366e-07, "loss": 2.0582, "step": 1070 }, { "epoch": 1.174863387978142, "grad_norm": 1.8687716914721453, "learning_rate": 4.0535916061127434e-07, "loss": 2.0985, "step": 1075 }, { "epoch": 1.180327868852459, "grad_norm": 1.8500938997613081, "learning_rate": 3.9788820025736986e-07, "loss": 2.0767, "step": 1080 }, { "epoch": 1.185792349726776, "grad_norm": 1.8312389667512146, "learning_rate": 3.905603021671151e-07, "loss": 2.0657, "step": 1085 }, { "epoch": 1.1912568306010929, "grad_norm": 1.867588510082228, "learning_rate": 3.833729960371216e-07, "loss": 2.0341, "step": 1090 }, { "epoch": 1.1967213114754098, "grad_norm": 2.0320023546596793, "learning_rate": 3.763238495530669e-07, "loss": 2.0428, "step": 1095 }, { "epoch": 1.2021857923497268, "grad_norm": 1.8098546698756057, "learning_rate": 3.6941046787752674e-07, "loss": 2.0333, "step": 1100 }, { "epoch": 1.2076502732240437, "grad_norm": 1.8386846652746143, "learning_rate": 3.626304931437368e-07, "loss": 2.0554, "step": 1105 }, { "epoch": 1.2131147540983607, "grad_norm": 1.8784877277085623, "learning_rate": 3.559816039552281e-07, "loss": 2.0227, "step": 1110 }, { "epoch": 1.2185792349726776, "grad_norm": 1.8657671721210465, "learning_rate": 3.494615148912776e-07, "loss": 2.0451, "step": 1115 }, { "epoch": 1.2240437158469946, "grad_norm": 1.8007719413939671, "learning_rate": 3.430679760181184e-07, "loss": 2.0583, "step": 1120 }, { "epoch": 1.2295081967213115, "grad_norm": 1.870766368370614, "learning_rate": 3.367987724058537e-07, "loss": 2.0488, "step": 1125 }, { "epoch": 1.2349726775956285, "grad_norm": 1.8322491897546949, "learning_rate": 3.3065172365101784e-07, "loss": 2.0705, "step": 1130 }, { "epoch": 1.2404371584699454, "grad_norm": 1.8536889671409005, "learning_rate": 3.2462468340473055e-07, "loss": 2.0704, "step": 1135 }, { "epoch": 1.2459016393442623, "grad_norm": 1.8359760782064882, "learning_rate": 3.1871553890638926e-07, "loss": 2.0451, "step": 1140 }, { "epoch": 1.2513661202185793, "grad_norm": 1.8344433175084502, "learning_rate": 3.129222105228447e-07, "loss": 2.0329, "step": 1145 }, { "epoch": 1.2568306010928962, "grad_norm": 1.8537487825008587, "learning_rate": 3.0724265129300667e-07, "loss": 2.0534, "step": 1150 }, { "epoch": 1.2622950819672132, "grad_norm": 1.8408591589072794, "learning_rate": 3.016748464778264e-07, "loss": 2.0942, "step": 1155 }, { "epoch": 1.2677595628415301, "grad_norm": 1.8078246234829183, "learning_rate": 2.962168131156018e-07, "loss": 2.1283, "step": 1160 }, { "epoch": 1.273224043715847, "grad_norm": 2.1814718317735906, "learning_rate": 2.9086659958255433e-07, "loss": 2.0702, "step": 1165 }, { "epoch": 1.278688524590164, "grad_norm": 1.8045654405880427, "learning_rate": 2.85622285158624e-07, "loss": 2.0367, "step": 1170 }, { "epoch": 1.2841530054644807, "grad_norm": 1.8472377696538738, "learning_rate": 2.804819795984313e-07, "loss": 2.0416, "step": 1175 }, { "epoch": 1.289617486338798, "grad_norm": 1.905366081667851, "learning_rate": 2.7544382270735544e-07, "loss": 2.0775, "step": 1180 }, { "epoch": 1.2950819672131146, "grad_norm": 1.9614873478866022, "learning_rate": 2.7050598392267637e-07, "loss": 2.0373, "step": 1185 }, { "epoch": 1.3005464480874318, "grad_norm": 1.8310655830774525, "learning_rate": 2.6566666189973166e-07, "loss": 1.9924, "step": 1190 }, { "epoch": 1.3060109289617485, "grad_norm": 1.7818072860023078, "learning_rate": 2.609240841030368e-07, "loss": 2.0684, "step": 1195 }, { "epoch": 1.3114754098360657, "grad_norm": 1.9094992756338325, "learning_rate": 2.5627650640232037e-07, "loss": 2.0095, "step": 1200 }, { "epoch": 1.3114754098360657, "eval_loss": 2.275972366333008, "eval_runtime": 75.1224, "eval_samples_per_second": 86.619, "eval_steps_per_second": 0.679, "step": 1200 }, { "epoch": 1.3169398907103824, "grad_norm": 1.8692781473355056, "learning_rate": 2.517222126734241e-07, "loss": 2.0688, "step": 1205 }, { "epoch": 1.3224043715846996, "grad_norm": 1.952487765852821, "learning_rate": 2.4725951440401845e-07, "loss": 2.0702, "step": 1210 }, { "epoch": 1.3278688524590163, "grad_norm": 1.8695789749802114, "learning_rate": 2.428867503040866e-07, "loss": 2.0588, "step": 1215 }, { "epoch": 1.3333333333333333, "grad_norm": 1.8261227342272521, "learning_rate": 2.386022859211273e-07, "loss": 2.0136, "step": 1220 }, { "epoch": 1.3387978142076502, "grad_norm": 1.7222392372978628, "learning_rate": 2.3440451326002926e-07, "loss": 2.0569, "step": 1225 }, { "epoch": 1.3442622950819672, "grad_norm": 1.9067634498498296, "learning_rate": 2.3029185040757038e-07, "loss": 2.0261, "step": 1230 }, { "epoch": 1.349726775956284, "grad_norm": 1.8423558641225324, "learning_rate": 2.262627411614938e-07, "loss": 2.0907, "step": 1235 }, { "epoch": 1.355191256830601, "grad_norm": 1.840981313277747, "learning_rate": 2.2231565466411502e-07, "loss": 2.0525, "step": 1240 }, { "epoch": 1.360655737704918, "grad_norm": 1.9131767631552514, "learning_rate": 2.184490850404133e-07, "loss": 2.0632, "step": 1245 }, { "epoch": 1.366120218579235, "grad_norm": 1.787761186102589, "learning_rate": 2.146615510405616e-07, "loss": 2.0723, "step": 1250 }, { "epoch": 1.3715846994535519, "grad_norm": 2.0011420142483685, "learning_rate": 2.1095159568685124e-07, "loss": 2.0347, "step": 1255 }, { "epoch": 1.3770491803278688, "grad_norm": 2.04737119019968, "learning_rate": 2.0731778592496148e-07, "loss": 2.0157, "step": 1260 }, { "epoch": 1.3825136612021858, "grad_norm": 1.8546271061398376, "learning_rate": 2.03758712279536e-07, "loss": 2.0558, "step": 1265 }, { "epoch": 1.3879781420765027, "grad_norm": 1.823598167691669, "learning_rate": 2.0027298851401635e-07, "loss": 2.0707, "step": 1270 }, { "epoch": 1.3934426229508197, "grad_norm": 1.8364088203878515, "learning_rate": 1.968592512946914e-07, "loss": 2.0616, "step": 1275 }, { "epoch": 1.3989071038251366, "grad_norm": 1.8504039969740431, "learning_rate": 1.935161598589178e-07, "loss": 2.0442, "step": 1280 }, { "epoch": 1.4043715846994536, "grad_norm": 1.8647380065818375, "learning_rate": 1.902423956874689e-07, "loss": 2.0309, "step": 1285 }, { "epoch": 1.4098360655737705, "grad_norm": 1.8378312471521248, "learning_rate": 1.870366621809691e-07, "loss": 2.0322, "step": 1290 }, { "epoch": 1.4153005464480874, "grad_norm": 1.925113709625938, "learning_rate": 1.8389768434037062e-07, "loss": 2.0688, "step": 1295 }, { "epoch": 1.4207650273224044, "grad_norm": 1.860586095258991, "learning_rate": 1.8082420845143144e-07, "loss": 2.0745, "step": 1300 }, { "epoch": 1.4262295081967213, "grad_norm": 1.8852545058774595, "learning_rate": 1.778150017731515e-07, "loss": 2.076, "step": 1305 }, { "epoch": 1.4316939890710383, "grad_norm": 1.941199221075769, "learning_rate": 1.7486885223012617e-07, "loss": 2.0019, "step": 1310 }, { "epoch": 1.4371584699453552, "grad_norm": 1.8446186191327532, "learning_rate": 1.719845681087774e-07, "loss": 2.0626, "step": 1315 }, { "epoch": 1.4426229508196722, "grad_norm": 1.9134786622014528, "learning_rate": 1.6916097775741735e-07, "loss": 2.0477, "step": 1320 }, { "epoch": 1.4480874316939891, "grad_norm": 1.8176316380129849, "learning_rate": 1.6639692929010962e-07, "loss": 2.0296, "step": 1325 }, { "epoch": 1.453551912568306, "grad_norm": 1.8469951284525707, "learning_rate": 1.636912902942842e-07, "loss": 2.0342, "step": 1330 }, { "epoch": 1.459016393442623, "grad_norm": 1.8740314655221872, "learning_rate": 1.6104294754206772e-07, "loss": 2.0445, "step": 1335 }, { "epoch": 1.46448087431694, "grad_norm": 2.003913447054603, "learning_rate": 1.5845080670528932e-07, "loss": 2.0545, "step": 1340 }, { "epoch": 1.469945355191257, "grad_norm": 1.856769949790638, "learning_rate": 1.559137920741231e-07, "loss": 2.0106, "step": 1345 }, { "epoch": 1.4754098360655736, "grad_norm": 1.9009712565408305, "learning_rate": 1.534308462793285e-07, "loss": 2.0312, "step": 1350 }, { "epoch": 1.4808743169398908, "grad_norm": 1.8649550487045021, "learning_rate": 1.5100093001805e-07, "loss": 2.058, "step": 1355 }, { "epoch": 1.4863387978142075, "grad_norm": 1.8282766283581593, "learning_rate": 1.486230217831383e-07, "loss": 2.0109, "step": 1360 }, { "epoch": 1.4918032786885247, "grad_norm": 1.8341029485175546, "learning_rate": 1.462961175959548e-07, "loss": 2.0767, "step": 1365 }, { "epoch": 1.4972677595628414, "grad_norm": 1.9503794173682378, "learning_rate": 1.4401923074262253e-07, "loss": 2.0394, "step": 1370 }, { "epoch": 1.5027322404371586, "grad_norm": 1.8750212125931591, "learning_rate": 1.417913915136858e-07, "loss": 2.061, "step": 1375 }, { "epoch": 1.5081967213114753, "grad_norm": 1.850380238557527, "learning_rate": 1.3961164694714208e-07, "loss": 2.1208, "step": 1380 }, { "epoch": 1.5136612021857925, "grad_norm": 1.883450204664466, "learning_rate": 1.3747906057481e-07, "loss": 2.041, "step": 1385 }, { "epoch": 1.5191256830601092, "grad_norm": 2.0647139754404673, "learning_rate": 1.3539271217199617e-07, "loss": 2.0448, "step": 1390 }, { "epoch": 1.5245901639344264, "grad_norm": 1.8198932393101204, "learning_rate": 1.3335169751042653e-07, "loss": 2.0706, "step": 1395 }, { "epoch": 1.530054644808743, "grad_norm": 1.8262850198089926, "learning_rate": 1.3135512811440523e-07, "loss": 2.0539, "step": 1400 }, { "epoch": 1.530054644808743, "eval_loss": 2.274564743041992, "eval_runtime": 75.0617, "eval_samples_per_second": 86.689, "eval_steps_per_second": 0.679, "step": 1400 }, { "epoch": 1.5355191256830603, "grad_norm": 2.1787152604764377, "learning_rate": 1.294021310201668e-07, "loss": 2.0272, "step": 1405 }, { "epoch": 1.540983606557377, "grad_norm": 1.895178288065996, "learning_rate": 1.2749184853838634e-07, "loss": 2.0395, "step": 1410 }, { "epoch": 1.5464480874316942, "grad_norm": 1.8243874586884308, "learning_rate": 1.2562343801981296e-07, "loss": 2.0385, "step": 1415 }, { "epoch": 1.5519125683060109, "grad_norm": 1.8215546298276755, "learning_rate": 1.237960716239925e-07, "loss": 2.0299, "step": 1420 }, { "epoch": 1.5573770491803278, "grad_norm": 1.9144206068231184, "learning_rate": 1.2200893609104527e-07, "loss": 2.0693, "step": 1425 }, { "epoch": 1.5628415300546448, "grad_norm": 1.832800068410983, "learning_rate": 1.2026123251646523e-07, "loss": 2.0911, "step": 1430 }, { "epoch": 1.5683060109289617, "grad_norm": 1.862861787979993, "learning_rate": 1.1855217612890718e-07, "loss": 2.0475, "step": 1435 }, { "epoch": 1.5737704918032787, "grad_norm": 2.0333731738009293, "learning_rate": 1.1688099607092871e-07, "loss": 2.0482, "step": 1440 }, { "epoch": 1.5792349726775956, "grad_norm": 1.8091303840445014, "learning_rate": 1.1524693518265448e-07, "loss": 2.0482, "step": 1445 }, { "epoch": 1.5846994535519126, "grad_norm": 1.864510400120361, "learning_rate": 1.136492497883297e-07, "loss": 2.0948, "step": 1450 }, { "epoch": 1.5901639344262295, "grad_norm": 1.9652692377864456, "learning_rate": 1.1208720948573126e-07, "loss": 2.0189, "step": 1455 }, { "epoch": 1.5956284153005464, "grad_norm": 1.9202417329675314, "learning_rate": 1.1056009693840394e-07, "loss": 2.078, "step": 1460 }, { "epoch": 1.6010928961748634, "grad_norm": 1.8209643705209526, "learning_rate": 1.0906720767069055e-07, "loss": 2.0417, "step": 1465 }, { "epoch": 1.6065573770491803, "grad_norm": 1.8079220297452976, "learning_rate": 1.0760784986552422e-07, "loss": 2.041, "step": 1470 }, { "epoch": 1.6120218579234973, "grad_norm": 1.9197478181290593, "learning_rate": 1.0618134416495201e-07, "loss": 2.0091, "step": 1475 }, { "epoch": 1.6174863387978142, "grad_norm": 1.8520224557231018, "learning_rate": 1.0478702347335883e-07, "loss": 2.0082, "step": 1480 }, { "epoch": 1.6229508196721312, "grad_norm": 1.8273112792953872, "learning_rate": 1.0342423276336188e-07, "loss": 2.0446, "step": 1485 }, { "epoch": 1.6284153005464481, "grad_norm": 1.8630686076964935, "learning_rate": 1.0209232888434338e-07, "loss": 2.0629, "step": 1490 }, { "epoch": 1.633879781420765, "grad_norm": 1.8271338340219678, "learning_rate": 1.0079068037359431e-07, "loss": 2.0609, "step": 1495 }, { "epoch": 1.639344262295082, "grad_norm": 1.8144459145882181, "learning_rate": 9.951866727003745e-08, "loss": 2.0364, "step": 1500 }, { "epoch": 1.644808743169399, "grad_norm": 1.8507906861527859, "learning_rate": 9.827568093050098e-08, "loss": 2.0506, "step": 1505 }, { "epoch": 1.650273224043716, "grad_norm": 1.8319335425047658, "learning_rate": 9.706112384851353e-08, "loss": 2.0253, "step": 1510 }, { "epoch": 1.6557377049180326, "grad_norm": 1.8984240206563825, "learning_rate": 9.587440947559151e-08, "loss": 2.0648, "step": 1515 }, { "epoch": 1.6612021857923498, "grad_norm": 1.8507118687181, "learning_rate": 9.471496204499047e-08, "loss": 2.0231, "step": 1520 }, { "epoch": 1.6666666666666665, "grad_norm": 1.9702444306141391, "learning_rate": 9.358221639789162e-08, "loss": 2.0409, "step": 1525 }, { "epoch": 1.6721311475409837, "grad_norm": 1.8247902367319633, "learning_rate": 9.247561781199593e-08, "loss": 2.0205, "step": 1530 }, { "epoch": 1.6775956284153004, "grad_norm": 1.8563007653589343, "learning_rate": 9.139462183249743e-08, "loss": 2.0488, "step": 1535 }, { "epoch": 1.6830601092896176, "grad_norm": 1.8206552163382879, "learning_rate": 9.033869410540892e-08, "loss": 2.0166, "step": 1540 }, { "epoch": 1.6885245901639343, "grad_norm": 1.9126349973731116, "learning_rate": 8.930731021321133e-08, "loss": 2.0486, "step": 1545 }, { "epoch": 1.6939890710382515, "grad_norm": 1.8565348783702142, "learning_rate": 8.829995551280143e-08, "loss": 2.0342, "step": 1550 }, { "epoch": 1.6994535519125682, "grad_norm": 1.909638179979103, "learning_rate": 8.731612497570976e-08, "loss": 2.073, "step": 1555 }, { "epoch": 1.7049180327868854, "grad_norm": 1.8690803406902856, "learning_rate": 8.635532303056259e-08, "loss": 2.0231, "step": 1560 }, { "epoch": 1.710382513661202, "grad_norm": 1.8909253306804354, "learning_rate": 8.541706340776192e-08, "loss": 2.0341, "step": 1565 }, { "epoch": 1.7158469945355193, "grad_norm": 1.8844764835255978, "learning_rate": 8.450086898635676e-08, "loss": 2.0347, "step": 1570 }, { "epoch": 1.721311475409836, "grad_norm": 1.75571408467022, "learning_rate": 8.360627164308056e-08, "loss": 2.0801, "step": 1575 }, { "epoch": 1.7267759562841531, "grad_norm": 1.8436023942890172, "learning_rate": 8.273281210352872e-08, "loss": 2.0365, "step": 1580 }, { "epoch": 1.7322404371584699, "grad_norm": 1.9085196731178369, "learning_rate": 8.188003979545094e-08, "loss": 2.0531, "step": 1585 }, { "epoch": 1.737704918032787, "grad_norm": 1.8822253588875573, "learning_rate": 8.104751270413362e-08, "loss": 2.0784, "step": 1590 }, { "epoch": 1.7431693989071038, "grad_norm": 1.788484127481047, "learning_rate": 8.02347972298469e-08, "loss": 2.0478, "step": 1595 }, { "epoch": 1.748633879781421, "grad_norm": 1.8239121636996685, "learning_rate": 7.944146804733213e-08, "loss": 2.0338, "step": 1600 }, { "epoch": 1.748633879781421, "eval_loss": 2.2742836475372314, "eval_runtime": 75.0576, "eval_samples_per_second": 86.693, "eval_steps_per_second": 0.679, "step": 1600 }, { "epoch": 1.7540983606557377, "grad_norm": 1.9239534582352587, "learning_rate": 7.866710796730526e-08, "loss": 2.0631, "step": 1605 }, { "epoch": 1.7595628415300546, "grad_norm": 1.8903502091457296, "learning_rate": 7.791130779995196e-08, "loss": 2.0572, "step": 1610 }, { "epoch": 1.7650273224043715, "grad_norm": 1.8293603401943201, "learning_rate": 7.717366622039046e-08, "loss": 2.0668, "step": 1615 }, { "epoch": 1.7704918032786885, "grad_norm": 1.9173015833072757, "learning_rate": 7.64537896360787e-08, "loss": 2.0435, "step": 1620 }, { "epoch": 1.7759562841530054, "grad_norm": 1.8793531033612623, "learning_rate": 7.575129205614193e-08, "loss": 2.0722, "step": 1625 }, { "epoch": 1.7814207650273224, "grad_norm": 2.0902523194542084, "learning_rate": 7.50657949625979e-08, "loss": 2.0433, "step": 1630 }, { "epoch": 1.7868852459016393, "grad_norm": 1.9107654817346211, "learning_rate": 7.439692718345629e-08, "loss": 2.0456, "step": 1635 }, { "epoch": 1.7923497267759563, "grad_norm": 1.9360239043323952, "learning_rate": 7.374432476766986e-08, "loss": 2.006, "step": 1640 }, { "epoch": 1.7978142076502732, "grad_norm": 1.9192977806152298, "learning_rate": 7.310763086191462e-08, "loss": 2.0468, "step": 1645 }, { "epoch": 1.8032786885245902, "grad_norm": 1.8243879474864746, "learning_rate": 7.248649558917661e-08, "loss": 2.0798, "step": 1650 }, { "epoch": 1.8087431693989071, "grad_norm": 1.8320967907842092, "learning_rate": 7.18805759291233e-08, "loss": 2.0515, "step": 1655 }, { "epoch": 1.814207650273224, "grad_norm": 1.8532616512840305, "learning_rate": 7.128953560023773e-08, "loss": 2.0775, "step": 1660 }, { "epoch": 1.819672131147541, "grad_norm": 1.838552382273461, "learning_rate": 7.071304494369334e-08, "loss": 2.0479, "step": 1665 }, { "epoch": 1.825136612021858, "grad_norm": 1.924941490211915, "learning_rate": 7.015078080894855e-08, "loss": 2.0786, "step": 1670 }, { "epoch": 1.830601092896175, "grad_norm": 2.143894042689188, "learning_rate": 6.960242644103938e-08, "loss": 2.0834, "step": 1675 }, { "epoch": 1.8360655737704918, "grad_norm": 1.8651804657911415, "learning_rate": 6.906767136954927e-08, "loss": 2.0642, "step": 1680 }, { "epoch": 1.8415300546448088, "grad_norm": 1.9275400611989582, "learning_rate": 6.854621129923514e-08, "loss": 2.0485, "step": 1685 }, { "epoch": 1.8469945355191257, "grad_norm": 1.861313763790637, "learning_rate": 6.803774800228914e-08, "loss": 2.0999, "step": 1690 }, { "epoch": 1.8524590163934427, "grad_norm": 1.8930724854627998, "learning_rate": 6.754198921221566e-08, "loss": 2.0448, "step": 1695 }, { "epoch": 1.8579234972677594, "grad_norm": 1.8993885693049763, "learning_rate": 6.705864851930317e-08, "loss": 2.0511, "step": 1700 }, { "epoch": 1.8633879781420766, "grad_norm": 1.883117813333527, "learning_rate": 6.658744526767117e-08, "loss": 2.0503, "step": 1705 }, { "epoch": 1.8688524590163933, "grad_norm": 1.848538549934253, "learning_rate": 6.612810445387236e-08, "loss": 2.0636, "step": 1710 }, { "epoch": 1.8743169398907105, "grad_norm": 1.8938252963914626, "learning_rate": 6.568035662702993e-08, "loss": 2.0718, "step": 1715 }, { "epoch": 1.8797814207650272, "grad_norm": 1.941319110309079, "learning_rate": 6.524393779049134e-08, "loss": 2.0647, "step": 1720 }, { "epoch": 1.8852459016393444, "grad_norm": 2.0070472308658207, "learning_rate": 6.481858930497878e-08, "loss": 2.0546, "step": 1725 }, { "epoch": 1.890710382513661, "grad_norm": 2.054630166123197, "learning_rate": 6.440405779321743e-08, "loss": 2.0349, "step": 1730 }, { "epoch": 1.8961748633879782, "grad_norm": 1.8560973783317283, "learning_rate": 6.40000950460228e-08, "loss": 2.053, "step": 1735 }, { "epoch": 1.901639344262295, "grad_norm": 1.8560379830723175, "learning_rate": 6.360645792982822e-08, "loss": 2.0397, "step": 1740 }, { "epoch": 1.9071038251366121, "grad_norm": 1.8759906071094705, "learning_rate": 6.322290829563445e-08, "loss": 2.0582, "step": 1745 }, { "epoch": 1.9125683060109289, "grad_norm": 1.8523523069150685, "learning_rate": 6.284921288936269e-08, "loss": 2.0589, "step": 1750 }, { "epoch": 1.918032786885246, "grad_norm": 1.7917256365306369, "learning_rate": 6.248514326359321e-08, "loss": 2.0742, "step": 1755 }, { "epoch": 1.9234972677595628, "grad_norm": 1.841924086545583, "learning_rate": 6.213047569067165e-08, "loss": 2.0714, "step": 1760 }, { "epoch": 1.92896174863388, "grad_norm": 1.8696658012304237, "learning_rate": 6.178499107716513e-08, "loss": 2.0, "step": 1765 }, { "epoch": 1.9344262295081966, "grad_norm": 1.8879810882710348, "learning_rate": 6.144847487965106e-08, "loss": 2.0584, "step": 1770 }, { "epoch": 1.9398907103825138, "grad_norm": 1.819319260545883, "learning_rate": 6.112071702182056e-08, "loss": 2.0353, "step": 1775 }, { "epoch": 1.9453551912568305, "grad_norm": 1.8742671379299753, "learning_rate": 6.080151181288026e-08, "loss": 2.0478, "step": 1780 }, { "epoch": 1.9508196721311475, "grad_norm": 1.8684517998018801, "learning_rate": 6.049065786723472e-08, "loss": 2.0565, "step": 1785 }, { "epoch": 1.9562841530054644, "grad_norm": 1.8109987887119923, "learning_rate": 6.018795802543315e-08, "loss": 2.0587, "step": 1790 }, { "epoch": 1.9617486338797814, "grad_norm": 1.9407341077135385, "learning_rate": 5.98932192763636e-08, "loss": 2.048, "step": 1795 }, { "epoch": 1.9672131147540983, "grad_norm": 1.8354866267003231, "learning_rate": 5.960625268067816e-08, "loss": 2.0648, "step": 1800 }, { "epoch": 1.9672131147540983, "eval_loss": 2.2736637592315674, "eval_runtime": 75.0951, "eval_samples_per_second": 86.65, "eval_steps_per_second": 0.679, "step": 1800 }, { "epoch": 1.9726775956284153, "grad_norm": 1.7987669216918003, "learning_rate": 5.9326873295433023e-08, "loss": 2.0055, "step": 1805 }, { "epoch": 1.9781420765027322, "grad_norm": 1.8771206541798455, "learning_rate": 5.905490009992716e-08, "loss": 2.0875, "step": 1810 }, { "epoch": 1.9836065573770492, "grad_norm": 1.798209591995569, "learning_rate": 5.8790155922723804e-08, "loss": 2.0414, "step": 1815 }, { "epoch": 1.989071038251366, "grad_norm": 1.8140574162134413, "learning_rate": 5.8532467369838935e-08, "loss": 2.0476, "step": 1820 }, { "epoch": 1.994535519125683, "grad_norm": 1.9205008465204905, "learning_rate": 5.82816647540811e-08, "loss": 2.0414, "step": 1825 }, { "epoch": 2.0, "grad_norm": 1.900514133765418, "learning_rate": 5.803758202552724e-08, "loss": 2.0637, "step": 1830 }, { "epoch": 2.0054644808743167, "grad_norm": 1.812710905593721, "learning_rate": 5.780005670311929e-08, "loss": 2.0017, "step": 1835 }, { "epoch": 2.010928961748634, "grad_norm": 1.8787871615638423, "learning_rate": 5.756892980736625e-08, "loss": 1.9808, "step": 1840 }, { "epoch": 2.0163934426229506, "grad_norm": 1.9008500311802838, "learning_rate": 5.7344045794137134e-08, "loss": 2.0183, "step": 1845 }, { "epoch": 2.021857923497268, "grad_norm": 1.8439766004122011, "learning_rate": 5.7125252489529687e-08, "loss": 2.0492, "step": 1850 }, { "epoch": 2.0273224043715845, "grad_norm": 1.8093023853453647, "learning_rate": 5.6912401025800444e-08, "loss": 2.0498, "step": 1855 }, { "epoch": 2.0327868852459017, "grad_norm": 1.8621731843549314, "learning_rate": 5.670534577834171e-08, "loss": 2.0566, "step": 1860 }, { "epoch": 2.0382513661202184, "grad_norm": 1.7945188171488486, "learning_rate": 5.6503944303690994e-08, "loss": 2.0399, "step": 1865 }, { "epoch": 2.0437158469945356, "grad_norm": 1.8231871269338034, "learning_rate": 5.630805727855896e-08, "loss": 2.0348, "step": 1870 }, { "epoch": 2.0491803278688523, "grad_norm": 1.9219456613473263, "learning_rate": 5.611754843986178e-08, "loss": 2.0056, "step": 1875 }, { "epoch": 2.0546448087431695, "grad_norm": 1.7850350529775676, "learning_rate": 5.5932284525744105e-08, "loss": 2.0062, "step": 1880 }, { "epoch": 2.060109289617486, "grad_norm": 1.9708916029467265, "learning_rate": 5.5752135217578976e-08, "loss": 2.0024, "step": 1885 }, { "epoch": 2.0655737704918034, "grad_norm": 1.913183828308229, "learning_rate": 5.55769730829312e-08, "loss": 2.0277, "step": 1890 }, { "epoch": 2.07103825136612, "grad_norm": 1.8542316127529779, "learning_rate": 5.5406673519470675e-08, "loss": 2.0015, "step": 1895 }, { "epoch": 2.0765027322404372, "grad_norm": 1.87389845276975, "learning_rate": 5.5241114699822666e-08, "loss": 2.0709, "step": 1900 }, { "epoch": 2.081967213114754, "grad_norm": 1.980231294589721, "learning_rate": 5.508017751734168e-08, "loss": 2.008, "step": 1905 }, { "epoch": 2.087431693989071, "grad_norm": 1.8517464515604878, "learning_rate": 5.492374553279633e-08, "loss": 2.0203, "step": 1910 }, { "epoch": 2.092896174863388, "grad_norm": 1.84630325467075, "learning_rate": 5.477170492195204e-08, "loss": 2.0385, "step": 1915 }, { "epoch": 2.098360655737705, "grad_norm": 1.8768394234332548, "learning_rate": 5.46239444240393e-08, "loss": 2.0187, "step": 1920 }, { "epoch": 2.1038251366120218, "grad_norm": 1.8943060640364853, "learning_rate": 5.4480355291094704e-08, "loss": 2.0574, "step": 1925 }, { "epoch": 2.109289617486339, "grad_norm": 1.8863483174705893, "learning_rate": 5.4340831238162615e-08, "loss": 2.0217, "step": 1930 }, { "epoch": 2.1147540983606556, "grad_norm": 1.8885742771135787, "learning_rate": 5.420526839434506e-08, "loss": 2.0538, "step": 1935 }, { "epoch": 2.120218579234973, "grad_norm": 1.8210903752712588, "learning_rate": 5.4073565254687946e-08, "loss": 2.0324, "step": 1940 }, { "epoch": 2.1256830601092895, "grad_norm": 1.8278501741427702, "learning_rate": 5.3945622632891495e-08, "loss": 2.0376, "step": 1945 }, { "epoch": 2.1311475409836067, "grad_norm": 1.8851985007280183, "learning_rate": 5.382134361483329e-08, "loss": 2.0602, "step": 1950 }, { "epoch": 2.1366120218579234, "grad_norm": 1.8672742611841686, "learning_rate": 5.370063351289204e-08, "loss": 2.0443, "step": 1955 }, { "epoch": 2.1420765027322406, "grad_norm": 1.9023532236989618, "learning_rate": 5.358339982106074e-08, "loss": 2.0178, "step": 1960 }, { "epoch": 2.1475409836065573, "grad_norm": 1.8533754595108112, "learning_rate": 5.346955217083767e-08, "loss": 2.0289, "step": 1965 }, { "epoch": 2.1530054644808745, "grad_norm": 1.8751406718039245, "learning_rate": 5.335900228788407e-08, "loss": 2.0258, "step": 1970 }, { "epoch": 2.158469945355191, "grad_norm": 1.911401329876507, "learning_rate": 5.3251663949437266e-08, "loss": 2.0621, "step": 1975 }, { "epoch": 2.1639344262295084, "grad_norm": 1.8780553903992336, "learning_rate": 5.3147452942468386e-08, "loss": 1.9947, "step": 1980 }, { "epoch": 2.169398907103825, "grad_norm": 1.9417700354104075, "learning_rate": 5.3046287022573567e-08, "loss": 2.0627, "step": 1985 }, { "epoch": 2.1748633879781423, "grad_norm": 1.9335794456687536, "learning_rate": 5.2948085873588114e-08, "loss": 2.0621, "step": 1990 }, { "epoch": 2.180327868852459, "grad_norm": 1.8441134776475825, "learning_rate": 5.2852771067912865e-08, "loss": 2.0741, "step": 1995 }, { "epoch": 2.185792349726776, "grad_norm": 1.93007244053263, "learning_rate": 5.276026602754233e-08, "loss": 2.0297, "step": 2000 }, { "epoch": 2.185792349726776, "eval_loss": 2.2766480445861816, "eval_runtime": 75.0721, "eval_samples_per_second": 86.677, "eval_steps_per_second": 0.679, "step": 2000 }, { "epoch": 2.191256830601093, "grad_norm": 1.8271236407081135, "learning_rate": 5.267049598578416e-08, "loss": 1.998, "step": 2005 }, { "epoch": 2.19672131147541, "grad_norm": 1.9080141032090714, "learning_rate": 5.258338794965976e-08, "loss": 2.0317, "step": 2010 }, { "epoch": 2.202185792349727, "grad_norm": 1.9296381326844356, "learning_rate": 5.2498870662975855e-08, "loss": 2.0527, "step": 2015 }, { "epoch": 2.2076502732240435, "grad_norm": 1.8649383667959, "learning_rate": 5.241687457005712e-08, "loss": 2.0167, "step": 2020 }, { "epoch": 2.2131147540983607, "grad_norm": 1.8733130363096773, "learning_rate": 5.233733178012981e-08, "loss": 2.0553, "step": 2025 }, { "epoch": 2.2185792349726774, "grad_norm": 1.8744906953041462, "learning_rate": 5.226017603234672e-08, "loss": 2.0345, "step": 2030 }, { "epoch": 2.2240437158469946, "grad_norm": 1.853006039099042, "learning_rate": 5.2185342661443896e-08, "loss": 1.9966, "step": 2035 }, { "epoch": 2.2295081967213113, "grad_norm": 1.8910360796325498, "learning_rate": 5.211276856401939e-08, "loss": 2.0135, "step": 2040 }, { "epoch": 2.2349726775956285, "grad_norm": 1.8514291560164504, "learning_rate": 5.2042392165424757e-08, "loss": 2.0205, "step": 2045 }, { "epoch": 2.240437158469945, "grad_norm": 1.8446701052221985, "learning_rate": 5.197415338725999e-08, "loss": 2.0301, "step": 2050 }, { "epoch": 2.2459016393442623, "grad_norm": 1.8588932533873443, "learning_rate": 5.1907993615462615e-08, "loss": 2.0287, "step": 2055 }, { "epoch": 2.251366120218579, "grad_norm": 1.8963789966982134, "learning_rate": 5.1843855668982e-08, "loss": 2.0719, "step": 2060 }, { "epoch": 2.2568306010928962, "grad_norm": 1.9044908060597479, "learning_rate": 5.17816837690297e-08, "loss": 1.9721, "step": 2065 }, { "epoch": 2.262295081967213, "grad_norm": 1.9295350690511475, "learning_rate": 5.172142350889727e-08, "loss": 2.0225, "step": 2070 }, { "epoch": 2.26775956284153, "grad_norm": 1.9304743860423463, "learning_rate": 5.166302182433254e-08, "loss": 2.0263, "step": 2075 }, { "epoch": 2.273224043715847, "grad_norm": 1.8671651062857888, "learning_rate": 5.160642696446577e-08, "loss": 2.0241, "step": 2080 }, { "epoch": 2.278688524590164, "grad_norm": 1.8420650638603713, "learning_rate": 5.155158846327734e-08, "loss": 2.0206, "step": 2085 }, { "epoch": 2.2841530054644807, "grad_norm": 1.8005272409919932, "learning_rate": 5.149845711159822e-08, "loss": 2.0365, "step": 2090 }, { "epoch": 2.289617486338798, "grad_norm": 1.857808778071221, "learning_rate": 5.144698492963522e-08, "loss": 2.0911, "step": 2095 }, { "epoch": 2.2950819672131146, "grad_norm": 1.9093915008013214, "learning_rate": 5.139712514001258e-08, "loss": 2.0428, "step": 2100 }, { "epoch": 2.300546448087432, "grad_norm": 1.8578270044645933, "learning_rate": 5.134883214132186e-08, "loss": 2.0124, "step": 2105 }, { "epoch": 2.3060109289617485, "grad_norm": 1.822793058548045, "learning_rate": 5.130206148217218e-08, "loss": 2.0746, "step": 2110 }, { "epoch": 2.3114754098360657, "grad_norm": 1.8442935114306909, "learning_rate": 5.12567698357328e-08, "loss": 2.0444, "step": 2115 }, { "epoch": 2.3169398907103824, "grad_norm": 1.9739163571989773, "learning_rate": 5.1212914974760244e-08, "loss": 2.0435, "step": 2120 }, { "epoch": 2.3224043715846996, "grad_norm": 1.8469690810911081, "learning_rate": 5.117045574710235e-08, "loss": 2.0545, "step": 2125 }, { "epoch": 2.3278688524590163, "grad_norm": 1.8661244475654946, "learning_rate": 5.112935205167153e-08, "loss": 2.0058, "step": 2130 }, { "epoch": 2.3333333333333335, "grad_norm": 1.8937750326566802, "learning_rate": 5.108956481487976e-08, "loss": 2.0293, "step": 2135 }, { "epoch": 2.33879781420765, "grad_norm": 1.9208419450347225, "learning_rate": 5.105105596752788e-08, "loss": 2.0414, "step": 2140 }, { "epoch": 2.3442622950819674, "grad_norm": 2.1121282228006555, "learning_rate": 5.101378842214193e-08, "loss": 2.0869, "step": 2145 }, { "epoch": 2.349726775956284, "grad_norm": 1.8848256373264323, "learning_rate": 5.0977726050749185e-08, "loss": 2.0614, "step": 2150 }, { "epoch": 2.3551912568306013, "grad_norm": 1.8507097430278243, "learning_rate": 5.094283366308685e-08, "loss": 2.0249, "step": 2155 }, { "epoch": 2.360655737704918, "grad_norm": 1.8890593768953334, "learning_rate": 5.0909076985236385e-08, "loss": 2.0068, "step": 2160 }, { "epoch": 2.366120218579235, "grad_norm": 1.8651383954059584, "learning_rate": 5.0876422638676395e-08, "loss": 2.0044, "step": 2165 }, { "epoch": 2.371584699453552, "grad_norm": 1.860146100854827, "learning_rate": 5.084483811974733e-08, "loss": 2.054, "step": 2170 }, { "epoch": 2.3770491803278686, "grad_norm": 1.7761767522433785, "learning_rate": 5.0814291779521236e-08, "loss": 2.0229, "step": 2175 }, { "epoch": 2.3825136612021858, "grad_norm": 1.8386788502881166, "learning_rate": 5.078475280406979e-08, "loss": 2.0662, "step": 2180 }, { "epoch": 2.387978142076503, "grad_norm": 1.990375014859749, "learning_rate": 5.075619119512409e-08, "loss": 2.0393, "step": 2185 }, { "epoch": 2.3934426229508197, "grad_norm": 2.2419457883038314, "learning_rate": 5.0728577751119725e-08, "loss": 2.0523, "step": 2190 }, { "epoch": 2.3989071038251364, "grad_norm": 1.9472851198204904, "learning_rate": 5.0701884048620594e-08, "loss": 2.0433, "step": 2195 }, { "epoch": 2.4043715846994536, "grad_norm": 1.8641814407570831, "learning_rate": 5.067608242411532e-08, "loss": 2.0487, "step": 2200 }, { "epoch": 2.4043715846994536, "eval_loss": 2.276731014251709, "eval_runtime": 75.0853, "eval_samples_per_second": 86.661, "eval_steps_per_second": 0.679, "step": 2200 }, { "epoch": 2.4098360655737707, "grad_norm": 1.8849638877894628, "learning_rate": 5.065114595617981e-08, "loss": 2.0449, "step": 2205 }, { "epoch": 2.4153005464480874, "grad_norm": 1.897559223397183, "learning_rate": 5.0627048448e-08, "loss": 2.0172, "step": 2210 }, { "epoch": 2.420765027322404, "grad_norm": 1.8881686050860271, "learning_rate": 5.060376441024851e-08, "loss": 2.0104, "step": 2215 }, { "epoch": 2.4262295081967213, "grad_norm": 1.8760582543927924, "learning_rate": 5.0581269044309416e-08, "loss": 2.0514, "step": 2220 }, { "epoch": 2.431693989071038, "grad_norm": 1.8590736579277904, "learning_rate": 5.055953822584505e-08, "loss": 2.0065, "step": 2225 }, { "epoch": 2.4371584699453552, "grad_norm": 2.014653467507829, "learning_rate": 5.0538548488699095e-08, "loss": 2.0011, "step": 2230 }, { "epoch": 2.442622950819672, "grad_norm": 2.0013649829205202, "learning_rate": 5.0518277009130157e-08, "loss": 2.0858, "step": 2235 }, { "epoch": 2.448087431693989, "grad_norm": 1.8662711132468726, "learning_rate": 5.0498701590370246e-08, "loss": 2.0186, "step": 2240 }, { "epoch": 2.453551912568306, "grad_norm": 1.9004909274072246, "learning_rate": 5.047980064750245e-08, "loss": 2.0112, "step": 2245 }, { "epoch": 2.459016393442623, "grad_norm": 1.8857990524183288, "learning_rate": 5.04615531926523e-08, "loss": 2.0886, "step": 2250 }, { "epoch": 2.4644808743169397, "grad_norm": 1.8001269753111797, "learning_rate": 5.04439388204875e-08, "loss": 1.9974, "step": 2255 }, { "epoch": 2.469945355191257, "grad_norm": 1.8470988845468073, "learning_rate": 5.042693769402049e-08, "loss": 1.9826, "step": 2260 }, { "epoch": 2.4754098360655736, "grad_norm": 1.8736758608587534, "learning_rate": 5.041053053070867e-08, "loss": 2.0697, "step": 2265 }, { "epoch": 2.480874316939891, "grad_norm": 1.9221103936145996, "learning_rate": 5.039469858884701e-08, "loss": 2.0596, "step": 2270 }, { "epoch": 2.4863387978142075, "grad_norm": 1.8924072367922147, "learning_rate": 5.037942365424796e-08, "loss": 2.0233, "step": 2275 }, { "epoch": 2.4918032786885247, "grad_norm": 1.8504333814599807, "learning_rate": 5.036468802720349e-08, "loss": 2.0577, "step": 2280 }, { "epoch": 2.4972677595628414, "grad_norm": 1.846494429919382, "learning_rate": 5.035047450972435e-08, "loss": 2.0249, "step": 2285 }, { "epoch": 2.5027322404371586, "grad_norm": 1.8525565278611498, "learning_rate": 5.033676639305158e-08, "loss": 2.0432, "step": 2290 }, { "epoch": 2.5081967213114753, "grad_norm": 1.988134266951729, "learning_rate": 5.0323547445435455e-08, "loss": 2.0604, "step": 2295 }, { "epoch": 2.5136612021857925, "grad_norm": 1.858473570561303, "learning_rate": 5.0310801900177e-08, "loss": 2.0029, "step": 2300 }, { "epoch": 2.519125683060109, "grad_norm": 1.8897075095235156, "learning_rate": 5.029851444392739e-08, "loss": 2.0182, "step": 2305 }, { "epoch": 2.5245901639344264, "grad_norm": 1.9818990480911667, "learning_rate": 5.028667020524067e-08, "loss": 1.9902, "step": 2310 }, { "epoch": 2.530054644808743, "grad_norm": 1.8548937980299227, "learning_rate": 5.027525474337505e-08, "loss": 2.0113, "step": 2315 }, { "epoch": 2.5355191256830603, "grad_norm": 1.9033985343889175, "learning_rate": 5.0264254037338365e-08, "loss": 2.0591, "step": 2320 }, { "epoch": 2.540983606557377, "grad_norm": 1.8954918019108078, "learning_rate": 5.025365447517326e-08, "loss": 2.0424, "step": 2325 }, { "epoch": 2.546448087431694, "grad_norm": 1.8869766835100785, "learning_rate": 5.024344284347762e-08, "loss": 2.03, "step": 2330 }, { "epoch": 2.551912568306011, "grad_norm": 1.8663624978318183, "learning_rate": 5.023360631715606e-08, "loss": 1.976, "step": 2335 }, { "epoch": 2.557377049180328, "grad_norm": 1.8371733503594865, "learning_rate": 5.0224132449398005e-08, "loss": 2.0441, "step": 2340 }, { "epoch": 2.5628415300546448, "grad_norm": 1.9433496190704163, "learning_rate": 5.0215009161878455e-08, "loss": 2.0678, "step": 2345 }, { "epoch": 2.5683060109289615, "grad_norm": 1.9523689339991457, "learning_rate": 5.020622473517704e-08, "loss": 2.0311, "step": 2350 }, { "epoch": 2.5737704918032787, "grad_norm": 1.8890575883757943, "learning_rate": 5.0197767799411424e-08, "loss": 2.0454, "step": 2355 }, { "epoch": 2.579234972677596, "grad_norm": 1.9102594514962234, "learning_rate": 5.0189627325081046e-08, "loss": 2.0324, "step": 2360 }, { "epoch": 2.5846994535519126, "grad_norm": 1.8390946791204932, "learning_rate": 5.018179261411716e-08, "loss": 2.0238, "step": 2365 }, { "epoch": 2.5901639344262293, "grad_norm": 1.9030836331353156, "learning_rate": 5.0174253291135456e-08, "loss": 2.0424, "step": 2370 }, { "epoch": 2.5956284153005464, "grad_norm": 1.9051341411136902, "learning_rate": 5.016699929488718e-08, "loss": 2.0464, "step": 2375 }, { "epoch": 2.6010928961748636, "grad_norm": 1.8905549632374719, "learning_rate": 5.016002086990525e-08, "loss": 2.0401, "step": 2380 }, { "epoch": 2.6065573770491803, "grad_norm": 1.8720851254061621, "learning_rate": 5.015330855834148e-08, "loss": 2.0313, "step": 2385 }, { "epoch": 2.612021857923497, "grad_norm": 1.8630940777989557, "learning_rate": 5.014685319199122e-08, "loss": 2.0418, "step": 2390 }, { "epoch": 2.6174863387978142, "grad_norm": 1.9536600782037399, "learning_rate": 5.014064588450203e-08, "loss": 2.0331, "step": 2395 }, { "epoch": 2.6229508196721314, "grad_norm": 1.8537695794419704, "learning_rate": 5.013467802376257e-08, "loss": 2.0329, "step": 2400 }, { "epoch": 2.6229508196721314, "eval_loss": 2.276965618133545, "eval_runtime": 75.085, "eval_samples_per_second": 86.662, "eval_steps_per_second": 0.679, "step": 2400 }, { "epoch": 2.628415300546448, "grad_norm": 1.856446433994958, "learning_rate": 5.0128941264468425e-08, "loss": 2.059, "step": 2405 }, { "epoch": 2.633879781420765, "grad_norm": 1.8864099698250834, "learning_rate": 5.012342752086127e-08, "loss": 2.0366, "step": 2410 }, { "epoch": 2.639344262295082, "grad_norm": 1.8965062954857936, "learning_rate": 5.011812895963815e-08, "loss": 2.0178, "step": 2415 }, { "epoch": 2.644808743169399, "grad_norm": 1.9283596089907955, "learning_rate": 5.011303799302737e-08, "loss": 2.0664, "step": 2420 }, { "epoch": 2.650273224043716, "grad_norm": 1.8806907065516518, "learning_rate": 5.0108147272027865e-08, "loss": 2.0187, "step": 2425 }, { "epoch": 2.6557377049180326, "grad_norm": 1.9156510863376972, "learning_rate": 5.0103449679808754e-08, "loss": 2.0101, "step": 2430 }, { "epoch": 2.66120218579235, "grad_norm": 2.126231497464025, "learning_rate": 5.009893832526587e-08, "loss": 1.9974, "step": 2435 }, { "epoch": 2.6666666666666665, "grad_norm": 1.9052967091959634, "learning_rate": 5.0094606536732234e-08, "loss": 2.0565, "step": 2440 }, { "epoch": 2.6721311475409837, "grad_norm": 1.8394704411303013, "learning_rate": 5.009044785583931e-08, "loss": 2.0296, "step": 2445 }, { "epoch": 2.6775956284153004, "grad_norm": 1.9096372006865225, "learning_rate": 5.008645603152607e-08, "loss": 2.0317, "step": 2450 }, { "epoch": 2.6830601092896176, "grad_norm": 1.8523989798638827, "learning_rate": 5.0082625014192866e-08, "loss": 2.0261, "step": 2455 }, { "epoch": 2.6885245901639343, "grad_norm": 1.8569921839621404, "learning_rate": 5.007894894999717e-08, "loss": 2.005, "step": 2460 }, { "epoch": 2.6939890710382515, "grad_norm": 1.8836826290467388, "learning_rate": 5.0075422175288365e-08, "loss": 2.0464, "step": 2465 }, { "epoch": 2.699453551912568, "grad_norm": 1.820199895319608, "learning_rate": 5.007203921117863e-08, "loss": 1.9825, "step": 2470 }, { "epoch": 2.7049180327868854, "grad_norm": 1.904515026457721, "learning_rate": 5.006879475824728e-08, "loss": 2.0278, "step": 2475 }, { "epoch": 2.710382513661202, "grad_norm": 1.943681983281218, "learning_rate": 5.006568369137572e-08, "loss": 2.0353, "step": 2480 }, { "epoch": 2.7158469945355193, "grad_norm": 1.8593463289638106, "learning_rate": 5.00627010547103e-08, "loss": 2.0444, "step": 2485 }, { "epoch": 2.721311475409836, "grad_norm": 1.8449163921598035, "learning_rate": 5.005984205675053e-08, "loss": 2.0289, "step": 2490 }, { "epoch": 2.726775956284153, "grad_norm": 1.9258062804823874, "learning_rate": 5.005710206555992e-08, "loss": 1.9806, "step": 2495 }, { "epoch": 2.73224043715847, "grad_norm": 1.886984272428234, "learning_rate": 5.0054476604096995e-08, "loss": 2.0158, "step": 2500 }, { "epoch": 2.737704918032787, "grad_norm": 2.0315158822636548, "learning_rate": 5.0051961345663824e-08, "loss": 2.0218, "step": 2505 }, { "epoch": 2.7431693989071038, "grad_norm": 1.8060895670908574, "learning_rate": 5.0049552109469755e-08, "loss": 2.0242, "step": 2510 }, { "epoch": 2.748633879781421, "grad_norm": 1.8838071016430706, "learning_rate": 5.004724485630778e-08, "loss": 2.0522, "step": 2515 }, { "epoch": 2.7540983606557377, "grad_norm": 1.8537277873432774, "learning_rate": 5.004503568434121e-08, "loss": 1.9872, "step": 2520 }, { "epoch": 2.7595628415300544, "grad_norm": 1.9906937494224455, "learning_rate": 5.004292082499825e-08, "loss": 2.0369, "step": 2525 }, { "epoch": 2.7650273224043715, "grad_norm": 1.8315661557574987, "learning_rate": 5.0040896638972245e-08, "loss": 2.0347, "step": 2530 }, { "epoch": 2.7704918032786887, "grad_norm": 1.9006118838371153, "learning_rate": 5.00389596123252e-08, "loss": 2.0747, "step": 2535 }, { "epoch": 2.7759562841530054, "grad_norm": 1.9282485545253067, "learning_rate": 5.003710635269248e-08, "loss": 2.0238, "step": 2540 }, { "epoch": 2.781420765027322, "grad_norm": 1.8801002631728025, "learning_rate": 5.0035333585586396e-08, "loss": 2.0089, "step": 2545 }, { "epoch": 2.7868852459016393, "grad_norm": 1.8790975179378258, "learning_rate": 5.0033638150796495e-08, "loss": 2.0503, "step": 2550 }, { "epoch": 2.7923497267759565, "grad_norm": 1.8446856614496407, "learning_rate": 5.0032016998884586e-08, "loss": 2.0306, "step": 2555 }, { "epoch": 2.797814207650273, "grad_norm": 1.9023341790959656, "learning_rate": 5.003046718777224e-08, "loss": 2.0464, "step": 2560 }, { "epoch": 2.80327868852459, "grad_norm": 1.8356837164563038, "learning_rate": 5.002898587941882e-08, "loss": 2.0674, "step": 2565 }, { "epoch": 2.808743169398907, "grad_norm": 1.9008745679241117, "learning_rate": 5.002757033658803e-08, "loss": 2.0508, "step": 2570 }, { "epoch": 2.8142076502732243, "grad_norm": 1.8556179817129685, "learning_rate": 5.0026217919700956e-08, "loss": 2.0161, "step": 2575 }, { "epoch": 2.819672131147541, "grad_norm": 1.8898728370320337, "learning_rate": 5.0024926083773705e-08, "loss": 2.0484, "step": 2580 }, { "epoch": 2.8251366120218577, "grad_norm": 1.8869809093319543, "learning_rate": 5.002369237543775e-08, "loss": 2.0164, "step": 2585 }, { "epoch": 2.830601092896175, "grad_norm": 2.0230326546469355, "learning_rate": 5.0022514430041064e-08, "loss": 2.035, "step": 2590 }, { "epoch": 2.836065573770492, "grad_norm": 1.8582179431821813, "learning_rate": 5.002138996882823e-08, "loss": 2.0064, "step": 2595 }, { "epoch": 2.841530054644809, "grad_norm": 1.942996732626371, "learning_rate": 5.002031679619775e-08, "loss": 2.0213, "step": 2600 }, { "epoch": 2.841530054644809, "eval_loss": 2.276575803756714, "eval_runtime": 75.1591, "eval_samples_per_second": 86.576, "eval_steps_per_second": 0.679, "step": 2600 }, { "epoch": 2.8469945355191255, "grad_norm": 1.8406548791619128, "learning_rate": 5.0019292797034756e-08, "loss": 2.0239, "step": 2605 }, { "epoch": 2.8524590163934427, "grad_norm": 1.8433943845894334, "learning_rate": 5.001831593411739e-08, "loss": 2.0306, "step": 2610 }, { "epoch": 2.8579234972677594, "grad_norm": 1.9214309861779986, "learning_rate": 5.0017384245595145e-08, "loss": 2.0792, "step": 2615 }, { "epoch": 2.8633879781420766, "grad_norm": 1.892919325287394, "learning_rate": 5.001649584253754e-08, "loss": 2.0389, "step": 2620 }, { "epoch": 2.8688524590163933, "grad_norm": 1.8352505329010071, "learning_rate": 5.001564890655143e-08, "loss": 2.0385, "step": 2625 }, { "epoch": 2.8743169398907105, "grad_norm": 1.9094968998206336, "learning_rate": 5.001484168746532e-08, "loss": 2.0307, "step": 2630 }, { "epoch": 2.879781420765027, "grad_norm": 1.913583135594377, "learning_rate": 5.001407250107926e-08, "loss": 2.0251, "step": 2635 }, { "epoch": 2.8852459016393444, "grad_norm": 1.8983404483614361, "learning_rate": 5.001333972697852e-08, "loss": 2.0251, "step": 2640 }, { "epoch": 2.890710382513661, "grad_norm": 1.9615347509758865, "learning_rate": 5.001264180640978e-08, "loss": 2.0367, "step": 2645 }, { "epoch": 2.8961748633879782, "grad_norm": 1.9624200439383404, "learning_rate": 5.001197724021815e-08, "loss": 2.062, "step": 2650 }, { "epoch": 2.901639344262295, "grad_norm": 1.8696575109999418, "learning_rate": 5.001134458684368e-08, "loss": 2.0521, "step": 2655 }, { "epoch": 2.907103825136612, "grad_norm": 1.8848143146406755, "learning_rate": 5.001074246037584e-08, "loss": 2.0034, "step": 2660 }, { "epoch": 2.912568306010929, "grad_norm": 1.973139426778756, "learning_rate": 5.001016952866467e-08, "loss": 1.9532, "step": 2665 }, { "epoch": 2.918032786885246, "grad_norm": 1.9504580432497, "learning_rate": 5.000962451148704e-08, "loss": 2.048, "step": 2670 }, { "epoch": 2.9234972677595628, "grad_norm": 1.953413058357899, "learning_rate": 5.0009106178766914e-08, "loss": 2.0661, "step": 2675 }, { "epoch": 2.92896174863388, "grad_norm": 1.881395664536309, "learning_rate": 5.000861334884807e-08, "loss": 2.022, "step": 2680 }, { "epoch": 2.9344262295081966, "grad_norm": 1.8914817780033801, "learning_rate": 5.0008144886818085e-08, "loss": 1.9874, "step": 2685 }, { "epoch": 2.939890710382514, "grad_norm": 2.017787415229173, "learning_rate": 5.000769970288234e-08, "loss": 2.0318, "step": 2690 }, { "epoch": 2.9453551912568305, "grad_norm": 1.8421207610475552, "learning_rate": 5.000727675078668e-08, "loss": 2.0521, "step": 2695 }, { "epoch": 2.9508196721311473, "grad_norm": 1.8659084624986955, "learning_rate": 5.0006875026287623e-08, "loss": 2.0089, "step": 2700 }, { "epoch": 2.9562841530054644, "grad_norm": 1.9075873541304413, "learning_rate": 5.0006493565668884e-08, "loss": 2.0478, "step": 2705 }, { "epoch": 2.9617486338797816, "grad_norm": 1.9745047266267015, "learning_rate": 5.0006131444302976e-08, "loss": 2.0439, "step": 2710 }, { "epoch": 2.9672131147540983, "grad_norm": 1.8923014776736973, "learning_rate": 5.000578777525686e-08, "loss": 2.0554, "step": 2715 }, { "epoch": 2.972677595628415, "grad_norm": 1.9080716218498992, "learning_rate": 5.0005461707940365e-08, "loss": 2.0322, "step": 2720 }, { "epoch": 2.978142076502732, "grad_norm": 1.9393226464756443, "learning_rate": 5.0005152426796475e-08, "loss": 2.0324, "step": 2725 }, { "epoch": 2.9836065573770494, "grad_norm": 1.8889314625477465, "learning_rate": 5.000485915003216e-08, "loss": 2.0421, "step": 2730 }, { "epoch": 2.989071038251366, "grad_norm": 1.9331494885327474, "learning_rate": 5.0004581128388925e-08, "loss": 2.0398, "step": 2735 }, { "epoch": 2.994535519125683, "grad_norm": 1.8595278802194335, "learning_rate": 5.000431764395187e-08, "loss": 2.0376, "step": 2740 }, { "epoch": 3.0, "grad_norm": 1.8505999974369438, "learning_rate": 5.000406800899633e-08, "loss": 2.0272, "step": 2745 }, { "epoch": 3.0054644808743167, "grad_norm": 1.8128081448976874, "learning_rate": 5.00038315648711e-08, "loss": 2.0134, "step": 2750 }, { "epoch": 3.010928961748634, "grad_norm": 1.849783427721221, "learning_rate": 5.000360768091725e-08, "loss": 1.962, "step": 2755 }, { "epoch": 3.0163934426229506, "grad_norm": 1.896562041216816, "learning_rate": 5.0003395753421604e-08, "loss": 2.0457, "step": 2760 }, { "epoch": 3.021857923497268, "grad_norm": 1.9734010112151688, "learning_rate": 5.0003195204603886e-08, "loss": 2.0289, "step": 2765 }, { "epoch": 3.0273224043715845, "grad_norm": 1.94494245556754, "learning_rate": 5.000300548163672e-08, "loss": 2.0502, "step": 2770 }, { "epoch": 3.0327868852459017, "grad_norm": 1.853549834674588, "learning_rate": 5.0002826055697557e-08, "loss": 2.0073, "step": 2775 }, { "epoch": 3.0382513661202184, "grad_norm": 1.9550895805921849, "learning_rate": 5.000265642105161e-08, "loss": 2.0578, "step": 2780 }, { "epoch": 3.0437158469945356, "grad_norm": 1.9832050839540076, "learning_rate": 5.0002496094165e-08, "loss": 2.0593, "step": 2785 }, { "epoch": 3.0491803278688523, "grad_norm": 1.8801978736078537, "learning_rate": 5.000234461284729e-08, "loss": 2.0796, "step": 2790 }, { "epoch": 3.0546448087431695, "grad_norm": 1.880459663640695, "learning_rate": 5.000220153542248e-08, "loss": 2.0813, "step": 2795 }, { "epoch": 3.060109289617486, "grad_norm": 1.8766167117396393, "learning_rate": 5.000206643992788e-08, "loss": 2.0559, "step": 2800 }, { "epoch": 3.060109289617486, "eval_loss": 2.277146816253662, "eval_runtime": 75.0877, "eval_samples_per_second": 86.659, "eval_steps_per_second": 0.679, "step": 2800 }, { "epoch": 3.0655737704918034, "grad_norm": 1.9254106923649494, "learning_rate": 5.000193892333986e-08, "loss": 2.0661, "step": 2805 }, { "epoch": 3.07103825136612, "grad_norm": 1.822299900106021, "learning_rate": 5.000181860082585e-08, "loss": 2.0499, "step": 2810 }, { "epoch": 3.0765027322404372, "grad_norm": 1.8735243038644225, "learning_rate": 5.0001705105021744e-08, "loss": 2.0296, "step": 2815 }, { "epoch": 3.081967213114754, "grad_norm": 1.9563218452911402, "learning_rate": 5.000159808533418e-08, "loss": 1.9812, "step": 2820 }, { "epoch": 3.087431693989071, "grad_norm": 1.8334084094484262, "learning_rate": 5.00014972072667e-08, "loss": 2.0074, "step": 2825 }, { "epoch": 3.092896174863388, "grad_norm": 1.8655682558825502, "learning_rate": 5.000140215176936e-08, "loss": 2.0072, "step": 2830 }, { "epoch": 3.098360655737705, "grad_norm": 1.9205939797923823, "learning_rate": 5.000131261461091e-08, "loss": 1.9616, "step": 2835 }, { "epoch": 3.1038251366120218, "grad_norm": 2.14258246365134, "learning_rate": 5.0001228305773056e-08, "loss": 2.0388, "step": 2840 }, { "epoch": 3.109289617486339, "grad_norm": 1.909977704305264, "learning_rate": 5.000114894886601e-08, "loss": 2.0023, "step": 2845 }, { "epoch": 3.1147540983606556, "grad_norm": 2.0206991852732394, "learning_rate": 5.000107428056477e-08, "loss": 2.0111, "step": 2850 }, { "epoch": 3.120218579234973, "grad_norm": 1.8666014246751432, "learning_rate": 5.000100405006557e-08, "loss": 2.0219, "step": 2855 }, { "epoch": 3.1256830601092895, "grad_norm": 1.9352070214880581, "learning_rate": 5.0000938018561714e-08, "loss": 2.029, "step": 2860 }, { "epoch": 3.1311475409836067, "grad_norm": 1.8998730338754464, "learning_rate": 5.0000875958738443e-08, "loss": 2.014, "step": 2865 }, { "epoch": 3.1366120218579234, "grad_norm": 1.93622910502082, "learning_rate": 5.000081765428609e-08, "loss": 2.0348, "step": 2870 }, { "epoch": 3.1420765027322406, "grad_norm": 1.8895366176405546, "learning_rate": 5.000076289943102e-08, "loss": 2.0577, "step": 2875 }, { "epoch": 3.1475409836065573, "grad_norm": 1.942718295521934, "learning_rate": 5.0000711498483816e-08, "loss": 2.0452, "step": 2880 }, { "epoch": 3.1530054644808745, "grad_norm": 1.8568483287237603, "learning_rate": 5.00006632654042e-08, "loss": 2.0405, "step": 2885 }, { "epoch": 3.158469945355191, "grad_norm": 1.8966452464630115, "learning_rate": 5.00006180233821e-08, "loss": 2.0307, "step": 2890 }, { "epoch": 3.1639344262295084, "grad_norm": 1.8844492467485716, "learning_rate": 5.000057560443445e-08, "loss": 2.038, "step": 2895 }, { "epoch": 3.169398907103825, "grad_norm": 1.9541049062507123, "learning_rate": 5.000053584901716e-08, "loss": 2.0324, "step": 2900 }, { "epoch": 3.1748633879781423, "grad_norm": 1.8762220421293871, "learning_rate": 5.0000498605651776e-08, "loss": 2.0117, "step": 2905 }, { "epoch": 3.180327868852459, "grad_norm": 1.8972364762038987, "learning_rate": 5.000046373056645e-08, "loss": 2.0539, "step": 2910 }, { "epoch": 3.185792349726776, "grad_norm": 1.8830419259378766, "learning_rate": 5.000043108735063e-08, "loss": 2.0143, "step": 2915 }, { "epoch": 3.191256830601093, "grad_norm": 1.8933082226852906, "learning_rate": 5.000040054662314e-08, "loss": 2.0245, "step": 2920 }, { "epoch": 3.19672131147541, "grad_norm": 1.8584858417385766, "learning_rate": 5.000037198571318e-08, "loss": 1.9939, "step": 2925 }, { "epoch": 3.202185792349727, "grad_norm": 1.9078023286100567, "learning_rate": 5.000034528835373e-08, "loss": 2.0418, "step": 2930 }, { "epoch": 3.2076502732240435, "grad_norm": 1.902956383213903, "learning_rate": 5.00003203443872e-08, "loss": 2.0302, "step": 2935 }, { "epoch": 3.2131147540983607, "grad_norm": 1.8818748470466278, "learning_rate": 5.000029704948257e-08, "loss": 2.0637, "step": 2940 }, { "epoch": 3.2185792349726774, "grad_norm": 1.914518786096776, "learning_rate": 5.0000275304863995e-08, "loss": 2.014, "step": 2945 }, { "epoch": 3.2240437158469946, "grad_norm": 1.9857131146213522, "learning_rate": 5.000025501705019e-08, "loss": 2.0159, "step": 2950 }, { "epoch": 3.2295081967213113, "grad_norm": 1.8481404227503944, "learning_rate": 5.000023609760444e-08, "loss": 2.0345, "step": 2955 }, { "epoch": 3.2349726775956285, "grad_norm": 1.9248498594561754, "learning_rate": 5.00002184628948e-08, "loss": 1.9741, "step": 2960 }, { "epoch": 3.240437158469945, "grad_norm": 1.9138227507681809, "learning_rate": 5.000020203386406e-08, "loss": 1.9825, "step": 2965 }, { "epoch": 3.2459016393442623, "grad_norm": 1.9553377832252659, "learning_rate": 5.000018673580931e-08, "loss": 2.0348, "step": 2970 }, { "epoch": 3.251366120218579, "grad_norm": 1.923854238806126, "learning_rate": 5.0000172498170615e-08, "loss": 2.033, "step": 2975 }, { "epoch": 3.2568306010928962, "grad_norm": 1.8966593579783744, "learning_rate": 5.000015925432853e-08, "loss": 2.0051, "step": 2980 }, { "epoch": 3.262295081967213, "grad_norm": 1.8885418073350184, "learning_rate": 5.000014694141023e-08, "loss": 2.0325, "step": 2985 }, { "epoch": 3.26775956284153, "grad_norm": 1.9005648234764283, "learning_rate": 5.000013550010379e-08, "loss": 2.0387, "step": 2990 }, { "epoch": 3.273224043715847, "grad_norm": 1.8497186687415175, "learning_rate": 5.0000124874480465e-08, "loss": 1.9916, "step": 2995 }, { "epoch": 3.278688524590164, "grad_norm": 1.9311355275570043, "learning_rate": 5.000011501182461e-08, "loss": 2.0543, "step": 3000 }, { "epoch": 3.278688524590164, "eval_loss": 2.2772867679595947, "eval_runtime": 75.0871, "eval_samples_per_second": 86.659, "eval_steps_per_second": 0.679, "step": 3000 }, { "epoch": 3.2841530054644807, "grad_norm": 1.8490487579130825, "learning_rate": 5.000010586247099e-08, "loss": 2.0141, "step": 3005 }, { "epoch": 3.289617486338798, "grad_norm": 1.8722847979242898, "learning_rate": 5.0000097379649185e-08, "loss": 2.0399, "step": 3010 }, { "epoch": 3.2950819672131146, "grad_norm": 1.8920328829395436, "learning_rate": 5.000008951933488e-08, "loss": 2.0403, "step": 3015 }, { "epoch": 3.300546448087432, "grad_norm": 1.859765946380407, "learning_rate": 5.000008224010771e-08, "loss": 2.0231, "step": 3020 }, { "epoch": 3.3060109289617485, "grad_norm": 1.889873157845456, "learning_rate": 5.0000075503015504e-08, "loss": 2.0029, "step": 3025 }, { "epoch": 3.3114754098360657, "grad_norm": 1.9194945076344194, "learning_rate": 5.000006927144461e-08, "loss": 2.0375, "step": 3030 }, { "epoch": 3.3169398907103824, "grad_norm": 1.8949475106036582, "learning_rate": 5.000006351099609e-08, "loss": 2.0234, "step": 3035 }, { "epoch": 3.3224043715846996, "grad_norm": 1.925413901133648, "learning_rate": 5.0000058189367665e-08, "loss": 2.0335, "step": 3040 }, { "epoch": 3.3278688524590163, "grad_norm": 1.8637852431158481, "learning_rate": 5.0000053276240954e-08, "loss": 2.0339, "step": 3045 }, { "epoch": 3.3333333333333335, "grad_norm": 1.990488877814686, "learning_rate": 5.0000048743174075e-08, "loss": 2.0116, "step": 3050 }, { "epoch": 3.33879781420765, "grad_norm": 1.9066583759059983, "learning_rate": 5.0000044563499215e-08, "loss": 2.0752, "step": 3055 }, { "epoch": 3.3442622950819674, "grad_norm": 1.9013799438501833, "learning_rate": 5.0000040712225024e-08, "loss": 2.0225, "step": 3060 }, { "epoch": 3.349726775956284, "grad_norm": 1.8226817121910608, "learning_rate": 5.000003716594369e-08, "loss": 2.0035, "step": 3065 }, { "epoch": 3.3551912568306013, "grad_norm": 1.8532253234195688, "learning_rate": 5.000003390274239e-08, "loss": 2.0492, "step": 3070 }, { "epoch": 3.360655737704918, "grad_norm": 1.8666444656750065, "learning_rate": 5.0000030902119114e-08, "loss": 1.9977, "step": 3075 }, { "epoch": 3.366120218579235, "grad_norm": 1.883761246140252, "learning_rate": 5.000002814490251e-08, "loss": 2.0615, "step": 3080 }, { "epoch": 3.371584699453552, "grad_norm": 1.952894075205677, "learning_rate": 5.000002561317571e-08, "loss": 2.0141, "step": 3085 }, { "epoch": 3.3770491803278686, "grad_norm": 1.8928059074027184, "learning_rate": 5.000002329020387e-08, "loss": 2.0403, "step": 3090 }, { "epoch": 3.3825136612021858, "grad_norm": 2.0098225664920224, "learning_rate": 5.0000021160365414e-08, "loss": 2.0737, "step": 3095 }, { "epoch": 3.387978142076503, "grad_norm": 1.8582907329607212, "learning_rate": 5.000001920908665e-08, "loss": 2.0323, "step": 3100 }, { "epoch": 3.3934426229508197, "grad_norm": 1.9004514759105224, "learning_rate": 5.000001742277974e-08, "loss": 2.0378, "step": 3105 }, { "epoch": 3.3989071038251364, "grad_norm": 1.9251008716345306, "learning_rate": 5.0000015788783874e-08, "loss": 1.9869, "step": 3110 }, { "epoch": 3.4043715846994536, "grad_norm": 1.8991213194710543, "learning_rate": 5.000001429530941e-08, "loss": 2.0395, "step": 3115 }, { "epoch": 3.4098360655737707, "grad_norm": 1.8526036080467823, "learning_rate": 5.000001293138501e-08, "loss": 2.0095, "step": 3120 }, { "epoch": 3.4153005464480874, "grad_norm": 1.8346508560296197, "learning_rate": 5.0000011686807445e-08, "loss": 2.0067, "step": 3125 }, { "epoch": 3.420765027322404, "grad_norm": 1.8407135510103516, "learning_rate": 5.000001055209419e-08, "loss": 2.0252, "step": 3130 }, { "epoch": 3.4262295081967213, "grad_norm": 1.8721531260003674, "learning_rate": 5.000000951843842e-08, "loss": 2.0432, "step": 3135 }, { "epoch": 3.431693989071038, "grad_norm": 1.9072265607352163, "learning_rate": 5.0000008577666524e-08, "loss": 2.0312, "step": 3140 }, { "epoch": 3.4371584699453552, "grad_norm": 1.9177737469818847, "learning_rate": 5.000000772219792e-08, "loss": 2.0066, "step": 3145 }, { "epoch": 3.442622950819672, "grad_norm": 1.9149658715997013, "learning_rate": 5.000000694500704e-08, "loss": 2.0064, "step": 3150 }, { "epoch": 3.448087431693989, "grad_norm": 1.9406268887306055, "learning_rate": 5.000000623958742e-08, "loss": 2.0253, "step": 3155 }, { "epoch": 3.453551912568306, "grad_norm": 1.93322985142905, "learning_rate": 5.000000559991787e-08, "loss": 2.0296, "step": 3160 }, { "epoch": 3.459016393442623, "grad_norm": 2.006884694469749, "learning_rate": 5.000000502043047e-08, "loss": 2.015, "step": 3165 }, { "epoch": 3.4644808743169397, "grad_norm": 1.973665285115433, "learning_rate": 5.0000004495980446e-08, "loss": 2.0621, "step": 3170 }, { "epoch": 3.469945355191257, "grad_norm": 1.9098826344464872, "learning_rate": 5.000000402181774e-08, "loss": 2.0137, "step": 3175 }, { "epoch": 3.4754098360655736, "grad_norm": 1.900637639917567, "learning_rate": 5.000000359356028e-08, "loss": 2.0411, "step": 3180 }, { "epoch": 3.480874316939891, "grad_norm": 1.9657694744447054, "learning_rate": 5.0000003207168756e-08, "loss": 2.0667, "step": 3185 }, { "epoch": 3.4863387978142075, "grad_norm": 1.8794891535447487, "learning_rate": 5.000000285892296e-08, "loss": 2.0421, "step": 3190 }, { "epoch": 3.4918032786885247, "grad_norm": 1.9073660767776919, "learning_rate": 5.000000254539948e-08, "loss": 2.0722, "step": 3195 }, { "epoch": 3.4972677595628414, "grad_norm": 1.9968851234028737, "learning_rate": 5.000000226345078e-08, "loss": 2.0317, "step": 3200 }, { "epoch": 3.4972677595628414, "eval_loss": 2.2772328853607178, "eval_runtime": 75.1937, "eval_samples_per_second": 86.536, "eval_steps_per_second": 0.678, "step": 3200 }, { "epoch": 3.5027322404371586, "grad_norm": 1.9363915857414498, "learning_rate": 5.000000201018557e-08, "loss": 2.0378, "step": 3205 }, { "epoch": 3.5081967213114753, "grad_norm": 1.9493487909740663, "learning_rate": 5.0000001782950314e-08, "loss": 2.0429, "step": 3210 }, { "epoch": 3.5136612021857925, "grad_norm": 1.8974490684659184, "learning_rate": 5.000000157931199e-08, "loss": 2.0341, "step": 3215 }, { "epoch": 3.519125683060109, "grad_norm": 1.8750804355544737, "learning_rate": 5.000000139704186e-08, "loss": 2.0143, "step": 3220 }, { "epoch": 3.5245901639344264, "grad_norm": 1.8835309853885958, "learning_rate": 5.0000001234100294e-08, "loss": 2.0252, "step": 3225 }, { "epoch": 3.530054644808743, "grad_norm": 1.9036966438501197, "learning_rate": 5.000000108862262e-08, "loss": 2.0031, "step": 3230 }, { "epoch": 3.5355191256830603, "grad_norm": 1.8701728772297301, "learning_rate": 5.0000000958905794e-08, "loss": 2.0028, "step": 3235 }, { "epoch": 3.540983606557377, "grad_norm": 1.8785086675187268, "learning_rate": 5.000000084339605e-08, "loss": 1.9671, "step": 3240 }, { "epoch": 3.546448087431694, "grad_norm": 1.9287901930232905, "learning_rate": 5.0000000740677285e-08, "loss": 2.0464, "step": 3245 }, { "epoch": 3.551912568306011, "grad_norm": 1.925166946388218, "learning_rate": 5.00000006494603e-08, "loss": 1.9629, "step": 3250 }, { "epoch": 3.557377049180328, "grad_norm": 1.9147306157624264, "learning_rate": 5.000000056857271e-08, "loss": 2.0377, "step": 3255 }, { "epoch": 3.5628415300546448, "grad_norm": 2.00912227135468, "learning_rate": 5.0000000496949596e-08, "loss": 2.0519, "step": 3260 }, { "epoch": 3.5683060109289615, "grad_norm": 1.914393129097604, "learning_rate": 5.000000043362476e-08, "loss": 1.9921, "step": 3265 }, { "epoch": 3.5737704918032787, "grad_norm": 1.8986536102948053, "learning_rate": 5.000000037772264e-08, "loss": 2.037, "step": 3270 }, { "epoch": 3.579234972677596, "grad_norm": 2.1302629939845272, "learning_rate": 5.000000032845078e-08, "loss": 2.0352, "step": 3275 }, { "epoch": 3.5846994535519126, "grad_norm": 1.9083903824546993, "learning_rate": 5.0000000285092845e-08, "loss": 2.0432, "step": 3280 }, { "epoch": 3.5901639344262293, "grad_norm": 1.9795975235944003, "learning_rate": 5.000000024700213e-08, "loss": 2.0047, "step": 3285 }, { "epoch": 3.5956284153005464, "grad_norm": 1.909947661859089, "learning_rate": 5.000000021359558e-08, "loss": 2.031, "step": 3290 }, { "epoch": 3.6010928961748636, "grad_norm": 1.873647300121296, "learning_rate": 5.000000018434823e-08, "loss": 2.0427, "step": 3295 }, { "epoch": 3.6065573770491803, "grad_norm": 1.8870603921175668, "learning_rate": 5.000000015878808e-08, "loss": 1.9943, "step": 3300 }, { "epoch": 3.612021857923497, "grad_norm": 1.83764062220617, "learning_rate": 5.000000013649137e-08, "loss": 2.0278, "step": 3305 }, { "epoch": 3.6174863387978142, "grad_norm": 1.8700657377845233, "learning_rate": 5.0000000117078175e-08, "loss": 2.016, "step": 3310 }, { "epoch": 3.6229508196721314, "grad_norm": 1.9024345479748699, "learning_rate": 5.000000010020843e-08, "loss": 2.0335, "step": 3315 }, { "epoch": 3.628415300546448, "grad_norm": 1.8905483742070606, "learning_rate": 5.000000008557818e-08, "loss": 2.018, "step": 3320 }, { "epoch": 3.633879781420765, "grad_norm": 1.9223349161654961, "learning_rate": 5.0000000072916214e-08, "loss": 2.0213, "step": 3325 }, { "epoch": 3.639344262295082, "grad_norm": 1.823567332382863, "learning_rate": 5.000000006198092e-08, "loss": 1.987, "step": 3330 }, { "epoch": 3.644808743169399, "grad_norm": 1.8810678051906216, "learning_rate": 5.00000000525574e-08, "loss": 1.9769, "step": 3335 }, { "epoch": 3.650273224043716, "grad_norm": 1.9204431005146232, "learning_rate": 5.0000000044454894e-08, "loss": 2.0674, "step": 3340 }, { "epoch": 3.6557377049180326, "grad_norm": 1.8872295947781799, "learning_rate": 5.000000003750432e-08, "loss": 2.0109, "step": 3345 }, { "epoch": 3.66120218579235, "grad_norm": 1.8893937179160833, "learning_rate": 5.000000003155614e-08, "loss": 2.0475, "step": 3350 }, { "epoch": 3.6666666666666665, "grad_norm": 1.9582368602914404, "learning_rate": 5.000000002647831e-08, "loss": 2.0292, "step": 3355 }, { "epoch": 3.6721311475409837, "grad_norm": 1.8759259337994865, "learning_rate": 5.000000002215448e-08, "loss": 2.0248, "step": 3360 }, { "epoch": 3.6775956284153004, "grad_norm": 1.815903984549811, "learning_rate": 5.0000000018482356e-08, "loss": 2.0287, "step": 3365 }, { "epoch": 3.6830601092896176, "grad_norm": 1.8747014733431713, "learning_rate": 5.000000001537216e-08, "loss": 2.0457, "step": 3370 }, { "epoch": 3.6885245901639343, "grad_norm": 1.9458767620629445, "learning_rate": 5.000000001274526e-08, "loss": 2.0515, "step": 3375 }, { "epoch": 3.6939890710382515, "grad_norm": 1.9268929400448993, "learning_rate": 5.0000000010533005e-08, "loss": 2.0511, "step": 3380 }, { "epoch": 3.699453551912568, "grad_norm": 1.902962520090544, "learning_rate": 5.0000000008675514e-08, "loss": 2.0558, "step": 3385 }, { "epoch": 3.7049180327868854, "grad_norm": 1.8229295875308797, "learning_rate": 5.000000000712075e-08, "loss": 2.0166, "step": 3390 }, { "epoch": 3.710382513661202, "grad_norm": 1.836606346535709, "learning_rate": 5.0000000005823554e-08, "loss": 2.0403, "step": 3395 }, { "epoch": 3.7158469945355193, "grad_norm": 1.9116742151193724, "learning_rate": 5.0000000004744865e-08, "loss": 1.988, "step": 3400 }, { "epoch": 3.7158469945355193, "eval_loss": 2.2769548892974854, "eval_runtime": 74.9791, "eval_samples_per_second": 86.784, "eval_steps_per_second": 0.68, "step": 3400 }, { "epoch": 3.721311475409836, "grad_norm": 1.8962679275361203, "learning_rate": 5.000000000385098e-08, "loss": 2.0077, "step": 3405 }, { "epoch": 3.726775956284153, "grad_norm": 1.8893473180089084, "learning_rate": 5.0000000003112903e-08, "loss": 2.0275, "step": 3410 }, { "epoch": 3.73224043715847, "grad_norm": 1.9059398028165249, "learning_rate": 5.0000000002505746e-08, "loss": 2.0248, "step": 3415 }, { "epoch": 3.737704918032787, "grad_norm": 1.9284706550848763, "learning_rate": 5.000000000200822e-08, "loss": 2.0841, "step": 3420 }, { "epoch": 3.7431693989071038, "grad_norm": 1.8971305679426038, "learning_rate": 5.000000000160219e-08, "loss": 2.0205, "step": 3425 }, { "epoch": 3.748633879781421, "grad_norm": 1.9639122926123413, "learning_rate": 5.000000000127221e-08, "loss": 2.0438, "step": 3430 }, { "epoch": 3.7540983606557377, "grad_norm": 1.9660777954794344, "learning_rate": 5.000000000100521e-08, "loss": 2.0285, "step": 3435 }, { "epoch": 3.7595628415300544, "grad_norm": 1.915135487255815, "learning_rate": 5.000000000079017e-08, "loss": 1.9938, "step": 3440 }, { "epoch": 3.7650273224043715, "grad_norm": 1.9104288280645758, "learning_rate": 5.000000000061779e-08, "loss": 2.0109, "step": 3445 }, { "epoch": 3.7704918032786887, "grad_norm": 1.9163223330431955, "learning_rate": 5.0000000000480305e-08, "loss": 2.0479, "step": 3450 }, { "epoch": 3.7759562841530054, "grad_norm": 1.8942590608094447, "learning_rate": 5.0000000000371217e-08, "loss": 2.0265, "step": 3455 }, { "epoch": 3.781420765027322, "grad_norm": 1.8979108959832878, "learning_rate": 5.0000000000285143e-08, "loss": 2.0483, "step": 3460 }, { "epoch": 3.7868852459016393, "grad_norm": 2.0082325735504205, "learning_rate": 5.000000000021761e-08, "loss": 2.0249, "step": 3465 }, { "epoch": 3.7923497267759565, "grad_norm": 1.9319079542553508, "learning_rate": 5.0000000000164944e-08, "loss": 2.039, "step": 3470 }, { "epoch": 3.797814207650273, "grad_norm": 1.8962147679193577, "learning_rate": 5.0000000000124134e-08, "loss": 1.9694, "step": 3475 }, { "epoch": 3.80327868852459, "grad_norm": 1.9119145259888968, "learning_rate": 5.0000000000092715e-08, "loss": 2.0416, "step": 3480 }, { "epoch": 3.808743169398907, "grad_norm": 1.8560975935966715, "learning_rate": 5.00000000000687e-08, "loss": 2.0174, "step": 3485 }, { "epoch": 3.8142076502732243, "grad_norm": 1.9664941712397381, "learning_rate": 5.000000000005048e-08, "loss": 2.0503, "step": 3490 }, { "epoch": 3.819672131147541, "grad_norm": 1.8658008767578975, "learning_rate": 5.0000000000036764e-08, "loss": 2.0293, "step": 3495 }, { "epoch": 3.8251366120218577, "grad_norm": 1.9253208530977064, "learning_rate": 5.000000000002653e-08, "loss": 2.0429, "step": 3500 }, { "epoch": 3.830601092896175, "grad_norm": 1.8810005361263469, "learning_rate": 5.000000000001895e-08, "loss": 2.035, "step": 3505 }, { "epoch": 3.836065573770492, "grad_norm": 1.893681875957613, "learning_rate": 5.000000000001339e-08, "loss": 2.0732, "step": 3510 }, { "epoch": 3.841530054644809, "grad_norm": 1.9417607113095643, "learning_rate": 5.0000000000009355e-08, "loss": 2.002, "step": 3515 }, { "epoch": 3.8469945355191255, "grad_norm": 1.9989745014112892, "learning_rate": 5.000000000000646e-08, "loss": 2.0525, "step": 3520 }, { "epoch": 3.8524590163934427, "grad_norm": 1.8555402595578698, "learning_rate": 5.0000000000004405e-08, "loss": 2.0228, "step": 3525 }, { "epoch": 3.8579234972677594, "grad_norm": 1.9137513054849469, "learning_rate": 5.0000000000002956e-08, "loss": 2.0657, "step": 3530 }, { "epoch": 3.8633879781420766, "grad_norm": 1.9157898282989583, "learning_rate": 5.0000000000001957e-08, "loss": 2.0173, "step": 3535 }, { "epoch": 3.8688524590163933, "grad_norm": 1.9273730542054064, "learning_rate": 5.0000000000001275e-08, "loss": 2.0529, "step": 3540 }, { "epoch": 3.8743169398907105, "grad_norm": 1.8997473790640476, "learning_rate": 5.000000000000082e-08, "loss": 2.0464, "step": 3545 }, { "epoch": 3.879781420765027, "grad_norm": 1.9722252114630803, "learning_rate": 5.0000000000000514e-08, "loss": 2.0219, "step": 3550 }, { "epoch": 3.8852459016393444, "grad_norm": 1.9361513715190686, "learning_rate": 5.0000000000000315e-08, "loss": 2.0549, "step": 3555 }, { "epoch": 3.890710382513661, "grad_norm": 1.9521279215167433, "learning_rate": 5.000000000000019e-08, "loss": 2.0019, "step": 3560 }, { "epoch": 3.8961748633879782, "grad_norm": 1.8990353241401117, "learning_rate": 5.000000000000011e-08, "loss": 2.037, "step": 3565 }, { "epoch": 3.901639344262295, "grad_norm": 1.8490026777793342, "learning_rate": 5.0000000000000064e-08, "loss": 2.0528, "step": 3570 }, { "epoch": 3.907103825136612, "grad_norm": 2.004168101245223, "learning_rate": 5.000000000000003e-08, "loss": 2.0062, "step": 3575 }, { "epoch": 3.912568306010929, "grad_norm": 1.8584030836568644, "learning_rate": 5.000000000000002e-08, "loss": 2.0026, "step": 3580 }, { "epoch": 3.918032786885246, "grad_norm": 1.8750862900064005, "learning_rate": 5.0000000000000004e-08, "loss": 2.0304, "step": 3585 }, { "epoch": 3.9234972677595628, "grad_norm": 1.9298592977310705, "learning_rate": 5.0000000000000004e-08, "loss": 2.0262, "step": 3590 }, { "epoch": 3.92896174863388, "grad_norm": 1.9261861281030954, "learning_rate": 5e-08, "loss": 2.0747, "step": 3595 }, { "epoch": 3.9344262295081966, "grad_norm": 1.9012633598619333, "learning_rate": 5e-08, "loss": 2.0355, "step": 3600 }, { "epoch": 3.9344262295081966, "eval_loss": 2.277177333831787, "eval_runtime": 75.1005, "eval_samples_per_second": 86.644, "eval_steps_per_second": 0.679, "step": 3600 }, { "epoch": 3.939890710382514, "grad_norm": 1.9662605743553438, "learning_rate": 5e-08, "loss": 2.0272, "step": 3605 }, { "epoch": 3.9453551912568305, "grad_norm": 1.8777765308314378, "learning_rate": 5e-08, "loss": 2.0574, "step": 3610 }, { "epoch": 3.9508196721311473, "grad_norm": 1.9697643417177255, "learning_rate": 5e-08, "loss": 2.0504, "step": 3615 }, { "epoch": 3.9562841530054644, "grad_norm": 1.91285486557523, "learning_rate": 5e-08, "loss": 2.0216, "step": 3620 }, { "epoch": 3.9617486338797816, "grad_norm": 1.894324240473093, "learning_rate": 5e-08, "loss": 2.0108, "step": 3625 }, { "epoch": 3.9672131147540983, "grad_norm": 1.9284412363816936, "learning_rate": 5e-08, "loss": 2.0038, "step": 3630 }, { "epoch": 3.972677595628415, "grad_norm": 1.8376681173174465, "learning_rate": 5e-08, "loss": 2.021, "step": 3635 }, { "epoch": 3.978142076502732, "grad_norm": 1.8629566090204688, "learning_rate": 5e-08, "loss": 2.0236, "step": 3640 }, { "epoch": 3.9836065573770494, "grad_norm": 1.9846522235537283, "learning_rate": 5e-08, "loss": 2.024, "step": 3645 }, { "epoch": 3.989071038251366, "grad_norm": 1.9025611361991746, "learning_rate": 5e-08, "loss": 2.0281, "step": 3650 }, { "epoch": 3.994535519125683, "grad_norm": 1.9351822472092162, "learning_rate": 5e-08, "loss": 2.0184, "step": 3655 }, { "epoch": 4.0, "grad_norm": 1.9623534464978543, "learning_rate": 5e-08, "loss": 1.9875, "step": 3660 }, { "epoch": 4.0, "step": 3660, "total_flos": 382536630927360.0, "train_loss": 2.107023582497581, "train_runtime": 13273.308, "train_samples_per_second": 17.646, "train_steps_per_second": 0.276 } ], "logging_steps": 5, "max_steps": 3660, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "total_flos": 382536630927360.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }