{ "best_metric": 0.6760911345481873, "best_model_checkpoint": "vit_epochs5_batch32_lr5e-05_size224_tiles1_seed1_vit_lr\\checkpoint-2345", "epoch": 5.0, "eval_steps": 500, "global_step": 2345, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.5828699469566345, "learning_rate": 4.989339019189766e-05, "loss": 0.8221, "step": 5 }, { "epoch": 0.02, "grad_norm": 0.39033976197242737, "learning_rate": 4.978678038379531e-05, "loss": 0.8309, "step": 10 }, { "epoch": 0.03, "grad_norm": 0.5242815613746643, "learning_rate": 4.9680170575692967e-05, "loss": 0.7594, "step": 15 }, { "epoch": 0.04, "grad_norm": 0.8361539244651794, "learning_rate": 4.957356076759062e-05, "loss": 0.8066, "step": 20 }, { "epoch": 0.05, "grad_norm": 0.1711714118719101, "learning_rate": 4.9466950959488276e-05, "loss": 0.7071, "step": 25 }, { "epoch": 0.06, "grad_norm": 0.19998708367347717, "learning_rate": 4.936034115138593e-05, "loss": 0.6991, "step": 30 }, { "epoch": 0.07, "grad_norm": 0.30334341526031494, "learning_rate": 4.9253731343283586e-05, "loss": 0.7404, "step": 35 }, { "epoch": 0.09, "grad_norm": 0.3605813682079315, "learning_rate": 4.914712153518124e-05, "loss": 0.7351, "step": 40 }, { "epoch": 0.1, "grad_norm": 0.06580419093370438, "learning_rate": 4.904051172707889e-05, "loss": 0.6959, "step": 45 }, { "epoch": 0.11, "grad_norm": 0.05234284698963165, "learning_rate": 4.893390191897655e-05, "loss": 0.6917, "step": 50 }, { "epoch": 0.12, "grad_norm": 0.10359219461679459, "learning_rate": 4.88272921108742e-05, "loss": 0.6982, "step": 55 }, { "epoch": 0.13, "grad_norm": 0.10600591450929642, "learning_rate": 4.872068230277186e-05, "loss": 0.7052, "step": 60 }, { "epoch": 0.14, "grad_norm": 0.14884132146835327, "learning_rate": 4.861407249466951e-05, "loss": 0.6971, "step": 65 }, { "epoch": 0.15, "grad_norm": 0.16608068346977234, "learning_rate": 4.850746268656717e-05, "loss": 0.696, "step": 70 }, { "epoch": 0.16, "grad_norm": 0.1025952398777008, "learning_rate": 4.840085287846482e-05, "loss": 0.6952, "step": 75 }, { "epoch": 0.17, "grad_norm": 0.032446928322315216, "learning_rate": 4.829424307036248e-05, "loss": 0.7, "step": 80 }, { "epoch": 0.18, "grad_norm": 0.20473094284534454, "learning_rate": 4.8187633262260126e-05, "loss": 0.6982, "step": 85 }, { "epoch": 0.19, "grad_norm": 0.24259920418262482, "learning_rate": 4.808102345415779e-05, "loss": 0.6941, "step": 90 }, { "epoch": 0.2, "grad_norm": 0.13239941000938416, "learning_rate": 4.7974413646055436e-05, "loss": 0.6893, "step": 95 }, { "epoch": 0.21, "grad_norm": 0.11972621828317642, "learning_rate": 4.78678038379531e-05, "loss": 0.689, "step": 100 }, { "epoch": 0.22, "grad_norm": 0.014825105667114258, "learning_rate": 4.7761194029850745e-05, "loss": 0.6935, "step": 105 }, { "epoch": 0.23, "grad_norm": 0.2749171555042267, "learning_rate": 4.765458422174841e-05, "loss": 0.7003, "step": 110 }, { "epoch": 0.25, "grad_norm": 0.05277346074581146, "learning_rate": 4.7547974413646055e-05, "loss": 0.692, "step": 115 }, { "epoch": 0.26, "grad_norm": 0.06357314437627792, "learning_rate": 4.7441364605543716e-05, "loss": 0.696, "step": 120 }, { "epoch": 0.27, "grad_norm": 0.18096879124641418, "learning_rate": 4.7334754797441364e-05, "loss": 0.7011, "step": 125 }, { "epoch": 0.28, "grad_norm": 0.1589221954345703, "learning_rate": 4.7228144989339026e-05, "loss": 0.6944, "step": 130 }, { "epoch": 0.29, "grad_norm": 0.1216045394539833, "learning_rate": 4.7121535181236674e-05, "loss": 0.6947, "step": 135 }, { "epoch": 0.3, "grad_norm": 0.022791074588894844, "learning_rate": 4.7014925373134335e-05, "loss": 0.6942, "step": 140 }, { "epoch": 0.31, "grad_norm": 0.07503971457481384, "learning_rate": 4.690831556503198e-05, "loss": 0.6937, "step": 145 }, { "epoch": 0.32, "grad_norm": 0.22517453134059906, "learning_rate": 4.6801705756929645e-05, "loss": 0.6958, "step": 150 }, { "epoch": 0.33, "grad_norm": 0.006116226315498352, "learning_rate": 4.669509594882729e-05, "loss": 0.693, "step": 155 }, { "epoch": 0.34, "grad_norm": 0.04895751550793648, "learning_rate": 4.658848614072495e-05, "loss": 0.6929, "step": 160 }, { "epoch": 0.35, "grad_norm": 0.3163030743598938, "learning_rate": 4.64818763326226e-05, "loss": 0.694, "step": 165 }, { "epoch": 0.36, "grad_norm": 0.04843531921505928, "learning_rate": 4.637526652452026e-05, "loss": 0.6926, "step": 170 }, { "epoch": 0.37, "grad_norm": 0.05857136473059654, "learning_rate": 4.626865671641791e-05, "loss": 0.6943, "step": 175 }, { "epoch": 0.38, "grad_norm": 0.05212704837322235, "learning_rate": 4.6162046908315566e-05, "loss": 0.694, "step": 180 }, { "epoch": 0.39, "grad_norm": 0.057463593780994415, "learning_rate": 4.605543710021322e-05, "loss": 0.6937, "step": 185 }, { "epoch": 0.41, "grad_norm": 0.11181072145700455, "learning_rate": 4.5948827292110876e-05, "loss": 0.6927, "step": 190 }, { "epoch": 0.42, "grad_norm": 0.16910992562770844, "learning_rate": 4.584221748400853e-05, "loss": 0.694, "step": 195 }, { "epoch": 0.43, "grad_norm": 0.16462725400924683, "learning_rate": 4.5735607675906185e-05, "loss": 0.6938, "step": 200 }, { "epoch": 0.44, "grad_norm": 0.04628448933362961, "learning_rate": 4.562899786780384e-05, "loss": 0.6932, "step": 205 }, { "epoch": 0.45, "grad_norm": 0.2542133629322052, "learning_rate": 4.5522388059701495e-05, "loss": 0.6933, "step": 210 }, { "epoch": 0.46, "grad_norm": 0.022086847573518753, "learning_rate": 4.541577825159915e-05, "loss": 0.6929, "step": 215 }, { "epoch": 0.47, "grad_norm": 0.1755961775779724, "learning_rate": 4.5309168443496804e-05, "loss": 0.695, "step": 220 }, { "epoch": 0.48, "grad_norm": 0.1452350616455078, "learning_rate": 4.520255863539446e-05, "loss": 0.6953, "step": 225 }, { "epoch": 0.49, "grad_norm": 0.2042374312877655, "learning_rate": 4.5095948827292114e-05, "loss": 0.6929, "step": 230 }, { "epoch": 0.5, "grad_norm": 0.1066068783402443, "learning_rate": 4.498933901918977e-05, "loss": 0.692, "step": 235 }, { "epoch": 0.51, "grad_norm": 0.15991215407848358, "learning_rate": 4.488272921108742e-05, "loss": 0.6944, "step": 240 }, { "epoch": 0.52, "grad_norm": 0.20871728658676147, "learning_rate": 4.477611940298508e-05, "loss": 0.693, "step": 245 }, { "epoch": 0.53, "grad_norm": 0.2562817335128784, "learning_rate": 4.466950959488273e-05, "loss": 0.6933, "step": 250 }, { "epoch": 0.54, "grad_norm": 0.04552245885133743, "learning_rate": 4.456289978678039e-05, "loss": 0.6929, "step": 255 }, { "epoch": 0.55, "grad_norm": 0.012926601804792881, "learning_rate": 4.445628997867804e-05, "loss": 0.6921, "step": 260 }, { "epoch": 0.57, "grad_norm": 0.06915324926376343, "learning_rate": 4.43496801705757e-05, "loss": 0.6936, "step": 265 }, { "epoch": 0.58, "grad_norm": 0.11693533509969711, "learning_rate": 4.424307036247335e-05, "loss": 0.6975, "step": 270 }, { "epoch": 0.59, "grad_norm": 0.25377050042152405, "learning_rate": 4.4136460554371006e-05, "loss": 0.6928, "step": 275 }, { "epoch": 0.6, "grad_norm": 0.22589242458343506, "learning_rate": 4.402985074626866e-05, "loss": 0.6936, "step": 280 }, { "epoch": 0.61, "grad_norm": 0.17468225955963135, "learning_rate": 4.3923240938166316e-05, "loss": 0.6934, "step": 285 }, { "epoch": 0.62, "grad_norm": 0.041436854749917984, "learning_rate": 4.381663113006397e-05, "loss": 0.6949, "step": 290 }, { "epoch": 0.63, "grad_norm": 0.16202151775360107, "learning_rate": 4.3710021321961625e-05, "loss": 0.6925, "step": 295 }, { "epoch": 0.64, "grad_norm": 0.10329734534025192, "learning_rate": 4.360341151385928e-05, "loss": 0.6947, "step": 300 }, { "epoch": 0.65, "grad_norm": 0.06689364463090897, "learning_rate": 4.3496801705756935e-05, "loss": 0.6923, "step": 305 }, { "epoch": 0.66, "grad_norm": 0.03244592249393463, "learning_rate": 4.339019189765459e-05, "loss": 0.6943, "step": 310 }, { "epoch": 0.67, "grad_norm": 0.08199141174554825, "learning_rate": 4.328358208955224e-05, "loss": 0.6926, "step": 315 }, { "epoch": 0.68, "grad_norm": 0.07944226264953613, "learning_rate": 4.31769722814499e-05, "loss": 0.6915, "step": 320 }, { "epoch": 0.69, "grad_norm": 0.08416552096605301, "learning_rate": 4.307036247334755e-05, "loss": 0.6913, "step": 325 }, { "epoch": 0.7, "grad_norm": 0.1907459795475006, "learning_rate": 4.29637526652452e-05, "loss": 0.6964, "step": 330 }, { "epoch": 0.71, "grad_norm": 0.08609720319509506, "learning_rate": 4.2857142857142856e-05, "loss": 0.6878, "step": 335 }, { "epoch": 0.72, "grad_norm": 0.036432862281799316, "learning_rate": 4.275053304904051e-05, "loss": 0.6942, "step": 340 }, { "epoch": 0.74, "grad_norm": 0.031249074265360832, "learning_rate": 4.2643923240938166e-05, "loss": 0.6932, "step": 345 }, { "epoch": 0.75, "grad_norm": 0.1355859935283661, "learning_rate": 4.253731343283582e-05, "loss": 0.6911, "step": 350 }, { "epoch": 0.76, "grad_norm": 0.1345302313566208, "learning_rate": 4.2430703624733475e-05, "loss": 0.6947, "step": 355 }, { "epoch": 0.77, "grad_norm": 0.07437537610530853, "learning_rate": 4.232409381663113e-05, "loss": 0.695, "step": 360 }, { "epoch": 0.78, "grad_norm": 0.01497673336416483, "learning_rate": 4.2217484008528785e-05, "loss": 0.694, "step": 365 }, { "epoch": 0.79, "grad_norm": 0.14993418753147125, "learning_rate": 4.211087420042644e-05, "loss": 0.6929, "step": 370 }, { "epoch": 0.8, "grad_norm": 0.017090369015932083, "learning_rate": 4.2004264392324094e-05, "loss": 0.6923, "step": 375 }, { "epoch": 0.81, "grad_norm": 0.07324916124343872, "learning_rate": 4.189765458422175e-05, "loss": 0.6925, "step": 380 }, { "epoch": 0.82, "grad_norm": 0.035330481827259064, "learning_rate": 4.1791044776119404e-05, "loss": 0.6965, "step": 385 }, { "epoch": 0.83, "grad_norm": 0.2187144011259079, "learning_rate": 4.168443496801706e-05, "loss": 0.6933, "step": 390 }, { "epoch": 0.84, "grad_norm": 0.05486568436026573, "learning_rate": 4.157782515991471e-05, "loss": 0.6934, "step": 395 }, { "epoch": 0.85, "grad_norm": 0.20385077595710754, "learning_rate": 4.147121535181237e-05, "loss": 0.6928, "step": 400 }, { "epoch": 0.86, "grad_norm": 0.011761588044464588, "learning_rate": 4.136460554371002e-05, "loss": 0.6928, "step": 405 }, { "epoch": 0.87, "grad_norm": 0.34647056460380554, "learning_rate": 4.125799573560768e-05, "loss": 0.6924, "step": 410 }, { "epoch": 0.88, "grad_norm": 0.24146543443202972, "learning_rate": 4.115138592750533e-05, "loss": 0.6909, "step": 415 }, { "epoch": 0.9, "grad_norm": 0.07559261471033096, "learning_rate": 4.104477611940299e-05, "loss": 0.6914, "step": 420 }, { "epoch": 0.91, "grad_norm": 0.031942449510097504, "learning_rate": 4.093816631130064e-05, "loss": 0.6911, "step": 425 }, { "epoch": 0.92, "grad_norm": 0.033947765827178955, "learning_rate": 4.0831556503198296e-05, "loss": 0.6923, "step": 430 }, { "epoch": 0.93, "grad_norm": 0.08389558643102646, "learning_rate": 4.072494669509595e-05, "loss": 0.6969, "step": 435 }, { "epoch": 0.94, "grad_norm": 0.12675201892852783, "learning_rate": 4.0618336886993606e-05, "loss": 0.6897, "step": 440 }, { "epoch": 0.95, "grad_norm": 0.02003253623843193, "learning_rate": 4.051172707889126e-05, "loss": 0.691, "step": 445 }, { "epoch": 0.96, "grad_norm": 0.18815450370311737, "learning_rate": 4.0405117270788915e-05, "loss": 0.693, "step": 450 }, { "epoch": 0.97, "grad_norm": 0.02639641985297203, "learning_rate": 4.029850746268657e-05, "loss": 0.6902, "step": 455 }, { "epoch": 0.98, "grad_norm": 0.021094592288136482, "learning_rate": 4.0191897654584225e-05, "loss": 0.6958, "step": 460 }, { "epoch": 0.99, "grad_norm": 0.12382938712835312, "learning_rate": 4.008528784648188e-05, "loss": 0.6952, "step": 465 }, { "epoch": 1.0, "eval_accuracy": 0.49706666666666666, "eval_loss": 0.6928426027297974, "eval_runtime": 17.2988, "eval_samples_per_second": 216.778, "eval_steps_per_second": 6.821, "step": 469 }, { "epoch": 1.0, "grad_norm": 0.1660253256559372, "learning_rate": 3.997867803837953e-05, "loss": 0.6912, "step": 470 }, { "epoch": 1.01, "grad_norm": 0.2098858803510666, "learning_rate": 3.987206823027719e-05, "loss": 0.6937, "step": 475 }, { "epoch": 1.02, "grad_norm": 0.2579794228076935, "learning_rate": 3.976545842217484e-05, "loss": 0.6929, "step": 480 }, { "epoch": 1.03, "grad_norm": 0.15190201997756958, "learning_rate": 3.96588486140725e-05, "loss": 0.6913, "step": 485 }, { "epoch": 1.04, "grad_norm": 0.11310646682977676, "learning_rate": 3.9552238805970146e-05, "loss": 0.6922, "step": 490 }, { "epoch": 1.06, "grad_norm": 0.13565397262573242, "learning_rate": 3.944562899786781e-05, "loss": 0.6916, "step": 495 }, { "epoch": 1.07, "grad_norm": 0.028749778866767883, "learning_rate": 3.9339019189765456e-05, "loss": 0.6919, "step": 500 }, { "epoch": 1.08, "grad_norm": 0.03068220429122448, "learning_rate": 3.923240938166312e-05, "loss": 0.6904, "step": 505 }, { "epoch": 1.09, "grad_norm": 0.38801059126853943, "learning_rate": 3.9125799573560765e-05, "loss": 0.699, "step": 510 }, { "epoch": 1.1, "grad_norm": 0.17091982066631317, "learning_rate": 3.901918976545843e-05, "loss": 0.692, "step": 515 }, { "epoch": 1.11, "grad_norm": 0.19640736281871796, "learning_rate": 3.8912579957356075e-05, "loss": 0.6906, "step": 520 }, { "epoch": 1.12, "grad_norm": 0.11164979636669159, "learning_rate": 3.8805970149253736e-05, "loss": 0.6919, "step": 525 }, { "epoch": 1.13, "grad_norm": 0.09658452868461609, "learning_rate": 3.8699360341151384e-05, "loss": 0.6907, "step": 530 }, { "epoch": 1.14, "grad_norm": 0.2699227035045624, "learning_rate": 3.8592750533049046e-05, "loss": 0.6921, "step": 535 }, { "epoch": 1.15, "grad_norm": 0.16162264347076416, "learning_rate": 3.8486140724946694e-05, "loss": 0.694, "step": 540 }, { "epoch": 1.16, "grad_norm": 0.10174128413200378, "learning_rate": 3.8379530916844355e-05, "loss": 0.6909, "step": 545 }, { "epoch": 1.17, "grad_norm": 0.04992598667740822, "learning_rate": 3.8272921108742e-05, "loss": 0.692, "step": 550 }, { "epoch": 1.18, "grad_norm": 0.054032549262046814, "learning_rate": 3.8166311300639665e-05, "loss": 0.6913, "step": 555 }, { "epoch": 1.19, "grad_norm": 0.11708575487136841, "learning_rate": 3.805970149253731e-05, "loss": 0.692, "step": 560 }, { "epoch": 1.2, "grad_norm": 0.012999237515032291, "learning_rate": 3.7953091684434974e-05, "loss": 0.6931, "step": 565 }, { "epoch": 1.22, "grad_norm": 0.3079542815685272, "learning_rate": 3.784648187633262e-05, "loss": 0.6913, "step": 570 }, { "epoch": 1.23, "grad_norm": 0.006371928378939629, "learning_rate": 3.7739872068230284e-05, "loss": 0.6927, "step": 575 }, { "epoch": 1.24, "grad_norm": 0.10761117190122604, "learning_rate": 3.763326226012793e-05, "loss": 0.6913, "step": 580 }, { "epoch": 1.25, "grad_norm": 0.05754609405994415, "learning_rate": 3.752665245202559e-05, "loss": 0.6923, "step": 585 }, { "epoch": 1.26, "grad_norm": 0.11597250401973724, "learning_rate": 3.742004264392324e-05, "loss": 0.6932, "step": 590 }, { "epoch": 1.27, "grad_norm": 0.06282239407300949, "learning_rate": 3.73134328358209e-05, "loss": 0.6926, "step": 595 }, { "epoch": 1.28, "grad_norm": 0.21621769666671753, "learning_rate": 3.720682302771855e-05, "loss": 0.6913, "step": 600 }, { "epoch": 1.29, "grad_norm": 0.042546581476926804, "learning_rate": 3.710021321961621e-05, "loss": 0.6919, "step": 605 }, { "epoch": 1.3, "grad_norm": 0.06986606121063232, "learning_rate": 3.699360341151386e-05, "loss": 0.6913, "step": 610 }, { "epoch": 1.31, "grad_norm": 0.0846039429306984, "learning_rate": 3.6886993603411515e-05, "loss": 0.6899, "step": 615 }, { "epoch": 1.32, "grad_norm": 0.08857711404561996, "learning_rate": 3.678038379530917e-05, "loss": 0.6932, "step": 620 }, { "epoch": 1.33, "grad_norm": 0.17316599190235138, "learning_rate": 3.6673773987206824e-05, "loss": 0.6908, "step": 625 }, { "epoch": 1.34, "grad_norm": 0.05025525391101837, "learning_rate": 3.656716417910448e-05, "loss": 0.6927, "step": 630 }, { "epoch": 1.35, "grad_norm": 0.2874080240726471, "learning_rate": 3.6460554371002134e-05, "loss": 0.6909, "step": 635 }, { "epoch": 1.36, "grad_norm": 0.19556842744350433, "learning_rate": 3.635394456289979e-05, "loss": 0.694, "step": 640 }, { "epoch": 1.38, "grad_norm": 0.0742514356970787, "learning_rate": 3.624733475479744e-05, "loss": 0.6935, "step": 645 }, { "epoch": 1.39, "grad_norm": 0.09203707426786423, "learning_rate": 3.61407249466951e-05, "loss": 0.6863, "step": 650 }, { "epoch": 1.4, "grad_norm": 0.022830668836832047, "learning_rate": 3.603411513859275e-05, "loss": 0.6968, "step": 655 }, { "epoch": 1.41, "grad_norm": 0.10433661937713623, "learning_rate": 3.592750533049041e-05, "loss": 0.6915, "step": 660 }, { "epoch": 1.42, "grad_norm": 0.266999751329422, "learning_rate": 3.582089552238806e-05, "loss": 0.6929, "step": 665 }, { "epoch": 1.43, "grad_norm": 0.21351861953735352, "learning_rate": 3.571428571428572e-05, "loss": 0.6935, "step": 670 }, { "epoch": 1.44, "grad_norm": 0.21409828960895538, "learning_rate": 3.560767590618337e-05, "loss": 0.6916, "step": 675 }, { "epoch": 1.45, "grad_norm": 0.10528093576431274, "learning_rate": 3.5501066098081026e-05, "loss": 0.6892, "step": 680 }, { "epoch": 1.46, "grad_norm": 0.05427803099155426, "learning_rate": 3.539445628997868e-05, "loss": 0.6904, "step": 685 }, { "epoch": 1.47, "grad_norm": 0.10534835606813431, "learning_rate": 3.5287846481876336e-05, "loss": 0.6916, "step": 690 }, { "epoch": 1.48, "grad_norm": 0.21933074295520782, "learning_rate": 3.518123667377399e-05, "loss": 0.6911, "step": 695 }, { "epoch": 1.49, "grad_norm": 0.09845433384180069, "learning_rate": 3.5074626865671645e-05, "loss": 0.6905, "step": 700 }, { "epoch": 1.5, "grad_norm": 0.06464946269989014, "learning_rate": 3.496801705756929e-05, "loss": 0.6925, "step": 705 }, { "epoch": 1.51, "grad_norm": 0.1835612803697586, "learning_rate": 3.4861407249466955e-05, "loss": 0.6882, "step": 710 }, { "epoch": 1.52, "grad_norm": 0.043870821595191956, "learning_rate": 3.47547974413646e-05, "loss": 0.6909, "step": 715 }, { "epoch": 1.54, "grad_norm": 0.19715048372745514, "learning_rate": 3.4648187633262264e-05, "loss": 0.6881, "step": 720 }, { "epoch": 1.55, "grad_norm": 0.08701707422733307, "learning_rate": 3.454157782515991e-05, "loss": 0.689, "step": 725 }, { "epoch": 1.56, "grad_norm": 0.026389161124825478, "learning_rate": 3.4434968017057574e-05, "loss": 0.6923, "step": 730 }, { "epoch": 1.57, "grad_norm": 0.15285784006118774, "learning_rate": 3.432835820895522e-05, "loss": 0.69, "step": 735 }, { "epoch": 1.58, "grad_norm": 0.050972145050764084, "learning_rate": 3.422174840085288e-05, "loss": 0.6866, "step": 740 }, { "epoch": 1.59, "grad_norm": 0.10428329557180405, "learning_rate": 3.411513859275053e-05, "loss": 0.692, "step": 745 }, { "epoch": 1.6, "grad_norm": 0.19904804229736328, "learning_rate": 3.400852878464819e-05, "loss": 0.6953, "step": 750 }, { "epoch": 1.61, "grad_norm": 0.2752636671066284, "learning_rate": 3.390191897654584e-05, "loss": 0.6895, "step": 755 }, { "epoch": 1.62, "grad_norm": 0.07439960539340973, "learning_rate": 3.37953091684435e-05, "loss": 0.6841, "step": 760 }, { "epoch": 1.63, "grad_norm": 0.16306805610656738, "learning_rate": 3.368869936034115e-05, "loss": 0.687, "step": 765 }, { "epoch": 1.64, "grad_norm": 0.10966607183218002, "learning_rate": 3.358208955223881e-05, "loss": 0.6883, "step": 770 }, { "epoch": 1.65, "grad_norm": 0.05831507220864296, "learning_rate": 3.347547974413646e-05, "loss": 0.6852, "step": 775 }, { "epoch": 1.66, "grad_norm": 0.31338346004486084, "learning_rate": 3.336886993603412e-05, "loss": 0.6906, "step": 780 }, { "epoch": 1.67, "grad_norm": 0.10868186503648758, "learning_rate": 3.326226012793177e-05, "loss": 0.6948, "step": 785 }, { "epoch": 1.68, "grad_norm": 0.12757889926433563, "learning_rate": 3.3155650319829424e-05, "loss": 0.6776, "step": 790 }, { "epoch": 1.7, "grad_norm": 0.07820712774991989, "learning_rate": 3.304904051172708e-05, "loss": 0.6944, "step": 795 }, { "epoch": 1.71, "grad_norm": 0.25035345554351807, "learning_rate": 3.294243070362473e-05, "loss": 0.6879, "step": 800 }, { "epoch": 1.72, "grad_norm": 0.15776149928569794, "learning_rate": 3.283582089552239e-05, "loss": 0.6788, "step": 805 }, { "epoch": 1.73, "grad_norm": 0.10203266143798828, "learning_rate": 3.272921108742004e-05, "loss": 0.6847, "step": 810 }, { "epoch": 1.74, "grad_norm": 0.19664335250854492, "learning_rate": 3.26226012793177e-05, "loss": 0.6888, "step": 815 }, { "epoch": 1.75, "grad_norm": 0.17209216952323914, "learning_rate": 3.251599147121535e-05, "loss": 0.6995, "step": 820 }, { "epoch": 1.76, "grad_norm": 0.06759248673915863, "learning_rate": 3.240938166311301e-05, "loss": 0.6901, "step": 825 }, { "epoch": 1.77, "grad_norm": 0.18248839676380157, "learning_rate": 3.230277185501066e-05, "loss": 0.6906, "step": 830 }, { "epoch": 1.78, "grad_norm": 0.049247272312641144, "learning_rate": 3.2196162046908317e-05, "loss": 0.6822, "step": 835 }, { "epoch": 1.79, "grad_norm": 0.0517529733479023, "learning_rate": 3.208955223880597e-05, "loss": 0.6778, "step": 840 }, { "epoch": 1.8, "grad_norm": 0.43367257714271545, "learning_rate": 3.1982942430703626e-05, "loss": 0.7034, "step": 845 }, { "epoch": 1.81, "grad_norm": 0.37519243359565735, "learning_rate": 3.187633262260128e-05, "loss": 0.6829, "step": 850 }, { "epoch": 1.82, "grad_norm": 0.07908283174037933, "learning_rate": 3.1769722814498935e-05, "loss": 0.6873, "step": 855 }, { "epoch": 1.83, "grad_norm": 0.22837957739830017, "learning_rate": 3.166311300639659e-05, "loss": 0.6907, "step": 860 }, { "epoch": 1.84, "grad_norm": 0.2421358823776245, "learning_rate": 3.1556503198294245e-05, "loss": 0.6893, "step": 865 }, { "epoch": 1.86, "grad_norm": 0.26317983865737915, "learning_rate": 3.14498933901919e-05, "loss": 0.687, "step": 870 }, { "epoch": 1.87, "grad_norm": 0.3738718032836914, "learning_rate": 3.1343283582089554e-05, "loss": 0.6857, "step": 875 }, { "epoch": 1.88, "grad_norm": 0.14564202725887299, "learning_rate": 3.123667377398721e-05, "loss": 0.6821, "step": 880 }, { "epoch": 1.89, "grad_norm": 0.13704021275043488, "learning_rate": 3.1130063965884864e-05, "loss": 0.6951, "step": 885 }, { "epoch": 1.9, "grad_norm": 0.10348877310752869, "learning_rate": 3.102345415778252e-05, "loss": 0.6793, "step": 890 }, { "epoch": 1.91, "grad_norm": 0.10818944126367569, "learning_rate": 3.0916844349680173e-05, "loss": 0.6884, "step": 895 }, { "epoch": 1.92, "grad_norm": 0.17710749804973602, "learning_rate": 3.081023454157783e-05, "loss": 0.6861, "step": 900 }, { "epoch": 1.93, "grad_norm": 0.11066386103630066, "learning_rate": 3.070362473347548e-05, "loss": 0.6761, "step": 905 }, { "epoch": 1.94, "grad_norm": 0.14435610175132751, "learning_rate": 3.059701492537314e-05, "loss": 0.6747, "step": 910 }, { "epoch": 1.95, "grad_norm": 0.15541020035743713, "learning_rate": 3.0490405117270792e-05, "loss": 0.6806, "step": 915 }, { "epoch": 1.96, "grad_norm": 0.08867447078227997, "learning_rate": 3.0383795309168444e-05, "loss": 0.69, "step": 920 }, { "epoch": 1.97, "grad_norm": 0.32354313135147095, "learning_rate": 3.0277185501066102e-05, "loss": 0.6812, "step": 925 }, { "epoch": 1.98, "grad_norm": 0.32110169529914856, "learning_rate": 3.0170575692963753e-05, "loss": 0.6733, "step": 930 }, { "epoch": 1.99, "grad_norm": 0.0616983063519001, "learning_rate": 3.006396588486141e-05, "loss": 0.6736, "step": 935 }, { "epoch": 2.0, "eval_accuracy": 0.5552, "eval_loss": 0.6843511462211609, "eval_runtime": 18.2604, "eval_samples_per_second": 205.363, "eval_steps_per_second": 6.462, "step": 938 }, { "epoch": 2.0, "grad_norm": 0.13492348790168762, "learning_rate": 2.9957356076759063e-05, "loss": 0.6925, "step": 940 }, { "epoch": 2.01, "grad_norm": 0.2928030490875244, "learning_rate": 2.9850746268656714e-05, "loss": 0.6677, "step": 945 }, { "epoch": 2.03, "grad_norm": 0.11364587396383286, "learning_rate": 2.9744136460554372e-05, "loss": 0.6838, "step": 950 }, { "epoch": 2.04, "grad_norm": 0.16877992451190948, "learning_rate": 2.9637526652452023e-05, "loss": 0.6952, "step": 955 }, { "epoch": 2.05, "grad_norm": 0.38469552993774414, "learning_rate": 2.953091684434968e-05, "loss": 0.6981, "step": 960 }, { "epoch": 2.06, "grad_norm": 0.2534121572971344, "learning_rate": 2.9424307036247333e-05, "loss": 0.6846, "step": 965 }, { "epoch": 2.07, "grad_norm": 0.23564128577709198, "learning_rate": 2.931769722814499e-05, "loss": 0.6697, "step": 970 }, { "epoch": 2.08, "grad_norm": 0.1798560619354248, "learning_rate": 2.9211087420042642e-05, "loss": 0.681, "step": 975 }, { "epoch": 2.09, "grad_norm": 0.2816617786884308, "learning_rate": 2.91044776119403e-05, "loss": 0.6721, "step": 980 }, { "epoch": 2.1, "grad_norm": 0.15949612855911255, "learning_rate": 2.8997867803837952e-05, "loss": 0.6684, "step": 985 }, { "epoch": 2.11, "grad_norm": 0.3336601257324219, "learning_rate": 2.889125799573561e-05, "loss": 0.6789, "step": 990 }, { "epoch": 2.12, "grad_norm": 0.15128879249095917, "learning_rate": 2.878464818763326e-05, "loss": 0.6679, "step": 995 }, { "epoch": 2.13, "grad_norm": 0.2831324636936188, "learning_rate": 2.867803837953092e-05, "loss": 0.683, "step": 1000 }, { "epoch": 2.14, "grad_norm": 0.09263738244771957, "learning_rate": 2.857142857142857e-05, "loss": 0.6769, "step": 1005 }, { "epoch": 2.15, "grad_norm": 0.10221715271472931, "learning_rate": 2.846481876332623e-05, "loss": 0.6633, "step": 1010 }, { "epoch": 2.16, "grad_norm": 0.3450409471988678, "learning_rate": 2.835820895522388e-05, "loss": 0.6859, "step": 1015 }, { "epoch": 2.17, "grad_norm": 0.15574760735034943, "learning_rate": 2.825159914712154e-05, "loss": 0.6703, "step": 1020 }, { "epoch": 2.19, "grad_norm": 0.18141822516918182, "learning_rate": 2.814498933901919e-05, "loss": 0.6829, "step": 1025 }, { "epoch": 2.2, "grad_norm": 0.2733962833881378, "learning_rate": 2.8038379530916848e-05, "loss": 0.6668, "step": 1030 }, { "epoch": 2.21, "grad_norm": 0.406516969203949, "learning_rate": 2.79317697228145e-05, "loss": 0.6932, "step": 1035 }, { "epoch": 2.22, "grad_norm": 0.26090744137763977, "learning_rate": 2.7825159914712157e-05, "loss": 0.6624, "step": 1040 }, { "epoch": 2.23, "grad_norm": 0.30359649658203125, "learning_rate": 2.771855010660981e-05, "loss": 0.678, "step": 1045 }, { "epoch": 2.24, "grad_norm": 0.6121388077735901, "learning_rate": 2.7611940298507467e-05, "loss": 0.6777, "step": 1050 }, { "epoch": 2.25, "grad_norm": 0.25791430473327637, "learning_rate": 2.7505330490405118e-05, "loss": 0.6562, "step": 1055 }, { "epoch": 2.26, "grad_norm": 0.28375235199928284, "learning_rate": 2.7398720682302776e-05, "loss": 0.6836, "step": 1060 }, { "epoch": 2.27, "grad_norm": 0.34015530347824097, "learning_rate": 2.7292110874200428e-05, "loss": 0.6845, "step": 1065 }, { "epoch": 2.28, "grad_norm": 0.4710615277290344, "learning_rate": 2.7185501066098086e-05, "loss": 0.6593, "step": 1070 }, { "epoch": 2.29, "grad_norm": 0.2846826910972595, "learning_rate": 2.7078891257995737e-05, "loss": 0.664, "step": 1075 }, { "epoch": 2.3, "grad_norm": 0.599051296710968, "learning_rate": 2.6972281449893395e-05, "loss": 0.6747, "step": 1080 }, { "epoch": 2.31, "grad_norm": 0.26196974515914917, "learning_rate": 2.6865671641791047e-05, "loss": 0.6693, "step": 1085 }, { "epoch": 2.32, "grad_norm": 0.6016497611999512, "learning_rate": 2.6759061833688705e-05, "loss": 0.6736, "step": 1090 }, { "epoch": 2.33, "grad_norm": 0.21997526288032532, "learning_rate": 2.6652452025586356e-05, "loss": 0.7005, "step": 1095 }, { "epoch": 2.35, "grad_norm": 0.10501424223184586, "learning_rate": 2.6545842217484007e-05, "loss": 0.6859, "step": 1100 }, { "epoch": 2.36, "grad_norm": 0.4104955196380615, "learning_rate": 2.6439232409381666e-05, "loss": 0.6939, "step": 1105 }, { "epoch": 2.37, "grad_norm": 0.16587625443935394, "learning_rate": 2.6332622601279317e-05, "loss": 0.6703, "step": 1110 }, { "epoch": 2.38, "grad_norm": 0.15244054794311523, "learning_rate": 2.6226012793176975e-05, "loss": 0.6841, "step": 1115 }, { "epoch": 2.39, "grad_norm": 0.14376255869865417, "learning_rate": 2.6119402985074626e-05, "loss": 0.6817, "step": 1120 }, { "epoch": 2.4, "grad_norm": 0.28233468532562256, "learning_rate": 2.6012793176972285e-05, "loss": 0.6702, "step": 1125 }, { "epoch": 2.41, "grad_norm": 0.21459686756134033, "learning_rate": 2.5906183368869936e-05, "loss": 0.6552, "step": 1130 }, { "epoch": 2.42, "grad_norm": 0.29934316873550415, "learning_rate": 2.5799573560767594e-05, "loss": 0.6684, "step": 1135 }, { "epoch": 2.43, "grad_norm": 0.1567791849374771, "learning_rate": 2.5692963752665245e-05, "loss": 0.6877, "step": 1140 }, { "epoch": 2.44, "grad_norm": 0.7527381777763367, "learning_rate": 2.5586353944562904e-05, "loss": 0.6815, "step": 1145 }, { "epoch": 2.45, "grad_norm": 0.2388136237859726, "learning_rate": 2.5479744136460555e-05, "loss": 0.7029, "step": 1150 }, { "epoch": 2.46, "grad_norm": 0.7899766564369202, "learning_rate": 2.537313432835821e-05, "loss": 0.6938, "step": 1155 }, { "epoch": 2.47, "grad_norm": 0.5360927581787109, "learning_rate": 2.5266524520255864e-05, "loss": 0.6813, "step": 1160 }, { "epoch": 2.48, "grad_norm": 0.2651243805885315, "learning_rate": 2.515991471215352e-05, "loss": 0.6877, "step": 1165 }, { "epoch": 2.49, "grad_norm": 0.40308693051338196, "learning_rate": 2.5053304904051174e-05, "loss": 0.6936, "step": 1170 }, { "epoch": 2.51, "grad_norm": 0.25783565640449524, "learning_rate": 2.494669509594883e-05, "loss": 0.6626, "step": 1175 }, { "epoch": 2.52, "grad_norm": 0.2718556523323059, "learning_rate": 2.4840085287846483e-05, "loss": 0.6858, "step": 1180 }, { "epoch": 2.53, "grad_norm": 0.139323428273201, "learning_rate": 2.4733475479744138e-05, "loss": 0.6612, "step": 1185 }, { "epoch": 2.54, "grad_norm": 0.256630539894104, "learning_rate": 2.4626865671641793e-05, "loss": 0.6875, "step": 1190 }, { "epoch": 2.55, "grad_norm": 0.2941284477710724, "learning_rate": 2.4520255863539444e-05, "loss": 0.6905, "step": 1195 }, { "epoch": 2.56, "grad_norm": 0.2886035144329071, "learning_rate": 2.44136460554371e-05, "loss": 0.6565, "step": 1200 }, { "epoch": 2.57, "grad_norm": 0.18019695580005646, "learning_rate": 2.4307036247334754e-05, "loss": 0.6781, "step": 1205 }, { "epoch": 2.58, "grad_norm": 0.4069792628288269, "learning_rate": 2.420042643923241e-05, "loss": 0.6899, "step": 1210 }, { "epoch": 2.59, "grad_norm": 0.4986726641654968, "learning_rate": 2.4093816631130063e-05, "loss": 0.6843, "step": 1215 }, { "epoch": 2.6, "grad_norm": 0.34947189688682556, "learning_rate": 2.3987206823027718e-05, "loss": 0.6828, "step": 1220 }, { "epoch": 2.61, "grad_norm": 0.12795394659042358, "learning_rate": 2.3880597014925373e-05, "loss": 0.6805, "step": 1225 }, { "epoch": 2.62, "grad_norm": 0.3556659519672394, "learning_rate": 2.3773987206823027e-05, "loss": 0.6659, "step": 1230 }, { "epoch": 2.63, "grad_norm": 0.2913447618484497, "learning_rate": 2.3667377398720682e-05, "loss": 0.661, "step": 1235 }, { "epoch": 2.64, "grad_norm": 0.33494627475738525, "learning_rate": 2.3560767590618337e-05, "loss": 0.6621, "step": 1240 }, { "epoch": 2.65, "grad_norm": 0.19486567378044128, "learning_rate": 2.345415778251599e-05, "loss": 0.6631, "step": 1245 }, { "epoch": 2.67, "grad_norm": 0.16531342267990112, "learning_rate": 2.3347547974413646e-05, "loss": 0.689, "step": 1250 }, { "epoch": 2.68, "grad_norm": 0.4831390976905823, "learning_rate": 2.32409381663113e-05, "loss": 0.6789, "step": 1255 }, { "epoch": 2.69, "grad_norm": 0.15912608802318573, "learning_rate": 2.3134328358208956e-05, "loss": 0.6605, "step": 1260 }, { "epoch": 2.7, "grad_norm": 0.21471765637397766, "learning_rate": 2.302771855010661e-05, "loss": 0.6865, "step": 1265 }, { "epoch": 2.71, "grad_norm": 0.333564817905426, "learning_rate": 2.2921108742004265e-05, "loss": 0.6757, "step": 1270 }, { "epoch": 2.72, "grad_norm": 0.23908694088459015, "learning_rate": 2.281449893390192e-05, "loss": 0.7025, "step": 1275 }, { "epoch": 2.73, "grad_norm": 0.22186516225337982, "learning_rate": 2.2707889125799575e-05, "loss": 0.6907, "step": 1280 }, { "epoch": 2.74, "grad_norm": 0.7879774570465088, "learning_rate": 2.260127931769723e-05, "loss": 0.685, "step": 1285 }, { "epoch": 2.75, "grad_norm": 0.562380850315094, "learning_rate": 2.2494669509594884e-05, "loss": 0.6728, "step": 1290 }, { "epoch": 2.76, "grad_norm": 0.22762958705425262, "learning_rate": 2.238805970149254e-05, "loss": 0.6782, "step": 1295 }, { "epoch": 2.77, "grad_norm": 0.26743292808532715, "learning_rate": 2.2281449893390194e-05, "loss": 0.663, "step": 1300 }, { "epoch": 2.78, "grad_norm": 0.21805542707443237, "learning_rate": 2.217484008528785e-05, "loss": 0.6609, "step": 1305 }, { "epoch": 2.79, "grad_norm": 0.2946370542049408, "learning_rate": 2.2068230277185503e-05, "loss": 0.6802, "step": 1310 }, { "epoch": 2.8, "grad_norm": 0.23653313517570496, "learning_rate": 2.1961620469083158e-05, "loss": 0.6943, "step": 1315 }, { "epoch": 2.81, "grad_norm": 0.27479249238967896, "learning_rate": 2.1855010660980813e-05, "loss": 0.6599, "step": 1320 }, { "epoch": 2.83, "grad_norm": 0.3639926314353943, "learning_rate": 2.1748400852878467e-05, "loss": 0.6773, "step": 1325 }, { "epoch": 2.84, "grad_norm": 0.17922058701515198, "learning_rate": 2.164179104477612e-05, "loss": 0.6758, "step": 1330 }, { "epoch": 2.85, "grad_norm": 0.39532744884490967, "learning_rate": 2.1535181236673773e-05, "loss": 0.6746, "step": 1335 }, { "epoch": 2.86, "grad_norm": 0.4862898290157318, "learning_rate": 2.1428571428571428e-05, "loss": 0.675, "step": 1340 }, { "epoch": 2.87, "grad_norm": 0.46271321177482605, "learning_rate": 2.1321961620469083e-05, "loss": 0.6846, "step": 1345 }, { "epoch": 2.88, "grad_norm": 0.15888762474060059, "learning_rate": 2.1215351812366738e-05, "loss": 0.6843, "step": 1350 }, { "epoch": 2.89, "grad_norm": 0.45253995060920715, "learning_rate": 2.1108742004264392e-05, "loss": 0.658, "step": 1355 }, { "epoch": 2.9, "grad_norm": 0.1543508768081665, "learning_rate": 2.1002132196162047e-05, "loss": 0.6722, "step": 1360 }, { "epoch": 2.91, "grad_norm": 0.2126993089914322, "learning_rate": 2.0895522388059702e-05, "loss": 0.6526, "step": 1365 }, { "epoch": 2.92, "grad_norm": 0.25720474123954773, "learning_rate": 2.0788912579957357e-05, "loss": 0.6596, "step": 1370 }, { "epoch": 2.93, "grad_norm": 0.3520115613937378, "learning_rate": 2.068230277185501e-05, "loss": 0.6623, "step": 1375 }, { "epoch": 2.94, "grad_norm": 0.3493128716945648, "learning_rate": 2.0575692963752666e-05, "loss": 0.666, "step": 1380 }, { "epoch": 2.95, "grad_norm": 0.44602665305137634, "learning_rate": 2.046908315565032e-05, "loss": 0.6703, "step": 1385 }, { "epoch": 2.96, "grad_norm": 0.5327634215354919, "learning_rate": 2.0362473347547976e-05, "loss": 0.6953, "step": 1390 }, { "epoch": 2.97, "grad_norm": 0.7421027421951294, "learning_rate": 2.025586353944563e-05, "loss": 0.6827, "step": 1395 }, { "epoch": 2.99, "grad_norm": 0.516767144203186, "learning_rate": 2.0149253731343285e-05, "loss": 0.6975, "step": 1400 }, { "epoch": 3.0, "grad_norm": 0.417506605386734, "learning_rate": 2.004264392324094e-05, "loss": 0.66, "step": 1405 }, { "epoch": 3.0, "eval_accuracy": 0.5522666666666667, "eval_loss": 0.6899899840354919, "eval_runtime": 17.8871, "eval_samples_per_second": 209.648, "eval_steps_per_second": 6.597, "step": 1407 }, { "epoch": 3.01, "grad_norm": 0.26411086320877075, "learning_rate": 1.9936034115138594e-05, "loss": 0.6743, "step": 1410 }, { "epoch": 3.02, "grad_norm": 0.2531265318393707, "learning_rate": 1.982942430703625e-05, "loss": 0.6764, "step": 1415 }, { "epoch": 3.03, "grad_norm": 0.3299374580383301, "learning_rate": 1.9722814498933904e-05, "loss": 0.6946, "step": 1420 }, { "epoch": 3.04, "grad_norm": 1.1010500192642212, "learning_rate": 1.961620469083156e-05, "loss": 0.6612, "step": 1425 }, { "epoch": 3.05, "grad_norm": 0.5194435715675354, "learning_rate": 1.9509594882729213e-05, "loss": 0.6768, "step": 1430 }, { "epoch": 3.06, "grad_norm": 0.32416439056396484, "learning_rate": 1.9402985074626868e-05, "loss": 0.691, "step": 1435 }, { "epoch": 3.07, "grad_norm": 0.1599058210849762, "learning_rate": 1.9296375266524523e-05, "loss": 0.6603, "step": 1440 }, { "epoch": 3.08, "grad_norm": 0.2524496018886566, "learning_rate": 1.9189765458422178e-05, "loss": 0.688, "step": 1445 }, { "epoch": 3.09, "grad_norm": 0.12636564671993256, "learning_rate": 1.9083155650319832e-05, "loss": 0.679, "step": 1450 }, { "epoch": 3.1, "grad_norm": 0.26544949412345886, "learning_rate": 1.8976545842217487e-05, "loss": 0.6691, "step": 1455 }, { "epoch": 3.11, "grad_norm": 0.22614602744579315, "learning_rate": 1.8869936034115142e-05, "loss": 0.6858, "step": 1460 }, { "epoch": 3.12, "grad_norm": 0.39157307147979736, "learning_rate": 1.8763326226012797e-05, "loss": 0.6774, "step": 1465 }, { "epoch": 3.13, "grad_norm": 0.7258807420730591, "learning_rate": 1.865671641791045e-05, "loss": 0.6577, "step": 1470 }, { "epoch": 3.14, "grad_norm": 0.3102707862854004, "learning_rate": 1.8550106609808106e-05, "loss": 0.6719, "step": 1475 }, { "epoch": 3.16, "grad_norm": 0.6749600768089294, "learning_rate": 1.8443496801705757e-05, "loss": 0.6728, "step": 1480 }, { "epoch": 3.17, "grad_norm": 0.3821292519569397, "learning_rate": 1.8336886993603412e-05, "loss": 0.6486, "step": 1485 }, { "epoch": 3.18, "grad_norm": 0.3780907094478607, "learning_rate": 1.8230277185501067e-05, "loss": 0.6672, "step": 1490 }, { "epoch": 3.19, "grad_norm": 0.23514017462730408, "learning_rate": 1.812366737739872e-05, "loss": 0.6668, "step": 1495 }, { "epoch": 3.2, "grad_norm": 0.18084169924259186, "learning_rate": 1.8017057569296376e-05, "loss": 0.6618, "step": 1500 }, { "epoch": 3.21, "grad_norm": 0.5182689428329468, "learning_rate": 1.791044776119403e-05, "loss": 0.6417, "step": 1505 }, { "epoch": 3.22, "grad_norm": 0.801882266998291, "learning_rate": 1.7803837953091686e-05, "loss": 0.6645, "step": 1510 }, { "epoch": 3.23, "grad_norm": 0.18955937027931213, "learning_rate": 1.769722814498934e-05, "loss": 0.6887, "step": 1515 }, { "epoch": 3.24, "grad_norm": 0.33656734228134155, "learning_rate": 1.7590618336886995e-05, "loss": 0.6597, "step": 1520 }, { "epoch": 3.25, "grad_norm": 0.19224336743354797, "learning_rate": 1.7484008528784647e-05, "loss": 0.6939, "step": 1525 }, { "epoch": 3.26, "grad_norm": 0.30128347873687744, "learning_rate": 1.73773987206823e-05, "loss": 0.6853, "step": 1530 }, { "epoch": 3.27, "grad_norm": 0.4165492355823517, "learning_rate": 1.7270788912579956e-05, "loss": 0.6606, "step": 1535 }, { "epoch": 3.28, "grad_norm": 0.38350632786750793, "learning_rate": 1.716417910447761e-05, "loss": 0.6526, "step": 1540 }, { "epoch": 3.29, "grad_norm": 0.5553402900695801, "learning_rate": 1.7057569296375266e-05, "loss": 0.6917, "step": 1545 }, { "epoch": 3.3, "grad_norm": 0.38175731897354126, "learning_rate": 1.695095948827292e-05, "loss": 0.637, "step": 1550 }, { "epoch": 3.32, "grad_norm": 0.6655339598655701, "learning_rate": 1.6844349680170575e-05, "loss": 0.6645, "step": 1555 }, { "epoch": 3.33, "grad_norm": 0.31345900893211365, "learning_rate": 1.673773987206823e-05, "loss": 0.6746, "step": 1560 }, { "epoch": 3.34, "grad_norm": 0.5527360439300537, "learning_rate": 1.6631130063965885e-05, "loss": 0.6596, "step": 1565 }, { "epoch": 3.35, "grad_norm": 0.2297668755054474, "learning_rate": 1.652452025586354e-05, "loss": 0.6798, "step": 1570 }, { "epoch": 3.36, "grad_norm": 0.4209613502025604, "learning_rate": 1.6417910447761194e-05, "loss": 0.6866, "step": 1575 }, { "epoch": 3.37, "grad_norm": 0.1880345642566681, "learning_rate": 1.631130063965885e-05, "loss": 0.6687, "step": 1580 }, { "epoch": 3.38, "grad_norm": 0.2493944764137268, "learning_rate": 1.6204690831556504e-05, "loss": 0.673, "step": 1585 }, { "epoch": 3.39, "grad_norm": 0.1546977460384369, "learning_rate": 1.6098081023454158e-05, "loss": 0.6654, "step": 1590 }, { "epoch": 3.4, "grad_norm": 0.4542850852012634, "learning_rate": 1.5991471215351813e-05, "loss": 0.6897, "step": 1595 }, { "epoch": 3.41, "grad_norm": 0.30015575885772705, "learning_rate": 1.5884861407249468e-05, "loss": 0.6784, "step": 1600 }, { "epoch": 3.42, "grad_norm": 0.19910399615764618, "learning_rate": 1.5778251599147122e-05, "loss": 0.6695, "step": 1605 }, { "epoch": 3.43, "grad_norm": 1.0947527885437012, "learning_rate": 1.5671641791044777e-05, "loss": 0.6715, "step": 1610 }, { "epoch": 3.44, "grad_norm": 0.29615241289138794, "learning_rate": 1.5565031982942432e-05, "loss": 0.6926, "step": 1615 }, { "epoch": 3.45, "grad_norm": 0.7357563376426697, "learning_rate": 1.5458422174840087e-05, "loss": 0.676, "step": 1620 }, { "epoch": 3.46, "grad_norm": 0.3963560461997986, "learning_rate": 1.535181236673774e-05, "loss": 0.6943, "step": 1625 }, { "epoch": 3.48, "grad_norm": 0.7890651226043701, "learning_rate": 1.5245202558635396e-05, "loss": 0.6674, "step": 1630 }, { "epoch": 3.49, "grad_norm": 0.34833666682243347, "learning_rate": 1.5138592750533051e-05, "loss": 0.6803, "step": 1635 }, { "epoch": 3.5, "grad_norm": 0.3659195899963379, "learning_rate": 1.5031982942430706e-05, "loss": 0.6559, "step": 1640 }, { "epoch": 3.51, "grad_norm": 0.4654899835586548, "learning_rate": 1.4925373134328357e-05, "loss": 0.6859, "step": 1645 }, { "epoch": 3.52, "grad_norm": 0.29908713698387146, "learning_rate": 1.4818763326226012e-05, "loss": 0.6877, "step": 1650 }, { "epoch": 3.53, "grad_norm": 0.20764610171318054, "learning_rate": 1.4712153518123666e-05, "loss": 0.6714, "step": 1655 }, { "epoch": 3.54, "grad_norm": 0.3057771623134613, "learning_rate": 1.4605543710021321e-05, "loss": 0.68, "step": 1660 }, { "epoch": 3.55, "grad_norm": 0.4783516824245453, "learning_rate": 1.4498933901918976e-05, "loss": 0.6582, "step": 1665 }, { "epoch": 3.56, "grad_norm": 0.44634905457496643, "learning_rate": 1.439232409381663e-05, "loss": 0.6602, "step": 1670 }, { "epoch": 3.57, "grad_norm": 0.23565933108329773, "learning_rate": 1.4285714285714285e-05, "loss": 0.6701, "step": 1675 }, { "epoch": 3.58, "grad_norm": 0.3964284658432007, "learning_rate": 1.417910447761194e-05, "loss": 0.6803, "step": 1680 }, { "epoch": 3.59, "grad_norm": 0.1792701780796051, "learning_rate": 1.4072494669509595e-05, "loss": 0.69, "step": 1685 }, { "epoch": 3.6, "grad_norm": 0.2967628538608551, "learning_rate": 1.396588486140725e-05, "loss": 0.6374, "step": 1690 }, { "epoch": 3.61, "grad_norm": 0.2924692928791046, "learning_rate": 1.3859275053304904e-05, "loss": 0.6933, "step": 1695 }, { "epoch": 3.62, "grad_norm": 0.44005072116851807, "learning_rate": 1.3752665245202559e-05, "loss": 0.6913, "step": 1700 }, { "epoch": 3.64, "grad_norm": 0.27338188886642456, "learning_rate": 1.3646055437100214e-05, "loss": 0.6607, "step": 1705 }, { "epoch": 3.65, "grad_norm": 0.54692143201828, "learning_rate": 1.3539445628997869e-05, "loss": 0.6474, "step": 1710 }, { "epoch": 3.66, "grad_norm": 0.2899147570133209, "learning_rate": 1.3432835820895523e-05, "loss": 0.6617, "step": 1715 }, { "epoch": 3.67, "grad_norm": 0.853637158870697, "learning_rate": 1.3326226012793178e-05, "loss": 0.6652, "step": 1720 }, { "epoch": 3.68, "grad_norm": 0.16075211763381958, "learning_rate": 1.3219616204690833e-05, "loss": 0.6644, "step": 1725 }, { "epoch": 3.69, "grad_norm": 0.5768300890922546, "learning_rate": 1.3113006396588488e-05, "loss": 0.6407, "step": 1730 }, { "epoch": 3.7, "grad_norm": 0.36618074774742126, "learning_rate": 1.3006396588486142e-05, "loss": 0.631, "step": 1735 }, { "epoch": 3.71, "grad_norm": 0.393250048160553, "learning_rate": 1.2899786780383797e-05, "loss": 0.6894, "step": 1740 }, { "epoch": 3.72, "grad_norm": 0.16580410301685333, "learning_rate": 1.2793176972281452e-05, "loss": 0.6921, "step": 1745 }, { "epoch": 3.73, "grad_norm": 0.3606794476509094, "learning_rate": 1.2686567164179105e-05, "loss": 0.6484, "step": 1750 }, { "epoch": 3.74, "grad_norm": 0.28691309690475464, "learning_rate": 1.257995735607676e-05, "loss": 0.6781, "step": 1755 }, { "epoch": 3.75, "grad_norm": 0.6777899861335754, "learning_rate": 1.2473347547974414e-05, "loss": 0.6987, "step": 1760 }, { "epoch": 3.76, "grad_norm": 0.2805548310279846, "learning_rate": 1.2366737739872069e-05, "loss": 0.661, "step": 1765 }, { "epoch": 3.77, "grad_norm": 0.44818803668022156, "learning_rate": 1.2260127931769722e-05, "loss": 0.6558, "step": 1770 }, { "epoch": 3.78, "grad_norm": 0.564558207988739, "learning_rate": 1.2153518123667377e-05, "loss": 0.6883, "step": 1775 }, { "epoch": 3.8, "grad_norm": 0.49266985058784485, "learning_rate": 1.2046908315565032e-05, "loss": 0.6723, "step": 1780 }, { "epoch": 3.81, "grad_norm": 0.5589272379875183, "learning_rate": 1.1940298507462686e-05, "loss": 0.6727, "step": 1785 }, { "epoch": 3.82, "grad_norm": 0.4831165373325348, "learning_rate": 1.1833688699360341e-05, "loss": 0.6545, "step": 1790 }, { "epoch": 3.83, "grad_norm": 0.2775633931159973, "learning_rate": 1.1727078891257996e-05, "loss": 0.6586, "step": 1795 }, { "epoch": 3.84, "grad_norm": 0.43390023708343506, "learning_rate": 1.162046908315565e-05, "loss": 0.671, "step": 1800 }, { "epoch": 3.85, "grad_norm": 0.1472308337688446, "learning_rate": 1.1513859275053305e-05, "loss": 0.6629, "step": 1805 }, { "epoch": 3.86, "grad_norm": 0.21258753538131714, "learning_rate": 1.140724946695096e-05, "loss": 0.7005, "step": 1810 }, { "epoch": 3.87, "grad_norm": 0.13851375877857208, "learning_rate": 1.1300639658848615e-05, "loss": 0.6705, "step": 1815 }, { "epoch": 3.88, "grad_norm": 0.2994244694709778, "learning_rate": 1.119402985074627e-05, "loss": 0.6877, "step": 1820 }, { "epoch": 3.89, "grad_norm": 0.3223628103733063, "learning_rate": 1.1087420042643924e-05, "loss": 0.6503, "step": 1825 }, { "epoch": 3.9, "grad_norm": 1.3159940242767334, "learning_rate": 1.0980810234541579e-05, "loss": 0.6736, "step": 1830 }, { "epoch": 3.91, "grad_norm": 0.47484922409057617, "learning_rate": 1.0874200426439234e-05, "loss": 0.6631, "step": 1835 }, { "epoch": 3.92, "grad_norm": 1.052727222442627, "learning_rate": 1.0767590618336887e-05, "loss": 0.6596, "step": 1840 }, { "epoch": 3.93, "grad_norm": 0.315295934677124, "learning_rate": 1.0660980810234541e-05, "loss": 0.686, "step": 1845 }, { "epoch": 3.94, "grad_norm": 0.2225022166967392, "learning_rate": 1.0554371002132196e-05, "loss": 0.6945, "step": 1850 }, { "epoch": 3.96, "grad_norm": 0.28455567359924316, "learning_rate": 1.0447761194029851e-05, "loss": 0.6731, "step": 1855 }, { "epoch": 3.97, "grad_norm": 0.34978070855140686, "learning_rate": 1.0341151385927506e-05, "loss": 0.6743, "step": 1860 }, { "epoch": 3.98, "grad_norm": 0.44299939274787903, "learning_rate": 1.023454157782516e-05, "loss": 0.7111, "step": 1865 }, { "epoch": 3.99, "grad_norm": 0.7539863586425781, "learning_rate": 1.0127931769722815e-05, "loss": 0.6464, "step": 1870 }, { "epoch": 4.0, "grad_norm": 0.5005848407745361, "learning_rate": 1.002132196162047e-05, "loss": 0.6937, "step": 1875 }, { "epoch": 4.0, "eval_accuracy": 0.5789333333333333, "eval_loss": 0.6769688129425049, "eval_runtime": 17.8906, "eval_samples_per_second": 209.608, "eval_steps_per_second": 6.596, "step": 1876 }, { "epoch": 4.01, "grad_norm": 0.39526981115341187, "learning_rate": 9.914712153518125e-06, "loss": 0.6529, "step": 1880 }, { "epoch": 4.02, "grad_norm": 0.5626300573348999, "learning_rate": 9.80810234541578e-06, "loss": 0.6536, "step": 1885 }, { "epoch": 4.03, "grad_norm": 1.011904239654541, "learning_rate": 9.701492537313434e-06, "loss": 0.6832, "step": 1890 }, { "epoch": 4.04, "grad_norm": 0.20950503647327423, "learning_rate": 9.594882729211089e-06, "loss": 0.6743, "step": 1895 }, { "epoch": 4.05, "grad_norm": 0.4118889570236206, "learning_rate": 9.488272921108744e-06, "loss": 0.6909, "step": 1900 }, { "epoch": 4.06, "grad_norm": 0.18626834452152252, "learning_rate": 9.381663113006398e-06, "loss": 0.705, "step": 1905 }, { "epoch": 4.07, "grad_norm": 0.2255084216594696, "learning_rate": 9.275053304904053e-06, "loss": 0.6689, "step": 1910 }, { "epoch": 4.08, "grad_norm": 0.32827380299568176, "learning_rate": 9.168443496801706e-06, "loss": 0.6667, "step": 1915 }, { "epoch": 4.09, "grad_norm": 0.452370285987854, "learning_rate": 9.06183368869936e-06, "loss": 0.6925, "step": 1920 }, { "epoch": 4.1, "grad_norm": 0.42745348811149597, "learning_rate": 8.955223880597016e-06, "loss": 0.6615, "step": 1925 }, { "epoch": 4.12, "grad_norm": 0.1782078742980957, "learning_rate": 8.84861407249467e-06, "loss": 0.6755, "step": 1930 }, { "epoch": 4.13, "grad_norm": 1.2486790418624878, "learning_rate": 8.742004264392323e-06, "loss": 0.6598, "step": 1935 }, { "epoch": 4.14, "grad_norm": 0.19132310152053833, "learning_rate": 8.635394456289978e-06, "loss": 0.6543, "step": 1940 }, { "epoch": 4.15, "grad_norm": 0.796686589717865, "learning_rate": 8.528784648187633e-06, "loss": 0.6576, "step": 1945 }, { "epoch": 4.16, "grad_norm": 0.17676232755184174, "learning_rate": 8.422174840085288e-06, "loss": 0.6948, "step": 1950 }, { "epoch": 4.17, "grad_norm": 0.22592103481292725, "learning_rate": 8.315565031982942e-06, "loss": 0.678, "step": 1955 }, { "epoch": 4.18, "grad_norm": 0.3258933126926422, "learning_rate": 8.208955223880597e-06, "loss": 0.6767, "step": 1960 }, { "epoch": 4.19, "grad_norm": 0.2841905653476715, "learning_rate": 8.102345415778252e-06, "loss": 0.6907, "step": 1965 }, { "epoch": 4.2, "grad_norm": 0.19143372774124146, "learning_rate": 7.995735607675907e-06, "loss": 0.6424, "step": 1970 }, { "epoch": 4.21, "grad_norm": 0.5144029259681702, "learning_rate": 7.889125799573561e-06, "loss": 0.6666, "step": 1975 }, { "epoch": 4.22, "grad_norm": 0.20798300206661224, "learning_rate": 7.782515991471216e-06, "loss": 0.6561, "step": 1980 }, { "epoch": 4.23, "grad_norm": 0.5507712960243225, "learning_rate": 7.67590618336887e-06, "loss": 0.6582, "step": 1985 }, { "epoch": 4.24, "grad_norm": 0.2972014546394348, "learning_rate": 7.5692963752665255e-06, "loss": 0.6903, "step": 1990 }, { "epoch": 4.25, "grad_norm": 0.5243217349052429, "learning_rate": 7.4626865671641785e-06, "loss": 0.6516, "step": 1995 }, { "epoch": 4.26, "grad_norm": 0.2954091727733612, "learning_rate": 7.356076759061833e-06, "loss": 0.6873, "step": 2000 }, { "epoch": 4.28, "grad_norm": 0.42244890332221985, "learning_rate": 7.249466950959488e-06, "loss": 0.6707, "step": 2005 }, { "epoch": 4.29, "grad_norm": 0.7245458364486694, "learning_rate": 7.142857142857143e-06, "loss": 0.6705, "step": 2010 }, { "epoch": 4.3, "grad_norm": 0.5897809863090515, "learning_rate": 7.0362473347547975e-06, "loss": 0.6536, "step": 2015 }, { "epoch": 4.31, "grad_norm": 0.7268871665000916, "learning_rate": 6.929637526652452e-06, "loss": 0.6987, "step": 2020 }, { "epoch": 4.32, "grad_norm": 0.7317787408828735, "learning_rate": 6.823027718550107e-06, "loss": 0.67, "step": 2025 }, { "epoch": 4.33, "grad_norm": 0.21886880695819855, "learning_rate": 6.716417910447762e-06, "loss": 0.667, "step": 2030 }, { "epoch": 4.34, "grad_norm": 0.3600420355796814, "learning_rate": 6.609808102345416e-06, "loss": 0.6832, "step": 2035 }, { "epoch": 4.35, "grad_norm": 0.708007276058197, "learning_rate": 6.503198294243071e-06, "loss": 0.6739, "step": 2040 }, { "epoch": 4.36, "grad_norm": 1.6473708152770996, "learning_rate": 6.396588486140726e-06, "loss": 0.6487, "step": 2045 }, { "epoch": 4.37, "grad_norm": 0.6396905779838562, "learning_rate": 6.28997867803838e-06, "loss": 0.6816, "step": 2050 }, { "epoch": 4.38, "grad_norm": 0.16320891678333282, "learning_rate": 6.1833688699360345e-06, "loss": 0.6707, "step": 2055 }, { "epoch": 4.39, "grad_norm": 0.5632389187812805, "learning_rate": 6.076759061833688e-06, "loss": 0.6798, "step": 2060 }, { "epoch": 4.4, "grad_norm": 0.322733610868454, "learning_rate": 5.970149253731343e-06, "loss": 0.6486, "step": 2065 }, { "epoch": 4.41, "grad_norm": 0.9542133212089539, "learning_rate": 5.863539445628998e-06, "loss": 0.6671, "step": 2070 }, { "epoch": 4.42, "grad_norm": 0.21736344695091248, "learning_rate": 5.756929637526653e-06, "loss": 0.6549, "step": 2075 }, { "epoch": 4.43, "grad_norm": 0.28586217761039734, "learning_rate": 5.650319829424307e-06, "loss": 0.6457, "step": 2080 }, { "epoch": 4.45, "grad_norm": 0.7068216800689697, "learning_rate": 5.543710021321962e-06, "loss": 0.6562, "step": 2085 }, { "epoch": 4.46, "grad_norm": 0.2828645408153534, "learning_rate": 5.437100213219617e-06, "loss": 0.6714, "step": 2090 }, { "epoch": 4.47, "grad_norm": 0.4406694173812866, "learning_rate": 5.330490405117271e-06, "loss": 0.6552, "step": 2095 }, { "epoch": 4.48, "grad_norm": 0.3138512670993805, "learning_rate": 5.2238805970149255e-06, "loss": 0.646, "step": 2100 }, { "epoch": 4.49, "grad_norm": 0.3833194971084595, "learning_rate": 5.11727078891258e-06, "loss": 0.6898, "step": 2105 }, { "epoch": 4.5, "grad_norm": 0.5406767129898071, "learning_rate": 5.010660980810235e-06, "loss": 0.6603, "step": 2110 }, { "epoch": 4.51, "grad_norm": 0.3065316379070282, "learning_rate": 4.90405117270789e-06, "loss": 0.688, "step": 2115 }, { "epoch": 4.52, "grad_norm": 0.17264671623706818, "learning_rate": 4.797441364605544e-06, "loss": 0.6723, "step": 2120 }, { "epoch": 4.53, "grad_norm": 0.2249528020620346, "learning_rate": 4.690831556503199e-06, "loss": 0.6521, "step": 2125 }, { "epoch": 4.54, "grad_norm": 0.21267692744731903, "learning_rate": 4.584221748400853e-06, "loss": 0.6599, "step": 2130 }, { "epoch": 4.55, "grad_norm": 0.24561788141727448, "learning_rate": 4.477611940298508e-06, "loss": 0.6713, "step": 2135 }, { "epoch": 4.56, "grad_norm": 1.0180352926254272, "learning_rate": 4.371002132196162e-06, "loss": 0.6565, "step": 2140 }, { "epoch": 4.57, "grad_norm": 0.5230167508125305, "learning_rate": 4.264392324093816e-06, "loss": 0.6575, "step": 2145 }, { "epoch": 4.58, "grad_norm": 0.6080590486526489, "learning_rate": 4.157782515991471e-06, "loss": 0.6781, "step": 2150 }, { "epoch": 4.59, "grad_norm": 0.394972026348114, "learning_rate": 4.051172707889126e-06, "loss": 0.6932, "step": 2155 }, { "epoch": 4.61, "grad_norm": 0.9278691411018372, "learning_rate": 3.944562899786781e-06, "loss": 0.6283, "step": 2160 }, { "epoch": 4.62, "grad_norm": 0.3256765305995941, "learning_rate": 3.837953091684435e-06, "loss": 0.6811, "step": 2165 }, { "epoch": 4.63, "grad_norm": 0.43106502294540405, "learning_rate": 3.7313432835820893e-06, "loss": 0.6642, "step": 2170 }, { "epoch": 4.64, "grad_norm": 0.26512110233306885, "learning_rate": 3.624733475479744e-06, "loss": 0.6548, "step": 2175 }, { "epoch": 4.65, "grad_norm": 0.7567458152770996, "learning_rate": 3.5181236673773987e-06, "loss": 0.693, "step": 2180 }, { "epoch": 4.66, "grad_norm": 0.6813324689865112, "learning_rate": 3.4115138592750535e-06, "loss": 0.6453, "step": 2185 }, { "epoch": 4.67, "grad_norm": 0.5289140939712524, "learning_rate": 3.304904051172708e-06, "loss": 0.6952, "step": 2190 }, { "epoch": 4.68, "grad_norm": 0.32925984263420105, "learning_rate": 3.198294243070363e-06, "loss": 0.6705, "step": 2195 }, { "epoch": 4.69, "grad_norm": 0.15043401718139648, "learning_rate": 3.0916844349680173e-06, "loss": 0.6799, "step": 2200 }, { "epoch": 4.7, "grad_norm": 0.21513478457927704, "learning_rate": 2.9850746268656716e-06, "loss": 0.6346, "step": 2205 }, { "epoch": 4.71, "grad_norm": 0.5901110172271729, "learning_rate": 2.8784648187633263e-06, "loss": 0.6564, "step": 2210 }, { "epoch": 4.72, "grad_norm": 0.18733665347099304, "learning_rate": 2.771855010660981e-06, "loss": 0.6512, "step": 2215 }, { "epoch": 4.73, "grad_norm": 0.7170325517654419, "learning_rate": 2.6652452025586354e-06, "loss": 0.6693, "step": 2220 }, { "epoch": 4.74, "grad_norm": 0.5361260175704956, "learning_rate": 2.55863539445629e-06, "loss": 0.6719, "step": 2225 }, { "epoch": 4.75, "grad_norm": 0.5559740662574768, "learning_rate": 2.452025586353945e-06, "loss": 0.6548, "step": 2230 }, { "epoch": 4.77, "grad_norm": 0.4540775716304779, "learning_rate": 2.3454157782515996e-06, "loss": 0.6915, "step": 2235 }, { "epoch": 4.78, "grad_norm": 0.16333653032779694, "learning_rate": 2.238805970149254e-06, "loss": 0.661, "step": 2240 }, { "epoch": 4.79, "grad_norm": 1.2140674591064453, "learning_rate": 2.132196162046908e-06, "loss": 0.6485, "step": 2245 }, { "epoch": 4.8, "grad_norm": 0.48497453331947327, "learning_rate": 2.025586353944563e-06, "loss": 0.682, "step": 2250 }, { "epoch": 4.81, "grad_norm": 0.5769209265708923, "learning_rate": 1.9189765458422177e-06, "loss": 0.6674, "step": 2255 }, { "epoch": 4.82, "grad_norm": 0.3388761579990387, "learning_rate": 1.812366737739872e-06, "loss": 0.6814, "step": 2260 }, { "epoch": 4.83, "grad_norm": 0.380491703748703, "learning_rate": 1.7057569296375267e-06, "loss": 0.6464, "step": 2265 }, { "epoch": 4.84, "grad_norm": 0.4902530610561371, "learning_rate": 1.5991471215351815e-06, "loss": 0.6572, "step": 2270 }, { "epoch": 4.85, "grad_norm": 1.4383553266525269, "learning_rate": 1.4925373134328358e-06, "loss": 0.6834, "step": 2275 }, { "epoch": 4.86, "grad_norm": 0.4202342927455902, "learning_rate": 1.3859275053304905e-06, "loss": 0.6597, "step": 2280 }, { "epoch": 4.87, "grad_norm": 0.6906626224517822, "learning_rate": 1.279317697228145e-06, "loss": 0.6891, "step": 2285 }, { "epoch": 4.88, "grad_norm": 0.36534583568573, "learning_rate": 1.1727078891257998e-06, "loss": 0.6654, "step": 2290 }, { "epoch": 4.89, "grad_norm": 0.22818098962306976, "learning_rate": 1.066098081023454e-06, "loss": 0.641, "step": 2295 }, { "epoch": 4.9, "grad_norm": 1.2233145236968994, "learning_rate": 9.594882729211088e-07, "loss": 0.6369, "step": 2300 }, { "epoch": 4.91, "grad_norm": 0.2983638346195221, "learning_rate": 8.528784648187634e-07, "loss": 0.6589, "step": 2305 }, { "epoch": 4.93, "grad_norm": 0.5600255131721497, "learning_rate": 7.462686567164179e-07, "loss": 0.642, "step": 2310 }, { "epoch": 4.94, "grad_norm": 0.271295964717865, "learning_rate": 6.396588486140725e-07, "loss": 0.6882, "step": 2315 }, { "epoch": 4.95, "grad_norm": 0.3047320246696472, "learning_rate": 5.33049040511727e-07, "loss": 0.6622, "step": 2320 }, { "epoch": 4.96, "grad_norm": 0.2932121753692627, "learning_rate": 4.264392324093817e-07, "loss": 0.6658, "step": 2325 }, { "epoch": 4.97, "grad_norm": 0.9874823689460754, "learning_rate": 3.1982942430703626e-07, "loss": 0.6619, "step": 2330 }, { "epoch": 4.98, "grad_norm": 0.6264281272888184, "learning_rate": 2.1321961620469084e-07, "loss": 0.678, "step": 2335 }, { "epoch": 4.99, "grad_norm": 0.49118179082870483, "learning_rate": 1.0660980810234542e-07, "loss": 0.702, "step": 2340 }, { "epoch": 5.0, "grad_norm": 0.45167651772499084, "learning_rate": 0.0, "loss": 0.6723, "step": 2345 }, { "epoch": 5.0, "eval_accuracy": 0.5736, "eval_loss": 0.6760911345481873, "eval_runtime": 17.4911, "eval_samples_per_second": 214.395, "eval_steps_per_second": 6.746, "step": 2345 }, { "epoch": 5.0, "step": 2345, "total_flos": 5.8118992210944e+18, "train_loss": 0.6811762883973275, "train_runtime": 808.2731, "train_samples_per_second": 92.79, "train_steps_per_second": 2.901 } ], "logging_steps": 5, "max_steps": 2345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 5.8118992210944e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }