{ "best_metric": 0.8714733542319749, "best_model_checkpoint": "vit-base-patch16-224-pure-ViT/checkpoint-2020", "epoch": 10.0, "eval_steps": 500, "global_step": 2020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 2.127864122390747, "learning_rate": 2.4752475247524753e-06, "loss": 0.7058, "step": 10 }, { "epoch": 0.1, "grad_norm": 1.910086750984192, "learning_rate": 4.950495049504951e-06, "loss": 0.6943, "step": 20 }, { "epoch": 0.15, "grad_norm": 2.0279719829559326, "learning_rate": 7.4257425742574256e-06, "loss": 0.6634, "step": 30 }, { "epoch": 0.2, "grad_norm": 2.858572006225586, "learning_rate": 9.900990099009901e-06, "loss": 0.6503, "step": 40 }, { "epoch": 0.25, "grad_norm": 2.1356213092803955, "learning_rate": 1.2376237623762377e-05, "loss": 0.6041, "step": 50 }, { "epoch": 0.3, "grad_norm": 2.2456114292144775, "learning_rate": 1.4851485148514851e-05, "loss": 0.5789, "step": 60 }, { "epoch": 0.35, "grad_norm": 1.5485836267471313, "learning_rate": 1.7326732673267325e-05, "loss": 0.5591, "step": 70 }, { "epoch": 0.4, "grad_norm": 2.320420742034912, "learning_rate": 1.9801980198019803e-05, "loss": 0.5324, "step": 80 }, { "epoch": 0.45, "grad_norm": 2.417330265045166, "learning_rate": 2.227722772277228e-05, "loss": 0.5268, "step": 90 }, { "epoch": 0.5, "grad_norm": 2.6199421882629395, "learning_rate": 2.4752475247524754e-05, "loss": 0.5218, "step": 100 }, { "epoch": 0.54, "grad_norm": 2.233579158782959, "learning_rate": 2.722772277227723e-05, "loss": 0.5062, "step": 110 }, { "epoch": 0.59, "grad_norm": 5.436655044555664, "learning_rate": 2.9702970297029702e-05, "loss": 0.5447, "step": 120 }, { "epoch": 0.64, "grad_norm": 2.305189371109009, "learning_rate": 3.217821782178218e-05, "loss": 0.5187, "step": 130 }, { "epoch": 0.69, "grad_norm": 2.4152462482452393, "learning_rate": 3.465346534653465e-05, "loss": 0.5011, "step": 140 }, { "epoch": 0.74, "grad_norm": 2.06062912940979, "learning_rate": 3.712871287128713e-05, "loss": 0.5123, "step": 150 }, { "epoch": 0.79, "grad_norm": 3.7219951152801514, "learning_rate": 3.9603960396039605e-05, "loss": 0.5124, "step": 160 }, { "epoch": 0.84, "grad_norm": 2.5283167362213135, "learning_rate": 4.207920792079208e-05, "loss": 0.4934, "step": 170 }, { "epoch": 0.89, "grad_norm": 2.957411050796509, "learning_rate": 4.455445544554456e-05, "loss": 0.5072, "step": 180 }, { "epoch": 0.94, "grad_norm": 2.1534793376922607, "learning_rate": 4.702970297029703e-05, "loss": 0.4785, "step": 190 }, { "epoch": 0.99, "grad_norm": 5.588723659515381, "learning_rate": 4.950495049504951e-05, "loss": 0.4676, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.8094740508533612, "eval_loss": 0.4041886329650879, "eval_runtime": 13.1239, "eval_samples_per_second": 218.761, "eval_steps_per_second": 6.858, "step": 202 }, { "epoch": 1.04, "grad_norm": 2.034203290939331, "learning_rate": 4.977997799779978e-05, "loss": 0.4653, "step": 210 }, { "epoch": 1.09, "grad_norm": 1.753670573234558, "learning_rate": 4.950495049504951e-05, "loss": 0.4843, "step": 220 }, { "epoch": 1.14, "grad_norm": 2.03153920173645, "learning_rate": 4.9229922992299234e-05, "loss": 0.4707, "step": 230 }, { "epoch": 1.19, "grad_norm": 2.570380926132202, "learning_rate": 4.895489548954896e-05, "loss": 0.4922, "step": 240 }, { "epoch": 1.24, "grad_norm": 1.440894603729248, "learning_rate": 4.867986798679868e-05, "loss": 0.4689, "step": 250 }, { "epoch": 1.29, "grad_norm": 1.8321127891540527, "learning_rate": 4.8404840484048406e-05, "loss": 0.4417, "step": 260 }, { "epoch": 1.34, "grad_norm": 1.853463888168335, "learning_rate": 4.812981298129813e-05, "loss": 0.4847, "step": 270 }, { "epoch": 1.39, "grad_norm": 1.8001580238342285, "learning_rate": 4.785478547854786e-05, "loss": 0.4815, "step": 280 }, { "epoch": 1.44, "grad_norm": 1.7666268348693848, "learning_rate": 4.7579757975797585e-05, "loss": 0.4424, "step": 290 }, { "epoch": 1.49, "grad_norm": 1.373815655708313, "learning_rate": 4.730473047304731e-05, "loss": 0.4604, "step": 300 }, { "epoch": 1.53, "grad_norm": 1.9827806949615479, "learning_rate": 4.702970297029703e-05, "loss": 0.4472, "step": 310 }, { "epoch": 1.58, "grad_norm": 4.021182060241699, "learning_rate": 4.675467546754676e-05, "loss": 0.4614, "step": 320 }, { "epoch": 1.63, "grad_norm": 1.6225961446762085, "learning_rate": 4.647964796479648e-05, "loss": 0.4679, "step": 330 }, { "epoch": 1.68, "grad_norm": 2.3169050216674805, "learning_rate": 4.62046204620462e-05, "loss": 0.4268, "step": 340 }, { "epoch": 1.73, "grad_norm": 2.3600728511810303, "learning_rate": 4.592959295929593e-05, "loss": 0.4601, "step": 350 }, { "epoch": 1.78, "grad_norm": 2.267119884490967, "learning_rate": 4.5654565456545655e-05, "loss": 0.4535, "step": 360 }, { "epoch": 1.83, "grad_norm": 2.0034701824188232, "learning_rate": 4.537953795379538e-05, "loss": 0.4404, "step": 370 }, { "epoch": 1.88, "grad_norm": 2.4724442958831787, "learning_rate": 4.510451045104511e-05, "loss": 0.4393, "step": 380 }, { "epoch": 1.93, "grad_norm": 3.4427175521850586, "learning_rate": 4.4829482948294834e-05, "loss": 0.4411, "step": 390 }, { "epoch": 1.98, "grad_norm": 1.7875171899795532, "learning_rate": 4.455445544554456e-05, "loss": 0.4605, "step": 400 }, { "epoch": 2.0, "eval_accuracy": 0.8376872169975619, "eval_loss": 0.3675467073917389, "eval_runtime": 13.0452, "eval_samples_per_second": 220.081, "eval_steps_per_second": 6.899, "step": 404 }, { "epoch": 2.03, "grad_norm": 1.3155332803726196, "learning_rate": 4.427942794279428e-05, "loss": 0.4514, "step": 410 }, { "epoch": 2.08, "grad_norm": 1.455605149269104, "learning_rate": 4.4004400440044006e-05, "loss": 0.4405, "step": 420 }, { "epoch": 2.13, "grad_norm": 3.8905019760131836, "learning_rate": 4.372937293729373e-05, "loss": 0.418, "step": 430 }, { "epoch": 2.18, "grad_norm": 1.4562945365905762, "learning_rate": 4.345434543454346e-05, "loss": 0.4437, "step": 440 }, { "epoch": 2.23, "grad_norm": 1.218812108039856, "learning_rate": 4.3179317931793185e-05, "loss": 0.422, "step": 450 }, { "epoch": 2.28, "grad_norm": 1.373265266418457, "learning_rate": 4.2904290429042904e-05, "loss": 0.3849, "step": 460 }, { "epoch": 2.33, "grad_norm": 1.9199002981185913, "learning_rate": 4.262926292629263e-05, "loss": 0.4121, "step": 470 }, { "epoch": 2.38, "grad_norm": 1.7126922607421875, "learning_rate": 4.2354235423542356e-05, "loss": 0.4351, "step": 480 }, { "epoch": 2.43, "grad_norm": 1.6914509534835815, "learning_rate": 4.207920792079208e-05, "loss": 0.4296, "step": 490 }, { "epoch": 2.48, "grad_norm": 3.3303816318511963, "learning_rate": 4.18041804180418e-05, "loss": 0.4435, "step": 500 }, { "epoch": 2.52, "grad_norm": 1.5210704803466797, "learning_rate": 4.152915291529153e-05, "loss": 0.4341, "step": 510 }, { "epoch": 2.57, "grad_norm": 2.497227430343628, "learning_rate": 4.1254125412541255e-05, "loss": 0.4113, "step": 520 }, { "epoch": 2.62, "grad_norm": 1.4309931993484497, "learning_rate": 4.097909790979098e-05, "loss": 0.4353, "step": 530 }, { "epoch": 2.67, "grad_norm": 1.4119651317596436, "learning_rate": 4.070407040704071e-05, "loss": 0.4356, "step": 540 }, { "epoch": 2.72, "grad_norm": 1.926058292388916, "learning_rate": 4.042904290429043e-05, "loss": 0.4124, "step": 550 }, { "epoch": 2.77, "grad_norm": 1.4232733249664307, "learning_rate": 4.015401540154016e-05, "loss": 0.3981, "step": 560 }, { "epoch": 2.82, "grad_norm": 2.521343469619751, "learning_rate": 3.987898789878988e-05, "loss": 0.4113, "step": 570 }, { "epoch": 2.87, "grad_norm": 2.658799409866333, "learning_rate": 3.9603960396039605e-05, "loss": 0.4261, "step": 580 }, { "epoch": 2.92, "grad_norm": 1.6421788930892944, "learning_rate": 3.932893289328933e-05, "loss": 0.4342, "step": 590 }, { "epoch": 2.97, "grad_norm": 1.5527944564819336, "learning_rate": 3.905390539053906e-05, "loss": 0.4012, "step": 600 }, { "epoch": 3.0, "eval_accuracy": 0.8505747126436781, "eval_loss": 0.3486035168170929, "eval_runtime": 13.2008, "eval_samples_per_second": 217.487, "eval_steps_per_second": 6.818, "step": 606 }, { "epoch": 3.02, "grad_norm": 1.2584953308105469, "learning_rate": 3.877887788778878e-05, "loss": 0.4146, "step": 610 }, { "epoch": 3.07, "grad_norm": 2.232922315597534, "learning_rate": 3.8503850385038503e-05, "loss": 0.4259, "step": 620 }, { "epoch": 3.12, "grad_norm": 1.4297153949737549, "learning_rate": 3.822882288228823e-05, "loss": 0.3939, "step": 630 }, { "epoch": 3.17, "grad_norm": 1.4966973066329956, "learning_rate": 3.7953795379537956e-05, "loss": 0.4267, "step": 640 }, { "epoch": 3.22, "grad_norm": 2.102534055709839, "learning_rate": 3.767876787678768e-05, "loss": 0.4007, "step": 650 }, { "epoch": 3.27, "grad_norm": 1.261534571647644, "learning_rate": 3.74037403740374e-05, "loss": 0.3922, "step": 660 }, { "epoch": 3.32, "grad_norm": 1.6345826387405396, "learning_rate": 3.712871287128713e-05, "loss": 0.3908, "step": 670 }, { "epoch": 3.37, "grad_norm": 2.479656934738159, "learning_rate": 3.6853685368536854e-05, "loss": 0.4128, "step": 680 }, { "epoch": 3.42, "grad_norm": 1.147039532661438, "learning_rate": 3.657865786578658e-05, "loss": 0.3946, "step": 690 }, { "epoch": 3.47, "grad_norm": 1.3348190784454346, "learning_rate": 3.6303630363036307e-05, "loss": 0.382, "step": 700 }, { "epoch": 3.51, "grad_norm": 2.655010938644409, "learning_rate": 3.602860286028603e-05, "loss": 0.4011, "step": 710 }, { "epoch": 3.56, "grad_norm": 1.7104761600494385, "learning_rate": 3.575357535753576e-05, "loss": 0.362, "step": 720 }, { "epoch": 3.61, "grad_norm": 1.740533709526062, "learning_rate": 3.5478547854785485e-05, "loss": 0.3818, "step": 730 }, { "epoch": 3.66, "grad_norm": 1.6973657608032227, "learning_rate": 3.5203520352035205e-05, "loss": 0.3794, "step": 740 }, { "epoch": 3.71, "grad_norm": 1.7627190351486206, "learning_rate": 3.492849284928493e-05, "loss": 0.4216, "step": 750 }, { "epoch": 3.76, "grad_norm": 1.6848688125610352, "learning_rate": 3.465346534653465e-05, "loss": 0.4105, "step": 760 }, { "epoch": 3.81, "grad_norm": 1.3963148593902588, "learning_rate": 3.4378437843784377e-05, "loss": 0.4227, "step": 770 }, { "epoch": 3.86, "grad_norm": 1.3992750644683838, "learning_rate": 3.41034103410341e-05, "loss": 0.3937, "step": 780 }, { "epoch": 3.91, "grad_norm": 2.454942226409912, "learning_rate": 3.382838283828383e-05, "loss": 0.3917, "step": 790 }, { "epoch": 3.96, "grad_norm": 1.409183382987976, "learning_rate": 3.3553355335533555e-05, "loss": 0.3727, "step": 800 }, { "epoch": 4.0, "eval_accuracy": 0.8481365377917102, "eval_loss": 0.34128066897392273, "eval_runtime": 13.0732, "eval_samples_per_second": 219.609, "eval_steps_per_second": 6.884, "step": 808 }, { "epoch": 4.01, "grad_norm": 1.3102561235427856, "learning_rate": 3.327832783278328e-05, "loss": 0.3735, "step": 810 }, { "epoch": 4.06, "grad_norm": 1.6926517486572266, "learning_rate": 3.300330033003301e-05, "loss": 0.3925, "step": 820 }, { "epoch": 4.11, "grad_norm": 2.1403160095214844, "learning_rate": 3.272827282728273e-05, "loss": 0.38, "step": 830 }, { "epoch": 4.16, "grad_norm": 1.4817917346954346, "learning_rate": 3.2453245324532453e-05, "loss": 0.3588, "step": 840 }, { "epoch": 4.21, "grad_norm": 2.6763010025024414, "learning_rate": 3.217821782178218e-05, "loss": 0.3728, "step": 850 }, { "epoch": 4.26, "grad_norm": 2.9526965618133545, "learning_rate": 3.1903190319031906e-05, "loss": 0.3934, "step": 860 }, { "epoch": 4.31, "grad_norm": 1.8583509922027588, "learning_rate": 3.162816281628163e-05, "loss": 0.3603, "step": 870 }, { "epoch": 4.36, "grad_norm": 2.0919086933135986, "learning_rate": 3.135313531353136e-05, "loss": 0.3527, "step": 880 }, { "epoch": 4.41, "grad_norm": 2.18446946144104, "learning_rate": 3.1078107810781085e-05, "loss": 0.3735, "step": 890 }, { "epoch": 4.46, "grad_norm": 1.4681159257888794, "learning_rate": 3.0803080308030804e-05, "loss": 0.371, "step": 900 }, { "epoch": 4.5, "grad_norm": 1.7022130489349365, "learning_rate": 3.052805280528053e-05, "loss": 0.3642, "step": 910 }, { "epoch": 4.55, "grad_norm": 2.031651258468628, "learning_rate": 3.0253025302530253e-05, "loss": 0.3561, "step": 920 }, { "epoch": 4.6, "grad_norm": 2.2234113216400146, "learning_rate": 2.9977997799779976e-05, "loss": 0.3568, "step": 930 }, { "epoch": 4.65, "grad_norm": 1.4382578134536743, "learning_rate": 2.9702970297029702e-05, "loss": 0.3685, "step": 940 }, { "epoch": 4.7, "grad_norm": 1.374062180519104, "learning_rate": 2.942794279427943e-05, "loss": 0.3478, "step": 950 }, { "epoch": 4.75, "grad_norm": 1.6396955251693726, "learning_rate": 2.9152915291529155e-05, "loss": 0.3549, "step": 960 }, { "epoch": 4.8, "grad_norm": 2.2647740840911865, "learning_rate": 2.8877887788778878e-05, "loss": 0.3598, "step": 970 }, { "epoch": 4.85, "grad_norm": 1.7425304651260376, "learning_rate": 2.8602860286028604e-05, "loss": 0.3898, "step": 980 }, { "epoch": 4.9, "grad_norm": 1.8157033920288086, "learning_rate": 2.832783278327833e-05, "loss": 0.3747, "step": 990 }, { "epoch": 4.95, "grad_norm": 1.7114821672439575, "learning_rate": 2.8052805280528056e-05, "loss": 0.3655, "step": 1000 }, { "epoch": 5.0, "grad_norm": 2.7515342235565186, "learning_rate": 2.777777777777778e-05, "loss": 0.3482, "step": 1010 }, { "epoch": 5.0, "eval_accuracy": 0.8613723441309649, "eval_loss": 0.33393073081970215, "eval_runtime": 13.1135, "eval_samples_per_second": 218.934, "eval_steps_per_second": 6.863, "step": 1010 }, { "epoch": 5.05, "grad_norm": 2.1413614749908447, "learning_rate": 2.7502750275027505e-05, "loss": 0.3581, "step": 1020 }, { "epoch": 5.1, "grad_norm": 1.9296377897262573, "learning_rate": 2.722772277227723e-05, "loss": 0.3561, "step": 1030 }, { "epoch": 5.15, "grad_norm": 2.4745469093322754, "learning_rate": 2.6952695269526958e-05, "loss": 0.3617, "step": 1040 }, { "epoch": 5.2, "grad_norm": 2.292569160461426, "learning_rate": 2.667766776677668e-05, "loss": 0.353, "step": 1050 }, { "epoch": 5.25, "grad_norm": 2.5686182975769043, "learning_rate": 2.64026402640264e-05, "loss": 0.3437, "step": 1060 }, { "epoch": 5.3, "grad_norm": 1.8417789936065674, "learning_rate": 2.6127612761276126e-05, "loss": 0.3262, "step": 1070 }, { "epoch": 5.35, "grad_norm": 2.6065661907196045, "learning_rate": 2.5852585258525853e-05, "loss": 0.3617, "step": 1080 }, { "epoch": 5.4, "grad_norm": 2.017815113067627, "learning_rate": 2.557755775577558e-05, "loss": 0.34, "step": 1090 }, { "epoch": 5.45, "grad_norm": 1.7035880088806152, "learning_rate": 2.53025302530253e-05, "loss": 0.3282, "step": 1100 }, { "epoch": 5.5, "grad_norm": 2.314371347427368, "learning_rate": 2.5027502750275028e-05, "loss": 0.3385, "step": 1110 }, { "epoch": 5.54, "grad_norm": 1.7718466520309448, "learning_rate": 2.4752475247524754e-05, "loss": 0.3415, "step": 1120 }, { "epoch": 5.59, "grad_norm": 1.4826040267944336, "learning_rate": 2.447744774477448e-05, "loss": 0.3368, "step": 1130 }, { "epoch": 5.64, "grad_norm": 1.6457180976867676, "learning_rate": 2.4202420242024203e-05, "loss": 0.3774, "step": 1140 }, { "epoch": 5.69, "grad_norm": 2.1672353744506836, "learning_rate": 2.392739273927393e-05, "loss": 0.3556, "step": 1150 }, { "epoch": 5.74, "grad_norm": 1.5077142715454102, "learning_rate": 2.3652365236523656e-05, "loss": 0.34, "step": 1160 }, { "epoch": 5.79, "grad_norm": 1.6296086311340332, "learning_rate": 2.337733773377338e-05, "loss": 0.3325, "step": 1170 }, { "epoch": 5.84, "grad_norm": 2.324458599090576, "learning_rate": 2.31023102310231e-05, "loss": 0.3276, "step": 1180 }, { "epoch": 5.89, "grad_norm": 2.119600534439087, "learning_rate": 2.2827282728272828e-05, "loss": 0.3472, "step": 1190 }, { "epoch": 5.94, "grad_norm": 1.9447872638702393, "learning_rate": 2.2552255225522554e-05, "loss": 0.3702, "step": 1200 }, { "epoch": 5.99, "grad_norm": 1.564266562461853, "learning_rate": 2.227722772277228e-05, "loss": 0.354, "step": 1210 }, { "epoch": 6.0, "eval_accuracy": 0.8561476837338906, "eval_loss": 0.343646377325058, "eval_runtime": 13.2461, "eval_samples_per_second": 216.743, "eval_steps_per_second": 6.794, "step": 1212 }, { "epoch": 6.04, "grad_norm": 1.6272279024124146, "learning_rate": 2.2002200220022003e-05, "loss": 0.3351, "step": 1220 }, { "epoch": 6.09, "grad_norm": 1.7797123193740845, "learning_rate": 2.172717271727173e-05, "loss": 0.3326, "step": 1230 }, { "epoch": 6.14, "grad_norm": 1.8236846923828125, "learning_rate": 2.1452145214521452e-05, "loss": 0.3223, "step": 1240 }, { "epoch": 6.19, "grad_norm": 1.5644466876983643, "learning_rate": 2.1177117711771178e-05, "loss": 0.3195, "step": 1250 }, { "epoch": 6.24, "grad_norm": 1.7544844150543213, "learning_rate": 2.09020902090209e-05, "loss": 0.3086, "step": 1260 }, { "epoch": 6.29, "grad_norm": 1.6309467554092407, "learning_rate": 2.0627062706270627e-05, "loss": 0.3423, "step": 1270 }, { "epoch": 6.34, "grad_norm": 2.1340713500976562, "learning_rate": 2.0352035203520354e-05, "loss": 0.3361, "step": 1280 }, { "epoch": 6.39, "grad_norm": 1.5964475870132446, "learning_rate": 2.007700770077008e-05, "loss": 0.3254, "step": 1290 }, { "epoch": 6.44, "grad_norm": 1.533130168914795, "learning_rate": 1.9801980198019803e-05, "loss": 0.3294, "step": 1300 }, { "epoch": 6.49, "grad_norm": 1.8504540920257568, "learning_rate": 1.952695269526953e-05, "loss": 0.3146, "step": 1310 }, { "epoch": 6.53, "grad_norm": 2.3233487606048584, "learning_rate": 1.9251925192519252e-05, "loss": 0.3308, "step": 1320 }, { "epoch": 6.58, "grad_norm": 1.8644599914550781, "learning_rate": 1.8976897689768978e-05, "loss": 0.3213, "step": 1330 }, { "epoch": 6.63, "grad_norm": 1.5879623889923096, "learning_rate": 1.87018701870187e-05, "loss": 0.3121, "step": 1340 }, { "epoch": 6.68, "grad_norm": 1.4578536748886108, "learning_rate": 1.8426842684268427e-05, "loss": 0.3052, "step": 1350 }, { "epoch": 6.73, "grad_norm": 1.7176955938339233, "learning_rate": 1.8151815181518153e-05, "loss": 0.3236, "step": 1360 }, { "epoch": 6.78, "grad_norm": 2.3816351890563965, "learning_rate": 1.787678767876788e-05, "loss": 0.3168, "step": 1370 }, { "epoch": 6.83, "grad_norm": 2.024005889892578, "learning_rate": 1.7601760176017602e-05, "loss": 0.3594, "step": 1380 }, { "epoch": 6.88, "grad_norm": 1.6122448444366455, "learning_rate": 1.7326732673267325e-05, "loss": 0.3304, "step": 1390 }, { "epoch": 6.93, "grad_norm": 2.809938430786133, "learning_rate": 1.705170517051705e-05, "loss": 0.3376, "step": 1400 }, { "epoch": 6.98, "grad_norm": 2.1670022010803223, "learning_rate": 1.6776677667766778e-05, "loss": 0.3212, "step": 1410 }, { "epoch": 7.0, "eval_accuracy": 0.8533611981887844, "eval_loss": 0.3415059745311737, "eval_runtime": 13.0333, "eval_samples_per_second": 220.283, "eval_steps_per_second": 6.905, "step": 1414 }, { "epoch": 7.03, "grad_norm": 2.0041117668151855, "learning_rate": 1.6501650165016504e-05, "loss": 0.3216, "step": 1420 }, { "epoch": 7.08, "grad_norm": 1.5932248830795288, "learning_rate": 1.6226622662266227e-05, "loss": 0.2902, "step": 1430 }, { "epoch": 7.13, "grad_norm": 1.8543118238449097, "learning_rate": 1.5951595159515953e-05, "loss": 0.3383, "step": 1440 }, { "epoch": 7.18, "grad_norm": 2.0562777519226074, "learning_rate": 1.567656765676568e-05, "loss": 0.3155, "step": 1450 }, { "epoch": 7.23, "grad_norm": 2.037494421005249, "learning_rate": 1.5401540154015402e-05, "loss": 0.3104, "step": 1460 }, { "epoch": 7.28, "grad_norm": 1.9015072584152222, "learning_rate": 1.5126512651265127e-05, "loss": 0.3259, "step": 1470 }, { "epoch": 7.33, "grad_norm": 1.8102400302886963, "learning_rate": 1.4851485148514851e-05, "loss": 0.3087, "step": 1480 }, { "epoch": 7.38, "grad_norm": 2.07037353515625, "learning_rate": 1.4576457645764577e-05, "loss": 0.301, "step": 1490 }, { "epoch": 7.43, "grad_norm": 2.2085139751434326, "learning_rate": 1.4301430143014302e-05, "loss": 0.3115, "step": 1500 }, { "epoch": 7.48, "grad_norm": 2.438488721847534, "learning_rate": 1.4026402640264028e-05, "loss": 0.3406, "step": 1510 }, { "epoch": 7.52, "grad_norm": 1.43858003616333, "learning_rate": 1.3751375137513753e-05, "loss": 0.2928, "step": 1520 }, { "epoch": 7.57, "grad_norm": 1.8977525234222412, "learning_rate": 1.3476347634763479e-05, "loss": 0.3201, "step": 1530 }, { "epoch": 7.62, "grad_norm": 1.6580339670181274, "learning_rate": 1.32013201320132e-05, "loss": 0.2892, "step": 1540 }, { "epoch": 7.67, "grad_norm": 2.5153727531433105, "learning_rate": 1.2926292629262926e-05, "loss": 0.3087, "step": 1550 }, { "epoch": 7.72, "grad_norm": 2.4366941452026367, "learning_rate": 1.265126512651265e-05, "loss": 0.28, "step": 1560 }, { "epoch": 7.77, "grad_norm": 1.8391032218933105, "learning_rate": 1.2376237623762377e-05, "loss": 0.3104, "step": 1570 }, { "epoch": 7.82, "grad_norm": 3.4520180225372314, "learning_rate": 1.2101210121012102e-05, "loss": 0.3113, "step": 1580 }, { "epoch": 7.87, "grad_norm": 2.3579323291778564, "learning_rate": 1.1826182618261828e-05, "loss": 0.3308, "step": 1590 }, { "epoch": 7.92, "grad_norm": 2.1137523651123047, "learning_rate": 1.155115511551155e-05, "loss": 0.3024, "step": 1600 }, { "epoch": 7.97, "grad_norm": 2.0364956855773926, "learning_rate": 1.1276127612761277e-05, "loss": 0.3263, "step": 1610 }, { "epoch": 8.0, "eval_accuracy": 0.864158829676071, "eval_loss": 0.3281286954879761, "eval_runtime": 13.04, "eval_samples_per_second": 220.169, "eval_steps_per_second": 6.902, "step": 1616 }, { "epoch": 8.02, "grad_norm": 1.6843018531799316, "learning_rate": 1.1001100110011001e-05, "loss": 0.3053, "step": 1620 }, { "epoch": 8.07, "grad_norm": 1.6904422044754028, "learning_rate": 1.0726072607260726e-05, "loss": 0.2766, "step": 1630 }, { "epoch": 8.12, "grad_norm": 2.0085134506225586, "learning_rate": 1.045104510451045e-05, "loss": 0.2933, "step": 1640 }, { "epoch": 8.17, "grad_norm": 2.3377184867858887, "learning_rate": 1.0176017601760177e-05, "loss": 0.3185, "step": 1650 }, { "epoch": 8.22, "grad_norm": 1.9562610387802124, "learning_rate": 9.900990099009901e-06, "loss": 0.3282, "step": 1660 }, { "epoch": 8.27, "grad_norm": 1.9323431253433228, "learning_rate": 9.625962596259626e-06, "loss": 0.3042, "step": 1670 }, { "epoch": 8.32, "grad_norm": 1.8757867813110352, "learning_rate": 9.35093509350935e-06, "loss": 0.2838, "step": 1680 }, { "epoch": 8.37, "grad_norm": 1.5240260362625122, "learning_rate": 9.075907590759077e-06, "loss": 0.2996, "step": 1690 }, { "epoch": 8.42, "grad_norm": 1.236165165901184, "learning_rate": 8.800880088008801e-06, "loss": 0.2833, "step": 1700 }, { "epoch": 8.47, "grad_norm": 1.9543601274490356, "learning_rate": 8.525852585258526e-06, "loss": 0.2706, "step": 1710 }, { "epoch": 8.51, "grad_norm": 2.0328145027160645, "learning_rate": 8.250825082508252e-06, "loss": 0.2952, "step": 1720 }, { "epoch": 8.56, "grad_norm": 1.5521138906478882, "learning_rate": 7.975797579757976e-06, "loss": 0.2984, "step": 1730 }, { "epoch": 8.61, "grad_norm": 1.9917885065078735, "learning_rate": 7.700770077007701e-06, "loss": 0.2924, "step": 1740 }, { "epoch": 8.66, "grad_norm": 1.6940460205078125, "learning_rate": 7.4257425742574256e-06, "loss": 0.3079, "step": 1750 }, { "epoch": 8.71, "grad_norm": 1.891791582107544, "learning_rate": 7.150715071507151e-06, "loss": 0.3039, "step": 1760 }, { "epoch": 8.76, "grad_norm": 1.9249194860458374, "learning_rate": 6.875687568756876e-06, "loss": 0.2744, "step": 1770 }, { "epoch": 8.81, "grad_norm": 1.7590512037277222, "learning_rate": 6.6006600660066e-06, "loss": 0.3029, "step": 1780 }, { "epoch": 8.86, "grad_norm": 1.5572028160095215, "learning_rate": 6.325632563256325e-06, "loss": 0.2814, "step": 1790 }, { "epoch": 8.91, "grad_norm": 1.9127514362335205, "learning_rate": 6.050605060506051e-06, "loss": 0.2968, "step": 1800 }, { "epoch": 8.96, "grad_norm": 1.6464173793792725, "learning_rate": 5.775577557755775e-06, "loss": 0.285, "step": 1810 }, { "epoch": 9.0, "eval_accuracy": 0.8672936259143156, "eval_loss": 0.3263307809829712, "eval_runtime": 13.0994, "eval_samples_per_second": 219.17, "eval_steps_per_second": 6.871, "step": 1818 }, { "epoch": 9.01, "grad_norm": 2.2403345108032227, "learning_rate": 5.500550055005501e-06, "loss": 0.3191, "step": 1820 }, { "epoch": 9.06, "grad_norm": 2.0323376655578613, "learning_rate": 5.225522552255225e-06, "loss": 0.2864, "step": 1830 }, { "epoch": 9.11, "grad_norm": 2.5605928897857666, "learning_rate": 4.950495049504951e-06, "loss": 0.2973, "step": 1840 }, { "epoch": 9.16, "grad_norm": 2.1377811431884766, "learning_rate": 4.675467546754675e-06, "loss": 0.2711, "step": 1850 }, { "epoch": 9.21, "grad_norm": 1.5810461044311523, "learning_rate": 4.400440044004401e-06, "loss": 0.2816, "step": 1860 }, { "epoch": 9.26, "grad_norm": 2.0196871757507324, "learning_rate": 4.125412541254126e-06, "loss": 0.2867, "step": 1870 }, { "epoch": 9.31, "grad_norm": 1.4029825925827026, "learning_rate": 3.8503850385038505e-06, "loss": 0.2873, "step": 1880 }, { "epoch": 9.36, "grad_norm": 1.715185284614563, "learning_rate": 3.5753575357535755e-06, "loss": 0.2944, "step": 1890 }, { "epoch": 9.41, "grad_norm": 1.9352003335952759, "learning_rate": 3.3003300330033e-06, "loss": 0.2926, "step": 1900 }, { "epoch": 9.46, "grad_norm": 2.096611738204956, "learning_rate": 3.0253025302530254e-06, "loss": 0.2593, "step": 1910 }, { "epoch": 9.5, "grad_norm": 1.943051815032959, "learning_rate": 2.7502750275027504e-06, "loss": 0.2894, "step": 1920 }, { "epoch": 9.55, "grad_norm": 1.3575071096420288, "learning_rate": 2.4752475247524753e-06, "loss": 0.2958, "step": 1930 }, { "epoch": 9.6, "grad_norm": 1.547815203666687, "learning_rate": 2.2002200220022003e-06, "loss": 0.2707, "step": 1940 }, { "epoch": 9.65, "grad_norm": 2.169074773788452, "learning_rate": 1.9251925192519253e-06, "loss": 0.269, "step": 1950 }, { "epoch": 9.7, "grad_norm": 1.756319522857666, "learning_rate": 1.65016501650165e-06, "loss": 0.2827, "step": 1960 }, { "epoch": 9.75, "grad_norm": 1.895622730255127, "learning_rate": 1.3751375137513752e-06, "loss": 0.2791, "step": 1970 }, { "epoch": 9.8, "grad_norm": 1.7341846227645874, "learning_rate": 1.1001100110011001e-06, "loss": 0.2808, "step": 1980 }, { "epoch": 9.85, "grad_norm": 1.894599199295044, "learning_rate": 8.25082508250825e-07, "loss": 0.2961, "step": 1990 }, { "epoch": 9.9, "grad_norm": 2.3876936435699463, "learning_rate": 5.500550055005501e-07, "loss": 0.2771, "step": 2000 }, { "epoch": 9.95, "grad_norm": 1.6493264436721802, "learning_rate": 2.7502750275027504e-07, "loss": 0.2674, "step": 2010 }, { "epoch": 10.0, "grad_norm": 1.8131953477859497, "learning_rate": 0.0, "loss": 0.2779, "step": 2020 }, { "epoch": 10.0, "eval_accuracy": 0.8714733542319749, "eval_loss": 0.3270263373851776, "eval_runtime": 13.1626, "eval_samples_per_second": 218.117, "eval_steps_per_second": 6.838, "step": 2020 }, { "epoch": 10.0, "step": 2020, "total_flos": 2.0021605356722135e+19, "train_loss": 0.37628526250914773, "train_runtime": 2991.3974, "train_samples_per_second": 86.371, "train_steps_per_second": 0.675 } ], "logging_steps": 10, "max_steps": 2020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 2.0021605356722135e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }