{ "best_metric": 0.842911877394636, "best_model_checkpoint": "vit-tiny-patch16-224-winkawaks/checkpoint-1818", "epoch": 10.0, "eval_steps": 500, "global_step": 2020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 11.655478477478027, "learning_rate": 2.4752475247524753e-06, "loss": 0.8812, "step": 10 }, { "epoch": 0.1, "grad_norm": 6.654660224914551, "learning_rate": 4.950495049504951e-06, "loss": 0.7531, "step": 20 }, { "epoch": 0.15, "grad_norm": 4.010664939880371, "learning_rate": 7.4257425742574256e-06, "loss": 0.7312, "step": 30 }, { "epoch": 0.2, "grad_norm": 3.320401191711426, "learning_rate": 9.900990099009901e-06, "loss": 0.6753, "step": 40 }, { "epoch": 0.25, "grad_norm": 4.002676010131836, "learning_rate": 1.2376237623762377e-05, "loss": 0.6656, "step": 50 }, { "epoch": 0.3, "grad_norm": 5.1362690925598145, "learning_rate": 1.4851485148514851e-05, "loss": 0.636, "step": 60 }, { "epoch": 0.35, "grad_norm": 4.089080333709717, "learning_rate": 1.7326732673267325e-05, "loss": 0.6258, "step": 70 }, { "epoch": 0.4, "grad_norm": 2.782809257507324, "learning_rate": 1.9801980198019803e-05, "loss": 0.6074, "step": 80 }, { "epoch": 0.45, "grad_norm": 4.684001922607422, "learning_rate": 2.227722772277228e-05, "loss": 0.6114, "step": 90 }, { "epoch": 0.5, "grad_norm": 8.801894187927246, "learning_rate": 2.4752475247524754e-05, "loss": 0.5888, "step": 100 }, { "epoch": 0.54, "grad_norm": 10.810859680175781, "learning_rate": 2.722772277227723e-05, "loss": 0.5858, "step": 110 }, { "epoch": 0.59, "grad_norm": 4.189158916473389, "learning_rate": 2.9702970297029702e-05, "loss": 0.5849, "step": 120 }, { "epoch": 0.64, "grad_norm": 6.533910274505615, "learning_rate": 3.217821782178218e-05, "loss": 0.5811, "step": 130 }, { "epoch": 0.69, "grad_norm": 15.806050300598145, "learning_rate": 3.465346534653465e-05, "loss": 0.6041, "step": 140 }, { "epoch": 0.74, "grad_norm": 3.087372064590454, "learning_rate": 3.712871287128713e-05, "loss": 0.5402, "step": 150 }, { "epoch": 0.79, "grad_norm": 6.852709770202637, "learning_rate": 3.9603960396039605e-05, "loss": 0.5435, "step": 160 }, { "epoch": 0.84, "grad_norm": 3.9751667976379395, "learning_rate": 4.207920792079208e-05, "loss": 0.5826, "step": 170 }, { "epoch": 0.89, "grad_norm": 8.571554183959961, "learning_rate": 4.455445544554456e-05, "loss": 0.5399, "step": 180 }, { "epoch": 0.94, "grad_norm": 2.956425905227661, "learning_rate": 4.702970297029703e-05, "loss": 0.5627, "step": 190 }, { "epoch": 0.99, "grad_norm": 10.44449520111084, "learning_rate": 4.950495049504951e-05, "loss": 0.5345, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.7770811563915012, "eval_loss": 0.4689331650733948, "eval_runtime": 5.627, "eval_samples_per_second": 510.22, "eval_steps_per_second": 15.994, "step": 202 }, { "epoch": 1.04, "grad_norm": 5.914970397949219, "learning_rate": 4.977997799779978e-05, "loss": 0.533, "step": 210 }, { "epoch": 1.09, "grad_norm": 7.073808193206787, "learning_rate": 4.950495049504951e-05, "loss": 0.517, "step": 220 }, { "epoch": 1.14, "grad_norm": 3.3154642581939697, "learning_rate": 4.9229922992299234e-05, "loss": 0.5385, "step": 230 }, { "epoch": 1.19, "grad_norm": 3.500061273574829, "learning_rate": 4.895489548954896e-05, "loss": 0.5345, "step": 240 }, { "epoch": 1.24, "grad_norm": 8.62484359741211, "learning_rate": 4.867986798679868e-05, "loss": 0.5368, "step": 250 }, { "epoch": 1.29, "grad_norm": 9.027387619018555, "learning_rate": 4.8404840484048406e-05, "loss": 0.5247, "step": 260 }, { "epoch": 1.34, "grad_norm": 7.457286357879639, "learning_rate": 4.812981298129813e-05, "loss": 0.5044, "step": 270 }, { "epoch": 1.39, "grad_norm": 7.2032790184021, "learning_rate": 4.785478547854786e-05, "loss": 0.5156, "step": 280 }, { "epoch": 1.44, "grad_norm": 5.909176826477051, "learning_rate": 4.7579757975797585e-05, "loss": 0.5191, "step": 290 }, { "epoch": 1.49, "grad_norm": 4.56859827041626, "learning_rate": 4.730473047304731e-05, "loss": 0.4891, "step": 300 }, { "epoch": 1.53, "grad_norm": 5.181271553039551, "learning_rate": 4.702970297029703e-05, "loss": 0.5016, "step": 310 }, { "epoch": 1.58, "grad_norm": 6.948479175567627, "learning_rate": 4.675467546754676e-05, "loss": 0.5008, "step": 320 }, { "epoch": 1.63, "grad_norm": 7.155640125274658, "learning_rate": 4.647964796479648e-05, "loss": 0.5451, "step": 330 }, { "epoch": 1.68, "grad_norm": 3.5051815509796143, "learning_rate": 4.62046204620462e-05, "loss": 0.4998, "step": 340 }, { "epoch": 1.73, "grad_norm": 12.015359878540039, "learning_rate": 4.592959295929593e-05, "loss": 0.5302, "step": 350 }, { "epoch": 1.78, "grad_norm": 5.719058990478516, "learning_rate": 4.5654565456545655e-05, "loss": 0.5052, "step": 360 }, { "epoch": 1.83, "grad_norm": 6.292553424835205, "learning_rate": 4.537953795379538e-05, "loss": 0.5192, "step": 370 }, { "epoch": 1.88, "grad_norm": 2.8755393028259277, "learning_rate": 4.510451045104511e-05, "loss": 0.4966, "step": 380 }, { "epoch": 1.93, "grad_norm": 3.096843719482422, "learning_rate": 4.4829482948294834e-05, "loss": 0.5032, "step": 390 }, { "epoch": 1.98, "grad_norm": 3.9766151905059814, "learning_rate": 4.455445544554456e-05, "loss": 0.4936, "step": 400 }, { "epoch": 2.0, "eval_accuracy": 0.7485196795541623, "eval_loss": 0.5022404789924622, "eval_runtime": 5.6861, "eval_samples_per_second": 504.916, "eval_steps_per_second": 15.828, "step": 404 }, { "epoch": 2.03, "grad_norm": 7.3236870765686035, "learning_rate": 4.427942794279428e-05, "loss": 0.4828, "step": 410 }, { "epoch": 2.08, "grad_norm": 4.203598976135254, "learning_rate": 4.4004400440044006e-05, "loss": 0.5108, "step": 420 }, { "epoch": 2.13, "grad_norm": 4.851642608642578, "learning_rate": 4.372937293729373e-05, "loss": 0.474, "step": 430 }, { "epoch": 2.18, "grad_norm": 8.77710247039795, "learning_rate": 4.345434543454346e-05, "loss": 0.4902, "step": 440 }, { "epoch": 2.23, "grad_norm": 6.436154365539551, "learning_rate": 4.3179317931793185e-05, "loss": 0.4838, "step": 450 }, { "epoch": 2.28, "grad_norm": 4.809104919433594, "learning_rate": 4.2904290429042904e-05, "loss": 0.5064, "step": 460 }, { "epoch": 2.33, "grad_norm": 4.125964641571045, "learning_rate": 4.262926292629263e-05, "loss": 0.4819, "step": 470 }, { "epoch": 2.38, "grad_norm": 6.8852763175964355, "learning_rate": 4.2354235423542356e-05, "loss": 0.5011, "step": 480 }, { "epoch": 2.43, "grad_norm": 3.3697924613952637, "learning_rate": 4.207920792079208e-05, "loss": 0.4676, "step": 490 }, { "epoch": 2.48, "grad_norm": 5.354856967926025, "learning_rate": 4.18041804180418e-05, "loss": 0.507, "step": 500 }, { "epoch": 2.52, "grad_norm": 7.638093948364258, "learning_rate": 4.152915291529153e-05, "loss": 0.5046, "step": 510 }, { "epoch": 2.57, "grad_norm": 4.444268226623535, "learning_rate": 4.1254125412541255e-05, "loss": 0.4635, "step": 520 }, { "epoch": 2.62, "grad_norm": 8.037334442138672, "learning_rate": 4.097909790979098e-05, "loss": 0.4796, "step": 530 }, { "epoch": 2.67, "grad_norm": 6.52307653427124, "learning_rate": 4.070407040704071e-05, "loss": 0.4668, "step": 540 }, { "epoch": 2.72, "grad_norm": 7.490472316741943, "learning_rate": 4.042904290429043e-05, "loss": 0.4747, "step": 550 }, { "epoch": 2.77, "grad_norm": 3.393075942993164, "learning_rate": 4.015401540154016e-05, "loss": 0.4667, "step": 560 }, { "epoch": 2.82, "grad_norm": 4.454715251922607, "learning_rate": 3.987898789878988e-05, "loss": 0.4795, "step": 570 }, { "epoch": 2.87, "grad_norm": 5.316873550415039, "learning_rate": 3.9603960396039605e-05, "loss": 0.499, "step": 580 }, { "epoch": 2.92, "grad_norm": 2.8893849849700928, "learning_rate": 3.932893289328933e-05, "loss": 0.4913, "step": 590 }, { "epoch": 2.97, "grad_norm": 2.6538729667663574, "learning_rate": 3.905390539053906e-05, "loss": 0.4911, "step": 600 }, { "epoch": 3.0, "eval_accuracy": 0.82793451758969, "eval_loss": 0.3886525630950928, "eval_runtime": 5.5731, "eval_samples_per_second": 515.151, "eval_steps_per_second": 16.149, "step": 606 }, { "epoch": 3.02, "grad_norm": 9.823522567749023, "learning_rate": 3.877887788778878e-05, "loss": 0.4731, "step": 610 }, { "epoch": 3.07, "grad_norm": 5.899537086486816, "learning_rate": 3.8503850385038503e-05, "loss": 0.4621, "step": 620 }, { "epoch": 3.12, "grad_norm": 4.964333534240723, "learning_rate": 3.822882288228823e-05, "loss": 0.4762, "step": 630 }, { "epoch": 3.17, "grad_norm": 4.469970703125, "learning_rate": 3.7953795379537956e-05, "loss": 0.4774, "step": 640 }, { "epoch": 3.22, "grad_norm": 3.393183469772339, "learning_rate": 3.767876787678768e-05, "loss": 0.4501, "step": 650 }, { "epoch": 3.27, "grad_norm": 11.101873397827148, "learning_rate": 3.74037403740374e-05, "loss": 0.4596, "step": 660 }, { "epoch": 3.32, "grad_norm": 4.9833831787109375, "learning_rate": 3.712871287128713e-05, "loss": 0.4516, "step": 670 }, { "epoch": 3.37, "grad_norm": 5.423743724822998, "learning_rate": 3.6853685368536854e-05, "loss": 0.4887, "step": 680 }, { "epoch": 3.42, "grad_norm": 4.94615364074707, "learning_rate": 3.657865786578658e-05, "loss": 0.4538, "step": 690 }, { "epoch": 3.47, "grad_norm": 5.113562107086182, "learning_rate": 3.6303630363036307e-05, "loss": 0.4645, "step": 700 }, { "epoch": 3.51, "grad_norm": 7.3234357833862305, "learning_rate": 3.602860286028603e-05, "loss": 0.4645, "step": 710 }, { "epoch": 3.56, "grad_norm": 4.696035385131836, "learning_rate": 3.575357535753576e-05, "loss": 0.4584, "step": 720 }, { "epoch": 3.61, "grad_norm": 3.724952459335327, "learning_rate": 3.5478547854785485e-05, "loss": 0.427, "step": 730 }, { "epoch": 3.66, "grad_norm": 3.8439409732818604, "learning_rate": 3.5203520352035205e-05, "loss": 0.4554, "step": 740 }, { "epoch": 3.71, "grad_norm": 3.4024925231933594, "learning_rate": 3.492849284928493e-05, "loss": 0.4413, "step": 750 }, { "epoch": 3.76, "grad_norm": 8.046459197998047, "learning_rate": 3.465346534653465e-05, "loss": 0.4638, "step": 760 }, { "epoch": 3.81, "grad_norm": 5.1486358642578125, "learning_rate": 3.4378437843784377e-05, "loss": 0.4852, "step": 770 }, { "epoch": 3.86, "grad_norm": 3.408259868621826, "learning_rate": 3.41034103410341e-05, "loss": 0.4347, "step": 780 }, { "epoch": 3.91, "grad_norm": 5.650546073913574, "learning_rate": 3.382838283828383e-05, "loss": 0.4294, "step": 790 }, { "epoch": 3.96, "grad_norm": 3.8805394172668457, "learning_rate": 3.3553355335533555e-05, "loss": 0.4191, "step": 800 }, { "epoch": 4.0, "eval_accuracy": 0.8098223615464994, "eval_loss": 0.4121237099170685, "eval_runtime": 5.4396, "eval_samples_per_second": 527.797, "eval_steps_per_second": 16.545, "step": 808 }, { "epoch": 4.01, "grad_norm": 4.736340522766113, "learning_rate": 3.327832783278328e-05, "loss": 0.4824, "step": 810 }, { "epoch": 4.06, "grad_norm": 4.468814373016357, "learning_rate": 3.300330033003301e-05, "loss": 0.468, "step": 820 }, { "epoch": 4.11, "grad_norm": 3.311174154281616, "learning_rate": 3.272827282728273e-05, "loss": 0.442, "step": 830 }, { "epoch": 4.16, "grad_norm": 6.016156196594238, "learning_rate": 3.2453245324532453e-05, "loss": 0.4748, "step": 840 }, { "epoch": 4.21, "grad_norm": 6.195199966430664, "learning_rate": 3.217821782178218e-05, "loss": 0.4488, "step": 850 }, { "epoch": 4.26, "grad_norm": 3.6519343852996826, "learning_rate": 3.1903190319031906e-05, "loss": 0.4285, "step": 860 }, { "epoch": 4.31, "grad_norm": 5.461367130279541, "learning_rate": 3.162816281628163e-05, "loss": 0.4124, "step": 870 }, { "epoch": 4.36, "grad_norm": 3.7743630409240723, "learning_rate": 3.135313531353136e-05, "loss": 0.4361, "step": 880 }, { "epoch": 4.41, "grad_norm": 3.741748571395874, "learning_rate": 3.1078107810781085e-05, "loss": 0.4196, "step": 890 }, { "epoch": 4.46, "grad_norm": 5.458765983581543, "learning_rate": 3.0803080308030804e-05, "loss": 0.4814, "step": 900 }, { "epoch": 4.5, "grad_norm": 5.118380546569824, "learning_rate": 3.052805280528053e-05, "loss": 0.4441, "step": 910 }, { "epoch": 4.55, "grad_norm": 4.428480625152588, "learning_rate": 3.0253025302530253e-05, "loss": 0.4217, "step": 920 }, { "epoch": 4.6, "grad_norm": 3.2278711795806885, "learning_rate": 2.9977997799779976e-05, "loss": 0.4259, "step": 930 }, { "epoch": 4.65, "grad_norm": 3.5108141899108887, "learning_rate": 2.9702970297029702e-05, "loss": 0.4507, "step": 940 }, { "epoch": 4.7, "grad_norm": 3.02825665473938, "learning_rate": 2.942794279427943e-05, "loss": 0.4095, "step": 950 }, { "epoch": 4.75, "grad_norm": 3.8650879859924316, "learning_rate": 2.9152915291529155e-05, "loss": 0.4251, "step": 960 }, { "epoch": 4.8, "grad_norm": 5.335313320159912, "learning_rate": 2.8877887788778878e-05, "loss": 0.428, "step": 970 }, { "epoch": 4.85, "grad_norm": 5.564798831939697, "learning_rate": 2.8602860286028604e-05, "loss": 0.4713, "step": 980 }, { "epoch": 4.9, "grad_norm": 4.036501407623291, "learning_rate": 2.832783278327833e-05, "loss": 0.4152, "step": 990 }, { "epoch": 4.95, "grad_norm": 4.295027732849121, "learning_rate": 2.8052805280528056e-05, "loss": 0.4209, "step": 1000 }, { "epoch": 5.0, "grad_norm": 2.8911187648773193, "learning_rate": 2.777777777777778e-05, "loss": 0.4408, "step": 1010 }, { "epoch": 5.0, "eval_accuracy": 0.8254963427377221, "eval_loss": 0.38967224955558777, "eval_runtime": 5.6692, "eval_samples_per_second": 506.425, "eval_steps_per_second": 15.875, "step": 1010 }, { "epoch": 5.05, "grad_norm": 7.270760536193848, "learning_rate": 2.7502750275027505e-05, "loss": 0.4415, "step": 1020 }, { "epoch": 5.1, "grad_norm": 2.75962495803833, "learning_rate": 2.722772277227723e-05, "loss": 0.4282, "step": 1030 }, { "epoch": 5.15, "grad_norm": 3.6654067039489746, "learning_rate": 2.6952695269526958e-05, "loss": 0.4265, "step": 1040 }, { "epoch": 5.2, "grad_norm": 5.461548805236816, "learning_rate": 2.667766776677668e-05, "loss": 0.4188, "step": 1050 }, { "epoch": 5.25, "grad_norm": 3.629088878631592, "learning_rate": 2.64026402640264e-05, "loss": 0.438, "step": 1060 }, { "epoch": 5.3, "grad_norm": 5.7036614418029785, "learning_rate": 2.6127612761276126e-05, "loss": 0.4073, "step": 1070 }, { "epoch": 5.35, "grad_norm": 7.140379905700684, "learning_rate": 2.5852585258525853e-05, "loss": 0.4287, "step": 1080 }, { "epoch": 5.4, "grad_norm": 3.2449722290039062, "learning_rate": 2.557755775577558e-05, "loss": 0.425, "step": 1090 }, { "epoch": 5.45, "grad_norm": 4.488211631774902, "learning_rate": 2.53025302530253e-05, "loss": 0.4203, "step": 1100 }, { "epoch": 5.5, "grad_norm": 3.7270421981811523, "learning_rate": 2.5027502750275028e-05, "loss": 0.4174, "step": 1110 }, { "epoch": 5.54, "grad_norm": 4.254336357116699, "learning_rate": 2.4752475247524754e-05, "loss": 0.4533, "step": 1120 }, { "epoch": 5.59, "grad_norm": 3.1208856105804443, "learning_rate": 2.447744774477448e-05, "loss": 0.4221, "step": 1130 }, { "epoch": 5.64, "grad_norm": 6.634130001068115, "learning_rate": 2.4202420242024203e-05, "loss": 0.4377, "step": 1140 }, { "epoch": 5.69, "grad_norm": 5.457045078277588, "learning_rate": 2.392739273927393e-05, "loss": 0.4459, "step": 1150 }, { "epoch": 5.74, "grad_norm": 3.537189483642578, "learning_rate": 2.3652365236523656e-05, "loss": 0.4423, "step": 1160 }, { "epoch": 5.79, "grad_norm": 5.883282661437988, "learning_rate": 2.337733773377338e-05, "loss": 0.4121, "step": 1170 }, { "epoch": 5.84, "grad_norm": 4.0562744140625, "learning_rate": 2.31023102310231e-05, "loss": 0.4114, "step": 1180 }, { "epoch": 5.89, "grad_norm": 5.0152974128723145, "learning_rate": 2.2827282728272828e-05, "loss": 0.4287, "step": 1190 }, { "epoch": 5.94, "grad_norm": 4.672534465789795, "learning_rate": 2.2552255225522554e-05, "loss": 0.4124, "step": 1200 }, { "epoch": 5.99, "grad_norm": 5.874300479888916, "learning_rate": 2.227722772277228e-05, "loss": 0.4134, "step": 1210 }, { "epoch": 6.0, "eval_accuracy": 0.8331591779867642, "eval_loss": 0.3714359402656555, "eval_runtime": 5.6334, "eval_samples_per_second": 509.636, "eval_steps_per_second": 15.976, "step": 1212 }, { "epoch": 6.04, "grad_norm": 3.3733067512512207, "learning_rate": 2.2002200220022003e-05, "loss": 0.404, "step": 1220 }, { "epoch": 6.09, "grad_norm": 3.479614734649658, "learning_rate": 2.172717271727173e-05, "loss": 0.4102, "step": 1230 }, { "epoch": 6.14, "grad_norm": 6.636502742767334, "learning_rate": 2.1452145214521452e-05, "loss": 0.4261, "step": 1240 }, { "epoch": 6.19, "grad_norm": 3.6467645168304443, "learning_rate": 2.1177117711771178e-05, "loss": 0.4061, "step": 1250 }, { "epoch": 6.24, "grad_norm": 3.435811996459961, "learning_rate": 2.09020902090209e-05, "loss": 0.4198, "step": 1260 }, { "epoch": 6.29, "grad_norm": 5.085671424865723, "learning_rate": 2.0627062706270627e-05, "loss": 0.3963, "step": 1270 }, { "epoch": 6.34, "grad_norm": 9.153255462646484, "learning_rate": 2.0352035203520354e-05, "loss": 0.4239, "step": 1280 }, { "epoch": 6.39, "grad_norm": 5.1853251457214355, "learning_rate": 2.007700770077008e-05, "loss": 0.3908, "step": 1290 }, { "epoch": 6.44, "grad_norm": 7.626086235046387, "learning_rate": 1.9801980198019803e-05, "loss": 0.4132, "step": 1300 }, { "epoch": 6.49, "grad_norm": 5.670557022094727, "learning_rate": 1.952695269526953e-05, "loss": 0.4228, "step": 1310 }, { "epoch": 6.53, "grad_norm": 5.062951564788818, "learning_rate": 1.9251925192519252e-05, "loss": 0.4385, "step": 1320 }, { "epoch": 6.58, "grad_norm": 3.8000316619873047, "learning_rate": 1.8976897689768978e-05, "loss": 0.414, "step": 1330 }, { "epoch": 6.63, "grad_norm": 3.444866895675659, "learning_rate": 1.87018701870187e-05, "loss": 0.4246, "step": 1340 }, { "epoch": 6.68, "grad_norm": 3.809499740600586, "learning_rate": 1.8426842684268427e-05, "loss": 0.4198, "step": 1350 }, { "epoch": 6.73, "grad_norm": 4.515871047973633, "learning_rate": 1.8151815181518153e-05, "loss": 0.4051, "step": 1360 }, { "epoch": 6.78, "grad_norm": 4.38671350479126, "learning_rate": 1.787678767876788e-05, "loss": 0.4153, "step": 1370 }, { "epoch": 6.83, "grad_norm": 3.238205909729004, "learning_rate": 1.7601760176017602e-05, "loss": 0.3947, "step": 1380 }, { "epoch": 6.88, "grad_norm": 3.455275058746338, "learning_rate": 1.7326732673267325e-05, "loss": 0.4018, "step": 1390 }, { "epoch": 6.93, "grad_norm": 3.587259531021118, "learning_rate": 1.705170517051705e-05, "loss": 0.3983, "step": 1400 }, { "epoch": 6.98, "grad_norm": 4.972535610198975, "learning_rate": 1.6776677667766778e-05, "loss": 0.4117, "step": 1410 }, { "epoch": 7.0, "eval_accuracy": 0.8376872169975619, "eval_loss": 0.36851075291633606, "eval_runtime": 5.6001, "eval_samples_per_second": 512.666, "eval_steps_per_second": 16.071, "step": 1414 }, { "epoch": 7.03, "grad_norm": 4.680665493011475, "learning_rate": 1.6501650165016504e-05, "loss": 0.3791, "step": 1420 }, { "epoch": 7.08, "grad_norm": 4.995221138000488, "learning_rate": 1.6226622662266227e-05, "loss": 0.4119, "step": 1430 }, { "epoch": 7.13, "grad_norm": 3.969187021255493, "learning_rate": 1.5951595159515953e-05, "loss": 0.3864, "step": 1440 }, { "epoch": 7.18, "grad_norm": 3.4930386543273926, "learning_rate": 1.567656765676568e-05, "loss": 0.3914, "step": 1450 }, { "epoch": 7.23, "grad_norm": 4.228696823120117, "learning_rate": 1.5401540154015402e-05, "loss": 0.4072, "step": 1460 }, { "epoch": 7.28, "grad_norm": 3.5527050495147705, "learning_rate": 1.5126512651265127e-05, "loss": 0.3864, "step": 1470 }, { "epoch": 7.33, "grad_norm": 3.4274041652679443, "learning_rate": 1.4851485148514851e-05, "loss": 0.3957, "step": 1480 }, { "epoch": 7.38, "grad_norm": 4.775713920593262, "learning_rate": 1.4576457645764577e-05, "loss": 0.4034, "step": 1490 }, { "epoch": 7.43, "grad_norm": 4.741795539855957, "learning_rate": 1.4301430143014302e-05, "loss": 0.4042, "step": 1500 }, { "epoch": 7.48, "grad_norm": 6.42737340927124, "learning_rate": 1.4026402640264028e-05, "loss": 0.4197, "step": 1510 }, { "epoch": 7.52, "grad_norm": 5.1894121170043945, "learning_rate": 1.3751375137513753e-05, "loss": 0.3956, "step": 1520 }, { "epoch": 7.57, "grad_norm": 5.648491859436035, "learning_rate": 1.3476347634763479e-05, "loss": 0.4005, "step": 1530 }, { "epoch": 7.62, "grad_norm": 4.315338611602783, "learning_rate": 1.32013201320132e-05, "loss": 0.3784, "step": 1540 }, { "epoch": 7.67, "grad_norm": 4.007127285003662, "learning_rate": 1.2926292629262926e-05, "loss": 0.3948, "step": 1550 }, { "epoch": 7.72, "grad_norm": 4.685734748840332, "learning_rate": 1.265126512651265e-05, "loss": 0.4165, "step": 1560 }, { "epoch": 7.77, "grad_norm": 5.51500940322876, "learning_rate": 1.2376237623762377e-05, "loss": 0.3872, "step": 1570 }, { "epoch": 7.82, "grad_norm": 3.208813428878784, "learning_rate": 1.2101210121012102e-05, "loss": 0.3886, "step": 1580 }, { "epoch": 7.87, "grad_norm": 4.27180814743042, "learning_rate": 1.1826182618261828e-05, "loss": 0.3837, "step": 1590 }, { "epoch": 7.92, "grad_norm": 9.008807182312012, "learning_rate": 1.155115511551155e-05, "loss": 0.4174, "step": 1600 }, { "epoch": 7.97, "grad_norm": 3.7245564460754395, "learning_rate": 1.1276127612761277e-05, "loss": 0.3991, "step": 1610 }, { "epoch": 8.0, "eval_accuracy": 0.8411703239289446, "eval_loss": 0.3601776659488678, "eval_runtime": 5.6002, "eval_samples_per_second": 512.663, "eval_steps_per_second": 16.071, "step": 1616 }, { "epoch": 8.02, "grad_norm": 4.604074954986572, "learning_rate": 1.1001100110011001e-05, "loss": 0.4121, "step": 1620 }, { "epoch": 8.07, "grad_norm": 6.1971611976623535, "learning_rate": 1.0726072607260726e-05, "loss": 0.3925, "step": 1630 }, { "epoch": 8.12, "grad_norm": 3.589857816696167, "learning_rate": 1.045104510451045e-05, "loss": 0.4339, "step": 1640 }, { "epoch": 8.17, "grad_norm": 3.007169246673584, "learning_rate": 1.0176017601760177e-05, "loss": 0.3841, "step": 1650 }, { "epoch": 8.22, "grad_norm": 8.007019996643066, "learning_rate": 9.900990099009901e-06, "loss": 0.4005, "step": 1660 }, { "epoch": 8.27, "grad_norm": 4.514450550079346, "learning_rate": 9.625962596259626e-06, "loss": 0.4072, "step": 1670 }, { "epoch": 8.32, "grad_norm": 6.048130989074707, "learning_rate": 9.35093509350935e-06, "loss": 0.3756, "step": 1680 }, { "epoch": 8.37, "grad_norm": 6.211109638214111, "learning_rate": 9.075907590759077e-06, "loss": 0.3953, "step": 1690 }, { "epoch": 8.42, "grad_norm": 3.7817318439483643, "learning_rate": 8.800880088008801e-06, "loss": 0.3685, "step": 1700 }, { "epoch": 8.47, "grad_norm": 5.379549980163574, "learning_rate": 8.525852585258526e-06, "loss": 0.3713, "step": 1710 }, { "epoch": 8.51, "grad_norm": 4.123336315155029, "learning_rate": 8.250825082508252e-06, "loss": 0.3629, "step": 1720 }, { "epoch": 8.56, "grad_norm": 4.12966251373291, "learning_rate": 7.975797579757976e-06, "loss": 0.3884, "step": 1730 }, { "epoch": 8.61, "grad_norm": 4.9768967628479, "learning_rate": 7.700770077007701e-06, "loss": 0.4092, "step": 1740 }, { "epoch": 8.66, "grad_norm": 5.166417598724365, "learning_rate": 7.4257425742574256e-06, "loss": 0.3891, "step": 1750 }, { "epoch": 8.71, "grad_norm": 6.434201240539551, "learning_rate": 7.150715071507151e-06, "loss": 0.3592, "step": 1760 }, { "epoch": 8.76, "grad_norm": 7.199638843536377, "learning_rate": 6.875687568756876e-06, "loss": 0.3909, "step": 1770 }, { "epoch": 8.81, "grad_norm": 7.480076313018799, "learning_rate": 6.6006600660066e-06, "loss": 0.3895, "step": 1780 }, { "epoch": 8.86, "grad_norm": 5.124411582946777, "learning_rate": 6.325632563256325e-06, "loss": 0.391, "step": 1790 }, { "epoch": 8.91, "grad_norm": 9.751836776733398, "learning_rate": 6.050605060506051e-06, "loss": 0.3908, "step": 1800 }, { "epoch": 8.96, "grad_norm": 4.501916408538818, "learning_rate": 5.775577557755775e-06, "loss": 0.3936, "step": 1810 }, { "epoch": 9.0, "eval_accuracy": 0.842911877394636, "eval_loss": 0.3542243540287018, "eval_runtime": 5.623, "eval_samples_per_second": 510.58, "eval_steps_per_second": 16.006, "step": 1818 }, { "epoch": 9.01, "grad_norm": 4.195034027099609, "learning_rate": 5.500550055005501e-06, "loss": 0.4147, "step": 1820 }, { "epoch": 9.06, "grad_norm": 4.904818058013916, "learning_rate": 5.225522552255225e-06, "loss": 0.3822, "step": 1830 }, { "epoch": 9.11, "grad_norm": 4.105266094207764, "learning_rate": 4.950495049504951e-06, "loss": 0.401, "step": 1840 }, { "epoch": 9.16, "grad_norm": 4.364089488983154, "learning_rate": 4.675467546754675e-06, "loss": 0.398, "step": 1850 }, { "epoch": 9.21, "grad_norm": 5.043737411499023, "learning_rate": 4.400440044004401e-06, "loss": 0.4218, "step": 1860 }, { "epoch": 9.26, "grad_norm": 4.617894172668457, "learning_rate": 4.125412541254126e-06, "loss": 0.4002, "step": 1870 }, { "epoch": 9.31, "grad_norm": 3.750427007675171, "learning_rate": 3.8503850385038505e-06, "loss": 0.4005, "step": 1880 }, { "epoch": 9.36, "grad_norm": 4.708156585693359, "learning_rate": 3.5753575357535755e-06, "loss": 0.4067, "step": 1890 }, { "epoch": 9.41, "grad_norm": 4.125495910644531, "learning_rate": 3.3003300330033e-06, "loss": 0.3745, "step": 1900 }, { "epoch": 9.46, "grad_norm": 3.2616422176361084, "learning_rate": 3.0253025302530254e-06, "loss": 0.3914, "step": 1910 }, { "epoch": 9.5, "grad_norm": 3.8317136764526367, "learning_rate": 2.7502750275027504e-06, "loss": 0.383, "step": 1920 }, { "epoch": 9.55, "grad_norm": 4.044767379760742, "learning_rate": 2.4752475247524753e-06, "loss": 0.3713, "step": 1930 }, { "epoch": 9.6, "grad_norm": 5.957603931427002, "learning_rate": 2.2002200220022003e-06, "loss": 0.3524, "step": 1940 }, { "epoch": 9.65, "grad_norm": 3.8878390789031982, "learning_rate": 1.9251925192519253e-06, "loss": 0.3876, "step": 1950 }, { "epoch": 9.7, "grad_norm": 4.049957752227783, "learning_rate": 1.65016501650165e-06, "loss": 0.3726, "step": 1960 }, { "epoch": 9.75, "grad_norm": 3.9568910598754883, "learning_rate": 1.3751375137513752e-06, "loss": 0.397, "step": 1970 }, { "epoch": 9.8, "grad_norm": 5.049424171447754, "learning_rate": 1.1001100110011001e-06, "loss": 0.3771, "step": 1980 }, { "epoch": 9.85, "grad_norm": 3.9506354331970215, "learning_rate": 8.25082508250825e-07, "loss": 0.3816, "step": 1990 }, { "epoch": 9.9, "grad_norm": 4.543933391571045, "learning_rate": 5.500550055005501e-07, "loss": 0.3961, "step": 2000 }, { "epoch": 9.95, "grad_norm": 4.267475605010986, "learning_rate": 2.7502750275027504e-07, "loss": 0.3612, "step": 2010 }, { "epoch": 10.0, "grad_norm": 7.206649303436279, "learning_rate": 0.0, "loss": 0.3422, "step": 2020 }, { "epoch": 10.0, "eval_accuracy": 0.8397770811563915, "eval_loss": 0.35397040843963623, "eval_runtime": 5.6519, "eval_samples_per_second": 507.973, "eval_steps_per_second": 15.924, "step": 2020 }, { "epoch": 10.0, "step": 2020, "total_flos": 1.2892209231838003e+18, "train_loss": 0.45288851414576614, "train_runtime": 810.9556, "train_samples_per_second": 318.599, "train_steps_per_second": 2.491 } ], "logging_steps": 10, "max_steps": 2020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.2892209231838003e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }