{ "best_metric": 0.796756082345602, "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-eurosat\\checkpoint-3062", "epoch": 14.948571428571428, "eval_steps": 500, "global_step": 3270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045714285714285714, "grad_norm": 13.994832992553711, "learning_rate": 1.5290519877675841e-06, "loss": 2.7494, "step": 10 }, { "epoch": 0.09142857142857143, "grad_norm": 7.420119762420654, "learning_rate": 3.0581039755351682e-06, "loss": 2.7395, "step": 20 }, { "epoch": 0.13714285714285715, "grad_norm": 5.467243194580078, "learning_rate": 4.587155963302753e-06, "loss": 2.7049, "step": 30 }, { "epoch": 0.18285714285714286, "grad_norm": 7.835602760314941, "learning_rate": 6.1162079510703365e-06, "loss": 2.6367, "step": 40 }, { "epoch": 0.22857142857142856, "grad_norm": 13.763399124145508, "learning_rate": 7.645259938837921e-06, "loss": 2.5592, "step": 50 }, { "epoch": 0.2742857142857143, "grad_norm": 10.583270072937012, "learning_rate": 9.174311926605506e-06, "loss": 2.4444, "step": 60 }, { "epoch": 0.32, "grad_norm": 19.168621063232422, "learning_rate": 1.070336391437309e-05, "loss": 2.3336, "step": 70 }, { "epoch": 0.3657142857142857, "grad_norm": 20.13259506225586, "learning_rate": 1.2232415902140673e-05, "loss": 2.1877, "step": 80 }, { "epoch": 0.4114285714285714, "grad_norm": 28.141359329223633, "learning_rate": 1.3761467889908258e-05, "loss": 2.0578, "step": 90 }, { "epoch": 0.45714285714285713, "grad_norm": 36.60035705566406, "learning_rate": 1.5290519877675842e-05, "loss": 1.9729, "step": 100 }, { "epoch": 0.5028571428571429, "grad_norm": 27.167247772216797, "learning_rate": 1.6819571865443427e-05, "loss": 1.884, "step": 110 }, { "epoch": 0.5485714285714286, "grad_norm": 21.56601905822754, "learning_rate": 1.834862385321101e-05, "loss": 1.7821, "step": 120 }, { "epoch": 0.5942857142857143, "grad_norm": 18.523468017578125, "learning_rate": 1.9877675840978592e-05, "loss": 1.8148, "step": 130 }, { "epoch": 0.64, "grad_norm": 14.976977348327637, "learning_rate": 2.140672782874618e-05, "loss": 1.6829, "step": 140 }, { "epoch": 0.6857142857142857, "grad_norm": 14.441283226013184, "learning_rate": 2.2935779816513765e-05, "loss": 1.6312, "step": 150 }, { "epoch": 0.7314285714285714, "grad_norm": 17.649080276489258, "learning_rate": 2.4464831804281346e-05, "loss": 1.6482, "step": 160 }, { "epoch": 0.7771428571428571, "grad_norm": 19.691490173339844, "learning_rate": 2.5993883792048927e-05, "loss": 1.5693, "step": 170 }, { "epoch": 0.8228571428571428, "grad_norm": 12.245804786682129, "learning_rate": 2.7522935779816515e-05, "loss": 1.6267, "step": 180 }, { "epoch": 0.8685714285714285, "grad_norm": 24.526466369628906, "learning_rate": 2.90519877675841e-05, "loss": 1.5964, "step": 190 }, { "epoch": 0.9142857142857143, "grad_norm": 14.706289291381836, "learning_rate": 3.0581039755351684e-05, "loss": 1.6017, "step": 200 }, { "epoch": 0.96, "grad_norm": 12.513092041015625, "learning_rate": 3.211009174311927e-05, "loss": 1.5891, "step": 210 }, { "epoch": 0.9965714285714286, "eval_accuracy": 0.5723019338739863, "eval_loss": 1.383291244506836, "eval_runtime": 74.6009, "eval_samples_per_second": 107.438, "eval_steps_per_second": 3.365, "step": 218 }, { "epoch": 1.0057142857142858, "grad_norm": 9.570089340209961, "learning_rate": 3.363914373088685e-05, "loss": 1.4337, "step": 220 }, { "epoch": 1.0514285714285714, "grad_norm": 29.601459503173828, "learning_rate": 3.516819571865443e-05, "loss": 1.4987, "step": 230 }, { "epoch": 1.0971428571428572, "grad_norm": 14.568526268005371, "learning_rate": 3.669724770642202e-05, "loss": 1.4594, "step": 240 }, { "epoch": 1.1428571428571428, "grad_norm": 9.901498794555664, "learning_rate": 3.822629969418961e-05, "loss": 1.4964, "step": 250 }, { "epoch": 1.1885714285714286, "grad_norm": 11.662745475769043, "learning_rate": 3.9755351681957185e-05, "loss": 1.4872, "step": 260 }, { "epoch": 1.2342857142857142, "grad_norm": 8.338129043579102, "learning_rate": 4.1284403669724776e-05, "loss": 1.3592, "step": 270 }, { "epoch": 1.28, "grad_norm": 12.640501022338867, "learning_rate": 4.281345565749236e-05, "loss": 1.4165, "step": 280 }, { "epoch": 1.3257142857142856, "grad_norm": 9.851750373840332, "learning_rate": 4.434250764525994e-05, "loss": 1.4671, "step": 290 }, { "epoch": 1.3714285714285714, "grad_norm": 9.80663013458252, "learning_rate": 4.587155963302753e-05, "loss": 1.4069, "step": 300 }, { "epoch": 1.4171428571428573, "grad_norm": 10.75167179107666, "learning_rate": 4.740061162079511e-05, "loss": 1.3966, "step": 310 }, { "epoch": 1.4628571428571429, "grad_norm": 9.546366691589355, "learning_rate": 4.892966360856269e-05, "loss": 1.3365, "step": 320 }, { "epoch": 1.5085714285714285, "grad_norm": 11.726640701293945, "learning_rate": 4.994903160040775e-05, "loss": 1.4017, "step": 330 }, { "epoch": 1.5542857142857143, "grad_norm": 9.12558650970459, "learning_rate": 4.977913693510024e-05, "loss": 1.3402, "step": 340 }, { "epoch": 1.6, "grad_norm": 7.8366804122924805, "learning_rate": 4.9609242269792734e-05, "loss": 1.3675, "step": 350 }, { "epoch": 1.6457142857142857, "grad_norm": 7.4768805503845215, "learning_rate": 4.943934760448522e-05, "loss": 1.3559, "step": 360 }, { "epoch": 1.6914285714285713, "grad_norm": 8.65304183959961, "learning_rate": 4.926945293917771e-05, "loss": 1.3973, "step": 370 }, { "epoch": 1.737142857142857, "grad_norm": 8.800814628601074, "learning_rate": 4.9099558273870206e-05, "loss": 1.2501, "step": 380 }, { "epoch": 1.782857142857143, "grad_norm": 14.658060073852539, "learning_rate": 4.892966360856269e-05, "loss": 1.287, "step": 390 }, { "epoch": 1.8285714285714287, "grad_norm": 9.586373329162598, "learning_rate": 4.8759768943255185e-05, "loss": 1.3249, "step": 400 }, { "epoch": 1.8742857142857143, "grad_norm": 12.57496166229248, "learning_rate": 4.858987427794767e-05, "loss": 1.3564, "step": 410 }, { "epoch": 1.92, "grad_norm": 9.330473899841309, "learning_rate": 4.8419979612640164e-05, "loss": 1.2967, "step": 420 }, { "epoch": 1.9657142857142857, "grad_norm": 8.545197486877441, "learning_rate": 4.8250084947332657e-05, "loss": 1.2997, "step": 430 }, { "epoch": 1.9977142857142858, "eval_accuracy": 0.6699937616968185, "eval_loss": 1.0830751657485962, "eval_runtime": 74.7614, "eval_samples_per_second": 107.208, "eval_steps_per_second": 3.357, "step": 437 }, { "epoch": 2.0114285714285716, "grad_norm": 7.490045547485352, "learning_rate": 4.808019028202514e-05, "loss": 1.258, "step": 440 }, { "epoch": 2.057142857142857, "grad_norm": 8.006925582885742, "learning_rate": 4.7910295616717635e-05, "loss": 1.168, "step": 450 }, { "epoch": 2.1028571428571428, "grad_norm": 12.528056144714355, "learning_rate": 4.774040095141013e-05, "loss": 1.2195, "step": 460 }, { "epoch": 2.1485714285714286, "grad_norm": 9.448918342590332, "learning_rate": 4.7570506286102614e-05, "loss": 1.2242, "step": 470 }, { "epoch": 2.1942857142857144, "grad_norm": 7.972462177276611, "learning_rate": 4.740061162079511e-05, "loss": 1.1925, "step": 480 }, { "epoch": 2.24, "grad_norm": 10.068849563598633, "learning_rate": 4.72307169554876e-05, "loss": 1.2521, "step": 490 }, { "epoch": 2.2857142857142856, "grad_norm": 9.536931037902832, "learning_rate": 4.706082229018009e-05, "loss": 1.2375, "step": 500 }, { "epoch": 2.3314285714285714, "grad_norm": 7.058177471160889, "learning_rate": 4.6890927624872586e-05, "loss": 1.1762, "step": 510 }, { "epoch": 2.3771428571428572, "grad_norm": 8.200982093811035, "learning_rate": 4.672103295956507e-05, "loss": 1.259, "step": 520 }, { "epoch": 2.422857142857143, "grad_norm": 6.11261510848999, "learning_rate": 4.6551138294257565e-05, "loss": 1.2291, "step": 530 }, { "epoch": 2.4685714285714284, "grad_norm": 8.771502494812012, "learning_rate": 4.638124362895006e-05, "loss": 1.2237, "step": 540 }, { "epoch": 2.5142857142857142, "grad_norm": 11.995628356933594, "learning_rate": 4.6211348963642544e-05, "loss": 1.2349, "step": 550 }, { "epoch": 2.56, "grad_norm": 8.15999698638916, "learning_rate": 4.604145429833504e-05, "loss": 1.1786, "step": 560 }, { "epoch": 2.605714285714286, "grad_norm": 7.252849578857422, "learning_rate": 4.587155963302753e-05, "loss": 1.1285, "step": 570 }, { "epoch": 2.6514285714285712, "grad_norm": 8.760148048400879, "learning_rate": 4.5701664967720016e-05, "loss": 1.074, "step": 580 }, { "epoch": 2.697142857142857, "grad_norm": 5.591124057769775, "learning_rate": 4.553177030241251e-05, "loss": 1.104, "step": 590 }, { "epoch": 2.742857142857143, "grad_norm": 7.468674659729004, "learning_rate": 4.5361875637104995e-05, "loss": 1.1936, "step": 600 }, { "epoch": 2.7885714285714287, "grad_norm": 7.683737754821777, "learning_rate": 4.519198097179749e-05, "loss": 1.1574, "step": 610 }, { "epoch": 2.8342857142857145, "grad_norm": 12.986687660217285, "learning_rate": 4.502208630648998e-05, "loss": 1.1513, "step": 620 }, { "epoch": 2.88, "grad_norm": 8.037307739257812, "learning_rate": 4.4852191641182466e-05, "loss": 1.1537, "step": 630 }, { "epoch": 2.9257142857142857, "grad_norm": 8.727031707763672, "learning_rate": 4.468229697587496e-05, "loss": 1.1823, "step": 640 }, { "epoch": 2.9714285714285715, "grad_norm": 7.474994659423828, "learning_rate": 4.451240231056745e-05, "loss": 1.1166, "step": 650 }, { "epoch": 2.998857142857143, "eval_accuracy": 0.6958203368683719, "eval_loss": 0.9937148094177246, "eval_runtime": 75.7204, "eval_samples_per_second": 105.85, "eval_steps_per_second": 3.315, "step": 656 }, { "epoch": 3.0171428571428573, "grad_norm": 5.9896016120910645, "learning_rate": 4.434250764525994e-05, "loss": 1.1378, "step": 660 }, { "epoch": 3.0628571428571427, "grad_norm": 5.852768898010254, "learning_rate": 4.417261297995243e-05, "loss": 1.0542, "step": 670 }, { "epoch": 3.1085714285714285, "grad_norm": 7.916086673736572, "learning_rate": 4.400271831464492e-05, "loss": 1.0331, "step": 680 }, { "epoch": 3.1542857142857144, "grad_norm": 8.959929466247559, "learning_rate": 4.383282364933741e-05, "loss": 1.0943, "step": 690 }, { "epoch": 3.2, "grad_norm": 6.69697904586792, "learning_rate": 4.36629289840299e-05, "loss": 1.0978, "step": 700 }, { "epoch": 3.2457142857142856, "grad_norm": 7.420085430145264, "learning_rate": 4.349303431872239e-05, "loss": 1.0588, "step": 710 }, { "epoch": 3.2914285714285714, "grad_norm": 10.829694747924805, "learning_rate": 4.332313965341488e-05, "loss": 1.0838, "step": 720 }, { "epoch": 3.337142857142857, "grad_norm": 6.954169273376465, "learning_rate": 4.3153244988107375e-05, "loss": 1.1126, "step": 730 }, { "epoch": 3.382857142857143, "grad_norm": 6.538149833679199, "learning_rate": 4.298335032279987e-05, "loss": 1.1049, "step": 740 }, { "epoch": 3.4285714285714284, "grad_norm": 11.640555381774902, "learning_rate": 4.281345565749236e-05, "loss": 1.0577, "step": 750 }, { "epoch": 3.474285714285714, "grad_norm": 7.022554397583008, "learning_rate": 4.264356099218485e-05, "loss": 1.1454, "step": 760 }, { "epoch": 3.52, "grad_norm": 9.619643211364746, "learning_rate": 4.247366632687734e-05, "loss": 1.0374, "step": 770 }, { "epoch": 3.565714285714286, "grad_norm": 8.229997634887695, "learning_rate": 4.230377166156983e-05, "loss": 1.0354, "step": 780 }, { "epoch": 3.611428571428571, "grad_norm": 8.94089126586914, "learning_rate": 4.213387699626232e-05, "loss": 1.1144, "step": 790 }, { "epoch": 3.657142857142857, "grad_norm": 7.949928283691406, "learning_rate": 4.196398233095481e-05, "loss": 1.0678, "step": 800 }, { "epoch": 3.702857142857143, "grad_norm": 6.979115009307861, "learning_rate": 4.1794087665647304e-05, "loss": 1.0945, "step": 810 }, { "epoch": 3.7485714285714287, "grad_norm": 8.669636726379395, "learning_rate": 4.162419300033979e-05, "loss": 1.0521, "step": 820 }, { "epoch": 3.7942857142857145, "grad_norm": 6.849328994750977, "learning_rate": 4.145429833503228e-05, "loss": 1.0513, "step": 830 }, { "epoch": 3.84, "grad_norm": 7.107924461364746, "learning_rate": 4.1284403669724776e-05, "loss": 1.0477, "step": 840 }, { "epoch": 3.8857142857142857, "grad_norm": 6.533900737762451, "learning_rate": 4.111450900441726e-05, "loss": 1.0513, "step": 850 }, { "epoch": 3.9314285714285715, "grad_norm": 6.826801776885986, "learning_rate": 4.0944614339109755e-05, "loss": 1.0932, "step": 860 }, { "epoch": 3.977142857142857, "grad_norm": 6.449113845825195, "learning_rate": 4.077471967380224e-05, "loss": 1.0464, "step": 870 }, { "epoch": 4.0, "eval_accuracy": 0.7231441048034934, "eval_loss": 0.9180014729499817, "eval_runtime": 74.9581, "eval_samples_per_second": 106.926, "eval_steps_per_second": 3.349, "step": 875 }, { "epoch": 4.022857142857143, "grad_norm": 6.743358135223389, "learning_rate": 4.0604825008494734e-05, "loss": 1.0845, "step": 880 }, { "epoch": 4.0685714285714285, "grad_norm": 6.779079437255859, "learning_rate": 4.043493034318723e-05, "loss": 1.0415, "step": 890 }, { "epoch": 4.114285714285714, "grad_norm": 8.849162101745605, "learning_rate": 4.026503567787971e-05, "loss": 1.0487, "step": 900 }, { "epoch": 4.16, "grad_norm": 7.157181739807129, "learning_rate": 4.0095141012572206e-05, "loss": 1.0647, "step": 910 }, { "epoch": 4.2057142857142855, "grad_norm": 8.016735076904297, "learning_rate": 3.99252463472647e-05, "loss": 1.0493, "step": 920 }, { "epoch": 4.251428571428572, "grad_norm": 8.14920711517334, "learning_rate": 3.9755351681957185e-05, "loss": 1.0228, "step": 930 }, { "epoch": 4.297142857142857, "grad_norm": 6.980301856994629, "learning_rate": 3.958545701664968e-05, "loss": 0.9977, "step": 940 }, { "epoch": 4.3428571428571425, "grad_norm": 6.754297256469727, "learning_rate": 3.941556235134217e-05, "loss": 1.025, "step": 950 }, { "epoch": 4.388571428571429, "grad_norm": 6.082598686218262, "learning_rate": 3.9245667686034656e-05, "loss": 0.9734, "step": 960 }, { "epoch": 4.434285714285714, "grad_norm": 9.236204147338867, "learning_rate": 3.907577302072715e-05, "loss": 0.9897, "step": 970 }, { "epoch": 4.48, "grad_norm": 6.07050085067749, "learning_rate": 3.890587835541964e-05, "loss": 1.0549, "step": 980 }, { "epoch": 4.525714285714286, "grad_norm": 8.898210525512695, "learning_rate": 3.8735983690112135e-05, "loss": 1.0703, "step": 990 }, { "epoch": 4.571428571428571, "grad_norm": 6.365227699279785, "learning_rate": 3.856608902480463e-05, "loss": 0.9674, "step": 1000 }, { "epoch": 4.617142857142857, "grad_norm": 6.180821895599365, "learning_rate": 3.8396194359497114e-05, "loss": 0.9661, "step": 1010 }, { "epoch": 4.662857142857143, "grad_norm": 7.927034854888916, "learning_rate": 3.822629969418961e-05, "loss": 1.025, "step": 1020 }, { "epoch": 4.708571428571428, "grad_norm": 6.064193248748779, "learning_rate": 3.80564050288821e-05, "loss": 1.0494, "step": 1030 }, { "epoch": 4.7542857142857144, "grad_norm": 7.2729668617248535, "learning_rate": 3.7886510363574586e-05, "loss": 1.0002, "step": 1040 }, { "epoch": 4.8, "grad_norm": 8.703302383422852, "learning_rate": 3.771661569826708e-05, "loss": 1.001, "step": 1050 }, { "epoch": 4.845714285714286, "grad_norm": 5.9366583824157715, "learning_rate": 3.7546721032959565e-05, "loss": 0.8903, "step": 1060 }, { "epoch": 4.8914285714285715, "grad_norm": 10.52004337310791, "learning_rate": 3.737682636765206e-05, "loss": 1.0021, "step": 1070 }, { "epoch": 4.937142857142857, "grad_norm": 6.183305740356445, "learning_rate": 3.720693170234455e-05, "loss": 0.9574, "step": 1080 }, { "epoch": 4.982857142857143, "grad_norm": 7.366205215454102, "learning_rate": 3.7037037037037037e-05, "loss": 0.982, "step": 1090 }, { "epoch": 4.996571428571428, "eval_accuracy": 0.743231441048035, "eval_loss": 0.8399370312690735, "eval_runtime": 75.6387, "eval_samples_per_second": 105.964, "eval_steps_per_second": 3.318, "step": 1093 }, { "epoch": 5.0285714285714285, "grad_norm": 6.932504177093506, "learning_rate": 3.686714237172953e-05, "loss": 1.0264, "step": 1100 }, { "epoch": 5.074285714285715, "grad_norm": 5.84490966796875, "learning_rate": 3.669724770642202e-05, "loss": 0.9594, "step": 1110 }, { "epoch": 5.12, "grad_norm": 6.8914475440979, "learning_rate": 3.652735304111451e-05, "loss": 0.918, "step": 1120 }, { "epoch": 5.1657142857142855, "grad_norm": 6.1007161140441895, "learning_rate": 3.6357458375807e-05, "loss": 1.0036, "step": 1130 }, { "epoch": 5.211428571428572, "grad_norm": 8.02859115600586, "learning_rate": 3.6187563710499494e-05, "loss": 0.9655, "step": 1140 }, { "epoch": 5.257142857142857, "grad_norm": 7.857163906097412, "learning_rate": 3.601766904519198e-05, "loss": 0.9567, "step": 1150 }, { "epoch": 5.3028571428571425, "grad_norm": 7.495018005371094, "learning_rate": 3.584777437988447e-05, "loss": 0.9226, "step": 1160 }, { "epoch": 5.348571428571429, "grad_norm": 8.65245532989502, "learning_rate": 3.567787971457696e-05, "loss": 0.8992, "step": 1170 }, { "epoch": 5.394285714285714, "grad_norm": 8.458373069763184, "learning_rate": 3.550798504926945e-05, "loss": 0.9539, "step": 1180 }, { "epoch": 5.44, "grad_norm": 6.742293357849121, "learning_rate": 3.5338090383961945e-05, "loss": 0.9382, "step": 1190 }, { "epoch": 5.485714285714286, "grad_norm": 6.219183921813965, "learning_rate": 3.516819571865443e-05, "loss": 0.9673, "step": 1200 }, { "epoch": 5.531428571428571, "grad_norm": 6.372159004211426, "learning_rate": 3.4998301053346924e-05, "loss": 0.9558, "step": 1210 }, { "epoch": 5.577142857142857, "grad_norm": 9.132979393005371, "learning_rate": 3.482840638803942e-05, "loss": 0.9738, "step": 1220 }, { "epoch": 5.622857142857143, "grad_norm": 10.370519638061523, "learning_rate": 3.465851172273191e-05, "loss": 0.9576, "step": 1230 }, { "epoch": 5.668571428571429, "grad_norm": 6.819110870361328, "learning_rate": 3.44886170574244e-05, "loss": 0.9086, "step": 1240 }, { "epoch": 5.714285714285714, "grad_norm": 5.857959747314453, "learning_rate": 3.431872239211689e-05, "loss": 0.8758, "step": 1250 }, { "epoch": 5.76, "grad_norm": 8.476204872131348, "learning_rate": 3.414882772680938e-05, "loss": 0.9147, "step": 1260 }, { "epoch": 5.805714285714286, "grad_norm": 6.7867865562438965, "learning_rate": 3.3978933061501874e-05, "loss": 0.9843, "step": 1270 }, { "epoch": 5.851428571428571, "grad_norm": 8.020210266113281, "learning_rate": 3.380903839619436e-05, "loss": 1.0334, "step": 1280 }, { "epoch": 5.897142857142857, "grad_norm": 7.1987199783325195, "learning_rate": 3.363914373088685e-05, "loss": 1.0116, "step": 1290 }, { "epoch": 5.942857142857143, "grad_norm": 6.633023738861084, "learning_rate": 3.3469249065579346e-05, "loss": 0.9476, "step": 1300 }, { "epoch": 5.988571428571428, "grad_norm": 7.582513332366943, "learning_rate": 3.329935440027183e-05, "loss": 0.9472, "step": 1310 }, { "epoch": 5.997714285714285, "eval_accuracy": 0.7535870243293824, "eval_loss": 0.8126731514930725, "eval_runtime": 71.4321, "eval_samples_per_second": 112.204, "eval_steps_per_second": 3.514, "step": 1312 }, { "epoch": 6.034285714285715, "grad_norm": 8.917913436889648, "learning_rate": 3.3129459734964325e-05, "loss": 0.8892, "step": 1320 }, { "epoch": 6.08, "grad_norm": 11.498839378356934, "learning_rate": 3.295956506965682e-05, "loss": 0.8989, "step": 1330 }, { "epoch": 6.1257142857142854, "grad_norm": 6.905866622924805, "learning_rate": 3.2789670404349304e-05, "loss": 0.8923, "step": 1340 }, { "epoch": 6.171428571428572, "grad_norm": 5.6410417556762695, "learning_rate": 3.26197757390418e-05, "loss": 0.9276, "step": 1350 }, { "epoch": 6.217142857142857, "grad_norm": 8.219051361083984, "learning_rate": 3.244988107373428e-05, "loss": 0.9223, "step": 1360 }, { "epoch": 6.2628571428571425, "grad_norm": 6.403746128082275, "learning_rate": 3.2279986408426776e-05, "loss": 0.9082, "step": 1370 }, { "epoch": 6.308571428571429, "grad_norm": 5.092130184173584, "learning_rate": 3.211009174311927e-05, "loss": 0.8609, "step": 1380 }, { "epoch": 6.354285714285714, "grad_norm": 7.074675559997559, "learning_rate": 3.1940197077811755e-05, "loss": 0.908, "step": 1390 }, { "epoch": 6.4, "grad_norm": 7.3398332595825195, "learning_rate": 3.177030241250425e-05, "loss": 0.9756, "step": 1400 }, { "epoch": 6.445714285714286, "grad_norm": 8.28463363647461, "learning_rate": 3.160040774719674e-05, "loss": 0.9251, "step": 1410 }, { "epoch": 6.491428571428571, "grad_norm": 11.502705574035645, "learning_rate": 3.1430513081889227e-05, "loss": 0.8992, "step": 1420 }, { "epoch": 6.537142857142857, "grad_norm": 6.942375659942627, "learning_rate": 3.126061841658172e-05, "loss": 0.878, "step": 1430 }, { "epoch": 6.582857142857143, "grad_norm": 6.975459098815918, "learning_rate": 3.1090723751274206e-05, "loss": 0.928, "step": 1440 }, { "epoch": 6.628571428571428, "grad_norm": 6.6826372146606445, "learning_rate": 3.09208290859667e-05, "loss": 0.9126, "step": 1450 }, { "epoch": 6.674285714285714, "grad_norm": 6.843193054199219, "learning_rate": 3.075093442065919e-05, "loss": 0.94, "step": 1460 }, { "epoch": 6.72, "grad_norm": 9.00674819946289, "learning_rate": 3.0581039755351684e-05, "loss": 0.9406, "step": 1470 }, { "epoch": 6.765714285714286, "grad_norm": 7.362587928771973, "learning_rate": 3.0411145090044174e-05, "loss": 0.9414, "step": 1480 }, { "epoch": 6.811428571428571, "grad_norm": 10.040757179260254, "learning_rate": 3.0241250424736666e-05, "loss": 0.9372, "step": 1490 }, { "epoch": 6.857142857142857, "grad_norm": 6.630922794342041, "learning_rate": 3.0071355759429153e-05, "loss": 0.918, "step": 1500 }, { "epoch": 6.902857142857143, "grad_norm": 6.996939182281494, "learning_rate": 2.9901461094121645e-05, "loss": 0.8634, "step": 1510 }, { "epoch": 6.948571428571428, "grad_norm": 6.364753723144531, "learning_rate": 2.9731566428814138e-05, "loss": 0.94, "step": 1520 }, { "epoch": 6.994285714285715, "grad_norm": 7.604902267456055, "learning_rate": 2.9561671763506628e-05, "loss": 0.8751, "step": 1530 }, { "epoch": 6.998857142857143, "eval_accuracy": 0.7639426076107299, "eval_loss": 0.7851645350456238, "eval_runtime": 71.3545, "eval_samples_per_second": 112.327, "eval_steps_per_second": 3.518, "step": 1531 }, { "epoch": 7.04, "grad_norm": 6.300110816955566, "learning_rate": 2.939177709819912e-05, "loss": 0.9023, "step": 1540 }, { "epoch": 7.085714285714285, "grad_norm": 6.873245716094971, "learning_rate": 2.9221882432891607e-05, "loss": 0.8445, "step": 1550 }, { "epoch": 7.131428571428572, "grad_norm": 6.225979328155518, "learning_rate": 2.90519877675841e-05, "loss": 0.8429, "step": 1560 }, { "epoch": 7.177142857142857, "grad_norm": 9.389466285705566, "learning_rate": 2.8882093102276592e-05, "loss": 0.8686, "step": 1570 }, { "epoch": 7.222857142857142, "grad_norm": 6.56587028503418, "learning_rate": 2.871219843696908e-05, "loss": 0.9168, "step": 1580 }, { "epoch": 7.268571428571429, "grad_norm": 7.2763447761535645, "learning_rate": 2.854230377166157e-05, "loss": 0.9107, "step": 1590 }, { "epoch": 7.314285714285714, "grad_norm": 10.647456169128418, "learning_rate": 2.8372409106354064e-05, "loss": 0.8785, "step": 1600 }, { "epoch": 7.36, "grad_norm": 9.847870826721191, "learning_rate": 2.820251444104655e-05, "loss": 0.8282, "step": 1610 }, { "epoch": 7.405714285714286, "grad_norm": 6.836136341094971, "learning_rate": 2.8032619775739043e-05, "loss": 0.881, "step": 1620 }, { "epoch": 7.451428571428571, "grad_norm": 7.8010687828063965, "learning_rate": 2.7862725110431533e-05, "loss": 0.8969, "step": 1630 }, { "epoch": 7.497142857142857, "grad_norm": 8.551609992980957, "learning_rate": 2.7692830445124026e-05, "loss": 0.9146, "step": 1640 }, { "epoch": 7.542857142857143, "grad_norm": 6.829668998718262, "learning_rate": 2.7522935779816515e-05, "loss": 0.8377, "step": 1650 }, { "epoch": 7.588571428571429, "grad_norm": 5.782020568847656, "learning_rate": 2.7353041114509004e-05, "loss": 0.8956, "step": 1660 }, { "epoch": 7.634285714285714, "grad_norm": 7.081970691680908, "learning_rate": 2.7183146449201497e-05, "loss": 0.8707, "step": 1670 }, { "epoch": 7.68, "grad_norm": 7.2627739906311035, "learning_rate": 2.701325178389399e-05, "loss": 0.8865, "step": 1680 }, { "epoch": 7.725714285714286, "grad_norm": 6.740649700164795, "learning_rate": 2.6843357118586476e-05, "loss": 0.8183, "step": 1690 }, { "epoch": 7.771428571428571, "grad_norm": 6.93267822265625, "learning_rate": 2.667346245327897e-05, "loss": 0.8898, "step": 1700 }, { "epoch": 7.817142857142857, "grad_norm": 8.19253921508789, "learning_rate": 2.6503567787971462e-05, "loss": 0.8982, "step": 1710 }, { "epoch": 7.862857142857143, "grad_norm": 5.707729816436768, "learning_rate": 2.6333673122663948e-05, "loss": 0.888, "step": 1720 }, { "epoch": 7.908571428571428, "grad_norm": 6.924159049987793, "learning_rate": 2.616377845735644e-05, "loss": 0.8658, "step": 1730 }, { "epoch": 7.954285714285715, "grad_norm": 8.230816841125488, "learning_rate": 2.5993883792048927e-05, "loss": 0.8772, "step": 1740 }, { "epoch": 8.0, "grad_norm": 7.3947906494140625, "learning_rate": 2.582398912674142e-05, "loss": 0.9107, "step": 1750 }, { "epoch": 8.0, "eval_accuracy": 0.7713038053649407, "eval_loss": 0.7643583416938782, "eval_runtime": 72.4816, "eval_samples_per_second": 110.58, "eval_steps_per_second": 3.463, "step": 1750 }, { "epoch": 8.045714285714286, "grad_norm": 6.424936294555664, "learning_rate": 2.5654094461433913e-05, "loss": 0.8515, "step": 1760 }, { "epoch": 8.09142857142857, "grad_norm": 7.372068881988525, "learning_rate": 2.5484199796126402e-05, "loss": 0.7997, "step": 1770 }, { "epoch": 8.137142857142857, "grad_norm": 6.683503150939941, "learning_rate": 2.5314305130818895e-05, "loss": 0.8021, "step": 1780 }, { "epoch": 8.182857142857143, "grad_norm": 7.226657390594482, "learning_rate": 2.5144410465511388e-05, "loss": 0.8367, "step": 1790 }, { "epoch": 8.228571428571428, "grad_norm": 6.432008743286133, "learning_rate": 2.4974515800203874e-05, "loss": 0.8582, "step": 1800 }, { "epoch": 8.274285714285714, "grad_norm": 6.051323890686035, "learning_rate": 2.4804621134896367e-05, "loss": 0.8166, "step": 1810 }, { "epoch": 8.32, "grad_norm": 10.92369270324707, "learning_rate": 2.4634726469588856e-05, "loss": 0.8797, "step": 1820 }, { "epoch": 8.365714285714287, "grad_norm": 9.526762962341309, "learning_rate": 2.4464831804281346e-05, "loss": 0.8032, "step": 1830 }, { "epoch": 8.411428571428571, "grad_norm": 7.965165138244629, "learning_rate": 2.4294937138973835e-05, "loss": 0.8453, "step": 1840 }, { "epoch": 8.457142857142857, "grad_norm": 7.171777248382568, "learning_rate": 2.4125042473666328e-05, "loss": 0.8668, "step": 1850 }, { "epoch": 8.502857142857144, "grad_norm": 7.443463325500488, "learning_rate": 2.3955147808358818e-05, "loss": 0.8412, "step": 1860 }, { "epoch": 8.548571428571428, "grad_norm": 5.963488578796387, "learning_rate": 2.3785253143051307e-05, "loss": 0.8126, "step": 1870 }, { "epoch": 8.594285714285714, "grad_norm": 7.679189682006836, "learning_rate": 2.36153584777438e-05, "loss": 0.8548, "step": 1880 }, { "epoch": 8.64, "grad_norm": 7.266505718231201, "learning_rate": 2.3445463812436293e-05, "loss": 0.8217, "step": 1890 }, { "epoch": 8.685714285714285, "grad_norm": 6.643305778503418, "learning_rate": 2.3275569147128782e-05, "loss": 0.8284, "step": 1900 }, { "epoch": 8.731428571428571, "grad_norm": 6.7400922775268555, "learning_rate": 2.3105674481821272e-05, "loss": 0.8595, "step": 1910 }, { "epoch": 8.777142857142858, "grad_norm": 9.218864440917969, "learning_rate": 2.2935779816513765e-05, "loss": 0.8636, "step": 1920 }, { "epoch": 8.822857142857142, "grad_norm": 6.2901434898376465, "learning_rate": 2.2765885151206254e-05, "loss": 0.8293, "step": 1930 }, { "epoch": 8.868571428571428, "grad_norm": 7.57029390335083, "learning_rate": 2.2595990485898744e-05, "loss": 0.8275, "step": 1940 }, { "epoch": 8.914285714285715, "grad_norm": 6.863813400268555, "learning_rate": 2.2426095820591233e-05, "loss": 0.7866, "step": 1950 }, { "epoch": 8.96, "grad_norm": 7.338642597198486, "learning_rate": 2.2256201155283726e-05, "loss": 0.8464, "step": 1960 }, { "epoch": 8.996571428571428, "eval_accuracy": 0.7830318153462258, "eval_loss": 0.7322039604187012, "eval_runtime": 69.8506, "eval_samples_per_second": 114.745, "eval_steps_per_second": 3.593, "step": 1968 }, { "epoch": 9.005714285714285, "grad_norm": 5.92184591293335, "learning_rate": 2.2086306489976216e-05, "loss": 0.7986, "step": 1970 }, { "epoch": 9.051428571428572, "grad_norm": 8.04157829284668, "learning_rate": 2.1916411824668705e-05, "loss": 0.8227, "step": 1980 }, { "epoch": 9.097142857142858, "grad_norm": 6.019360065460205, "learning_rate": 2.1746517159361194e-05, "loss": 0.7563, "step": 1990 }, { "epoch": 9.142857142857142, "grad_norm": 7.545748233795166, "learning_rate": 2.1576622494053687e-05, "loss": 0.8087, "step": 2000 }, { "epoch": 9.188571428571429, "grad_norm": 7.692215442657471, "learning_rate": 2.140672782874618e-05, "loss": 0.7899, "step": 2010 }, { "epoch": 9.234285714285715, "grad_norm": 7.4185566902160645, "learning_rate": 2.123683316343867e-05, "loss": 0.7823, "step": 2020 }, { "epoch": 9.28, "grad_norm": 8.05014705657959, "learning_rate": 2.106693849813116e-05, "loss": 0.767, "step": 2030 }, { "epoch": 9.325714285714286, "grad_norm": 7.103221893310547, "learning_rate": 2.0897043832823652e-05, "loss": 0.7722, "step": 2040 }, { "epoch": 9.371428571428572, "grad_norm": 6.289785861968994, "learning_rate": 2.072714916751614e-05, "loss": 0.7829, "step": 2050 }, { "epoch": 9.417142857142856, "grad_norm": 5.672107696533203, "learning_rate": 2.055725450220863e-05, "loss": 0.7582, "step": 2060 }, { "epoch": 9.462857142857143, "grad_norm": 7.584166049957275, "learning_rate": 2.038735983690112e-05, "loss": 0.8066, "step": 2070 }, { "epoch": 9.508571428571429, "grad_norm": 6.826247215270996, "learning_rate": 2.0217465171593613e-05, "loss": 0.8539, "step": 2080 }, { "epoch": 9.554285714285715, "grad_norm": 7.450297832489014, "learning_rate": 2.0047570506286103e-05, "loss": 0.8866, "step": 2090 }, { "epoch": 9.6, "grad_norm": 7.801323890686035, "learning_rate": 1.9877675840978592e-05, "loss": 0.7857, "step": 2100 }, { "epoch": 9.645714285714286, "grad_norm": 5.812144756317139, "learning_rate": 1.9707781175671085e-05, "loss": 0.7891, "step": 2110 }, { "epoch": 9.691428571428572, "grad_norm": 6.948761940002441, "learning_rate": 1.9537886510363575e-05, "loss": 0.8282, "step": 2120 }, { "epoch": 9.737142857142857, "grad_norm": 7.478163719177246, "learning_rate": 1.9367991845056068e-05, "loss": 0.7716, "step": 2130 }, { "epoch": 9.782857142857143, "grad_norm": 7.083802700042725, "learning_rate": 1.9198097179748557e-05, "loss": 0.8072, "step": 2140 }, { "epoch": 9.82857142857143, "grad_norm": 8.924737930297852, "learning_rate": 1.902820251444105e-05, "loss": 0.795, "step": 2150 }, { "epoch": 9.874285714285714, "grad_norm": 6.02655553817749, "learning_rate": 1.885830784913354e-05, "loss": 0.8785, "step": 2160 }, { "epoch": 9.92, "grad_norm": 6.204999923706055, "learning_rate": 1.868841318382603e-05, "loss": 0.8836, "step": 2170 }, { "epoch": 9.965714285714286, "grad_norm": 9.425383567810059, "learning_rate": 1.8518518518518518e-05, "loss": 0.8398, "step": 2180 }, { "epoch": 9.997714285714286, "eval_accuracy": 0.7797878976918278, "eval_loss": 0.7243014574050903, "eval_runtime": 70.1724, "eval_samples_per_second": 114.219, "eval_steps_per_second": 3.577, "step": 2187 }, { "epoch": 10.01142857142857, "grad_norm": 6.657647132873535, "learning_rate": 1.834862385321101e-05, "loss": 0.7659, "step": 2190 }, { "epoch": 10.057142857142857, "grad_norm": 6.205779075622559, "learning_rate": 1.81787291879035e-05, "loss": 0.7612, "step": 2200 }, { "epoch": 10.102857142857143, "grad_norm": 4.940152168273926, "learning_rate": 1.800883452259599e-05, "loss": 0.7277, "step": 2210 }, { "epoch": 10.14857142857143, "grad_norm": 6.750416278839111, "learning_rate": 1.783893985728848e-05, "loss": 0.7334, "step": 2220 }, { "epoch": 10.194285714285714, "grad_norm": 8.511019706726074, "learning_rate": 1.7669045191980972e-05, "loss": 0.846, "step": 2230 }, { "epoch": 10.24, "grad_norm": 6.814949989318848, "learning_rate": 1.7499150526673462e-05, "loss": 0.7721, "step": 2240 }, { "epoch": 10.285714285714286, "grad_norm": 8.27193546295166, "learning_rate": 1.7329255861365955e-05, "loss": 0.7794, "step": 2250 }, { "epoch": 10.331428571428571, "grad_norm": 6.475657939910889, "learning_rate": 1.7159361196058444e-05, "loss": 0.7965, "step": 2260 }, { "epoch": 10.377142857142857, "grad_norm": 7.63599967956543, "learning_rate": 1.6989466530750937e-05, "loss": 0.768, "step": 2270 }, { "epoch": 10.422857142857143, "grad_norm": 6.7202043533325195, "learning_rate": 1.6819571865443427e-05, "loss": 0.7671, "step": 2280 }, { "epoch": 10.468571428571428, "grad_norm": 11.612401962280273, "learning_rate": 1.6649677200135916e-05, "loss": 0.7752, "step": 2290 }, { "epoch": 10.514285714285714, "grad_norm": 10.456415176391602, "learning_rate": 1.647978253482841e-05, "loss": 0.8036, "step": 2300 }, { "epoch": 10.56, "grad_norm": 5.8782782554626465, "learning_rate": 1.63098878695209e-05, "loss": 0.7652, "step": 2310 }, { "epoch": 10.605714285714285, "grad_norm": 8.532390594482422, "learning_rate": 1.6139993204213388e-05, "loss": 0.8296, "step": 2320 }, { "epoch": 10.651428571428571, "grad_norm": 8.59847354888916, "learning_rate": 1.5970098538905877e-05, "loss": 0.8239, "step": 2330 }, { "epoch": 10.697142857142858, "grad_norm": 7.728725910186768, "learning_rate": 1.580020387359837e-05, "loss": 0.7657, "step": 2340 }, { "epoch": 10.742857142857144, "grad_norm": 7.599369049072266, "learning_rate": 1.563030920829086e-05, "loss": 0.734, "step": 2350 }, { "epoch": 10.788571428571428, "grad_norm": 7.236623287200928, "learning_rate": 1.546041454298335e-05, "loss": 0.7992, "step": 2360 }, { "epoch": 10.834285714285715, "grad_norm": 9.501078605651855, "learning_rate": 1.5290519877675842e-05, "loss": 0.7977, "step": 2370 }, { "epoch": 10.88, "grad_norm": 7.179370403289795, "learning_rate": 1.5120625212368333e-05, "loss": 0.8284, "step": 2380 }, { "epoch": 10.925714285714285, "grad_norm": 6.750663757324219, "learning_rate": 1.4950730547060823e-05, "loss": 0.8197, "step": 2390 }, { "epoch": 10.971428571428572, "grad_norm": 8.519887924194336, "learning_rate": 1.4780835881753314e-05, "loss": 0.7534, "step": 2400 }, { "epoch": 10.998857142857142, "eval_accuracy": 0.7845290081097941, "eval_loss": 0.708789050579071, "eval_runtime": 70.0816, "eval_samples_per_second": 114.367, "eval_steps_per_second": 3.582, "step": 2406 }, { "epoch": 11.017142857142858, "grad_norm": 8.514420509338379, "learning_rate": 1.4610941216445803e-05, "loss": 0.804, "step": 2410 }, { "epoch": 11.062857142857142, "grad_norm": 7.491757869720459, "learning_rate": 1.4441046551138296e-05, "loss": 0.8208, "step": 2420 }, { "epoch": 11.108571428571429, "grad_norm": 8.859854698181152, "learning_rate": 1.4271151885830786e-05, "loss": 0.7703, "step": 2430 }, { "epoch": 11.154285714285715, "grad_norm": 7.596364498138428, "learning_rate": 1.4101257220523275e-05, "loss": 0.7777, "step": 2440 }, { "epoch": 11.2, "grad_norm": 4.977694988250732, "learning_rate": 1.3931362555215766e-05, "loss": 0.7842, "step": 2450 }, { "epoch": 11.245714285714286, "grad_norm": 8.050721168518066, "learning_rate": 1.3761467889908258e-05, "loss": 0.7335, "step": 2460 }, { "epoch": 11.291428571428572, "grad_norm": 6.315252304077148, "learning_rate": 1.3591573224600749e-05, "loss": 0.7898, "step": 2470 }, { "epoch": 11.337142857142856, "grad_norm": 6.380728721618652, "learning_rate": 1.3421678559293238e-05, "loss": 0.7531, "step": 2480 }, { "epoch": 11.382857142857143, "grad_norm": 6.871670246124268, "learning_rate": 1.3251783893985731e-05, "loss": 0.7604, "step": 2490 }, { "epoch": 11.428571428571429, "grad_norm": 7.661756992340088, "learning_rate": 1.308188922867822e-05, "loss": 0.7772, "step": 2500 }, { "epoch": 11.474285714285715, "grad_norm": 8.346705436706543, "learning_rate": 1.291199456337071e-05, "loss": 0.7372, "step": 2510 }, { "epoch": 11.52, "grad_norm": 5.545016288757324, "learning_rate": 1.2742099898063201e-05, "loss": 0.7428, "step": 2520 }, { "epoch": 11.565714285714286, "grad_norm": 8.73310661315918, "learning_rate": 1.2572205232755694e-05, "loss": 0.7566, "step": 2530 }, { "epoch": 11.611428571428572, "grad_norm": 7.3945794105529785, "learning_rate": 1.2402310567448183e-05, "loss": 0.7988, "step": 2540 }, { "epoch": 11.657142857142857, "grad_norm": 5.604186534881592, "learning_rate": 1.2232415902140673e-05, "loss": 0.6912, "step": 2550 }, { "epoch": 11.702857142857143, "grad_norm": 6.901523590087891, "learning_rate": 1.2062521236833164e-05, "loss": 0.7438, "step": 2560 }, { "epoch": 11.748571428571429, "grad_norm": 7.090272426605225, "learning_rate": 1.1892626571525654e-05, "loss": 0.7502, "step": 2570 }, { "epoch": 11.794285714285714, "grad_norm": 9.939922332763672, "learning_rate": 1.1722731906218146e-05, "loss": 0.7574, "step": 2580 }, { "epoch": 11.84, "grad_norm": 7.832808017730713, "learning_rate": 1.1552837240910636e-05, "loss": 0.7763, "step": 2590 }, { "epoch": 11.885714285714286, "grad_norm": 7.921093940734863, "learning_rate": 1.1382942575603127e-05, "loss": 0.8136, "step": 2600 }, { "epoch": 11.93142857142857, "grad_norm": 8.13971996307373, "learning_rate": 1.1213047910295617e-05, "loss": 0.7735, "step": 2610 }, { "epoch": 11.977142857142857, "grad_norm": 5.870953559875488, "learning_rate": 1.1043153244988108e-05, "loss": 0.7051, "step": 2620 }, { "epoch": 12.0, "eval_accuracy": 0.793512164691204, "eval_loss": 0.6982392072677612, "eval_runtime": 74.4508, "eval_samples_per_second": 107.655, "eval_steps_per_second": 3.371, "step": 2625 }, { "epoch": 12.022857142857143, "grad_norm": 9.655831336975098, "learning_rate": 1.0873258579680597e-05, "loss": 0.7839, "step": 2630 }, { "epoch": 12.06857142857143, "grad_norm": 6.195824146270752, "learning_rate": 1.070336391437309e-05, "loss": 0.7916, "step": 2640 }, { "epoch": 12.114285714285714, "grad_norm": 7.92185115814209, "learning_rate": 1.053346924906558e-05, "loss": 0.7016, "step": 2650 }, { "epoch": 12.16, "grad_norm": 5.990954875946045, "learning_rate": 1.036357458375807e-05, "loss": 0.6903, "step": 2660 }, { "epoch": 12.205714285714286, "grad_norm": 5.883810520172119, "learning_rate": 1.019367991845056e-05, "loss": 0.7879, "step": 2670 }, { "epoch": 12.251428571428571, "grad_norm": 6.014761447906494, "learning_rate": 1.0023785253143051e-05, "loss": 0.7569, "step": 2680 }, { "epoch": 12.297142857142857, "grad_norm": 6.2539191246032715, "learning_rate": 9.853890587835543e-06, "loss": 0.7547, "step": 2690 }, { "epoch": 12.342857142857143, "grad_norm": 8.04623031616211, "learning_rate": 9.683995922528034e-06, "loss": 0.7636, "step": 2700 }, { "epoch": 12.388571428571428, "grad_norm": 7.5707106590271, "learning_rate": 9.514101257220525e-06, "loss": 0.7976, "step": 2710 }, { "epoch": 12.434285714285714, "grad_norm": 7.271738529205322, "learning_rate": 9.344206591913014e-06, "loss": 0.6919, "step": 2720 }, { "epoch": 12.48, "grad_norm": 6.238006591796875, "learning_rate": 9.174311926605506e-06, "loss": 0.7343, "step": 2730 }, { "epoch": 12.525714285714285, "grad_norm": 6.735348701477051, "learning_rate": 9.004417261297995e-06, "loss": 0.6853, "step": 2740 }, { "epoch": 12.571428571428571, "grad_norm": 7.480915069580078, "learning_rate": 8.834522595990486e-06, "loss": 0.7797, "step": 2750 }, { "epoch": 12.617142857142857, "grad_norm": 7.131129741668701, "learning_rate": 8.664627930682977e-06, "loss": 0.7023, "step": 2760 }, { "epoch": 12.662857142857142, "grad_norm": 6.145063400268555, "learning_rate": 8.494733265375469e-06, "loss": 0.801, "step": 2770 }, { "epoch": 12.708571428571428, "grad_norm": 8.46693229675293, "learning_rate": 8.324838600067958e-06, "loss": 0.7693, "step": 2780 }, { "epoch": 12.754285714285714, "grad_norm": 6.422353744506836, "learning_rate": 8.15494393476045e-06, "loss": 0.7248, "step": 2790 }, { "epoch": 12.8, "grad_norm": 10.295066833496094, "learning_rate": 7.985049269452939e-06, "loss": 0.8283, "step": 2800 }, { "epoch": 12.845714285714285, "grad_norm": 7.5840959548950195, "learning_rate": 7.81515460414543e-06, "loss": 0.7081, "step": 2810 }, { "epoch": 12.891428571428571, "grad_norm": 6.741255760192871, "learning_rate": 7.645259938837921e-06, "loss": 0.7079, "step": 2820 }, { "epoch": 12.937142857142858, "grad_norm": 7.525110721588135, "learning_rate": 7.475365273530411e-06, "loss": 0.7374, "step": 2830 }, { "epoch": 12.982857142857142, "grad_norm": 7.784526348114014, "learning_rate": 7.305470608222902e-06, "loss": 0.7359, "step": 2840 }, { "epoch": 12.996571428571428, "eval_accuracy": 0.7916406737367436, "eval_loss": 0.6984859704971313, "eval_runtime": 73.0055, "eval_samples_per_second": 109.786, "eval_steps_per_second": 3.438, "step": 2843 }, { "epoch": 13.028571428571428, "grad_norm": 7.309814929962158, "learning_rate": 7.135575942915393e-06, "loss": 0.709, "step": 2850 }, { "epoch": 13.074285714285715, "grad_norm": 10.355072021484375, "learning_rate": 6.965681277607883e-06, "loss": 0.7387, "step": 2860 }, { "epoch": 13.12, "grad_norm": 7.303361415863037, "learning_rate": 6.795786612300374e-06, "loss": 0.6999, "step": 2870 }, { "epoch": 13.165714285714285, "grad_norm": 8.104413032531738, "learning_rate": 6.6258919469928655e-06, "loss": 0.7602, "step": 2880 }, { "epoch": 13.211428571428572, "grad_norm": 6.764256000518799, "learning_rate": 6.455997281685355e-06, "loss": 0.7195, "step": 2890 }, { "epoch": 13.257142857142856, "grad_norm": 6.039324760437012, "learning_rate": 6.286102616377847e-06, "loss": 0.6927, "step": 2900 }, { "epoch": 13.302857142857142, "grad_norm": 5.957076549530029, "learning_rate": 6.1162079510703365e-06, "loss": 0.7369, "step": 2910 }, { "epoch": 13.348571428571429, "grad_norm": 6.061888694763184, "learning_rate": 5.946313285762827e-06, "loss": 0.734, "step": 2920 }, { "epoch": 13.394285714285715, "grad_norm": 8.270124435424805, "learning_rate": 5.776418620455318e-06, "loss": 0.7389, "step": 2930 }, { "epoch": 13.44, "grad_norm": 5.897438049316406, "learning_rate": 5.606523955147808e-06, "loss": 0.772, "step": 2940 }, { "epoch": 13.485714285714286, "grad_norm": 10.048014640808105, "learning_rate": 5.436629289840299e-06, "loss": 0.7129, "step": 2950 }, { "epoch": 13.531428571428572, "grad_norm": 6.532159805297852, "learning_rate": 5.26673462453279e-06, "loss": 0.7329, "step": 2960 }, { "epoch": 13.577142857142857, "grad_norm": 7.861076831817627, "learning_rate": 5.09683995922528e-06, "loss": 0.659, "step": 2970 }, { "epoch": 13.622857142857143, "grad_norm": 7.23004150390625, "learning_rate": 4.926945293917771e-06, "loss": 0.7383, "step": 2980 }, { "epoch": 13.668571428571429, "grad_norm": 7.737166881561279, "learning_rate": 4.7570506286102625e-06, "loss": 0.7243, "step": 2990 }, { "epoch": 13.714285714285714, "grad_norm": 6.211805820465088, "learning_rate": 4.587155963302753e-06, "loss": 0.748, "step": 3000 }, { "epoch": 13.76, "grad_norm": 6.718061447143555, "learning_rate": 4.417261297995243e-06, "loss": 0.7005, "step": 3010 }, { "epoch": 13.805714285714286, "grad_norm": 6.454185485839844, "learning_rate": 4.247366632687734e-06, "loss": 0.806, "step": 3020 }, { "epoch": 13.85142857142857, "grad_norm": 8.608590126037598, "learning_rate": 4.077471967380225e-06, "loss": 0.7342, "step": 3030 }, { "epoch": 13.897142857142857, "grad_norm": 6.505426406860352, "learning_rate": 3.907577302072715e-06, "loss": 0.7148, "step": 3040 }, { "epoch": 13.942857142857143, "grad_norm": 7.760762691497803, "learning_rate": 3.7376826367652057e-06, "loss": 0.7208, "step": 3050 }, { "epoch": 13.98857142857143, "grad_norm": 6.736624240875244, "learning_rate": 3.5677879714576964e-06, "loss": 0.7641, "step": 3060 }, { "epoch": 13.997714285714286, "eval_accuracy": 0.796756082345602, "eval_loss": 0.6838445663452148, "eval_runtime": 74.6047, "eval_samples_per_second": 107.433, "eval_steps_per_second": 3.364, "step": 3062 }, { "epoch": 14.034285714285714, "grad_norm": 6.665014266967773, "learning_rate": 3.397893306150187e-06, "loss": 0.6983, "step": 3070 }, { "epoch": 14.08, "grad_norm": 6.425662994384766, "learning_rate": 3.2279986408426775e-06, "loss": 0.6563, "step": 3080 }, { "epoch": 14.125714285714286, "grad_norm": 6.079023838043213, "learning_rate": 3.0581039755351682e-06, "loss": 0.6838, "step": 3090 }, { "epoch": 14.17142857142857, "grad_norm": 6.835630416870117, "learning_rate": 2.888209310227659e-06, "loss": 0.6923, "step": 3100 }, { "epoch": 14.217142857142857, "grad_norm": 7.853431224822998, "learning_rate": 2.7183146449201493e-06, "loss": 0.7237, "step": 3110 }, { "epoch": 14.262857142857143, "grad_norm": 6.308788299560547, "learning_rate": 2.54841997961264e-06, "loss": 0.7469, "step": 3120 }, { "epoch": 14.308571428571428, "grad_norm": 7.8088202476501465, "learning_rate": 2.3785253143051312e-06, "loss": 0.73, "step": 3130 }, { "epoch": 14.354285714285714, "grad_norm": 6.700632095336914, "learning_rate": 2.2086306489976216e-06, "loss": 0.7349, "step": 3140 }, { "epoch": 14.4, "grad_norm": 7.2199320793151855, "learning_rate": 2.0387359836901123e-06, "loss": 0.7093, "step": 3150 }, { "epoch": 14.445714285714285, "grad_norm": 5.401243686676025, "learning_rate": 1.8688413183826028e-06, "loss": 0.6662, "step": 3160 }, { "epoch": 14.491428571428571, "grad_norm": 6.991000652313232, "learning_rate": 1.6989466530750936e-06, "loss": 0.6859, "step": 3170 }, { "epoch": 14.537142857142857, "grad_norm": 5.750977993011475, "learning_rate": 1.5290519877675841e-06, "loss": 0.6917, "step": 3180 }, { "epoch": 14.582857142857144, "grad_norm": 8.291132926940918, "learning_rate": 1.3591573224600747e-06, "loss": 0.7237, "step": 3190 }, { "epoch": 14.628571428571428, "grad_norm": 8.148652076721191, "learning_rate": 1.1892626571525656e-06, "loss": 0.795, "step": 3200 }, { "epoch": 14.674285714285714, "grad_norm": 8.365762710571289, "learning_rate": 1.0193679918450562e-06, "loss": 0.7038, "step": 3210 }, { "epoch": 14.72, "grad_norm": 8.118247032165527, "learning_rate": 8.494733265375468e-07, "loss": 0.7334, "step": 3220 }, { "epoch": 14.765714285714285, "grad_norm": 7.585115432739258, "learning_rate": 6.795786612300373e-07, "loss": 0.7761, "step": 3230 }, { "epoch": 14.811428571428571, "grad_norm": 7.77222204208374, "learning_rate": 5.096839959225281e-07, "loss": 0.6876, "step": 3240 }, { "epoch": 14.857142857142858, "grad_norm": 7.689059734344482, "learning_rate": 3.3978933061501866e-07, "loss": 0.6827, "step": 3250 }, { "epoch": 14.902857142857142, "grad_norm": 7.31766414642334, "learning_rate": 1.6989466530750933e-07, "loss": 0.7361, "step": 3260 }, { "epoch": 14.948571428571428, "grad_norm": 6.6061015129089355, "learning_rate": 0.0, "loss": 0.7372, "step": 3270 }, { "epoch": 14.948571428571428, "eval_accuracy": 0.796756082345602, "eval_loss": 0.6781123280525208, "eval_runtime": 74.4538, "eval_samples_per_second": 107.651, "eval_steps_per_second": 3.371, "step": 3270 }, { "epoch": 14.948571428571428, "step": 3270, "total_flos": 1.0403963301155365e+19, "train_loss": 0.9863547705729073, "train_runtime": 7901.2254, "train_samples_per_second": 53.135, "train_steps_per_second": 0.414 } ], "logging_steps": 10, "max_steps": 3270, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0403963301155365e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }